aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2024-05-20 16:37:59 +0000
committerAndroid Build Coastguard Worker <android-build-coastguard-worker@google.com>2024-05-20 16:37:59 +0000
commitf280dc28ba4fd713d8b92243fc97dd32b79bd902 (patch)
treeaaa7cb313ca956a5e7b01f65223311730f0c5388
parent1db05b5d41d80b78de1acafa6f061af6dac689f2 (diff)
parentb1b8c8771490b286182357d1f2f8418a47e3297c (diff)
downloadicing-f280dc28ba4fd713d8b92243fc97dd32b79bd902.tar.gz
Snap for 11861033 from b1b8c8771490b286182357d1f2f8418a47e3297c to androidx-concurrent-releaseandroidx-concurrent-release
Change-Id: Ib053f36b0fd3bf7585788b84dfc6cc3314011d59
-rw-r--r--.gitignore1
-rw-r--r--Android.bp2
-rw-r--r--AndroidManifest.xml2
-rw-r--r--CMakeLists.txt24
-rw-r--r--OWNERS2
-rw-r--r--build.gradle73
-rw-r--r--icing/absl_ports/annotate.cc2
-rw-r--r--icing/absl_ports/arraysize_macros.h40
-rw-r--r--icing/absl_ports/ascii_str_to_lower.cc (renamed from icing/absl_ports/status_imports.h)17
-rw-r--r--icing/absl_ports/ascii_str_to_lower.h38
-rw-r--r--icing/absl_ports/status_test.cc53
-rw-r--r--icing/absl_ports/str_cat.cc5
-rw-r--r--icing/absl_ports/str_join.cc41
-rw-r--r--icing/absl_ports/str_join.h11
-rw-r--r--icing/document-builder.h138
-rw-r--r--icing/file/destructible-directory.h74
-rw-r--r--icing/file/destructible-directory_test.cc118
-rw-r--r--icing/file/destructible-file.h72
-rw-r--r--icing/file/destructible-file_test.cc117
-rw-r--r--icing/file/file-backed-bitmap.cc24
-rw-r--r--icing/file/file-backed-bitmap.h5
-rw-r--r--icing/file/file-backed-proto-log.h478
-rw-r--r--icing/file/file-backed-proto-log_benchmark.cc169
-rw-r--r--icing/file/file-backed-proto-log_test.cc420
-rw-r--r--icing/file/file-backed-proto.h59
-rw-r--r--icing/file/file-backed-proto_test.cc16
-rw-r--r--icing/file/file-backed-vector.h695
-rw-r--r--icing/file/file-backed-vector_benchmark.cc158
-rw-r--r--icing/file/file-backed-vector_test.cc1020
-rw-r--r--icing/file/filesystem.cc169
-rw-r--r--icing/file/filesystem.h20
-rw-r--r--icing/file/filesystem_test.cc43
-rw-r--r--icing/file/memory-mapped-file-leak_test.cc72
-rw-r--r--icing/file/memory-mapped-file.cc354
-rw-r--r--icing/file/memory-mapped-file.h282
-rw-r--r--icing/file/memory-mapped-file_test.cc668
-rw-r--r--icing/file/mock-filesystem.h16
-rw-r--r--icing/file/persistent-hash-map.cc750
-rw-r--r--icing/file/persistent-hash-map.h529
-rw-r--r--icing/file/persistent-hash-map_test.cc1577
-rw-r--r--icing/file/persistent-storage.cc55
-rw-r--r--icing/file/persistent-storage.h369
-rw-r--r--icing/file/portable-file-backed-proto-log.h1263
-rw-r--r--icing/file/portable-file-backed-proto-log_benchmark.cc343
-rw-r--r--icing/file/portable-file-backed-proto-log_test.cc1265
-rw-r--r--icing/file/posting_list/flash-index-storage-header.h122
-rw-r--r--icing/file/posting_list/flash-index-storage.cc661
-rw-r--r--icing/file/posting_list/flash-index-storage.h381
-rw-r--r--icing/file/posting_list/flash-index-storage_test.cc610
-rw-r--r--icing/file/posting_list/index-block.cc333
-rw-r--r--icing/file/posting_list/index-block.h369
-rw-r--r--icing/file/posting_list/index-block_test.cc357
-rw-r--r--icing/file/posting_list/posting-list-accessor.cc136
-rw-r--r--icing/file/posting_list/posting-list-accessor.h118
-rw-r--r--icing/file/posting_list/posting-list-common.h33
-rw-r--r--icing/file/posting_list/posting-list-free.h (renamed from icing/index/posting-list-free.h)58
-rw-r--r--icing/file/posting_list/posting-list-free_test.cc (renamed from icing/index/posting-list-free_test.cc)86
-rw-r--r--icing/file/posting_list/posting-list-identifier.cc27
-rw-r--r--icing/file/posting_list/posting-list-identifier.h120
-rw-r--r--icing/file/posting_list/posting-list-used.cc58
-rw-r--r--icing/file/posting_list/posting-list-used.h174
-rw-r--r--icing/file/posting_list/posting-list-utils.cc (renamed from icing/index/posting-list-utils.cc)23
-rw-r--r--icing/file/posting_list/posting-list-utils.h (renamed from icing/index/posting-list-utils.h)21
-rw-r--r--icing/file/version-util.cc150
-rw-r--r--icing/file/version-util.h115
-rw-r--r--icing/file/version-util_test.cc484
-rw-r--r--icing/icing-search-engine-test-jni-layer.cc (renamed from icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc)5
-rw-r--r--icing/icing-search-engine-with-icu-file_test.cc46
-rw-r--r--icing/icing-search-engine.cc2360
-rw-r--r--icing/icing-search-engine.h380
-rw-r--r--icing/icing-search-engine_backwards_compatibility_test.cc569
-rw-r--r--icing/icing-search-engine_benchmark.cc1027
-rw-r--r--icing/icing-search-engine_delete_test.cc768
-rw-r--r--icing/icing-search-engine_flush_benchmark.cc199
-rw-r--r--icing/icing-search-engine_fuzz_test.cc30
-rw-r--r--icing/icing-search-engine_initialization_test.cc6030
-rw-r--r--icing/icing-search-engine_optimize_test.cc1855
-rw-r--r--icing/icing-search-engine_put_test.cc481
-rw-r--r--icing/icing-search-engine_schema_test.cc3159
-rw-r--r--icing/icing-search-engine_search_test.cc7173
-rw-r--r--icing/icing-search-engine_suggest_test.cc1601
-rw-r--r--icing/icing-search-engine_test.cc3936
-rw-r--r--icing/index/data-indexing-handler.h69
-rw-r--r--icing/index/hit/doc-hit-info.cc62
-rw-r--r--icing/index/hit/doc-hit-info.h51
-rw-r--r--icing/index/hit/doc-hit-info_test.cc134
-rw-r--r--icing/index/hit/hit.cc65
-rw-r--r--icing/index/hit/hit.h116
-rw-r--r--icing/index/hit/hit_test.cc151
-rw-r--r--icing/index/index-processor.cc96
-rw-r--r--icing/index/index-processor.h91
-rw-r--r--icing/index/index-processor_benchmark.cc315
-rw-r--r--icing/index/index-processor_test.cc1516
-rw-r--r--icing/index/index.cc293
-rw-r--r--icing/index/index.h206
-rw-r--r--icing/index/index_test.cc2563
-rw-r--r--icing/index/integer-section-indexing-handler.cc112
-rw-r--r--icing/index/integer-section-indexing-handler.h71
-rw-r--r--icing/index/integer-section-indexing-handler_test.cc601
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-all-document-id.cc8
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-all-document-id.h14
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc55
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-and.cc76
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-and.h48
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-and_test.cc451
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-filter.cc112
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-filter.h25
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-filter_test.cc559
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-none.h52
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-not.cc56
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-not.h18
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-not_test.cc56
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-or.cc87
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-or.h63
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-or_test.cc426
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-property-in-document.cc65
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-property-in-document.h73
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc103
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-property-in-schema.h80
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc269
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict.cc245
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict.h63
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc452
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-term.cc125
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-term.h108
-rw-r--r--icing/index/iterator/doc-hit-info-iterator-test-util.h131
-rw-r--r--icing/index/iterator/doc-hit-info-iterator.h221
-rw-r--r--icing/index/iterator/doc-hit-info-iterator_benchmark.cc10
-rw-r--r--icing/index/iterator/section-restrict-data.cc82
-rw-r--r--icing/index/iterator/section-restrict-data.h98
-rw-r--r--icing/index/lite-index.cc457
-rw-r--r--icing/index/lite-index.h269
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.cc217
-rw-r--r--icing/index/lite/doc-hit-info-iterator-term-lite.h173
-rw-r--r--icing/index/lite/lite-index-header.h (renamed from icing/legacy/index/icing-lite-index-header.h)37
-rw-r--r--icing/index/lite/lite-index-options.cc (renamed from icing/legacy/index/icing-lite-index-options.cc)34
-rw-r--r--icing/index/lite/lite-index-options.h (renamed from icing/legacy/index/icing-lite-index-options.h)20
-rw-r--r--icing/index/lite/lite-index.cc716
-rw-r--r--icing/index/lite/lite-index.h444
-rw-r--r--icing/index/lite/lite-index_test.cc741
-rw-r--r--icing/index/lite/lite-index_thread-safety_test.cc399
-rw-r--r--icing/index/lite/term-id-hit-pair.h85
-rw-r--r--icing/index/main/doc-hit-info-iterator-term-main.cc218
-rw-r--r--icing/index/main/doc-hit-info-iterator-term-main.h204
-rw-r--r--icing/index/main/main-index-merger.cc305
-rw-r--r--icing/index/main/main-index-merger.h49
-rw-r--r--icing/index/main/main-index-merger_test.cc382
-rw-r--r--icing/index/main/main-index.cc858
-rw-r--r--icing/index/main/main-index.h350
-rw-r--r--icing/index/main/main-index_test.cc710
-rw-r--r--icing/index/main/posting-list-hit-accessor.cc123
-rw-r--r--icing/index/main/posting-list-hit-accessor.h101
-rw-r--r--icing/index/main/posting-list-hit-accessor_test.cc366
-rw-r--r--icing/index/main/posting-list-hit-serializer.cc714
-rw-r--r--icing/index/main/posting-list-hit-serializer.h345
-rw-r--r--icing/index/main/posting-list-hit-serializer_test.cc731
-rw-r--r--icing/index/numeric/doc-hit-info-iterator-numeric.h85
-rw-r--r--icing/index/numeric/dummy-numeric-index.h351
-rw-r--r--icing/index/numeric/integer-index-bucket-util.cc205
-rw-r--r--icing/index/numeric/integer-index-bucket-util.h81
-rw-r--r--icing/index/numeric/integer-index-bucket-util_test.cc1112
-rw-r--r--icing/index/numeric/integer-index-data.h59
-rw-r--r--icing/index/numeric/integer-index-storage.cc1180
-rw-r--r--icing/index/numeric/integer-index-storage.h506
-rw-r--r--icing/index/numeric/integer-index-storage_benchmark.cc407
-rw-r--r--icing/index/numeric/integer-index-storage_test.cc2161
-rw-r--r--icing/index/numeric/integer-index.cc651
-rw-r--r--icing/index/numeric/integer-index.h409
-rw-r--r--icing/index/numeric/integer-index_test.cc2598
-rw-r--r--icing/index/numeric/numeric-index.h204
-rw-r--r--icing/index/numeric/posting-list-integer-index-accessor.cc164
-rw-r--r--icing/index/numeric/posting-list-integer-index-accessor.h130
-rw-r--r--icing/index/numeric/posting-list-integer-index-accessor_test.cc535
-rw-r--r--icing/index/numeric/posting-list-integer-index-serializer.cc512
-rw-r--r--icing/index/numeric/posting-list-integer-index-serializer.h338
-rw-r--r--icing/index/numeric/posting-list-integer-index-serializer_test.cc491
-rw-r--r--icing/index/posting-list-used.cc613
-rw-r--r--icing/index/posting-list-used.h321
-rw-r--r--icing/index/posting-list-used_test.cc537
-rw-r--r--icing/index/property-existence-indexing-handler.cc127
-rw-r--r--icing/index/property-existence-indexing-handler.h86
-rw-r--r--icing/index/property-existence-indexing-handler_test.cc524
-rw-r--r--icing/index/string-section-indexing-handler.cc114
-rw-r--r--icing/index/string-section-indexing-handler.h77
-rw-r--r--icing/index/term-indexing-handler.cc146
-rw-r--r--icing/index/term-indexing-handler.h97
-rw-r--r--icing/index/term-indexing-handler_test.cc664
-rw-r--r--icing/index/term-metadata.h11
-rw-r--r--icing/jni.lds9
-rw-r--r--icing/jni/icing-search-engine-jni.cc496
-rw-r--r--icing/jni/jni-cache.cc7
-rw-r--r--icing/jni/jni-cache.h12
-rw-r--r--icing/jni/scoped-primitive-array-critical.h86
-rw-r--r--icing/jni/scoped-primitive-array-critical_test.cc140
-rw-r--r--icing/jni/scoped-utf-chars.h81
-rw-r--r--icing/jni/scoped-utf-chars_test.cc126
-rw-r--r--icing/join/aggregation-scorer.cc139
-rw-r--r--icing/join/aggregation-scorer.h41
-rw-r--r--icing/join/aggregation-scorer_test.cc215
-rw-r--r--icing/join/doc-join-info.cc49
-rw-r--r--icing/join/doc-join-info.h66
-rw-r--r--icing/join/doc-join-info_test.cc96
-rw-r--r--icing/join/document-id-to-join-info.h67
-rw-r--r--icing/join/join-children-fetcher.cc39
-rw-r--r--icing/join/join-children-fetcher.h73
-rw-r--r--icing/join/join-children-fetcher_test.cc83
-rw-r--r--icing/join/join-processor.cc270
-rw-r--r--icing/join/join-processor.h88
-rw-r--r--icing/join/join-processor_test.cc930
-rw-r--r--icing/join/posting-list-join-data-accessor.h211
-rw-r--r--icing/join/posting-list-join-data-accessor_test.cc435
-rw-r--r--icing/join/posting-list-join-data-serializer.h803
-rw-r--r--icing/join/posting-list-join-data-serializer_test.cc653
-rw-r--r--icing/join/qualified-id-join-index-impl-v1.cc476
-rw-r--r--icing/join/qualified-id-join-index-impl-v1.h327
-rw-r--r--icing/join/qualified-id-join-index-impl-v1_test.cc931
-rw-r--r--icing/join/qualified-id-join-index-impl-v2.cc681
-rw-r--r--icing/join/qualified-id-join-index-impl-v2.h369
-rw-r--r--icing/join/qualified-id-join-index-impl-v2_test.cc1414
-rw-r--r--icing/join/qualified-id-join-index.h187
-rw-r--r--icing/join/qualified-id-join-indexing-handler-v1_test.cc558
-rw-r--r--icing/join/qualified-id-join-indexing-handler.cc179
-rw-r--r--icing/join/qualified-id-join-indexing-handler.h78
-rw-r--r--icing/join/qualified-id-join-indexing-handler_test.cc829
-rw-r--r--icing/join/qualified-id.cc110
-rw-r--r--icing/join/qualified-id.h65
-rw-r--r--icing/join/qualified-id_test.cc159
-rw-r--r--icing/legacy/core/icing-core-types.h3
-rw-r--r--icing/legacy/core/icing-string-util.cc17
-rw-r--r--icing/legacy/core/icing-string-util.h9
-rw-r--r--icing/legacy/core/icing-timer.h3
-rw-r--r--icing/legacy/index/icing-array-storage.cc32
-rw-r--r--icing/legacy/index/icing-array-storage.h3
-rw-r--r--icing/legacy/index/icing-bit-util.h5
-rw-r--r--icing/legacy/index/icing-common-types.h129
-rw-r--r--icing/legacy/index/icing-dynamic-trie.cc560
-rw-r--r--icing/legacy/index/icing-dynamic-trie.h109
-rw-r--r--icing/legacy/index/icing-dynamic-trie_test.cc1450
-rw-r--r--icing/legacy/index/icing-filesystem.cc89
-rw-r--r--icing/legacy/index/icing-filesystem.h13
-rw-r--r--icing/legacy/index/icing-flash-bitmap.cc25
-rw-r--r--icing/legacy/index/icing-flash-bitmap.h4
-rw-r--r--icing/legacy/index/icing-mmapper.cc11
-rw-r--r--icing/legacy/index/icing-mmapper.h4
-rw-r--r--icing/legacy/index/icing-mock-filesystem.h225
-rw-r--r--icing/legacy/index/icing-storage-file.cc14
-rw-r--r--icing/legacy/index/icing-storage.h1
-rw-r--r--icing/monkey_test/icing-monkey-test-runner.cc525
-rw-r--r--icing/monkey_test/icing-monkey-test-runner.h79
-rw-r--r--icing/monkey_test/icing-search-engine_monkey_test.cc99
-rw-r--r--icing/monkey_test/in-memory-icing-search-engine.cc352
-rw-r--r--icing/monkey_test/in-memory-icing-search-engine.h167
-rw-r--r--icing/monkey_test/monkey-test-common-words.h284
-rw-r--r--icing/monkey_test/monkey-test-generators.cc346
-rw-r--r--icing/monkey_test/monkey-test-generators.h127
-rw-r--r--icing/monkey_test/monkey-test-util.h68
-rw-r--r--icing/monkey_test/monkey-tokenized-document.h38
-rw-r--r--icing/performance-configuration.cc41
-rw-r--r--icing/performance-configuration.h19
-rw-r--r--icing/portable/endian.h208
-rw-r--r--icing/portable/equals-proto.h2
-rw-r--r--icing/portable/gzip_stream.cc313
-rw-r--r--icing/portable/gzip_stream.h177
-rw-r--r--icing/portable/platform.h106
-rw-r--r--icing/query/advanced_query_parser/abstract-syntax-tree-test-utils.h108
-rw-r--r--icing/query/advanced_query_parser/abstract-syntax-tree.h184
-rw-r--r--icing/query/advanced_query_parser/abstract-syntax-tree_test.cc143
-rw-r--r--icing/query/advanced_query_parser/function.cc77
-rw-r--r--icing/query/advanced_query_parser/function.h66
-rw-r--r--icing/query/advanced_query_parser/function_test.cc332
-rw-r--r--icing/query/advanced_query_parser/lexer.cc270
-rw-r--r--icing/query/advanced_query_parser/lexer.h169
-rw-r--r--icing/query/advanced_query_parser/lexer_fuzz_test.cc37
-rw-r--r--icing/query/advanced_query_parser/lexer_test.cc698
-rw-r--r--icing/query/advanced_query_parser/param.h57
-rw-r--r--icing/query/advanced_query_parser/parser.cc449
-rw-r--r--icing/query/advanced_query_parser/parser.h141
-rw-r--r--icing/query/advanced_query_parser/parser_integration_test.cc1012
-rw-r--r--icing/query/advanced_query_parser/parser_test.cc1087
-rw-r--r--icing/query/advanced_query_parser/pending-value.cc44
-rw-r--r--icing/query/advanced_query_parser/pending-value.h160
-rw-r--r--icing/query/advanced_query_parser/query-visitor.cc963
-rw-r--r--icing/query/advanced_query_parser/query-visitor.h327
-rw-r--r--icing/query/advanced_query_parser/query-visitor_test.cc4112
-rw-r--r--icing/query/advanced_query_parser/util/string-util.cc106
-rw-r--r--icing/query/advanced_query_parser/util/string-util.h49
-rw-r--r--icing/query/advanced_query_parser/util/string-util_test.cc125
-rw-r--r--icing/query/query-features.h63
-rw-r--r--icing/query/query-processor.cc169
-rw-r--r--icing/query/query-processor.h47
-rw-r--r--icing/query/query-processor_benchmark.cc227
-rw-r--r--icing/query/query-processor_test.cc2897
-rw-r--r--icing/query/query-results.h46
-rw-r--r--icing/query/query-terms.h10
-rw-r--r--icing/query/query-utils.cc42
-rw-r--r--icing/query/query-utils.h30
-rw-r--r--icing/query/suggestion-processor.cc311
-rw-r--r--icing/query/suggestion-processor.h78
-rw-r--r--icing/query/suggestion-processor_test.cc722
-rw-r--r--icing/result/page-result-state.h15
-rw-r--r--icing/result/page-result.h46
-rw-r--r--icing/result/projection-tree.cc50
-rw-r--r--icing/result/projection-tree.h61
-rw-r--r--icing/result/projection-tree_test.cc118
-rw-r--r--icing/result/projector.cc62
-rw-r--r--icing/result/projector.h36
-rw-r--r--icing/result/result-adjustment-info.cc64
-rw-r--r--icing/result/result-adjustment-info.h53
-rw-r--r--icing/result/result-adjustment-info_test.cc198
-rw-r--r--icing/result/result-retriever-v2.cc268
-rw-r--r--icing/result/result-retriever-v2.h111
-rw-r--r--icing/result/result-retriever-v2_group-result-limiter_test.cc1163
-rw-r--r--icing/result/result-retriever-v2_projection_test.cc1957
-rw-r--r--icing/result/result-retriever-v2_snippet_test.cc1162
-rw-r--r--icing/result/result-retriever-v2_test.cc1012
-rw-r--r--icing/result/result-retriever.cc96
-rw-r--r--icing/result/result-retriever.h95
-rw-r--r--icing/result/result-retriever_test.cc586
-rw-r--r--icing/result/result-state-manager.cc223
-rw-r--r--icing/result/result-state-manager.h113
-rw-r--r--icing/result/result-state-manager_test.cc1755
-rw-r--r--icing/result/result-state-manager_thread-safety_test.cc458
-rw-r--r--icing/result/result-state-v2.cc84
-rw-r--r--icing/result/result-state-v2.h175
-rw-r--r--icing/result/result-state-v2_test.cc409
-rw-r--r--icing/result/result-state.cc70
-rw-r--r--icing/result/result-state.h81
-rw-r--r--icing/result/result-state_test.cc214
-rw-r--r--icing/result/snippet-retriever-test-jni-layer.cc36
-rw-r--r--icing/result/snippet-retriever.cc630
-rw-r--r--icing/result/snippet-retriever_benchmark.cc333
-rw-r--r--icing/result/snippet-retriever_test.cc1711
-rw-r--r--icing/schema-builder.h227
-rw-r--r--icing/schema/backup-schema-producer.cc164
-rw-r--r--icing/schema/backup-schema-producer.h55
-rw-r--r--icing/schema/backup-schema-producer_test.cc737
-rw-r--r--icing/schema/joinable-property-manager-builder_test.cc446
-rw-r--r--icing/schema/joinable-property-manager.cc203
-rw-r--r--icing/schema/joinable-property-manager.h160
-rw-r--r--icing/schema/joinable-property-manager_test.cc519
-rw-r--r--icing/schema/joinable-property.h132
-rw-r--r--icing/schema/property-util.cc137
-rw-r--r--icing/schema/property-util.h212
-rw-r--r--icing/schema/property-util_test.cc253
-rw-r--r--icing/schema/schema-property-iterator.cc198
-rw-r--r--icing/schema/schema-property-iterator.h222
-rw-r--r--icing/schema/schema-property-iterator_test.cc3905
-rw-r--r--icing/schema/schema-store.cc824
-rw-r--r--icing/schema/schema-store.h406
-rw-r--r--icing/schema/schema-store_test.cc2919
-rw-r--r--icing/schema/schema-type-manager.cc108
-rw-r--r--icing/schema/schema-type-manager.h79
-rw-r--r--icing/schema/schema-type-manager_test.cc356
-rw-r--r--icing/schema/schema-util.cc974
-rw-r--r--icing/schema/schema-util.h274
-rw-r--r--icing/schema/schema-util_test.cc5468
-rw-r--r--icing/schema/section-manager-builder_test.cc341
-rw-r--r--icing/schema/section-manager.cc391
-rw-r--r--icing/schema/section-manager.h109
-rw-r--r--icing/schema/section-manager_test.cc1244
-rw-r--r--icing/schema/section.h92
-rw-r--r--icing/scoring/advanced_scoring/advanced-scorer.cc68
-rw-r--r--icing/scoring/advanced_scoring/advanced-scorer.h92
-rw-r--r--icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc70
-rw-r--r--icing/scoring/advanced_scoring/advanced-scorer_test.cc1039
-rw-r--r--icing/scoring/advanced_scoring/score-expression.cc521
-rw-r--r--icing/scoring/advanced_scoring/score-expression.h348
-rw-r--r--icing/scoring/advanced_scoring/score-expression_test.cc353
-rw-r--r--icing/scoring/advanced_scoring/scoring-visitor.cc191
-rw-r--r--icing/scoring/advanced_scoring/scoring-visitor.h108
-rw-r--r--icing/scoring/bm25f-calculator.cc248
-rw-r--r--icing/scoring/bm25f-calculator.h177
-rw-r--r--icing/scoring/priority-queue-scored-document-hits-ranker.h128
-rw-r--r--icing/scoring/priority-queue-scored-document-hits-ranker_test.cc255
-rw-r--r--icing/scoring/ranker.cc132
-rw-r--r--icing/scoring/ranker.h25
-rw-r--r--icing/scoring/ranker_benchmark.cc4
-rw-r--r--icing/scoring/score-and-rank_benchmark.cc224
-rw-r--r--icing/scoring/scored-document-hit.cc30
-rw-r--r--icing/scoring/scored-document-hit.h80
-rw-r--r--icing/scoring/scored-document-hit_test.cc77
-rw-r--r--icing/scoring/scored-document-hits-ranker.h62
-rw-r--r--icing/scoring/scorer-factory.cc242
-rw-r--r--icing/scoring/scorer-factory.h49
-rw-r--r--icing/scoring/scorer-test-utils.h77
-rw-r--r--icing/scoring/scorer.cc100
-rw-r--r--icing/scoring/scorer.h36
-rw-r--r--icing/scoring/scorer_test.cc613
-rw-r--r--icing/scoring/scoring-processor.cc44
-rw-r--r--icing/scoring/scoring-processor.h19
-rw-r--r--icing/scoring/scoring-processor_test.cc876
-rw-r--r--icing/scoring/section-weights.cc151
-rw-r--r--icing/scoring/section-weights.h96
-rw-r--r--icing/scoring/section-weights_test.cc447
-rw-r--r--icing/store/corpus-associated-scoring-data.h79
-rw-r--r--icing/store/corpus-id.h32
-rw-r--r--icing/store/document-associated-score-data.h34
-rw-r--r--icing/store/document-filter-data.h1
-rw-r--r--icing/store/document-id.h7
-rw-r--r--icing/store/document-log-creator.cc205
-rw-r--r--icing/store/document-log-creator.h85
-rw-r--r--icing/store/document-store.cc1828
-rw-r--r--icing/store/document-store.h554
-rw-r--r--icing/store/document-store_benchmark.cc342
-rw-r--r--icing/store/document-store_test.cc4034
-rw-r--r--icing/store/dynamic-trie-key-mapper.h334
-rw-r--r--icing/store/dynamic-trie-key-mapper_test.cc67
-rw-r--r--icing/store/key-mapper.h265
-rw-r--r--icing/store/key-mapper_benchmark.cc323
-rw-r--r--icing/store/key-mapper_test.cc181
-rw-r--r--icing/store/namespace-fingerprint-identifier.cc73
-rw-r--r--icing/store/namespace-fingerprint-identifier.h72
-rw-r--r--icing/store/namespace-fingerprint-identifier_test.cc148
-rw-r--r--icing/store/namespace-id.h1
-rw-r--r--icing/store/persistent-hash-map-key-mapper.h206
-rw-r--r--icing/store/persistent-hash-map-key-mapper_test.cc52
-rw-r--r--icing/store/suggestion-result-checker-impl.h154
-rw-r--r--icing/store/suggestion-result-checker.h44
-rw-r--r--icing/store/usage-store.cc262
-rw-r--r--icing/store/usage-store.h205
-rw-r--r--icing/store/usage-store_test.cc628
-rw-r--r--icing/testing/always-false-suggestion-result-checker-impl.h36
-rw-r--r--icing/testing/always-true-suggestion-result-checker-impl.h36
-rw-r--r--icing/testing/common-matchers.cc124
-rw-r--r--icing/testing/common-matchers.h436
-rw-r--r--icing/testing/fake-clock.h25
-rw-r--r--icing/testing/fake-clock_test.cc13
-rw-r--r--icing/testing/hit-test-utils.cc59
-rw-r--r--icing/testing/hit-test-utils.h43
-rw-r--r--icing/testing/icu-data-file-helper.cc (renamed from icing/helpers/icu/icu-data-file-helper.cc)4
-rw-r--r--icing/testing/icu-data-file-helper.h (renamed from icing/helpers/icu/icu-data-file-helper.h)6
-rw-r--r--icing/testing/icu-i18n-test-utils.cc2
-rw-r--r--icing/testing/jni-test-helpers.h2
-rw-r--r--icing/testing/numeric/normal-distribution-number-generator.h42
-rw-r--r--icing/testing/numeric/number-generator.h39
-rw-r--r--icing/testing/numeric/uniform-distribution-integer-generator.h41
-rw-r--r--icing/testing/random-string.cc54
-rw-r--r--icing/testing/random-string.h32
-rw-r--r--icing/testing/random-string_test.cc54
-rw-r--r--icing/testing/schema-generator.h42
-rw-r--r--icing/testing/snippet-helpers.cc80
-rw-r--r--icing/testing/snippet-helpers.h60
-rw-r--r--icing/text_classifier/lib3/utils/base/logging.h1
-rw-r--r--icing/text_classifier/lib3/utils/base/statusor.h97
-rw-r--r--icing/text_classifier/lib3/utils/java/jni-base.cc6
-rw-r--r--icing/text_classifier/lib3/utils/java/jni-base.h4
-rw-r--r--icing/text_classifier/lib3/utils/java/jni-helper.h17
-rw-r--r--icing/tokenization/combined-tokenizer_test.cc262
-rw-r--r--icing/tokenization/icu/icu-language-segmenter-factory.cc5
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.cc265
-rw-r--r--icing/tokenization/icu/icu-language-segmenter.h35
-rw-r--r--icing/tokenization/icu/icu-language-segmenter_test.cc942
-rw-r--r--icing/tokenization/language-segmenter-factory.h8
-rw-r--r--icing/tokenization/language-segmenter-iterator-test-jni-layer.cc37
-rw-r--r--icing/tokenization/language-segmenter-iterator_test.cc135
-rw-r--r--icing/tokenization/language-segmenter.h79
-rw-r--r--icing/tokenization/language-segmenter_benchmark.cc16
-rw-r--r--icing/tokenization/plain-tokenizer-test-jni-layer.cc36
-rw-r--r--icing/tokenization/plain-tokenizer.cc39
-rw-r--r--icing/tokenization/plain-tokenizer_test.cc472
-rw-r--r--icing/tokenization/raw-query-tokenizer.cc452
-rw-r--r--icing/tokenization/raw-query-tokenizer_test.cc677
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc (renamed from icing/jni/reverse-jni-break-iterator.cc)6
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-break-iterator.h (renamed from icing/jni/reverse-jni-break-iterator.h)8
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc4
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc37
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h46
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc324
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h2
-rw-r--r--icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc (renamed from icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc)601
-rw-r--r--icing/tokenization/rfc822-tokenizer.cc798
-rw-r--r--icing/tokenization/rfc822-tokenizer.h38
-rw-r--r--icing/tokenization/rfc822-tokenizer_test.cc992
-rw-r--r--icing/tokenization/simple/space-language-segmenter-factory.cc41
-rw-r--r--icing/tokenization/simple/space-language-segmenter.cc205
-rw-r--r--icing/tokenization/simple/space-language-segmenter.h58
-rw-r--r--icing/tokenization/simple/space-language-segmenter_test.cc114
-rw-r--r--icing/tokenization/token.h37
-rw-r--r--icing/tokenization/tokenizer-factory.cc23
-rw-r--r--icing/tokenization/tokenizer-factory.h2
-rw-r--r--icing/tokenization/tokenizer.h42
-rw-r--r--icing/tokenization/verbatim-tokenizer.cc144
-rw-r--r--icing/tokenization/verbatim-tokenizer.h41
-rw-r--r--icing/tokenization/verbatim-tokenizer_test.cc210
-rw-r--r--icing/tools/document-store-dump.cc119
-rw-r--r--icing/tools/document-store-dump.h35
-rw-r--r--icing/tools/icing-tool.cc306
-rw-r--r--icing/transform/icu/icu-normalizer.cc188
-rw-r--r--icing/transform/icu/icu-normalizer.h47
-rw-r--r--icing/transform/icu/icu-normalizer_benchmark.cc163
-rw-r--r--icing/transform/icu/icu-normalizer_test.cc193
-rw-r--r--icing/transform/map/map-normalizer.cc112
-rw-r--r--icing/transform/map/map-normalizer.h12
-rw-r--r--icing/transform/map/map-normalizer_benchmark.cc102
-rw-r--r--icing/transform/map/map-normalizer_test.cc99
-rw-r--r--icing/transform/map/normalization-map.cc26
-rw-r--r--icing/transform/map/normalization-map.h2
-rw-r--r--icing/transform/normalizer.h12
-rw-r--r--icing/transform/simple/none-normalizer-factory.cc53
-rw-r--r--icing/transform/simple/none-normalizer.h51
-rw-r--r--icing/transform/simple/none-normalizer_test.cc74
-rw-r--r--icing/util/bit-util.h82
-rw-r--r--icing/util/bit-util_test.cc145
-rw-r--r--icing/util/character-iterator.cc269
-rw-r--r--icing/util/character-iterator.h116
-rw-r--r--icing/util/character-iterator_test.cc266
-rw-r--r--icing/util/clock.cc19
-rw-r--r--icing/util/clock.h70
-rw-r--r--icing/util/crc32.h6
-rw-r--r--icing/util/data-loss.h36
-rw-r--r--icing/util/document-validator.cc33
-rw-r--r--icing/util/document-validator.h10
-rw-r--r--icing/util/document-validator_test.cc305
-rw-r--r--icing/util/encode-util.cc50
-rw-r--r--icing/util/encode-util.h45
-rw-r--r--icing/util/encode-util_test.cc91
-rw-r--r--icing/util/fingerprint-util.cc48
-rw-r--r--icing/util/fingerprint-util.h47
-rw-r--r--icing/util/fingerprint-util_test.cc75
-rw-r--r--icing/util/i18n-utils.cc25
-rw-r--r--icing/util/i18n-utils.h10
-rw-r--r--icing/util/logging.cc125
-rw-r--r--icing/util/logging.h140
-rw-r--r--icing/util/logging_raw.cc104
-rw-r--r--icing/util/logging_raw.h34
-rw-r--r--icing/util/logging_test.cc158
-rw-r--r--icing/util/math-util.h2
-rw-r--r--icing/util/snippet-helpers.cc94
-rw-r--r--icing/util/snippet-helpers.h60
-rw-r--r--icing/util/tokenized-document.cc92
-rw-r--r--icing/util/tokenized-document.h92
-rw-r--r--icing/util/tokenized-document_test.cc455
-rw-r--r--java/build.gradle88
-rw-r--r--java/src/com/google/android/icing/IcingSearchEngine.java438
-rw-r--r--java/src/com/google/android/icing/IcingSearchEngineImpl.java331
-rw-r--r--java/src/com/google/android/icing/IcingSearchEngineInterface.java148
-rw-r--r--java/src/com/google/android/icing/IcingSearchEngineUtils.java471
-rw-r--r--java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java666
-rw-r--r--lint-baseline.xml487
-rw-r--r--nativeLib/build.gradle43
-rw-r--r--proto/icing/index/numeric/wildcard-property-storage.proto22
-rw-r--r--proto/icing/proto/debug.proto137
-rw-r--r--proto/icing/proto/document.proto76
-rw-r--r--proto/icing/proto/document_wrapper.proto4
-rw-r--r--proto/icing/proto/initialize.proto105
-rw-r--r--proto/icing/proto/internal/optimize.proto29
-rw-r--r--proto/icing/proto/logging.proto364
-rw-r--r--proto/icing/proto/optimize.proto55
-rw-r--r--proto/icing/proto/persist.proto22
-rw-r--r--proto/icing/proto/schema.proto183
-rw-r--r--proto/icing/proto/scoring.proto115
-rw-r--r--proto/icing/proto/search.proto460
-rw-r--r--proto/icing/proto/status.proto11
-rw-r--r--proto/icing/proto/storage.proto187
-rw-r--r--proto/icing/proto/usage.proto69
-rw-r--r--synced_AOSP_CL_number.txt1
556 files changed, 163055 insertions, 20125 deletions
diff --git a/.gitignore b/.gitignore
index f57bd5e..962fbd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@
# Files
*.iml
+*.cmake.gen \ No newline at end of file
diff --git a/Android.bp b/Android.bp
index 7982c4f..82b7b59 100644
--- a/Android.bp
+++ b/Android.bp
@@ -50,6 +50,8 @@ cc_defaults {
"-funsigned-char",
"-fvisibility=hidden",
+
+ "-Bsymbolic",
],
}
diff --git a/AndroidManifest.xml b/AndroidManifest.xml
deleted file mode 100644
index 7377c53..0000000
--- a/AndroidManifest.xml
+++ /dev/null
@@ -1,2 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<manifest package="com.google.android.icing" />
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0830783..4b7c752 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,9 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-cmake_minimum_required(VERSION 3.10.2)
+cmake_minimum_required(VERSION 3.22.1)
+
+project(icing)
add_definitions("-DICING_REVERSE_JNI_SEGMENTATION=1")
+set(VERSION_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/icing/jni.lds")
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_SHARED_LINKER_FLAGS
+ "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--gc-sections -Wl,--version-script=${VERSION_SCRIPT}")
set(
Protobuf_PREBUILTS_DIR
@@ -45,7 +51,7 @@ add_subdirectory("${Protobuf_SOURCE_DIR}/cmake" ${Protobuf_TARGET_BINARY_DIR})
# Compile libandroidicu
set(ICU_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../icu/libandroidicu")
set(ICU_TARGET_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/icu-target")
-add_subdirectory(${ICU_SOURCE_DIR} ${ICU_TARGET_BINARY_DIR})
+add_subdirectory("${ICU_SOURCE_DIR}/static_shim" ${ICU_TARGET_BINARY_DIR})
# Glob Icing proto sources. Results look like this: icing/proto/document.proto
file(
@@ -55,7 +61,10 @@ file(
"*.proto")
message(STATUS "Icing_PROTO_FILES=${Icing_PROTO_FILES}")
+
# Run protoc on Icing_PROTO_FILES to generate pb.cc and pb.h files
+# The DEPENDS section of add_custom_command could trigger a remake if any proto
+# source file has been updated.
file(MAKE_DIRECTORY ${Icing_PROTO_GEN_DIR})
foreach(FILE ${Icing_PROTO_FILES})
# Find the name of the proto file without the .proto extension
@@ -68,10 +77,10 @@ foreach(FILE ${Icing_PROTO_FILES})
"${Icing_PROTO_GEN_DIR}/${FILE_NOEXT}.pb.h"
COMMAND ${Protobuf_PROTOC_PATH}
--proto_path "${CMAKE_CURRENT_SOURCE_DIR}/proto"
- --cpp_out ${Icing_PROTO_GEN_DIR}
+ --cpp_out "lite:${Icing_PROTO_GEN_DIR}"
${FILE}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
- DEPENDS ${Protobuf_PROTOC_PATH}
+ DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/proto/${FILE}
)
endforeach()
message(STATUS "Icing_PROTO_SOURCES=${Icing_PROTO_SOURCES}")
@@ -89,6 +98,11 @@ file(
# Glob expressions
icing/*.cc icing/*.h
)
+
+# TODO(b/170611579): When supporting cmake v3.12 or higher, use CONFIGURE_DEPENDS
+# in the glob and remove this section.
+include(synced_AOSP_CL_number.txt)
+
# Exclude the same types of files as Android.bp. See the comments there.
list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/.*[^a-zA-Z0-9]test[^a-zA-Z0-9].*$")
list(FILTER Icing_CC_SOURCES EXCLUDE REGEX "^icing/.*_benchmark\.cc$")
@@ -116,4 +130,4 @@ target_include_directories(icing PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
target_include_directories(icing PRIVATE ${Icing_PROTO_GEN_DIR})
target_include_directories(icing PRIVATE "${Protobuf_SOURCE_DIR}/src")
target_include_directories(icing PRIVATE "${ICU_SOURCE_DIR}/include")
-target_link_libraries(icing protobuf::libprotobuf libandroidicu log)
+target_link_libraries(icing protobuf::libprotobuf-lite libandroidicu log z)
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000..93c8e30
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,2 @@
+adorokhine@google.com
+tjbarron@google.com
diff --git a/build.gradle b/build.gradle
new file mode 100644
index 0000000..97cc5e1
--- /dev/null
+++ b/build.gradle
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2020 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import androidx.build.SdkHelperKt
+
+plugins {
+ id("AndroidXPlugin")
+ id("AndroidXRepackagePlugin")
+ id("java-library")
+ id("com.google.protobuf")
+}
+
+repackage {
+ // Must match what is in frameworks/support/appsearch/appsearch-external-protobuf/build.gradle
+ addRelocation {
+ sourcePackage = "com.google.protobuf"
+ targetPackage = "com.google.android.icing.protobuf"
+ }
+}
+
+sourceSets {
+ main {
+ java.srcDir 'java/src/'
+ proto.srcDir 'proto/'
+ }
+}
+
+dependencies {
+ compileOnly("androidx.annotation:annotation:1.1.0")
+ compileOnly(SdkHelperKt.getSdkDependency(project))
+ compileOnly(libs.protobufLite)
+}
+
+afterEvaluate {
+ lint {
+ lintOptions {
+ // protobuf generates unannotated methods
+ disable("UnknownNullness")
+ }
+ }
+}
+
+protobuf {
+ protoc {
+ artifact = libs.protobufCompiler.get()
+ }
+ generateProtoTasks {
+ all().each { task ->
+ task.builtins {
+ java {
+ option 'lite'
+ }
+ }
+ }
+ }
+}
+
+androidx {
+ mavenVersion = LibraryVersions.APPSEARCH
+}
diff --git a/icing/absl_ports/annotate.cc b/icing/absl_ports/annotate.cc
index d283e13..dfe5566 100644
--- a/icing/absl_ports/annotate.cc
+++ b/icing/absl_ports/annotate.cc
@@ -33,7 +33,7 @@ libtextclassifier3::Status Annotate(const libtextclassifier3::Status& s,
std::string new_msg =
(!s.error_message().empty())
- ? absl_ports::StrCat(s.error_message(), kErrorSeparator, msg)
+ ? absl_ports::StrCat(msg, kErrorSeparator, s.error_message())
: std::string(msg);
return libtextclassifier3::Status(s.CanonicalCode(), new_msg);
}
diff --git a/icing/absl_ports/arraysize_macros.h b/icing/absl_ports/arraysize_macros.h
new file mode 100644
index 0000000..e09c019
--- /dev/null
+++ b/icing/absl_ports/arraysize_macros.h
@@ -0,0 +1,40 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_ARRAYSIZE_MACROS_H_
+#define ICING_ABSL_PORTS_ARRAYSIZE_MACROS_H_
+
+#include <cstddef>
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+// ABSL_PORT_ARRAYSIZE()
+//
+// Returns the number of elements in an array as a compile-time constant, which
+// can be used in defining new arrays. If you use this macro on a pointer by
+// mistake, you will get a compile-time error.
+#define ABSL_PORT_ARRAYSIZE(array) (sizeof(absl_ports::ArraySizeHelper(array)))
+
+// Note: this internal template function declaration is used by ABSL_PORT_ARRAYSIZE.
+// The function doesn't need a definition, as we only use its type.
+template <typename T, size_t N>
+auto ArraySizeHelper(const T (&array)[N]) -> char (&)[N];
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ARRAYSIZE_MACROS_H_
diff --git a/icing/absl_ports/status_imports.h b/icing/absl_ports/ascii_str_to_lower.cc
index 3a97fd6..f181751 100644
--- a/icing/absl_ports/status_imports.h
+++ b/icing/absl_ports/ascii_str_to_lower.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Google LLC
+// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -12,21 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_ABSL_PORTS_STATUS_IMPORTS_H_
-#define ICING_ABSL_PORTS_STATUS_IMPORTS_H_
+#include "icing/absl_ports/ascii_str_to_lower.h"
-#include "icing/text_classifier/lib3/utils/base/status.h"
+#include <string>
namespace icing {
namespace lib {
namespace absl_ports {
-// TODO(b/144458732) Delete this file once visibility on TC3 Status has been
-// granted to the sample app.
-using Status = libtextclassifier3::Status;
+void AsciiStrToLower(std::string* s) {
+ for (auto& ch : *s) {
+ ch = std::tolower(static_cast<unsigned char>(ch));
+ }
+}
} // namespace absl_ports
} // namespace lib
} // namespace icing
-
-#endif // ICING_ABSL_PORTS_STATUS_IMPORTS_H_
diff --git a/icing/absl_ports/ascii_str_to_lower.h b/icing/absl_ports/ascii_str_to_lower.h
new file mode 100644
index 0000000..0233fa8
--- /dev/null
+++ b/icing/absl_ports/ascii_str_to_lower.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_ABSL_PORTS_ASCII_STR_TO_LOWER_H_
+#define ICING_ABSL_PORTS_ASCII_STR_TO_LOWER_H_
+
+#include <string>
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+// Converts the characters in `s` to lowercase, changing the contents of `s`.
+void AsciiStrToLower(std::string* s);
+
+// Creates a lowercase string from a given std::string_view.
+inline std::string AsciiStrToLower(std::string_view s) {
+ std::string result(s);
+ AsciiStrToLower(&result);
+ return result;
+}
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_ABSL_PORTS_ASCII_TO_LOWER_H_
diff --git a/icing/absl_ports/status_test.cc b/icing/absl_ports/status_test.cc
new file mode 100644
index 0000000..1909302
--- /dev/null
+++ b/icing/absl_ports/status_test.cc
@@ -0,0 +1,53 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/proto/document.pb.h"
+
+namespace icing {
+namespace lib {
+
+TEST(StatusTest, StatusOrOfProtoConstructorTest) {
+ libtextclassifier3::StatusOr<DocumentProto> status_or =
+ absl_ports::InvalidArgumentError("test");
+ libtextclassifier3::StatusOr<DocumentProto> new_status_or = status_or;
+}
+
+TEST(StatusTest, StatusOrOfProtoMoveConstructorTest) {
+ libtextclassifier3::StatusOr<DocumentProto> status_or =
+ absl_ports::InvalidArgumentError("test");
+ libtextclassifier3::StatusOr<DocumentProto> new_status_or =
+ std::move(status_or);
+}
+
+TEST(StatusTest, StatusOrOfProtoAssignmentTest) {
+ libtextclassifier3::StatusOr<DocumentProto> status_or =
+ absl_ports::InvalidArgumentError("test");
+ libtextclassifier3::StatusOr<DocumentProto> new_status_or;
+ new_status_or = status_or;
+}
+
+TEST(StatusTest, StatusOrOfProtoMoveAssignmentTest) {
+ libtextclassifier3::StatusOr<DocumentProto> status_or =
+ absl_ports::InvalidArgumentError("test");
+ libtextclassifier3::StatusOr<DocumentProto> new_status_or;
+ new_status_or = std::move(status_or);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/absl_ports/str_cat.cc b/icing/absl_ports/str_cat.cc
index 2cf020d..8695824 100644
--- a/icing/absl_ports/str_cat.cc
+++ b/icing/absl_ports/str_cat.cc
@@ -175,9 +175,8 @@ void StrAppendPieces(std::string* dest, std::vector<std::string_view> pieces) {
for (std::string_view s : pieces) {
result_size += s.length();
}
- // Create result with enough room to fit all operands.
- std::string result;
- result.__resize_default_init(result_size);
+ // Resize dest with enough room to fit all operands.
+ dest->__resize_default_init(result_size);
char* out = &(*dest)[old_size];
for (std::string_view s : pieces) {
diff --git a/icing/absl_ports/str_join.cc b/icing/absl_ports/str_join.cc
new file mode 100644
index 0000000..2d105ca
--- /dev/null
+++ b/icing/absl_ports/str_join.cc
@@ -0,0 +1,41 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/absl_ports/str_join.h"
+
+namespace icing {
+namespace lib {
+namespace absl_ports {
+
+std::vector<std::string_view> StrSplit(std::string_view text,
+ std::string_view sep) {
+ std::vector<std::string_view> substrings;
+ size_t separator_position = text.find(sep);
+ size_t current_start = 0;
+ size_t current_end = separator_position;
+ while (separator_position != std::string_view::npos) {
+ substrings.push_back(
+ text.substr(current_start, current_end - current_start));
+ current_start = current_end + sep.length();
+ separator_position = text.find(sep, current_start);
+ current_end = separator_position;
+ }
+ current_end = text.length();
+ substrings.push_back(text.substr(current_start, current_end - current_start));
+ return substrings;
+}
+
+} // namespace absl_ports
+} // namespace lib
+} // namespace icing
diff --git a/icing/absl_ports/str_join.h b/icing/absl_ports/str_join.h
index 7c8936a..5277bca 100644
--- a/icing/absl_ports/str_join.h
+++ b/icing/absl_ports/str_join.h
@@ -17,6 +17,7 @@
#include <string>
#include <string_view>
+#include <vector>
#include "icing/absl_ports/str_cat.h"
@@ -92,6 +93,11 @@ std::string StrJoin(Iterator first, Iterator last, std::string_view sep,
return result;
}
+template <typename Iterator>
+std::string StrJoin(Iterator first, Iterator last, std::string_view sep) {
+ return absl_ports::StrJoin(first, last, sep, DefaultFormatter());
+}
+
template <typename Container, typename Formatter>
std::string StrJoin(const Container& container, std::string_view sep,
Formatter&& formatter) {
@@ -104,8 +110,11 @@ std::string StrJoin(const Container& container, std::string_view sep) {
return absl_ports::StrJoin(container, sep, DefaultFormatter());
}
+std::vector<std::string_view> StrSplit(std::string_view text,
+ std::string_view sep);
+
} // namespace absl_ports
} // namespace lib
} // namespace icing
-#endif // ICING_ABSL_PORTS_STR_JOIN_H_
+#endif // ICING_ABSL_PORTS_STR_JOIN_H_ \ No newline at end of file
diff --git a/icing/document-builder.h b/icing/document-builder.h
index 4c95b89..44500f9 100644
--- a/icing/document-builder.h
+++ b/icing/document-builder.h
@@ -71,11 +71,6 @@ class DocumentBuilder {
return *this;
}
- DocumentBuilder& ClearCustomProperties() {
- document_.clear_custom_properties();
- return *this;
- }
-
// Takes a property name and any number of string values.
template <typename... V>
DocumentBuilder& AddStringProperty(std::string property_name,
@@ -83,26 +78,25 @@ class DocumentBuilder {
return AddStringProperty(std::move(property_name), {string_values...});
}
- // Takes a custom property name and any number of string values.
- template <typename... V>
- DocumentBuilder& AddCustomStringProperty(std::string property_name,
- V... string_values) {
- return AddCustomStringProperty(std::move(property_name),
- {string_values...});
+ // Takes a property name and iterator of int64_t values.
+ template <typename InputIt>
+ DocumentBuilder& AddInt64Property(std::string property_name, InputIt first,
+ InputIt last) {
+ auto property = document_.add_properties();
+ property->set_name(std::move(property_name));
+ for (InputIt it = first; it != last; ++it) {
+ property->mutable_int64_values()->Add(*it);
+ }
+ return *this;
}
// Takes a property name and any number of int64_t values.
template <typename... V>
DocumentBuilder& AddInt64Property(std::string property_name,
V... int64_values) {
- return AddInt64Property(std::move(property_name), {int64_values...});
- }
-
- // Takes a custom property name and any number of int64_t values.
- template <typename... V>
- DocumentBuilder& AddCustomInt64Property(std::string property_name,
- V... int64_values) {
- return AddCustomInt64Property(std::move(property_name), {int64_values...});
+ std::initializer_list<int64_t> int64_values_list = {int64_values...};
+ return AddInt64Property(std::move(property_name), int64_values_list.begin(),
+ int64_values_list.end());
}
// Takes a property name and any number of double values.
@@ -112,14 +106,6 @@ class DocumentBuilder {
return AddDoubleProperty(std::move(property_name), {double_values...});
}
- // Takes a custom property name and any number of double values.
- template <typename... V>
- DocumentBuilder& AddCustomDoubleProperty(std::string property_name,
- V... double_values) {
- return AddCustomDoubleProperty(std::move(property_name),
- {double_values...});
- }
-
// Takes a property name and any number of boolean values.
template <typename... V>
DocumentBuilder& AddBooleanProperty(std::string property_name,
@@ -127,28 +113,12 @@ class DocumentBuilder {
return AddBooleanProperty(std::move(property_name), {boolean_values...});
}
- // Takes a custom property name and any number of boolean values.
- template <typename... V>
- DocumentBuilder& AddCustomBooleanProperty(std::string property_name,
- V... boolean_values) {
- return AddCustomBooleanProperty(std::move(property_name),
- {boolean_values...});
- }
-
// Takes a property name and any number of bytes values.
template <typename... V>
DocumentBuilder& AddBytesProperty(std::string property_name,
V... bytes_values) {
return AddBytesProperty(std::move(property_name), {bytes_values...});
}
-
- // Takes a custom property name and any number of bytes values.
- template <typename... V>
- DocumentBuilder& AddCustomBytesProperty(std::string property_name,
- V... bytes_values) {
- return AddCustomBytesProperty(std::move(property_name), {bytes_values...});
- }
-
// Takes a property name and any number of document values.
template <typename... V>
DocumentBuilder& AddDocumentProperty(std::string property_name,
@@ -156,14 +126,6 @@ class DocumentBuilder {
return AddDocumentProperty(std::move(property_name), {document_values...});
}
- // Takes a custom property name and any number of document values.
- template <typename... V>
- DocumentBuilder& AddCustomDocumentProperty(std::string property_name,
- V&&... document_values) {
- return AddCustomDocumentProperty(std::move(property_name),
- {document_values...});
- }
-
DocumentProto Build() const { return document_; }
private:
@@ -180,37 +142,6 @@ class DocumentBuilder {
return *this;
}
- DocumentBuilder& AddCustomStringProperty(
- std::string property_name,
- std::initializer_list<std::string_view> string_values) {
- auto custom_property = document_.add_custom_properties();
- custom_property->set_name(std::move(property_name));
- for (std::string_view string_value : string_values) {
- custom_property->mutable_string_values()->Add(std::string(string_value));
- }
- return *this;
- }
-
- DocumentBuilder& AddInt64Property(
- std::string property_name, std::initializer_list<int64_t> int64_values) {
- auto property = document_.add_properties();
- property->set_name(std::move(property_name));
- for (int64_t int64_value : int64_values) {
- property->mutable_int64_values()->Add(int64_value);
- }
- return *this;
- }
-
- DocumentBuilder& AddCustomInt64Property(
- std::string property_name, std::initializer_list<int64_t> int64_values) {
- auto custom_property = document_.add_custom_properties();
- custom_property->set_name(std::move(property_name));
- for (int64_t int64_value : int64_values) {
- custom_property->mutable_int64_values()->Add(int64_value);
- }
- return *this;
- }
-
DocumentBuilder& AddDoubleProperty(
std::string property_name, std::initializer_list<double> double_values) {
auto property = document_.add_properties();
@@ -221,16 +152,6 @@ class DocumentBuilder {
return *this;
}
- DocumentBuilder& AddCustomDoubleProperty(
- std::string property_name, std::initializer_list<double> double_values) {
- auto custom_property = document_.add_custom_properties();
- custom_property->set_name(std::move(property_name));
- for (double double_value : double_values) {
- custom_property->mutable_double_values()->Add(double_value);
- }
- return *this;
- }
-
DocumentBuilder& AddBooleanProperty(
std::string property_name, std::initializer_list<bool> boolean_values) {
auto property = document_.add_properties();
@@ -241,16 +162,6 @@ class DocumentBuilder {
return *this;
}
- DocumentBuilder& AddCustomBooleanProperty(
- std::string property_name, std::initializer_list<bool> boolean_values) {
- auto custom_property = document_.add_custom_properties();
- custom_property->set_name(std::move(property_name));
- for (bool boolean_value : boolean_values) {
- custom_property->mutable_boolean_values()->Add(boolean_value);
- }
- return *this;
- }
-
DocumentBuilder& AddBytesProperty(
std::string property_name,
std::initializer_list<std::string> bytes_values) {
@@ -262,17 +173,6 @@ class DocumentBuilder {
return *this;
}
- DocumentBuilder& AddCustomBytesProperty(
- std::string property_name,
- std::initializer_list<std::string> bytes_values) {
- auto custom_property = document_.add_custom_properties();
- custom_property->set_name(std::move(property_name));
- for (const std::string& bytes_value : bytes_values) {
- custom_property->mutable_bytes_values()->Add(std::string(bytes_value));
- }
- return *this;
- }
-
DocumentBuilder& AddDocumentProperty(
std::string property_name,
std::initializer_list<DocumentProto> document_values) {
@@ -283,18 +183,6 @@ class DocumentBuilder {
}
return *this;
}
-
- DocumentBuilder& AddCustomDocumentProperty(
- std::string property_name,
- std::initializer_list<DocumentProto> document_values) {
- auto custom_property = document_.add_custom_properties();
- custom_property->set_name(std::move(property_name));
- for (DocumentProto document_value : document_values) {
- custom_property->mutable_document_values()->Add(
- std::move(document_value));
- }
- return *this;
- }
};
} // namespace lib
diff --git a/icing/file/destructible-directory.h b/icing/file/destructible-directory.h
new file mode 100644
index 0000000..9a8bd4b
--- /dev/null
+++ b/icing/file/destructible-directory.h
@@ -0,0 +1,74 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_DESTRUCTIBLE_DIRECTORY_H_
+#define ICING_FILE_DESTRUCTIBLE_DIRECTORY_H_
+
+#include "icing/file/filesystem.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+// A convenient RAII class which will recursively create the directory at the
+// specified file path and delete it upon destruction.
+class DestructibleDirectory {
+ public:
+ explicit DestructibleDirectory(const Filesystem* filesystem, std::string dir)
+ : filesystem_(filesystem), dir_(std::move(dir)) {
+ is_valid_ = filesystem_->CreateDirectoryRecursively(dir_.c_str());
+ }
+
+ DestructibleDirectory(const DestructibleDirectory&) = delete;
+ DestructibleDirectory& operator=(const DestructibleDirectory&) = delete;
+
+ DestructibleDirectory(DestructibleDirectory&& rhs)
+ : filesystem_(nullptr), is_valid_(false) {
+ Swap(rhs);
+ }
+
+ DestructibleDirectory& operator=(DestructibleDirectory&& rhs) {
+ Swap(rhs);
+ return *this;
+ }
+
+ ~DestructibleDirectory() {
+ if (filesystem_ != nullptr &&
+ !filesystem_->DeleteDirectoryRecursively(dir_.c_str())) {
+ // Swallow deletion failures as there's nothing actionable to do about
+ // them.
+ ICING_LOG(WARNING) << "Unable to delete temporary directory: " << dir_;
+ }
+ }
+
+ const std::string& dir() const { return dir_; }
+
+ bool is_valid() const { return is_valid_; }
+
+ private:
+ void Swap(DestructibleDirectory& other) {
+ std::swap(filesystem_, other.filesystem_);
+ std::swap(dir_, other.dir_);
+ std::swap(is_valid_, other.is_valid_);
+ }
+
+ const Filesystem* filesystem_;
+ std::string dir_;
+ bool is_valid_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_DESTRUCTIBLE_DIRECTORY_H_
diff --git a/icing/file/destructible-directory_test.cc b/icing/file/destructible-directory_test.cc
new file mode 100644
index 0000000..dae74ff
--- /dev/null
+++ b/icing/file/destructible-directory_test.cc
@@ -0,0 +1,118 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/destructible-directory.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+
+TEST(DestructibleDirectoryTest, DeletesDirectoryProperly) {
+ Filesystem filesystem;
+ std::string dir_path = GetTestTempDir() + "/dir1";
+ std::string file_path = dir_path + "/file1";
+
+ {
+ // 1. Create a file in the directory.
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(dir_path.c_str()));
+ ScopedFd sfd(filesystem.OpenForWrite(file_path.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+ int i = 127;
+ ASSERT_TRUE(filesystem.Write(sfd.get(), &i, sizeof(i)));
+ }
+
+ {
+ // 2. Open the directory with a DestructibleDirectory
+ DestructibleDirectory destructible(&filesystem, dir_path);
+ EXPECT_TRUE(destructible.is_valid());
+ EXPECT_THAT(destructible.dir(), Eq(dir_path));
+ }
+
+ // 3. Ensure that the file and directory don't exist.
+ EXPECT_FALSE(filesystem.FileExists(file_path.c_str()));
+ EXPECT_FALSE(filesystem.DirectoryExists(dir_path.c_str()));
+}
+
+TEST(DestructibleDirectoryTest, MoveAssignDeletesDirectoryProperly) {
+ Filesystem filesystem;
+ std::string filepath1 = GetTestTempDir() + "/dir1";
+ std::string filepath2 = GetTestTempDir() + "/dir2";
+
+ // 1. Create dir1
+ DestructibleDirectory destructible1(&filesystem, filepath1);
+ ASSERT_TRUE(destructible1.is_valid());
+ ASSERT_TRUE(filesystem.DirectoryExists(filepath1.c_str()));
+
+ {
+ // 2. Create dir2
+ DestructibleDirectory destructible2(&filesystem, filepath2);
+ ASSERT_TRUE(destructible2.is_valid());
+
+ // Move assign destructible2 into destructible1
+ destructible1 = std::move(destructible2);
+ }
+
+ // 3. dir1 shouldn't exist because it was destroyed when destructible1 was
+ // move assigned to.
+ EXPECT_FALSE(filesystem.DirectoryExists(filepath1.c_str()));
+
+ // 4. dir2 should still exist because it moved into destructible1 from
+ // destructible2.
+ EXPECT_TRUE(filesystem.DirectoryExists(filepath2.c_str()));
+}
+
+TEST(DestructibleDirectoryTest, MoveConstructionDeletesDirectoryProperly) {
+ Filesystem filesystem;
+ std::string filepath1 = GetTestTempDir() + "/dir1";
+
+ // 1. Create destructible1, it'll be reconstructed soon anyways.
+ std::unique_ptr<DestructibleDirectory> destructible1;
+ {
+ // 2. Create file1
+ DestructibleDirectory destructible2(&filesystem, filepath1);
+ ASSERT_TRUE(destructible2.is_valid());
+
+ // Move construct destructible1 from destructible2
+ destructible1 =
+ std::make_unique<DestructibleDirectory>(std::move(destructible2));
+ }
+
+ // 3. dir1 should still exist because it moved into destructible1 from
+ // destructible2.
+ EXPECT_TRUE(destructible1->is_valid());
+ EXPECT_TRUE(filesystem.DirectoryExists(filepath1.c_str()));
+
+ {
+ // 4. Move construct destructible3 from destructible1
+ DestructibleDirectory destructible3(std::move(*destructible1));
+ EXPECT_TRUE(destructible3.is_valid());
+ }
+
+ // 5. dir1 shouldn't exist because it was destroyed when destructible3 was
+ // destroyed.
+ EXPECT_FALSE(filesystem.DirectoryExists(filepath1.c_str()));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/destructible-file.h b/icing/file/destructible-file.h
new file mode 100644
index 0000000..006dcb4
--- /dev/null
+++ b/icing/file/destructible-file.h
@@ -0,0 +1,72 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_DESTRUCTIBLE_FILE_H_
+#define ICING_FILE_DESTRUCTIBLE_FILE_H_
+
+#include <unistd.h>
+
+#include <string>
+
+#include "icing/file/filesystem.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+// A convenient RAII class which will open the specified file path for write and
+// delete the underlying file upon destruction.
+class DestructibleFile {
+ public:
+ explicit DestructibleFile(const std::string& filepath,
+ const Filesystem* filesystem)
+ : filesystem_(filesystem), filepath_(filepath) {
+ fd_ = filesystem_->OpenForWrite(filepath_.c_str());
+ }
+
+ DestructibleFile(const DestructibleFile&) = delete;
+ DestructibleFile(DestructibleFile&& other) : filesystem_(nullptr), fd_(-1) {
+ *this = std::move(other);
+ }
+
+ DestructibleFile& operator=(const DestructibleFile&) = delete;
+ DestructibleFile& operator=(DestructibleFile&& other) {
+ std::swap(fd_, other.fd_);
+ std::swap(filesystem_, other.filesystem_);
+ std::swap(filepath_, other.filepath_);
+ return *this;
+ }
+
+ ~DestructibleFile() {
+ if (is_valid()) {
+ close(fd_);
+ if (!filesystem_->DeleteFile(filepath_.c_str())) {
+ ICING_VLOG(1) << "Failed to delete file " << filepath_;
+ }
+ }
+ }
+
+ bool is_valid() const { return fd_ >= 0; }
+ int get_fd() const { return fd_; }
+
+ private:
+ const Filesystem* filesystem_;
+ std::string filepath_;
+ int fd_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_DESTRUCTIBLE_FILE_H_
diff --git a/icing/file/destructible-file_test.cc b/icing/file/destructible-file_test.cc
new file mode 100644
index 0000000..61316d1
--- /dev/null
+++ b/icing/file/destructible-file_test.cc
@@ -0,0 +1,117 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/destructible-file.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+TEST(DestructibleFileTest, DeletesFileProperly) {
+ Filesystem filesystem;
+ std::string filepath1 = GetTestTempDir() + "/file1";
+
+ {
+ // 1. Create the file
+ ScopedFd sfd(filesystem.OpenForWrite(filepath1.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+ int i = 127;
+ ASSERT_TRUE(filesystem.Write(sfd.get(), &i, sizeof(i)));
+ }
+
+ {
+ // 2. Open with a Destructible file.
+ DestructibleFile destructible(filepath1, &filesystem);
+ ASSERT_TRUE(destructible.is_valid());
+ }
+
+ // 3. Ensure that the file doesn't exist.
+ EXPECT_FALSE(filesystem.FileExists(filepath1.c_str()));
+}
+
+TEST(DestructibleFileTest, MoveAssignDeletesFileProperly) {
+ Filesystem filesystem;
+ std::string filepath1 = GetTestTempDir() + "/file1";
+ std::string filepath2 = GetTestTempDir() + "/file2";
+
+ // 1. Create file1
+ DestructibleFile destructible1(filepath1, &filesystem);
+ ASSERT_TRUE(destructible1.is_valid());
+ int i = 127;
+ ASSERT_TRUE(filesystem.Write(destructible1.get_fd(), &i, sizeof(i)));
+
+ {
+ // 2. Create file2
+ DestructibleFile destructible2(filepath2, &filesystem);
+ ASSERT_TRUE(destructible2.is_valid());
+ i = 458;
+ ASSERT_TRUE(filesystem.Write(destructible2.get_fd(), &i, sizeof(i)));
+
+ // Move assign destructible2 into destructible1
+ destructible1 = std::move(destructible2);
+ }
+
+ // 3. file1 shouldn't exist because it was destroyed when destructible1 was
+ // move assigned to.
+ EXPECT_FALSE(filesystem.FileExists(filepath1.c_str()));
+
+ // 4. file2 should still exist because it moved into destructible1 from
+ // destructible2.
+ EXPECT_TRUE(filesystem.FileExists(filepath2.c_str()));
+}
+
+TEST(DestructibleFileTest, MoveConstructionDeletesFileProperly) {
+ Filesystem filesystem;
+ std::string filepath1 = GetTestTempDir() + "/file1";
+
+ // 1. Create destructible1, it'll be reconstructed soon anyways.
+ std::unique_ptr<DestructibleFile> destructible1;
+ {
+ // 2. Create file1
+ DestructibleFile destructible2(filepath1, &filesystem);
+ ASSERT_TRUE(destructible2.is_valid());
+ int i = 458;
+ ASSERT_TRUE(filesystem.Write(destructible2.get_fd(), &i, sizeof(i)));
+
+ // Move construct destructible1 from destructible2
+ destructible1 =
+ std::make_unique<DestructibleFile>(std::move(destructible2));
+ }
+
+ // 3. file1 should still exist because it moved into destructible1 from
+ // destructible2.
+ ASSERT_TRUE(destructible1->is_valid());
+ EXPECT_TRUE(filesystem.FileExists(filepath1.c_str()));
+
+ {
+ // 4. Move construct destructible3 from destructible1
+ DestructibleFile destructible3(std::move(*destructible1));
+ ASSERT_TRUE(destructible3.is_valid());
+ }
+
+ // 5. file1 shouldn't exist because it was destroyed when destructible3 was
+ // destroyed.
+ EXPECT_FALSE(filesystem.FileExists(filepath1.c_str()));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/file-backed-bitmap.cc b/icing/file/file-backed-bitmap.cc
index f1e568c..bdcfc79 100644
--- a/icing/file/file-backed-bitmap.cc
+++ b/icing/file/file-backed-bitmap.cc
@@ -47,10 +47,14 @@ FileBackedBitmap::Create(const Filesystem* filesystem,
"mmap strategy.");
}
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile mmapper,
+ MemoryMappedFile::Create(*filesystem, file_path, mmap_strategy));
+
auto bitmap = std::unique_ptr<FileBackedBitmap>(
- new FileBackedBitmap(filesystem, file_path, mmap_strategy));
+ new FileBackedBitmap(filesystem, file_path, std::move(mmapper)));
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = bitmap->Initialize();
if (!status.ok()) {
@@ -62,10 +66,10 @@ FileBackedBitmap::Create(const Filesystem* filesystem,
FileBackedBitmap::FileBackedBitmap(const Filesystem* filesystem,
std::string_view file_path,
- MemoryMappedFile::Strategy mmap_strategy)
+ MemoryMappedFile&& mmapper)
: filesystem_(filesystem),
file_path_(file_path),
- mmapper_(new MemoryMappedFile(*filesystem, file_path, mmap_strategy)) {}
+ mmapper_(std::make_unique<MemoryMappedFile>(std::move(mmapper))) {}
FileBackedBitmap::~FileBackedBitmap() {
// Only update if we have auto_sync setup, otherwise the checksum will be
@@ -122,7 +126,7 @@ libtextclassifier3::Status FileBackedBitmap::FileBackedBitmap::Initialize() {
<< " of size: " << file_size;
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, file_size);
if (!status.ok()) {
@@ -198,7 +202,7 @@ int FileBackedBitmap::NumBits() const {
libtextclassifier3::Status FileBackedBitmap::Set(int bit_index,
bool bit_value) {
if (bit_index >= NumBits()) {
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = GrowTo(bit_index);
if (!status.ok()) {
@@ -261,7 +265,7 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
file_path_.c_str(), new_file_size));
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
if (!status.ok()) {
@@ -269,8 +273,8 @@ libtextclassifier3::Status FileBackedBitmap::GrowTo(int new_num_bits) {
return status;
}
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Grew file %s to new size %zd", file_path_.c_str(), new_file_size);
+ ICING_VLOG(1) << "Grew file " << file_path_ << " to new size "
+ << new_file_size;
mutable_header()->state = Header::ChecksumState::kStale;
return libtextclassifier3::Status::OK;
}
@@ -281,7 +285,7 @@ libtextclassifier3::Status FileBackedBitmap::TruncateTo(int new_num_bits) {
}
const size_t new_file_size = FileSizeForBits(new_num_bits);
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = mmapper_->Remap(0, new_file_size);
if (!status.ok()) {
diff --git a/icing/file/file-backed-bitmap.h b/icing/file/file-backed-bitmap.h
index e3d98ad..beba14e 100644
--- a/icing/file/file-backed-bitmap.h
+++ b/icing/file/file-backed-bitmap.h
@@ -175,8 +175,9 @@ class FileBackedBitmap {
Header* mutable_header();
// Use FileBackedBitmap::Create() to instantiate.
- FileBackedBitmap(const Filesystem* filesystem, std::string_view file_path,
- MemoryMappedFile::Strategy mmap_strategy);
+ explicit FileBackedBitmap(const Filesystem* filesystem,
+ std::string_view file_path,
+ MemoryMappedFile&& mmapper);
// Verify the contents of the bitmap and get ready for read/write operations.
//
diff --git a/icing/file/file-backed-proto-log.h b/icing/file/file-backed-proto-log.h
index 62943b8..095f832 100644
--- a/icing/file/file-backed-proto-log.h
+++ b/icing/file/file-backed-proto-log.h
@@ -14,16 +14,14 @@
// File-backed log of protos with append-only writes and position based reads.
//
-// There should only be one instance of a FileBackedProtoLog of the same file at
-// a time; using multiple instances at the same time may lead to undefined
-// behavior.
+// The implementation in this file is deprecated and replaced by
+// portable-file-backed-proto-log.h.
//
-// The entire checksum is computed on initialization to verify the contents are
-// valid. On failure, the log will be truncated to the last verified state when
-// PersistToDisk() was called. If the log cannot successfully restore the last
-// state due to disk corruption or some other inconsistency, then the entire log
-// will be lost.
+// This deprecated implementation has been made read-only for the purposes of
+// migration; writing and erasing this format of log is no longer supported and
+// the methods to accomplish this have been removed.
//
+// The details of this format follow below:
// Each proto written to the file will have a metadata written just before it.
// The metadata consists of
// {
@@ -31,49 +29,30 @@
// 3 bytes of the proto size
// n bytes of the proto itself
// }
-//
-// Example usage:
-// ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
-// FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path_,
-// options));
-// auto proto_log = create_result.proto_log;
-//
-// Document document;
-// document.set_namespace("com.google.android.example");
-// document.set_uri("www.google.com");
-//
-// int64_t document_offset = proto_log->WriteProto(document));
-// Document same_document = proto_log->ReadProto(document_offset));
-// proto_log->PersistToDisk();
-//
// TODO(b/136514769): Add versioning to the header and a UpgradeToVersion
// migration method.
-
#ifndef ICING_FILE_FILE_BACKED_PROTO_LOG_H_
#define ICING_FILE_FILE_BACKED_PROTO_LOG_H_
-#include <cstddef>
#include <cstdint>
-#include <cstring>
#include <memory>
#include <string>
#include <string_view>
-#include <utility>
-#include <vector>
-#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include <google/protobuf/io/gzip_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
#include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/gzip_stream.h"
+#include "icing/portable/platform.h"
#include "icing/portable/zlib.h"
#include "icing/util/crc32.h"
+#include "icing/util/data-loss.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
namespace icing {
namespace lib {
@@ -110,10 +89,6 @@ class FileBackedProtoLog {
// Header stored at the beginning of the file before the rest of the log
// contents. Stores metadata on the log.
- //
- // TODO(b/139375388): Migrate the Header struct to a proto. This makes
- // migrations easier since we don't need to worry about different size padding
- // (which would affect the checksum) and different endians.
struct Header {
static constexpr int32_t kMagic = 0xf4c6f67a;
@@ -151,11 +126,15 @@ class FileBackedProtoLog {
// A successfully initialized log.
std::unique_ptr<FileBackedProtoLog<ProtoT>> proto_log;
- // Whether there was some data loss while initializing from a previous
- // state. This can happen if the file is corrupted or some previously added
- // data was unpersisted. This may be used to signal that any derived data
- // off of the proto log may need to be regenerated.
- bool data_loss;
+ // The data status after initializing from a previous state. Data loss can
+ // happen if the file is corrupted or some previously added data was
+ // unpersisted. This may be used to signal that any derived data off of the
+ // proto log may need to be regenerated.
+ DataLoss data_loss;
+
+ bool has_data_loss() {
+ return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
+ }
};
// Factory method to create, initialize, and return a FileBackedProtoLog. Will
@@ -166,9 +145,10 @@ class FileBackedProtoLog {
// log saves these checkpointed "good" states when PersistToDisk() is called
// or the log is safely destructed. If the log rewinds successfully to the
// last-good state, then the returned CreateResult.data_loss indicates
- // there was some data loss so that any derived data may know that it
- // needs to be updated. If the log re-initializes successfully without any
- // data loss, the boolean will be false.
+ // whether it has a data loss and what kind of data loss it is (partial or
+ // complete) so that any derived data may know that it needs to be updated. If
+ // the log re-initializes successfully without any data loss,
+ // CreateResult.data_loss will be NONE.
//
// Params:
// filesystem: Handles system level calls
@@ -188,45 +168,15 @@ class FileBackedProtoLog {
FileBackedProtoLog(const FileBackedProtoLog&) = delete;
FileBackedProtoLog& operator=(const FileBackedProtoLog&) = delete;
- // This will update the checksum of the log as well.
- ~FileBackedProtoLog();
-
- // Writes the serialized proto to the underlying file. Writes are applied
- // directly to the underlying file. Users do not need to sync the file after
- // writing.
- //
- // Returns:
- // Offset of the newly appended proto in file on success
- // INVALID_ARGUMENT if proto is too large, as decided by
- // Options.max_proto_size
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
-
// Reads out a proto located at file_offset from the file.
//
// Returns:
// A proto on success
+ // NOT_FOUND if the proto at the given offset has been erased
// OUT_OF_RANGE_ERROR if file_offset exceeds file size
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
- // Calculates and returns the disk usage in bytes. Rounds up to the nearest
- // block size.
- //
- // Returns:
- // Disk usage on success
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
-
- // Returns the file size of all the elements held in the log. File size is in
- // bytes. This excludes the size of any internal metadata of the log, e.g. the
- // log's header.
- //
- // Returns:
- // File size on success
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
-
// An iterator helping to find offsets of all the protos in file.
// Example usage:
//
@@ -236,10 +186,11 @@ class FileBackedProtoLog {
// }
class Iterator {
public:
- Iterator(const Filesystem& filesystem, const std::string& file_path,
- int64_t initial_offset);
+ explicit Iterator(const Filesystem& filesystem,
+ const std::string& file_path, int64_t initial_offset,
+ MemoryMappedFile&& mmapped_file);
- // Advances to the position of next proto.
+ // Advances to the position of next proto whether it has been erased or not.
//
// Returns:
// OK on success
@@ -263,73 +214,7 @@ class FileBackedProtoLog {
// Returns an iterator of current proto log. The caller needs to keep the
// proto log unchanged while using the iterator, otherwise unexpected
// behaviors could happen.
- Iterator GetIterator();
-
- // Persists all changes since initialization or the last call to
- // PersistToDisk(). Any changes that aren't persisted may be lost if the
- // system fails to close safely.
- //
- // Example use case:
- //
- // Document document;
- // document.set_namespace("com.google.android.example");
- // document.set_uri("www.google.com");
- //
- // {
- // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
- // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
- // options));
- // auto proto_log = std::move(create_result.proto_log);
- //
- // int64_t document_offset = proto_log->WriteProto(document));
- //
- // // We lose the document here since it wasn't persisted.
- // // *SYSTEM CRASH*
- // }
- //
- // {
- // // Can still successfully create after a crash since the log can
- // // rewind/truncate to recover into a previously good state
- // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
- // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
- // options));
- // auto proto_log = std::move(create_result.proto_log);
- //
- // // Lost the proto since we didn't PersistToDisk before the crash
- // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
- //
- // int64_t document_offset = proto_log->WriteProto(document));
- //
- // // Persisted this time, so we should be ok.
- // ICING_ASSERT_OK(proto_log->PersistToDisk());
- // }
- //
- // {
- // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
- // FileBackedProtoLog<DocumentProto>::Create(filesystem, file_path,
- // options));
- // auto proto_log = std::move(create_result.proto_log);
- //
- // // SUCCESS
- // Document same_document = proto_log->ReadProto(document_offset));
- // }
- //
- // NOTE: Since all protos are already written to the file directly, this
- // just updates the checksum and rewind position. Without these updates,
- // future initializations will truncate the file and discard unpersisted
- // changes.
- //
- // Returns:
- // OK on success
- // INTERNAL_ERROR on IO error
- libtextclassifier3::Status PersistToDisk();
-
- // Calculates the checksum of the log contents. Excludes the header content.
- //
- // Returns:
- // Crc of the log content
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+ libtextclassifier3::StatusOr<Iterator> GetIterator();
private:
// Object can only be instantiated via the ::Create factory.
@@ -339,7 +224,7 @@ class FileBackedProtoLog {
// Initializes a new proto log.
//
// Returns:
- // std::unique_ptr<FileBackedProtoLog> that can be used immediately
+ // std::unique_ptr<CreateResult> on success
// INTERNAL_ERROR on IO error
static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
const Filesystem* filesystem, const std::string& file_path,
@@ -350,7 +235,7 @@ class FileBackedProtoLog {
// content will be lost.
//
// Returns:
- // std::unique_ptr<FileBackedProtoLog> that can be used immediately
+ // std::unique_ptr<CreateResult> on success
// INTERNAL_ERROR on IO error or internal inconsistencies in the file
// INVALID_ARGUMENT_ERROR if options aren't consistent with previous
// instances
@@ -369,6 +254,28 @@ class FileBackedProtoLog {
const Filesystem* filesystem, const std::string& file_path,
Crc32 initial_crc, int64_t start, int64_t end);
+ static bool IsEmptyBuffer(const char* buffer, int size) {
+ return std::all_of(buffer, buffer + size,
+ [](const char byte) { return byte == 0; });
+ }
+
+ // Helper function to get stored proto size from the metadata.
+ // Metadata format: 8 bits magic + 24 bits size
+ static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
+
+ // Helper function to get stored proto magic from the metadata.
+ // Metadata format: 8 bits magic + 24 bits size
+ static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
+
+ // Reads out the metadata of a proto located at file_offset from the file.
+ //
+ // Returns:
+ // Proto's metadata on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
+ // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
+ static libtextclassifier3::StatusOr<int> ReadProtoMetadata(
+ MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
+
// Magic number added in front of every proto. Used when reading out protos
// as a first check for corruption in each entry in the file. Even if there is
// a corruption, the best we can do is roll back to our last recovery point
@@ -386,31 +293,17 @@ class FileBackedProtoLog {
static_assert(kMaxProtoSize <= 0x00FFFFFF,
"kMaxProtoSize doesn't fit in 3 bytes");
- // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
- static constexpr int kDeflateCompressionLevel = 3;
-
// Chunks of the file to mmap at a time, so we don't mmap the entire file.
- static constexpr int kMmapChunkSize = 4 * 1024;
+ // Only used on 32-bit devices
+ static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB
ScopedFd fd_;
const Filesystem* const filesystem_;
const std::string file_path_;
-
- // Reads out the metadata of a proto located at file_offset from the file.
- //
- // Returns:
- // Proto's metadata on success
- // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
- // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
- static libtextclassifier3::StatusOr<int> ReadProtoMetadata(
- MemoryMappedFile* mmapped_file, int64_t file_offset, int64_t file_size);
std::unique_ptr<Header> header_;
};
template <typename ProtoT>
-constexpr uint8_t FileBackedProtoLog<ProtoT>::kProtoMagic;
-
-template <typename ProtoT>
FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
const std::string& file_path,
std::unique_ptr<Header> header)
@@ -421,15 +314,6 @@ FileBackedProtoLog<ProtoT>::FileBackedProtoLog(const Filesystem* filesystem,
}
template <typename ProtoT>
-FileBackedProtoLog<ProtoT>::~FileBackedProtoLog() {
- if (!PersistToDisk().ok()) {
- ICING_LOG(WARNING)
- << "Error persisting to disk during destruction of FileBackedProtoLog: "
- << file_path_;
- }
-}
-
-template <typename ProtoT>
libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::CreateResult>
FileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
const std::string& file_path,
@@ -485,7 +369,7 @@ FileBackedProtoLog<ProtoT>::InitializeNewFile(const Filesystem* filesystem,
std::unique_ptr<FileBackedProtoLog<ProtoT>>(
new FileBackedProtoLog<ProtoT>(filesystem, file_path,
std::move(header))),
- /*data_loss=*/false};
+ /*data_loss=*/DataLoss::NONE};
return create_result;
}
@@ -535,15 +419,15 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
}
header->max_proto_size = options.max_proto_size;
- bool data_loss = false;
+ DataLoss data_loss = DataLoss::NONE;
ICING_ASSIGN_OR_RETURN(Crc32 calculated_log_checksum,
ComputeChecksum(filesystem, file_path, Crc32(),
sizeof(Header), file_size));
+
// Double check that the log checksum is the same as the one that was
// persisted last time. If not, we start recovery logic.
if (header->log_checksum != calculated_log_checksum.Get()) {
- // Need to rewind the proto log since the checksums don't match
- data_loss = true;
+ // Need to rewind the proto log since the checksums don't match.
// Worst case, we have to rewind the entire log back to just the header
int64_t last_known_good = sizeof(Header);
@@ -559,10 +443,12 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
// Check if it matches our last rewind state. If so, this becomes our last
// good state and we can safely truncate and recover from here.
last_known_good = header->rewind_offset;
+ data_loss = DataLoss::PARTIAL;
} else {
// Otherwise, we're going to truncate the entire log and this resets the
// checksum to an empty log state.
header->log_checksum = 0;
+ data_loss = DataLoss::COMPLETE;
}
if (!filesystem->Truncate(file_path.c_str(), last_known_good)) {
@@ -570,8 +456,8 @@ FileBackedProtoLog<ProtoT>::InitializeExistingFile(const Filesystem* filesystem,
absl_ports::StrCat("Error truncating file: ", file_path));
}
- ICING_LOG(INFO) << "Truncated '" << file_path << "' to size "
- << last_known_good;
+ ICING_LOG(WARNING) << "Truncated '" << file_path << "' to size "
+ << last_known_good;
}
CreateResult create_result = {
@@ -587,8 +473,10 @@ template <typename ProtoT>
libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
const Filesystem* filesystem, const std::string& file_path,
Crc32 initial_crc, int64_t start, int64_t end) {
- auto mmapped_file = MemoryMappedFile(*filesystem, file_path,
- MemoryMappedFile::Strategy::READ_ONLY);
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(*filesystem, file_path,
+ MemoryMappedFile::Strategy::READ_ONLY));
Crc32 new_crc(initial_crc.Get());
if (start < 0) {
@@ -598,6 +486,14 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
file_path.c_str(), static_cast<long long>(start)));
}
+ if (end < start) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Ending checksum offset of file '%s' must be greater than start "
+ "'%lld', was '%lld'",
+ file_path.c_str(), static_cast<long long>(start),
+ static_cast<long long>(end)));
+ }
+
int64_t file_size = filesystem->GetFileSize(file_path.c_str());
if (end > file_size) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
@@ -607,103 +503,57 @@ libtextclassifier3::StatusOr<Crc32> FileBackedProtoLog<ProtoT>::ComputeChecksum(
static_cast<long long>(end)));
}
- for (int i = start; i < end; i += kMmapChunkSize) {
- // Don't read past the file size.
- int next_chunk_size = kMmapChunkSize;
- if ((i + kMmapChunkSize) >= end) {
- next_chunk_size = end - i;
+ Architecture architecture = GetArchitecture();
+ switch (architecture) {
+ case Architecture::BIT_64: {
+ // Don't mmap in chunks here since mmapping can be harmful on 64-bit
+ // devices where mmap/munmap calls need the mmap write semaphore, which
+ // blocks mmap/munmap/mprotect and all page faults from executing while
+ // they run. On 64-bit devices, this doesn't actually load into memory, it
+ // just makes the file faultable. So the whole file should be ok.
+ // b/185822878.
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
+ auto mmap_str = std::string_view(mmapped_file.region(), end - start);
+ new_crc.Append(mmap_str);
+ break;
}
-
- ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
-
- auto mmap_str = std::string_view(mmapped_file.region(), next_chunk_size);
- new_crc.Append(mmap_str);
- }
-
- return new_crc;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::WriteProto(
- const ProtoT& proto) {
- int64_t proto_size = proto.ByteSizeLong();
- int32_t metadata;
- int metadata_size = sizeof(metadata);
- int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
-
- if (proto_size > header_->max_proto_size) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "proto_size, %lld, was too large to write. Max is %d",
- static_cast<long long>(proto_size), header_->max_proto_size));
- }
-
- // At this point, we've guaranteed that proto_size is under kMaxProtoSize (see
- // ::Create), so we can safely store it in an int.
- int final_size = 0;
-
- std::string proto_str;
- google::protobuf::io::StringOutputStream proto_stream(&proto_str);
-
- if (header_->compress) {
- google::protobuf::io::GzipOutputStream::Options options;
- options.format = google::protobuf::io::GzipOutputStream::ZLIB;
- options.compression_level = kDeflateCompressionLevel;
-
- google::protobuf::io::GzipOutputStream compressing_stream(&proto_stream,
- options);
-
- bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
- compressing_stream.Close();
-
- if (!success) {
- return absl_ports::InternalError("Error compressing proto.");
+ case Architecture::BIT_32:
+ [[fallthrough]];
+ case Architecture::UNKNOWN: {
+ // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
+ // much memory at once. If we're unknown, then also chunk it because we're
+ // not sure what the device can handle.
+ for (int i = start; i < end; i += kMmapChunkSize) {
+ // Don't read past the file size.
+ int next_chunk_size = kMmapChunkSize;
+ if ((i + kMmapChunkSize) >= end) {
+ next_chunk_size = end - i;
+ }
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
+
+ auto mmap_str =
+ std::string_view(mmapped_file.region(), next_chunk_size);
+ new_crc.Append(mmap_str);
+ }
+ break;
}
-
- final_size = proto_str.size();
-
- // In case the compressed proto is larger than the original proto, we also
- // can't write it.
- if (final_size > header_->max_proto_size) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Compressed proto size, %d, was greater than "
- "max_proto_size, %d",
- final_size, header_->max_proto_size));
- }
- } else {
- // Serialize the proto directly into the write buffer at an offset of the
- // metadata.
- proto.SerializeToZeroCopyStream(&proto_stream);
- final_size = proto_str.size();
- }
-
- // 1st byte for magic, next 3 bytes for proto size.
- metadata = (kProtoMagic << 24) | final_size;
-
- // Actually write metadata, has to be done after we know the possibly
- // compressed proto size
- if (!filesystem_->Write(fd_.get(), &metadata, metadata_size)) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to write proto metadata to: ", file_path_));
- }
-
- // Write the serialized proto
- if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to write proto to: ", file_path_));
}
- return current_position;
+ return new_crc;
}
template <typename ProtoT>
libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
int64_t file_offset) const {
int64_t file_size = filesystem_->GetFileSize(fd_.get());
- MemoryMappedFile mmapped_file(*filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_ONLY);
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(*filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY));
if (file_offset >= file_size) {
- // file_size points to the next byte to write at, so subtract one to get the
- // inclusive, actual size of file.
+ // file_size points to the next byte to write at, so subtract one to get
+ // the inclusive, actual size of file.
return absl_ports::OutOfRangeError(
IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
"out of range of the file size, %lld",
@@ -716,17 +566,22 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
int metadata, ReadProtoMetadata(&mmapped_file, file_offset, file_size));
// Copy out however many bytes it says the proto is
- int stored_size = metadata & 0x00FFFFFF;
+ int stored_size = GetProtoSize(metadata);
ICING_RETURN_IF_ERROR(
mmapped_file.Remap(file_offset + sizeof(metadata), stored_size));
- google::protobuf::io::ArrayInputStream proto_stream(
- mmapped_file.mutable_region(), stored_size);
+
+ if (IsEmptyBuffer(mmapped_file.region(), mmapped_file.region_size())) {
+ return absl_ports::NotFoundError("The proto data has been erased.");
+ }
+
+ google::protobuf::io::ArrayInputStream proto_stream(mmapped_file.mutable_region(),
+ stored_size);
// Deserialize proto
ProtoT proto;
if (header_->compress) {
- google::protobuf::io::GzipInputStream decompress_stream(&proto_stream);
+ protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
proto.ParseFromZeroCopyStream(&decompress_stream);
} else {
proto.ParseFromZeroCopyStream(&proto_stream);
@@ -736,32 +591,11 @@ libtextclassifier3::StatusOr<ProtoT> FileBackedProtoLog<ProtoT>::ReadProto(
}
template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t> FileBackedProtoLog<ProtoT>::GetDiskUsage()
- const {
- int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
- if (size == Filesystem::kBadFileSize) {
- return absl_ports::InternalError("Failed to get disk usage of proto log");
- }
- return size;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<int64_t>
-FileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
- int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
- if (total_file_size == Filesystem::kBadFileSize) {
- return absl_ports::InternalError(
- "Failed to get file size of elments in the proto log");
- }
- return total_file_size - sizeof(Header);
-}
-
-template <typename ProtoT>
FileBackedProtoLog<ProtoT>::Iterator::Iterator(const Filesystem& filesystem,
const std::string& file_path,
- int64_t initial_offset)
- : mmapped_file_(filesystem, file_path,
- MemoryMappedFile::Strategy::READ_ONLY),
+ int64_t initial_offset,
+ MemoryMappedFile&& mmapped_file)
+ : mmapped_file_(std::move(mmapped_file)),
initial_offset_(initial_offset),
current_offset_(kInvalidOffset),
file_size_(filesystem.GetFileSize(file_path.c_str())) {
@@ -781,8 +615,7 @@ libtextclassifier3::Status FileBackedProtoLog<ProtoT>::Iterator::Advance() {
ICING_ASSIGN_OR_RETURN(
int metadata,
ReadProtoMetadata(&mmapped_file_, current_offset_, file_size_));
- int proto_size = metadata & 0x00FFFFFF;
- current_offset_ += sizeof(metadata) + proto_size;
+ current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
}
if (current_offset_ < file_size_) {
@@ -801,9 +634,14 @@ int64_t FileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
}
template <typename ProtoT>
-typename FileBackedProtoLog<ProtoT>::Iterator
+libtextclassifier3::StatusOr<typename FileBackedProtoLog<ProtoT>::Iterator>
FileBackedProtoLog<ProtoT>::GetIterator() {
- return Iterator(*filesystem_, file_path_, /*initial_offset=*/sizeof(Header));
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(*filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY));
+ return Iterator(*filesystem_, file_path_,
+ /*initial_offset=*/sizeof(Header), std::move(mmapped_file));
}
template <typename ProtoT>
@@ -829,7 +667,7 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
ICING_RETURN_IF_ERROR(mmapped_file->Remap(file_offset, metadata_size));
memcpy(&metadata, mmapped_file->region(), metadata_size);
// Checks magic number
- uint8_t stored_k_proto_magic = metadata >> 24;
+ uint8_t stored_k_proto_magic = GetProtoMagic(metadata);
if (stored_k_proto_magic != kProtoMagic) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
@@ -838,50 +676,6 @@ libtextclassifier3::StatusOr<int> FileBackedProtoLog<ProtoT>::ReadProtoMetadata(
return metadata;
}
-template <typename ProtoT>
-libtextclassifier3::Status FileBackedProtoLog<ProtoT>::PersistToDisk() {
- int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
- if (file_size == header_->rewind_offset) {
- // No changes made, don't need to update the checksum.
- return libtextclassifier3::Status::OK;
- }
-
- int64_t new_content_size = file_size - header_->rewind_offset;
- Crc32 crc;
- if (new_content_size < 0) {
- // File shrunk, recalculate the entire checksum.
- ICING_ASSIGN_OR_RETURN(
- crc, ComputeChecksum(filesystem_, file_path_, Crc32(), sizeof(Header),
- file_size));
- } else {
- // Append new changes to the existing checksum.
- ICING_ASSIGN_OR_RETURN(
- crc,
- ComputeChecksum(filesystem_, file_path_, Crc32(header_->log_checksum),
- header_->rewind_offset, file_size));
- }
-
- header_->log_checksum = crc.Get();
- header_->rewind_offset = file_size;
- header_->header_checksum = header_->CalculateHeaderChecksum();
-
- if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
- sizeof(Header))) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to update header to: ", file_path_));
- }
-
- return libtextclassifier3::Status::OK;
-}
-
-template <typename ProtoT>
-libtextclassifier3::StatusOr<Crc32>
-FileBackedProtoLog<ProtoT>::ComputeChecksum() {
- return FileBackedProtoLog<ProtoT>::ComputeChecksum(
- filesystem_, file_path_, Crc32(), /*start=*/sizeof(Header),
- /*end=*/filesystem_->GetFileSize(file_path_.c_str()));
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-proto-log_benchmark.cc b/icing/file/file-backed-proto-log_benchmark.cc
deleted file mode 100644
index 26e0fb0..0000000
--- a/icing/file/file-backed-proto-log_benchmark.cc
+++ /dev/null
@@ -1,169 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cstdint>
-#include <random>
-
-#include "testing/base/public/benchmark.h"
-#include "gmock/gmock.h"
-#include "icing/document-builder.h"
-#include "icing/file/file-backed-proto-log.h"
-#include "icing/file/filesystem.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/proto/document.pb.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/random-string.h"
-#include "icing/testing/tmp-directory.h"
-
-// go/microbenchmarks
-//
-// To build and run on a local machine:
-// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
-// icing/file:file-backed-proto-log_benchmark
-//
-// $ blaze-bin/icing/file/file-backed-proto-log_benchmark
-// --benchmarks=all
-//
-//
-// To build and run on an Android device (must be connected and rooted):
-// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
-// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
-// icing/file:file-backed-proto-log_benchmark
-//
-// $ adb root
-//
-// $ adb push
-// blaze-bin/icing/file/file-backed-proto-log_benchmark
-// /data/local/tmp/
-//
-// $ adb shell /data/local/tmp/file-backed-proto-log-benchmark
-// --benchmarks=all
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-static void BM_Write(benchmark::State& state) {
- const Filesystem filesystem;
- int string_length = state.range(0);
- const std::string file_path = IcingStringUtil::StringPrintf(
- "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
- int max_proto_size = (1 << 24) - 1; // 16 MiB
- bool compress = true;
-
- // Make sure it doesn't already exist.
- filesystem.DeleteFile(file_path.c_str());
-
- auto proto_log =
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem, file_path,
- FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
- .ValueOrDie()
- .proto_log;
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- std::default_random_engine random;
- const std::string rand_str =
- RandomString(kAlNumAlphabet, string_length, &random);
-
- auto document_properties = document.add_properties();
- document_properties->set_name("string property");
- document_properties->add_string_values(rand_str);
-
- for (auto _ : state) {
- testing::DoNotOptimize(proto_log->WriteProto(document));
- }
- state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
- string_length);
-
- // Cleanup after ourselves
- filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_Write)
- ->Arg(1)
- ->Arg(32)
- ->Arg(512)
- ->Arg(1024)
- ->Arg(4 * 1024)
- ->Arg(8 * 1024)
- ->Arg(16 * 1024)
- ->Arg(32 * 1024)
- ->Arg(256 * 1024)
- ->Arg(2 * 1024 * 1024)
- ->Arg(8 * 1024 * 1024)
- ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
- // 16MiB, and we need some extra space for the
- // rest of the document properties
-
-static void BM_Read(benchmark::State& state) {
- const Filesystem filesystem;
- int string_length = state.range(0);
- const std::string file_path = IcingStringUtil::StringPrintf(
- "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
- int max_proto_size = (1 << 24) - 1; // 16 MiB
- bool compress = true;
-
- // Make sure it doesn't already exist.
- filesystem.DeleteFile(file_path.c_str());
-
- auto proto_log =
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem, file_path,
- FileBackedProtoLog<DocumentProto>::Options(compress, max_proto_size))
- .ValueOrDie()
- .proto_log;
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- std::default_random_engine random;
- const std::string rand_str =
- RandomString(kAlNumAlphabet, string_length, &random);
-
- auto document_properties = document.add_properties();
- document_properties->set_name("string property");
- document_properties->add_string_values(rand_str);
-
- ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
- proto_log->WriteProto(document));
-
- for (auto _ : state) {
- testing::DoNotOptimize(proto_log->ReadProto(write_offset));
- }
- state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
- string_length);
-
- // Cleanup after ourselves
- filesystem.DeleteFile(file_path.c_str());
-}
-BENCHMARK(BM_Read)
- ->Arg(1)
- ->Arg(32)
- ->Arg(512)
- ->Arg(1024)
- ->Arg(4 * 1024)
- ->Arg(8 * 1024)
- ->Arg(16 * 1024)
- ->Arg(32 * 1024)
- ->Arg(256 * 1024)
- ->Arg(2 * 1024 * 1024)
- ->Arg(8 * 1024 * 1024)
- ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
- // 16MiB, and we need some extra space for the
- // rest of the document properties
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/file/file-backed-proto-log_test.cc b/icing/file/file-backed-proto-log_test.cc
index 3a9060d..eccb0c7 100644
--- a/icing/file/file-backed-proto-log_test.cc
+++ b/icing/file/file-backed-proto-log_test.cc
@@ -19,10 +19,7 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/file/mock-filesystem.h"
-#include "icing/portable/equals-proto.h"
#include "icing/proto/document.pb.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
@@ -32,14 +29,7 @@ namespace lib {
namespace {
-using ::icing::lib::portable_equals_proto::EqualsProto;
-using ::testing::A;
-using ::testing::Eq;
-using ::testing::Gt;
-using ::testing::Not;
using ::testing::NotNull;
-using ::testing::Pair;
-using ::testing::Return;
class FileBackedProtoLogTest : public ::testing::Test {
protected:
@@ -48,7 +38,10 @@ class FileBackedProtoLogTest : public ::testing::Test {
// https://stackoverflow.com/a/47368753
FileBackedProtoLogTest() {}
- void SetUp() override { file_path_ = GetTestTempDir() + "/proto_log"; }
+ void SetUp() override {
+ file_path_ = GetTestTempDir() + "/proto_log";
+ filesystem_.DeleteFile(file_path_.c_str());
+ }
void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
@@ -74,7 +67,7 @@ TEST_F(FileBackedProtoLogTest, Initialize) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
EXPECT_THAT(create_result.proto_log, NotNull());
- EXPECT_FALSE(create_result.data_loss);
+ EXPECT_FALSE(create_result.has_data_loss());
// Can't recreate the same file with different options.
ASSERT_THAT(FileBackedProtoLog<DocumentProto>::Create(
@@ -84,193 +77,6 @@ TEST_F(FileBackedProtoLogTest, Initialize) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(FileBackedProtoLogTest, WriteProtoTooLarge) {
- int max_proto_size = 1;
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- // Proto is too large for the max_proto_size_in
- ASSERT_THAT(proto_log->WriteProto(document),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(FileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- // Write a proto
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset,
- proto_log->WriteProto(document));
-
- // The 4 bytes of metadata that just doesn't have the same kProtoMagic
- // specified in file-backed-proto-log.h
- uint32_t wrong_magic = 0x7E000000;
-
- // Sanity check that we opened the file correctly
- int fd = filesystem_.OpenForWrite(file_path_.c_str());
- ASSERT_GT(fd, 0);
-
- // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of
- // a proto entry.
- filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic));
-
- ASSERT_THAT(proto_log->ReadProto(file_offset),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-}
-
-TEST_F(FileBackedProtoLogTest, ReadWriteUncompressedProto) {
- int last_offset;
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(
- /*compress_in=*/false, max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- // Write the first proto
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(int written_position,
- proto_log->WriteProto(document1));
-
- int document1_offset = written_position;
-
- // Check that what we read is what we wrote
- ASSERT_THAT(proto_log->ReadProto(written_position),
- IsOkAndHolds(EqualsProto(document1)));
-
- // Write a second proto that's close to the max size. Leave some room for
- // the rest of the proto properties.
- std::string long_str(max_proto_size_ - 1024, 'a');
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .AddStringProperty("long_str", long_str)
- .Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(written_position,
- proto_log->WriteProto(document2));
-
- int document2_offset = written_position;
- last_offset = written_position;
- ASSERT_GT(document2_offset, document1_offset);
-
- // Check the second proto
- ASSERT_THAT(proto_log->ReadProto(written_position),
- IsOkAndHolds(EqualsProto(document2)));
-
- ICING_ASSERT_OK(proto_log->PersistToDisk());
- }
-
- {
- // Make a new proto_log with the same file_path, and make sure we
- // can still write to the same underlying file.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(
- /*compress_in=*/false, max_proto_size_)));
- auto recreated_proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- // Write a third proto
- DocumentProto document3 =
- DocumentBuilder().SetKey("namespace3", "uri3").Build();
-
- ASSERT_THAT(recreated_proto_log->WriteProto(document3),
- IsOkAndHolds(Gt(last_offset)));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, ReadWriteCompressedProto) {
- int last_offset;
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(
- /*compress_in=*/true, max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- // Write the first proto
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(int written_position,
- proto_log->WriteProto(document1));
-
- int document1_offset = written_position;
-
- // Check that what we read is what we wrote
- ASSERT_THAT(proto_log->ReadProto(written_position),
- IsOkAndHolds(EqualsProto(document1)));
-
- // Write a second proto that's close to the max size. Leave some room for
- // the rest of the proto properties.
- std::string long_str(max_proto_size_ - 1024, 'a');
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .AddStringProperty("long_str", long_str)
- .Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(written_position,
- proto_log->WriteProto(document2));
-
- int document2_offset = written_position;
- last_offset = written_position;
- ASSERT_GT(document2_offset, document1_offset);
-
- // Check the second proto
- ASSERT_THAT(proto_log->ReadProto(written_position),
- IsOkAndHolds(EqualsProto(document2)));
-
- ICING_ASSERT_OK(proto_log->PersistToDisk());
- }
-
- {
- // Make a new proto_log with the same file_path, and make sure we
- // can still write to the same underlying file.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(
- /*compress_in=*/true, max_proto_size_)));
- auto recreated_proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- // Write a third proto
- DocumentProto document3 =
- DocumentBuilder().SetKey("namespace3", "uri3").Build();
-
- ASSERT_THAT(recreated_proto_log->WriteProto(document3),
- IsOkAndHolds(Gt(last_offset)));
- }
-}
-
TEST_F(FileBackedProtoLogTest, CorruptHeader) {
{
ICING_ASSERT_OK_AND_ASSIGN(
@@ -280,7 +86,7 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) {
FileBackedProtoLog<DocumentProto>::Options(compress_,
max_proto_size_)));
auto recreated_proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
+ EXPECT_FALSE(create_result.has_data_loss());
int corrupt_offset =
offsetof(FileBackedProtoLog<DocumentProto>::Header, rewind_offset);
@@ -300,220 +106,6 @@ TEST_F(FileBackedProtoLogTest, CorruptHeader) {
}
}
-TEST_F(FileBackedProtoLogTest, CorruptContent) {
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- DocumentProto document =
- DocumentBuilder().SetKey("namespace1", "uri1").Build();
-
- // Write and persist an document.
- ICING_ASSERT_OK_AND_ASSIGN(int document_offset,
- proto_log->WriteProto(document));
- ICING_ASSERT_OK(proto_log->PersistToDisk());
-
- // "Corrupt" the content written in the log.
- document.set_uri("invalid");
- std::string serialized_document = document.SerializeAsString();
- filesystem_.PWrite(file_path_.c_str(), document_offset,
- serialized_document.data(), serialized_document.size());
- }
-
- {
- // We can recover, but we have data loss.
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_TRUE(create_result.data_loss);
-
- // Lost everything in the log since the rewind position doesn't help if
- // there's been data corruption within the persisted region
- ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
- sizeof(FileBackedProtoLog<DocumentProto>::Header));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, PersistToDisk) {
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace1", "uri1").Build();
- DocumentProto document2 =
- DocumentBuilder().SetKey("namespace2", "uri2").Build();
- int document1_offset, document2_offset;
- int log_size;
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- // Write and persist the first proto
- ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
- proto_log->WriteProto(document1));
- ICING_ASSERT_OK(proto_log->PersistToDisk());
-
- // Write, but don't explicitly persist the second proto
- ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
- proto_log->WriteProto(document2));
-
- // Check that what we read is what we wrote
- ASSERT_THAT(proto_log->ReadProto(document1_offset),
- IsOkAndHolds(EqualsProto(document1)));
- ASSERT_THAT(proto_log->ReadProto(document2_offset),
- IsOkAndHolds(EqualsProto(document2)));
-
- log_size = filesystem_.GetFileSize(file_path_.c_str());
- ASSERT_GT(log_size, 0);
- }
-
- {
- // The header rewind position and checksum aren't updated in this "system
- // crash" scenario.
-
- std::string bad_proto =
- "some incomplete proto that we didn't finish writing before the system "
- "crashed";
- filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(),
- bad_proto.size());
-
- // Double check that we actually wrote something to the underlying file
- ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size);
- }
-
- {
- // We can recover, but we have data loss
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- ASSERT_TRUE(create_result.data_loss);
-
- // Check that everything was persisted across instances
- ASSERT_THAT(proto_log->ReadProto(document1_offset),
- IsOkAndHolds(EqualsProto(document1)));
- ASSERT_THAT(proto_log->ReadProto(document2_offset),
- IsOkAndHolds(EqualsProto(document2)));
-
- // We correctly rewound to the last good state.
- ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str()));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, Iterator) {
- DocumentProto document1 =
- DocumentBuilder().SetKey("namespace", "uri1").Build();
- DocumentProto document2 =
- DocumentBuilder().SetKey("namespace", "uri2").Build();
-
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- {
- // Empty iterator
- auto iterator = proto_log->GetIterator();
- ASSERT_THAT(iterator.Advance(),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- }
-
- {
- // Iterates through some documents
- ICING_ASSERT_OK(proto_log->WriteProto(document1));
- ICING_ASSERT_OK(proto_log->WriteProto(document2));
- auto iterator = proto_log->GetIterator();
- // 1st proto
- ICING_ASSERT_OK(iterator.Advance());
- ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
- IsOkAndHolds(EqualsProto(document1)));
- // 2nd proto
- ICING_ASSERT_OK(iterator.Advance());
- ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
- IsOkAndHolds(EqualsProto(document2)));
- // Tries to advance
- ASSERT_THAT(iterator.Advance(),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- }
-
- {
- // Iterator with bad filesystem
- MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, GetFileSize(A<const char *>()))
- .WillByDefault(Return(Filesystem::kBadFileSize));
- FileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
- mock_filesystem, file_path_, /*initial_offset=*/0);
- ASSERT_THAT(bad_iterator.Advance(),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- }
-}
-
-TEST_F(FileBackedProtoLogTest, ComputeChecksum) {
- DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
- Crc32 checksum;
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- ICING_EXPECT_OK(proto_log->WriteProto(document));
-
- ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum());
-
- // Calling it twice with no changes should get us the same checksum
- EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
- }
-
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- FileBackedProtoLog<DocumentProto>::CreateResult create_result,
- FileBackedProtoLog<DocumentProto>::Create(
- &filesystem_, file_path_,
- FileBackedProtoLog<DocumentProto>::Options(compress_,
- max_proto_size_)));
- auto proto_log = std::move(create_result.proto_log);
- EXPECT_FALSE(create_result.data_loss);
-
- // Checksum should be consistent across instances
- EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
-
- // PersistToDisk shouldn't affect the checksum value
- ICING_EXPECT_OK(proto_log->PersistToDisk());
- EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
-
- // Check that modifying the log leads to a different checksum
- ICING_EXPECT_OK(proto_log->WriteProto(document));
- EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
- }
-}
-
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/file/file-backed-proto.h b/icing/file/file-backed-proto.h
index aede8de..8c5743b 100644
--- a/icing/file/file-backed-proto.h
+++ b/icing/file/file-backed-proto.h
@@ -22,6 +22,7 @@
#ifndef ICING_FILE_FILE_BACKED_PROTO_H_
#define ICING_FILE_FILE_BACKED_PROTO_H_
+#include <algorithm>
#include <cstdint>
#include <memory>
#include <string>
@@ -37,6 +38,7 @@
#include "icing/legacy/core/icing-string-util.h"
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
@@ -63,6 +65,24 @@ class FileBackedProto {
// file_path : Must be a path within in a directory that already exists.
FileBackedProto(const Filesystem& filesystem, std::string_view file_path);
+ // Reset the internal file_path for the file backed proto.
+ // Example use:
+ // auto file_backed_proto1 = *FileBackedProto<Proto>::Create(...);
+ // auto file_backed_google::protobuf = *FileBackedProto<Proto>::Create(...);
+ // filesystem.SwapFiles(file1, file2);
+ // file_backed_proto1.SetSwappedFilepath(file2);
+ // file_backed_google::protobuf.SetSwappedFilepath(file1);
+ void SetSwappedFilepath(std::string_view swapped_to_file_path) {
+ file_path_ = swapped_to_file_path;
+ }
+
+ // Computes the checksum of the proto stored in this file and returns it.
+ // RETURNS:
+ // - the checksum of the proto or 0 if the file is empty/non-existent
+ // - INTERNAL_ERROR if an IO error or a corruption was encountered.
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const
+ ICING_LOCKS_EXCLUDED(mutex_);
+
// Returns a reference to the proto read from the file. It
// internally caches the read proto so that future calls are fast.
//
@@ -83,7 +103,7 @@ class FileBackedProto {
//
// TODO(cassiewang) The implementation today loses old data if Write() fails.
// We should write to a tmp file first and rename the file to fix this.
- // TODO(samzheng) Change to Write(ProtoT&& proto)
+ // TODO(cassiewang) Change to Write(ProtoT&& proto)
libtextclassifier3::Status Write(std::unique_ptr<ProtoT> proto)
ICING_LOCKS_EXCLUDED(mutex_);
@@ -92,6 +112,11 @@ class FileBackedProto {
FileBackedProto& operator=(const FileBackedProto&) = delete;
private:
+ // Internal method to handle reading the proto from disk.
+ // Requires the caller to hold an exclusive lock on mutex_.
+ libtextclassifier3::StatusOr<const ProtoT*> ReadInternal() const
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
// Upper bound of file-size that is supported.
static constexpr int32_t kMaxFileSize = 1 * 1024 * 1024; // 1 MiB.
@@ -99,9 +124,11 @@ class FileBackedProto {
mutable absl_ports::shared_mutex mutex_;
const Filesystem* const filesystem_;
- const std::string file_path_;
+ std::string file_path_;
mutable std::unique_ptr<ProtoT> cached_proto_ ICING_GUARDED_BY(mutex_);
+
+ mutable std::unique_ptr<Header> cached_header_ ICING_GUARDED_BY(mutex_);
};
template <typename ProtoT>
@@ -113,12 +140,35 @@ FileBackedProto<ProtoT>::FileBackedProto(const Filesystem& filesystem,
: filesystem_(&filesystem), file_path_(file_path) {}
template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32> FileBackedProto<ProtoT>::ComputeChecksum()
+ const {
+ absl_ports::unique_lock l(&mutex_);
+ if (cached_proto_ == nullptr) {
+ auto read_status = ReadInternal();
+ if (!read_status.ok()) {
+ if (absl_ports::IsNotFound(read_status.status())) {
+ // File doesn't exist. So simply return 0.
+ return Crc32();
+ }
+ return read_status.status();
+ }
+ }
+ return Crc32(cached_header_->proto_checksum);
+}
+
+template <typename ProtoT>
libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read()
const {
ICING_VLOG(1) << "Reading proto from file: " << file_path_;
absl_ports::unique_lock l(&mutex_);
+ return ReadInternal();
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<const ProtoT*>
+FileBackedProto<ProtoT>::ReadInternal() const {
// Return cached proto if we've already read from disk.
if (cached_proto_ != nullptr) {
ICING_VLOG(1) << "Reusing cached proto for file: " << file_path_;
@@ -146,8 +196,7 @@ libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read()
<< " of size: " << file_size;
Header header;
- if (!filesystem_->PRead(fd.get(), &header, sizeof(Header),
- /*offset=*/0)) {
+ if (!filesystem_->PRead(fd.get(), &header, sizeof(Header), /*offset=*/0)) {
return absl_ports::InternalError(
absl_ports::StrCat("Unable to read header of: ", file_path_));
}
@@ -182,6 +231,7 @@ libtextclassifier3::StatusOr<const ProtoT*> FileBackedProto<ProtoT>::Read()
ICING_VLOG(1) << "Successfully read proto from file: " << file_path_;
cached_proto_ = std::move(proto);
+ cached_header_ = std::make_unique<Header>(std::move(header));
return cached_proto_.get();
}
@@ -242,6 +292,7 @@ libtextclassifier3::Status FileBackedProto<ProtoT>::Write(
ICING_VLOG(1) << "Successfully wrote proto to file: " << file_path_;
cached_proto_ = std::move(new_proto);
+ cached_header_ = std::make_unique<Header>(std::move(header));
return libtextclassifier3::Status::OK;
}
diff --git a/icing/file/file-backed-proto_test.cc b/icing/file/file-backed-proto_test.cc
index 7f994fb..009af52 100644
--- a/icing/file/file-backed-proto_test.cc
+++ b/icing/file/file-backed-proto_test.cc
@@ -45,7 +45,7 @@ TEST_F(FileBackedProtoTest, SimpleReadWriteTest) {
DocumentBuilder().SetKey("namespace", "google.com").Build();
FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
- ICING_ASSERT_OK(file_proto.Write(absl::make_unique<DocumentProto>(document)));
+ ICING_ASSERT_OK(file_proto.Write(std::make_unique<DocumentProto>(document)));
EXPECT_THAT(file_proto.Read(), IsOkAndHolds(Pointee(EqualsProto(document))));
// Multiple reads work.
EXPECT_THAT(file_proto.Read(), IsOkAndHolds(Pointee(EqualsProto(document))));
@@ -61,7 +61,7 @@ TEST_F(FileBackedProtoTest, DataPersistsAcrossMultipleInstancesTest) {
EXPECT_THAT(file_proto.Read(), Not(IsOk())); // Nothing to read.
ICING_ASSERT_OK(
- file_proto.Write(absl::make_unique<DocumentProto>(document)));
+ file_proto.Write(std::make_unique<DocumentProto>(document)));
EXPECT_THAT(file_proto.Read(),
IsOkAndHolds(Pointee(EqualsProto(document))));
}
@@ -84,12 +84,12 @@ TEST_F(FileBackedProtoTest, MultipleUpdatesToProtoTest) {
{
FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
ICING_ASSERT_OK(
- file_proto.Write(absl::make_unique<DocumentProto>(googleProto)));
+ file_proto.Write(std::make_unique<DocumentProto>(googleProto)));
EXPECT_THAT(file_proto.Read(),
IsOkAndHolds(Pointee(EqualsProto(googleProto))));
ICING_ASSERT_OK(
- file_proto.Write(absl::make_unique<DocumentProto>(youtubeProto)));
+ file_proto.Write(std::make_unique<DocumentProto>(youtubeProto)));
EXPECT_THAT(file_proto.Read(),
IsOkAndHolds(Pointee(EqualsProto(youtubeProto))));
}
@@ -100,12 +100,12 @@ TEST_F(FileBackedProtoTest, MultipleUpdatesToProtoTest) {
IsOkAndHolds(Pointee(EqualsProto(youtubeProto))));
ICING_ASSERT_OK(
- file_proto.Write(absl::make_unique<DocumentProto>(wazeProto)));
+ file_proto.Write(std::make_unique<DocumentProto>(wazeProto)));
EXPECT_THAT(file_proto.Read(),
IsOkAndHolds(Pointee(EqualsProto(wazeProto))));
ICING_ASSERT_OK(
- file_proto.Write(absl::make_unique<DocumentProto>(googleProto)));
+ file_proto.Write(std::make_unique<DocumentProto>(googleProto)));
EXPECT_THAT(file_proto.Read(),
IsOkAndHolds(Pointee(EqualsProto(googleProto))));
}
@@ -117,7 +117,7 @@ TEST_F(FileBackedProtoTest, InvalidFilenameTest) {
FileBackedProto<DocumentProto> file_proto(filesystem_, "");
EXPECT_THAT(file_proto.Read(), Not(IsOk()));
- EXPECT_THAT(file_proto.Write(absl::make_unique<DocumentProto>(document)),
+ EXPECT_THAT(file_proto.Write(std::make_unique<DocumentProto>(document)),
Not(IsOk()));
}
@@ -128,7 +128,7 @@ TEST_F(FileBackedProtoTest, FileCorruptionTest) {
{
FileBackedProto<DocumentProto> file_proto(filesystem_, filename_);
ICING_ASSERT_OK(
- file_proto.Write(absl::make_unique<DocumentProto>(document)));
+ file_proto.Write(std::make_unique<DocumentProto>(document)));
EXPECT_THAT(file_proto.Read(),
IsOkAndHolds(Pointee(EqualsProto(document))));
}
diff --git a/icing/file/file-backed-vector.h b/icing/file/file-backed-vector.h
index 27d03b2..7408e8b 100644
--- a/icing/file/file-backed-vector.h
+++ b/icing/file/file-backed-vector.h
@@ -56,10 +56,15 @@
#ifndef ICING_FILE_FILE_BACKED_VECTOR_H_
#define ICING_FILE_FILE_BACKED_VECTOR_H_
-#include <stdint.h>
#include <sys/mman.h>
+#include <unistd.h>
+#include <algorithm>
+#include <cinttypes>
#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <limits>
#include <memory>
#include <string>
#include <utility>
@@ -72,6 +77,7 @@
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
#include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/platform.h"
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/math-util.h"
@@ -83,6 +89,9 @@ namespace lib {
template <typename T>
class FileBackedVector {
public:
+ class MutableArrayView;
+ class MutableView;
+
// Header stored at the beginning of the file before the rest of the vector
// elements. Stores metadata on the vector.
struct Header {
@@ -133,15 +142,38 @@ class FileBackedVector {
kHeaderChecksumOffset,
"");
- Crc32 crc;
- std::string_view header_str(
- reinterpret_cast<const char*>(this),
- offsetof(FileBackedVector::Header, header_checksum));
- crc.Append(header_str);
- return crc.Get();
+ return Crc32(std::string_view(
+ reinterpret_cast<const char*>(this),
+ offsetof(FileBackedVector::Header, header_checksum)))
+ .Get();
}
};
+ // Absolute max file size for FileBackedVector.
+ // - We memory map the whole file, so file size ~= memory size.
+ // - On 32-bit platform, the virtual memory address space is 4GB. To avoid
+ // exhausting the memory, set smaller file size limit for 32-bit platform.
+#ifdef ICING_ARCH_BIT_64
+ static constexpr int32_t kMaxFileSize =
+ std::numeric_limits<int32_t>::max(); // 2^31-1 Bytes, ~2.1 GB
+#else
+ static constexpr int32_t kMaxFileSize =
+ (1 << 28) + Header::kHeaderSize; // 2^28 + 12 Bytes, ~256 MiB
+#endif
+
+ // Size of element type T. The value is same as sizeof(T), while we should
+ // avoid using sizeof(T) in our codebase to prevent unexpected unsigned
+ // integer casting.
+ static constexpr int32_t kElementTypeSize = static_cast<int32_t>(sizeof(T));
+ static_assert(sizeof(T) <= (1 << 10));
+
+ // Absolute max # of elements allowed. Since we are using int32_t to store
+ // num_elements, max value is 2^31-1. Still the actual max # of elements are
+ // determined by max_file_size, kMaxFileSize, kElementTypeSize, and
+ // Header::kHeaderSize.
+ static constexpr int32_t kMaxNumElements =
+ std::numeric_limits<int32_t>::max();
+
// Creates a new FileBackedVector to read/write content to.
//
// filesystem: Object to make system level calls
@@ -149,9 +181,44 @@ class FileBackedVector {
// within a directory that already exists.
// mmap_strategy : Strategy/optimizations to access the content in the vector,
// see MemoryMappedFile::Strategy for more details
+ // max_file_size: Maximum file size for FileBackedVector, default
+ // kMaxFileSize. Note that this value won't be written into the
+ // header, so maximum file size will always be specified in
+ // runtime and the caller should make sure the value is correct
+ // and reasonable. Also it will be cached in MemoryMappedFile
+ // member, so we can always call mmapped_file_->max_file_size()
+ // to get it.
+ // The range should be in
+ // [Header::kHeaderSize + kElementTypeSize, kMaxFileSize], and
+ // (max_file_size - Header::kHeaderSize) / kElementTypeSize is
+ // max # of elements that can be stored.
+ // pre_mapping_mmap_size: pre-mapping size of MemoryMappedFile, default 0.
+ // Pre-mapping a large memory region to the file and
+ // grow the underlying file later, so we can avoid
+ // remapping too frequently and reduce the cost of
+ // system call and memory paging after remap. The user
+ // should specify reasonable size to save remapping
+ // cost and avoid exhausting the memory at once in the
+ // beginning.
+ // Note: if the file exists and pre_mapping_mmap_size
+ // is smaller than file_size - Header::kHeaderSize,
+ // then it still pre-maps file_size -
+ // Header::kHeaderSize to make all existing elements
+ // available.
+ // TODO(b/247671531): figure out pre_mapping_mmap_size for each
+ // FileBackedVector use case.
+ //
+ // Return:
+ // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
+ // checksum.
+ // INTERNAL_ERROR on I/O errors.
+ // INVALID_ARGUMENT_ERROR if max_file_size is incorrect.
+ // UNIMPLEMENTED_ERROR if created with strategy READ_WRITE_MANUAL_SYNC.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
Create(const Filesystem& filesystem, const std::string& file_path,
- MemoryMappedFile::Strategy mmap_strategy);
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size = kMaxFileSize,
+ int32_t pre_mapping_mmap_size = 0);
// Deletes the FileBackedVector
//
@@ -169,23 +236,144 @@ class FileBackedVector {
// synced by the system and the checksum will be updated.
~FileBackedVector();
- // Accesses the element at idx.
+ // Gets a copy of the element at idx.
+ //
+ // This is useful if you think the FileBackedVector may grow before you need
+ // to access this return value. When the FileBackedVector grows, the
+ // underlying mmap will be unmapped and remapped, which will invalidate any
+ // pointers to the previously mapped region. Getting a copy will avoid
+ // referencing the now-invalidated region.
//
// Returns:
- // OUT_OF_RANGE_ERROR if idx < 0 or > num_elements()
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
+ libtextclassifier3::StatusOr<T> GetCopy(int32_t idx) const;
+
+ // Gets an immutable pointer to the element at idx.
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the pointer
+ // returned by Get.
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to avoid a copy. When the
+ // FileBackedVector grows, the underlying mmap will be unmapped and remapped,
+ // which will invalidate this pointer to the previously mapped region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
libtextclassifier3::StatusOr<const T*> Get(int32_t idx) const;
+ // Gets a MutableView to the element at idx.
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference returned by MutableView::Get().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to mutate the underlying
+ // data directly. When the FileBackedVector grows, the underlying mmap will be
+ // unmapped and remapped, which will invalidate this MutableView to the
+ // previously mapped region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx >= num_elements()
+ libtextclassifier3::StatusOr<MutableView> GetMutable(int32_t idx);
+
+ // Gets a MutableArrayView to the elements at range [idx, idx + len).
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference/pointer returned by MutableArrayView::operator[]/data().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to mutate the underlying
+ // data directly. When the FileBackedVector grows, the underlying mmap will be
+ // unmapped and remapped, which will invalidate this MutableArrayView to the
+ // previously mapped region.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx + len > num_elements()
+ libtextclassifier3::StatusOr<MutableArrayView> GetMutable(int32_t idx,
+ int32_t len);
+
// Writes the value at idx.
//
+ // May grow the underlying file and mmapped region as needed to fit the new
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
+ //
// Returns:
- // OUT_OF_RANGE_ERROR if idx < 0 or file cannot be grown idx size
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx > kMaxIndex or file cannot be grown
+ // to fit idx + 1 elements
libtextclassifier3::Status Set(int32_t idx, const T& value);
- // Resizes to first len elements. The crc is not updated on truncation.
+ // Set [idx, idx + len) to a single value.
+ //
+ // May grow the underlying file and mmapped region as needed to fit the new
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if idx < 0 or idx + len > kMaxNumElements or file
+ // cannot be grown to fit idx + len elements
+ libtextclassifier3::Status Set(int32_t idx, int32_t len, const T& value);
+
+ // Appends the value to the end of the vector.
+ //
+ // May grow the underlying file and mmapped region as needed to fit the new
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if file cannot be grown (i.e. reach
+ // mmapped_file_->max_file_size())
+ libtextclassifier3::Status Append(const T& value) {
+ return Set(header_->num_elements, value);
+ }
+
+ // Allocates spaces with given length in the end of the vector and returns a
+ // MutableArrayView to the space.
+ //
+ // May grow the underlying file and mmapped region as needed to fit the new
+ // value. If it does grow, then any pointers/references to previous values
+ // returned from Get/GetMutable/Allocate may be invalidated.
+ //
+ // WARNING: Subsequent calls to Set/Append/Allocate may invalidate the
+ // reference/pointer returned by MutableArrayView::operator[]/data().
+ //
+ // This is useful if you do not think the FileBackedVector will grow before
+ // you need to reference this value, and you want to allocate adjacent spaces
+ // for multiple elements and mutate the underlying data directly. When the
+ // FileBackedVector grows, the underlying mmap will be unmapped and remapped,
+ // which will invalidate this MutableArrayView to the previously mapped
+ // region.
//
// Returns:
- // OUT_OF_RANGE_ERROR if len < 0 or >= num_elements()
- libtextclassifier3::Status TruncateTo(int32_t len);
+ // OUT_OF_RANGE_ERROR if len <= 0 or file cannot be grown (i.e. reach
+ // mmapped_file_->max_file_size())
+ libtextclassifier3::StatusOr<MutableArrayView> Allocate(int32_t len);
+
+ // Resizes to first len elements. The crc is cleared on truncation and will be
+ // updated on destruction, or once the client calls ComputeChecksum() or
+ // PersistToDisk().
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if len < 0 or len >= num_elements()
+ libtextclassifier3::Status TruncateTo(int32_t new_num_elements);
+
+ // Sorts the vector within range [begin_idx, end_idx).
+ // It handles SetDirty properly for the file-backed-vector.
+ //
+ // Returns:
+ // OUT_OF_RANGE_ERROR if (0 <= begin_idx < end_idx <= num_elements()) does
+ // not hold
+ libtextclassifier3::Status Sort(int32_t begin_idx, int32_t end_idx);
+
+ // Mark idx as changed iff idx < changes_end_, so later ComputeChecksum() can
+ // update checksum by the cached changes without going over [0, changes_end_).
+ //
+ // If the buffer size exceeds kPartialCrcLimitDiv, then clear all change
+ // buffers and set changes_end_ as 0, indicating that the checksum should be
+ // recomputed from idx 0 (starting from the beginning). Otherwise cache the
+ // change.
+ void SetDirty(int32_t idx);
// Flushes content to underlying file.
//
@@ -211,22 +399,78 @@ class FileBackedVector {
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+ // Updates checksum of the vector contents and returns it.
+ //
+ // Returns:
+ // INTERNAL_ERROR if the vector's internal state is inconsistent
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
// Accessors.
const T* array() const {
return reinterpret_cast<const T*>(mmapped_file_->region());
}
- T* mutable_array() const {
- return reinterpret_cast<T*>(mmapped_file_->mutable_region());
- }
-
int32_t num_elements() const { return header_->num_elements; }
- // Updates checksum of the vector contents and returns it.
- //
- // Returns:
- // INTERNAL_ERROR if the vector's internal state is inconsistent
- libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+ public:
+ class MutableArrayView {
+ public:
+ const T& operator[](int32_t idx) const { return data_[idx]; }
+ T& operator[](int32_t idx) {
+ SetDirty(idx);
+ return data_[idx];
+ }
+
+ const T* data() const { return data_; }
+
+ int32_t size() const { return len_; }
+
+ // Set the mutable array slice (starting at idx) by the given element array.
+ // It handles SetDirty properly for the file-backed-vector when modifying
+ // elements.
+ //
+ // REQUIRES: arr is valid && arr_len >= 0 && idx >= 0 && idx + arr_len <=
+ // size(), otherwise the behavior is undefined.
+ void SetArray(int32_t idx, const T* arr, int32_t arr_len) {
+ for (int32_t i = 0; i < arr_len; ++i) {
+ SetDirty(idx + i);
+ data_[idx + i] = arr[i];
+ }
+ }
+
+ private:
+ MutableArrayView(FileBackedVector<T>* vector, T* data, int32_t len)
+ : vector_(vector),
+ data_(data),
+ original_idx_(data - vector->array()),
+ len_(len) {}
+
+ void SetDirty(int32_t idx) { vector_->SetDirty(original_idx_ + idx); }
+
+ // Does not own. For SetDirty only.
+ FileBackedVector<T>* vector_;
+
+ // data_ points at vector_->mutable_array()[original_idx_]
+ T* data_;
+ int32_t original_idx_;
+ int32_t len_;
+
+ friend class FileBackedVector;
+ };
+
+ class MutableView {
+ public:
+ const T& Get() const { return mutable_array_view_[0]; }
+ T& Get() { return mutable_array_view_[0]; }
+
+ private:
+ MutableView(FileBackedVector<T>* vector, T* data)
+ : mutable_array_view_(vector, data, 1) {}
+
+ MutableArrayView mutable_array_view_;
+
+ friend class FileBackedVector;
+ };
private:
// We track partial updates to the array for crc updating. This
@@ -239,24 +483,27 @@ class FileBackedVector {
// Grow file by at least this many elements if array is growable.
static constexpr int64_t kGrowElements = 1u << 14; // 16K
- // Max number of elements that can be held by the vector.
- static constexpr int64_t kMaxNumElements = 1u << 20; // 1M
+ // Absolute max index allowed.
+ static constexpr int32_t kMaxIndex = kMaxNumElements - 1;
// Can only be created through the factory ::Create function
- FileBackedVector(const Filesystem& filesystem, const std::string& file_path,
- std::unique_ptr<Header> header,
- std::unique_ptr<MemoryMappedFile> mmapped_file);
+ explicit FileBackedVector(const Filesystem& filesystem,
+ const std::string& file_path,
+ std::unique_ptr<Header> header,
+ MemoryMappedFile&& mmapped_file);
// Initialize a new FileBackedVector, and create the file.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeNewFile(const Filesystem& filesystem, const std::string& file_path,
- ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy);
+ ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size, int32_t pre_mapping_mmap_size);
// Initialize a FileBackedVector from an existing file.
static libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
InitializeExistingFile(const Filesystem& filesystem,
const std::string& file_path, ScopedFd fd,
- MemoryMappedFile::Strategy mmap_strategy);
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size, int32_t pre_mapping_mmap_size);
// Grows the underlying file to hold at least num_elements
//
@@ -264,6 +511,10 @@ class FileBackedVector {
// OUT_OF_RANGE_ERROR if we can't grow to the specified size
libtextclassifier3::Status GrowIfNecessary(int32_t num_elements);
+ T* mutable_array() const {
+ return reinterpret_cast<T*>(mmapped_file_->mutable_region());
+ }
+
// Cached constructor params.
const Filesystem* const filesystem_;
const std::string file_path_;
@@ -281,26 +532,33 @@ class FileBackedVector {
// Buffer of the original elements that have been changed since the last crc
// update. Will be cleared if the size grows too big.
std::string saved_original_buffer_;
-
- // Keep track of all pages we touched so we can write them back to
- // disk.
- std::vector<bool> dirty_pages_;
};
template <typename T>
+constexpr int32_t FileBackedVector<T>::kMaxFileSize;
+
+template <typename T>
+constexpr int32_t FileBackedVector<T>::kElementTypeSize;
+
+template <typename T>
+constexpr int32_t FileBackedVector<T>::kMaxNumElements;
+
+template <typename T>
constexpr int32_t FileBackedVector<T>::kPartialCrcLimitDiv;
template <typename T>
constexpr int64_t FileBackedVector<T>::kGrowElements;
template <typename T>
-constexpr int64_t FileBackedVector<T>::kMaxNumElements;
+constexpr int32_t FileBackedVector<T>::kMaxIndex;
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
FileBackedVector<T>::Create(const Filesystem& filesystem,
const std::string& file_path,
- MemoryMappedFile::Strategy mmap_strategy) {
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size,
+ int32_t pre_mapping_mmap_size) {
if (mmap_strategy == MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC) {
// FileBackedVector's behavior of growing the file underneath the mmap is
// inherently broken with MAP_PRIVATE. Growing the vector requires extending
@@ -313,6 +571,14 @@ FileBackedVector<T>::Create(const Filesystem& filesystem,
"mmap strategy.");
}
+ if (max_file_size < Header::kHeaderSize + kElementTypeSize ||
+ max_file_size > kMaxFileSize) {
+ // FileBackedVector should be able to store at least 1 element, so
+ // max_file_size should be at least Header::kHeaderSize + kElementTypeSize.
+ return absl_ports::InvalidArgumentError(
+ "Invalid max file size for FileBackedVector");
+ }
+
ScopedFd fd(filesystem.OpenForWrite(file_path.c_str()));
if (!fd.is_valid()) {
return absl_ports::InternalError(
@@ -325,41 +591,56 @@ FileBackedVector<T>::Create(const Filesystem& filesystem,
absl_ports::StrCat("Bad file size for file ", file_path));
}
+ if (max_file_size < file_size) {
+ return absl_ports::InvalidArgumentError(
+ "Max file size should not be smaller than the existing file size");
+ }
+
const bool new_file = file_size == 0;
if (new_file) {
return InitializeNewFile(filesystem, file_path, std::move(fd),
- mmap_strategy);
+ mmap_strategy, max_file_size,
+ pre_mapping_mmap_size);
}
return InitializeExistingFile(filesystem, file_path, std::move(fd),
- mmap_strategy);
+ mmap_strategy, max_file_size,
+ pre_mapping_mmap_size);
}
template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
-FileBackedVector<T>::InitializeNewFile(
- const Filesystem& filesystem, const std::string& file_path, ScopedFd fd,
- MemoryMappedFile::Strategy mmap_strategy) {
+FileBackedVector<T>::InitializeNewFile(const Filesystem& filesystem,
+ const std::string& file_path,
+ ScopedFd fd,
+ MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size,
+ int32_t pre_mapping_mmap_size) {
// Create header.
auto header = std::make_unique<Header>();
header->magic = FileBackedVector<T>::Header::kMagic;
- header->element_size = sizeof(T);
+ header->element_size = kElementTypeSize;
header->header_checksum = header->CalculateHeaderChecksum();
// We use Write() here, instead of writing through the mmapped region
// created below, so we can gracefully handle errors that occur when the
// disk is full. See b/77309668 for details.
if (!filesystem.PWrite(fd.get(), /*offset=*/0, header.get(),
- sizeof(Header))) {
+ Header::kHeaderSize)) {
return absl_ports::InternalError("Failed to write header");
}
- // Constructor of MemoryMappedFile doesn't actually call mmap(), mmap()
- // happens on MemoryMappedFile::Remap(). So having a potentially unflushed fd
- // at this point shouldn't run into issues with a mmap of the same file. But
- // we'll close the fd just in case.
+ // Close the fd since constructor of MemoryMappedFile calls mmap() and we need
+ // to flush fd before mmap().
fd.reset();
- auto mmapped_file =
- std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
+
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem, file_path, mmap_strategy,
+ max_file_size,
+ /*pre_mapping_file_offset=*/Header::kHeaderSize,
+ /*pre_mapping_mmap_size=*/
+ std::min(max_file_size - Header::kHeaderSize,
+ pre_mapping_mmap_size)));
return std::unique_ptr<FileBackedVector<T>>(new FileBackedVector<T>(
filesystem, file_path, std::move(header), std::move(mmapped_file)));
@@ -369,15 +650,21 @@ template <typename T>
libtextclassifier3::StatusOr<std::unique_ptr<FileBackedVector<T>>>
FileBackedVector<T>::InitializeExistingFile(
const Filesystem& filesystem, const std::string& file_path,
- const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy) {
+ const ScopedFd fd, MemoryMappedFile::Strategy mmap_strategy,
+ int32_t max_file_size, int32_t pre_mapping_mmap_size) {
int64_t file_size = filesystem.GetFileSize(file_path.c_str());
- if (file_size < sizeof(FileBackedVector<T>::Header)) {
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Bad file size for file ", file_path));
+ }
+
+ if (file_size < Header::kHeaderSize) {
return absl_ports::InternalError(
absl_ports::StrCat("File header too short for ", file_path));
}
auto header = std::make_unique<Header>();
- if (!filesystem.PRead(fd.get(), header.get(), sizeof(Header),
+ if (!filesystem.PRead(fd.get(), header.get(), Header::kHeaderSize,
/*offset=*/0)) {
return absl_ports::InternalError(
absl_ports::StrCat("Failed to read header of ", file_path));
@@ -391,34 +678,50 @@ FileBackedVector<T>::InitializeExistingFile(
absl_ports::StrCat("Invalid header kMagic for ", file_path));
}
- // Mmap the content of the vector, excluding the header so its easier to
- // access elements from the mmapped region
- auto mmapped_file =
- std::make_unique<MemoryMappedFile>(filesystem, file_path, mmap_strategy);
- ICING_RETURN_IF_ERROR(
- mmapped_file->Remap(sizeof(Header), file_size - sizeof(Header)));
-
// Check header
if (header->header_checksum != header->CalculateHeaderChecksum()) {
- return absl_ports::InternalError(
+ return absl_ports::FailedPreconditionError(
absl_ports::StrCat("Invalid header crc for ", file_path));
}
- if (header->element_size != sizeof(T)) {
+ if (header->element_size != kElementTypeSize) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
- "Inconsistent element size, expected %zd, actual %d", sizeof(T),
+ "Inconsistent element size, expected %d, actual %d", kElementTypeSize,
header->element_size));
}
+ int64_t min_file_size =
+ static_cast<int64_t>(header->num_elements) * kElementTypeSize +
+ Header::kHeaderSize;
+ if (min_file_size > file_size) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Inconsistent file size, expected %" PRId64 ", actual %" PRId64,
+ min_file_size, file_size));
+ }
+
+ // Mmap the content of the vector, excluding the header so its easier to
+ // access elements from the mmapped region
+ // Although users can specify their own pre_mapping_mmap_size, we should make
+ // sure that the pre-map size is at least file_size - Header::kHeaderSize to
+ // make all existing elements available.
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(
+ filesystem, file_path, mmap_strategy, max_file_size,
+ /*pre_mapping_file_offset=*/Header::kHeaderSize,
+ /*pre_mapping_mmap_size=*/
+ std::max(
+ file_size - Header::kHeaderSize,
+ static_cast<int64_t>(std::min(max_file_size - Header::kHeaderSize,
+ pre_mapping_mmap_size)))));
+
// Check vector contents
- Crc32 vector_checksum;
- std::string_view vector_contents(
- reinterpret_cast<const char*>(mmapped_file->region()),
- header->num_elements * sizeof(T));
- vector_checksum.Append(vector_contents);
+ Crc32 vector_checksum(
+ std::string_view(reinterpret_cast<const char*>(mmapped_file.region()),
+ header->num_elements * kElementTypeSize));
if (vector_checksum.Get() != header->vector_checksum) {
- return absl_ports::InternalError(
+ return absl_ports::FailedPreconditionError(
absl_ports::StrCat("Invalid vector contents for ", file_path));
}
@@ -437,14 +740,15 @@ libtextclassifier3::Status FileBackedVector<T>::Delete(
}
template <typename T>
-FileBackedVector<T>::FileBackedVector(
- const Filesystem& filesystem, const std::string& file_path,
- std::unique_ptr<Header> header,
- std::unique_ptr<MemoryMappedFile> mmapped_file)
+FileBackedVector<T>::FileBackedVector(const Filesystem& filesystem,
+ const std::string& file_path,
+ std::unique_ptr<Header> header,
+ MemoryMappedFile&& mmapped_file)
: filesystem_(&filesystem),
file_path_(file_path),
header_(std::move(header)),
- mmapped_file_(std::move(mmapped_file)),
+ mmapped_file_(
+ std::make_unique<MemoryMappedFile>(std::move(mmapped_file))),
changes_end_(header_->num_elements) {}
template <typename T>
@@ -460,6 +764,13 @@ FileBackedVector<T>::~FileBackedVector() {
}
template <typename T>
+libtextclassifier3::StatusOr<T> FileBackedVector<T>::GetCopy(
+ int32_t idx) const {
+ ICING_ASSIGN_OR_RETURN(const T* value, Get(idx));
+ return *value;
+}
+
+template <typename T>
libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get(
int32_t idx) const {
if (idx < 0) {
@@ -477,54 +788,111 @@ libtextclassifier3::StatusOr<const T*> FileBackedVector<T>::Get(
}
template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableView>
+FileBackedVector<T>::GetMutable(int32_t idx) {
+ if (idx < 0) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
+ }
+
+ if (idx >= header_->num_elements) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index, %d, was greater than vector size, %d", idx,
+ header_->num_elements));
+ }
+
+ return MutableView(this, &mutable_array()[idx]);
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView>
+FileBackedVector<T>::GetMutable(int32_t idx, int32_t len) {
+ if (idx < 0) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
+ }
+
+ if (idx > header_->num_elements - len) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Index with len, %d %d, was greater than vector size, %d", idx, len,
+ header_->num_elements));
+ }
+
+ return MutableArrayView(this, &mutable_array()[idx], len);
+}
+
+template <typename T>
libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx,
const T& value) {
+ return Set(idx, 1, value);
+}
+
+template <typename T>
+libtextclassifier3::Status FileBackedVector<T>::Set(int32_t idx, int32_t len,
+ const T& value) {
if (idx < 0) {
return absl_ports::OutOfRangeError(
IcingStringUtil::StringPrintf("Index, %d, was less than 0", idx));
}
- int32_t start_byte = idx * sizeof(T);
-
- ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + 1));
+ if (len <= 0) {
+ return absl_ports::OutOfRangeError("Invalid set length");
+ }
- if (idx + 1 > header_->num_elements) {
- header_->num_elements = idx + 1;
+ if (idx > kMaxNumElements - len) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Length %d (with index %d), was too long "
+ "for max num elements allowed, %d",
+ len, idx, kMaxNumElements));
}
- if (mutable_array()[idx] == value) {
- // No need to update
- return libtextclassifier3::Status::OK;
+ ICING_RETURN_IF_ERROR(GrowIfNecessary(idx + len));
+
+ if (idx + len > header_->num_elements) {
+ header_->num_elements = idx + len;
}
- // Cache original value to update crcs.
- if (idx < changes_end_) {
- // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
- // revert to full CRC.
- if ((saved_original_buffer_.size() + sizeof(T)) *
- FileBackedVector<T>::kPartialCrcLimitDiv >
- changes_end_ * sizeof(T)) {
- ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
- changes_.clear();
- saved_original_buffer_.clear();
- changes_end_ = 0;
- header_->vector_checksum = 0;
- } else {
- changes_.push_back(idx);
- saved_original_buffer_.append(
- reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
- sizeof(T));
+ for (int32_t i = 0; i < len; ++i) {
+ if (array()[idx + i] == value) {
+ // No need to update
+ continue;
}
+
+ SetDirty(idx + i);
+ mutable_array()[idx + i] = value;
}
- mutable_array()[idx] = value;
return libtextclassifier3::Status::OK;
}
template <typename T>
+libtextclassifier3::StatusOr<typename FileBackedVector<T>::MutableArrayView>
+FileBackedVector<T>::Allocate(int32_t len) {
+ if (len <= 0) {
+ return absl_ports::OutOfRangeError("Invalid allocate length");
+ }
+
+ if (len > kMaxNumElements - header_->num_elements) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Cannot allocate %d elements", len));
+ }
+
+ // Although header_->num_elements + len doesn't exceed kMaxNumElements, the
+ // actual max # of elements are determined by mmapped_file_->max_file_size(),
+ // kElementTypeSize, and kHeaderSize. Thus, it is still possible to fail to
+ // grow the file.
+ ICING_RETURN_IF_ERROR(GrowIfNecessary(header_->num_elements + len));
+
+ int32_t start_idx = header_->num_elements;
+ header_->num_elements += len;
+
+ return MutableArrayView(this, &mutable_array()[start_idx], len);
+}
+
+template <typename T>
libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
int32_t num_elements) {
- if (sizeof(T) == 0) {
+ if (kElementTypeSize == 0) {
// Growing is a no-op
return libtextclassifier3::Status::OK;
}
@@ -533,32 +901,35 @@ libtextclassifier3::Status FileBackedVector<T>::GrowIfNecessary(
return libtextclassifier3::Status::OK;
}
- if (num_elements > FileBackedVector<T>::kMaxNumElements) {
+ if (num_elements > (mmapped_file_->max_file_size() - Header::kHeaderSize) /
+ kElementTypeSize) {
return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
- "%d exceeds maximum number of elements allowed, %lld", num_elements,
- static_cast<long long>(FileBackedVector<T>::kMaxNumElements)));
+ "%d elements total size exceed maximum bytes of elements allowed, "
+ "%" PRId64 " bytes",
+ num_elements, mmapped_file_->max_file_size() - Header::kHeaderSize));
}
- int64_t current_file_size = filesystem_->GetFileSize(file_path_.c_str());
- int64_t least_file_size_needed = sizeof(Header) + num_elements * sizeof(T);
-
- if (least_file_size_needed <= current_file_size) {
- // Our underlying file can hold the target num_elements cause we've grown
- // before
+ int32_t least_file_size_needed =
+ Header::kHeaderSize + num_elements * kElementTypeSize; // Won't overflow
+ if (least_file_size_needed <= mmapped_file_->available_size()) {
return libtextclassifier3::Status::OK;
}
- // Otherwise, we need to grow. Grow to kGrowElements boundary.
- least_file_size_needed = math_util::RoundUpTo(
- least_file_size_needed,
- int64_t{FileBackedVector<T>::kGrowElements * sizeof(T)});
- if (!filesystem_->Grow(file_path_.c_str(), least_file_size_needed)) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Couldn't grow file ", file_path_));
- }
-
- ICING_RETURN_IF_ERROR(mmapped_file_->Remap(
- sizeof(Header), least_file_size_needed - sizeof(Header)));
+ int64_t round_up_file_size_needed = math_util::RoundUpTo(
+ int64_t{least_file_size_needed},
+ int64_t{FileBackedVector<T>::kGrowElements} * kElementTypeSize);
+
+ // Call GrowAndRemapIfNecessary. It handles file growth internally and remaps
+ // intelligently.
+ // We've ensured that least_file_size_needed (for num_elements) doesn't exceed
+ // mmapped_file_->max_file_size(), but it is still possible that
+ // round_up_file_size_needed exceeds it, so use the smaller value of them as
+ // new_mmap_size.
+ ICING_RETURN_IF_ERROR(mmapped_file_->GrowAndRemapIfNecessary(
+ /*new_file_offset=*/Header::kHeaderSize,
+ /*new_mmap_size=*/std::min(round_up_file_size_needed,
+ mmapped_file_->max_file_size()) -
+ Header::kHeaderSize));
return libtextclassifier3::Status::OK;
}
@@ -577,11 +948,58 @@ libtextclassifier3::Status FileBackedVector<T>::TruncateTo(
new_num_elements, header_->num_elements));
}
+ ICING_VLOG(2)
+ << "FileBackedVector truncating, need to recalculate entire checksum";
+ changes_.clear();
+ saved_original_buffer_.clear();
+ changes_end_ = 0;
+ header_->vector_checksum = 0;
+
header_->num_elements = new_num_elements;
return libtextclassifier3::Status::OK;
}
template <typename T>
+libtextclassifier3::Status FileBackedVector<T>::Sort(int32_t begin_idx,
+ int32_t end_idx) {
+ if (begin_idx < 0 || begin_idx >= end_idx ||
+ end_idx > header_->num_elements) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Invalid sort index, %d, %d", begin_idx, end_idx));
+ }
+ for (int32_t i = begin_idx; i < end_idx; ++i) {
+ SetDirty(i);
+ }
+ std::sort(mutable_array() + begin_idx, mutable_array() + end_idx);
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+void FileBackedVector<T>::SetDirty(int32_t idx) {
+ // Cache original value to update crcs.
+ if (idx >= 0 && idx < changes_end_) {
+ // If we exceed kPartialCrcLimitDiv, clear changes_end_ to
+ // revert to full CRC.
+ if ((saved_original_buffer_.size() + kElementTypeSize) *
+ FileBackedVector<T>::kPartialCrcLimitDiv >
+ changes_end_ * kElementTypeSize) {
+ ICING_VLOG(2) << "FileBackedVector change tracking limit exceeded";
+ changes_.clear();
+ saved_original_buffer_.clear();
+ changes_end_ = 0;
+ header_->vector_checksum = 0;
+ } else {
+ int32_t start_byte = idx * kElementTypeSize;
+
+ changes_.push_back(idx);
+ saved_original_buffer_.append(
+ reinterpret_cast<char*>(const_cast<T*>(array())) + start_byte,
+ kElementTypeSize);
+ }
+ }
+}
+
+template <typename T>
libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
// First apply the modified area. Keep a bitmap of already updated
// regions so we don't double-update.
@@ -592,8 +1010,7 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
int num_truncated = 0;
int num_overlapped = 0;
int num_duplicate = 0;
- for (size_t i = 0; i < changes_.size(); i++) {
- const int32_t change_offset = changes_[i];
+ for (const int32_t change_offset : changes_) {
if (change_offset > changes_end_) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to update crc, change offset %d, changes_end_ %d",
@@ -607,9 +1024,10 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
}
// Turn change buffer into change^original.
- const char* buffer_end = &saved_original_buffer_[cur_offset + sizeof(T)];
- const char* cur_array =
- reinterpret_cast<const char*>(array()) + change_offset * sizeof(T);
+ const char* buffer_end =
+ &saved_original_buffer_[cur_offset + kElementTypeSize];
+ const char* cur_array = reinterpret_cast<const char*>(array()) +
+ change_offset * kElementTypeSize;
// Now xor in. SSE acceleration please?
for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
cur++, cur_array++) {
@@ -621,9 +1039,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
bool overlap = false;
uint32_t cur_element = change_offset;
for (char* cur = &saved_original_buffer_[cur_offset]; cur < buffer_end;
- cur_element++, cur += sizeof(T)) {
+ cur_element++, cur += kElementTypeSize) {
if (updated[cur_element]) {
- memset(cur, 0, sizeof(T));
+ memset(cur, 0, kElementTypeSize);
overlap = true;
} else {
updated[cur_element] = true;
@@ -634,10 +1052,11 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
// Apply update to crc.
if (new_update) {
// Explicitly create the string_view with length
- std::string_view xored_str(buffer_end - sizeof(T), sizeof(T));
+ std::string_view xored_str(buffer_end - kElementTypeSize,
+ kElementTypeSize);
if (!cur_crc
- .UpdateWithXor(xored_str, changes_end_ * sizeof(T),
- change_offset * sizeof(T))
+ .UpdateWithXor(xored_str, changes_end_ * kElementTypeSize,
+ change_offset * kElementTypeSize)
.ok()) {
return absl_ports::InternalError(IcingStringUtil::StringPrintf(
"Failed to update crc, change offset %d, change "
@@ -651,8 +1070,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
} else {
num_duplicate++;
}
- cur_offset += sizeof(T);
+ cur_offset += kElementTypeSize;
}
+
if (!changes_.empty()) {
ICING_VLOG(2) << IcingStringUtil::StringPrintf(
"Array update partial crcs %d truncated %d overlapped %d duplicate %d",
@@ -663,8 +1083,9 @@ libtextclassifier3::StatusOr<Crc32> FileBackedVector<T>::ComputeChecksum() {
if (changes_end_ < header_->num_elements) {
// Explicitly create the string_view with length
std::string_view update_str(
- reinterpret_cast<const char*>(array()) + changes_end_ * sizeof(T),
- (header_->num_elements - changes_end_) * sizeof(T));
+ reinterpret_cast<const char*>(array()) +
+ changes_end_ * kElementTypeSize,
+ (header_->num_elements - changes_end_) * kElementTypeSize);
cur_crc.Append(update_str);
ICING_VLOG(2) << IcingStringUtil::StringPrintf(
"Array update tail crc offset %d -> %d", changes_end_,
@@ -689,7 +1110,7 @@ libtextclassifier3::Status FileBackedVector<T>::PersistToDisk() {
header_->header_checksum = header_->CalculateHeaderChecksum();
if (!filesystem_->PWrite(file_path_.c_str(), /*offset=*/0, header_.get(),
- sizeof(Header))) {
+ Header::kHeaderSize)) {
return absl_ports::InternalError("Failed to sync header");
}
@@ -723,7 +1144,11 @@ libtextclassifier3::StatusOr<int64_t> FileBackedVector<T>::GetElementsFileSize()
return absl_ports::InternalError(
"Failed to get file size of elements in the file-backed vector");
}
- return total_file_size - sizeof(Header);
+ if (total_file_size < Header::kHeaderSize) {
+ return absl_ports::InternalError(
+ "File size should not be smaller than header size");
+ }
+ return total_file_size - Header::kHeaderSize;
}
} // namespace lib
diff --git a/icing/file/file-backed-vector_benchmark.cc b/icing/file/file-backed-vector_benchmark.cc
new file mode 100644
index 0000000..0447e93
--- /dev/null
+++ b/icing/file/file-backed-vector_benchmark.cc
@@ -0,0 +1,158 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/destructible-directory.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+class FileBackedVectorBenchmark {
+ public:
+ explicit FileBackedVectorBenchmark()
+ : base_dir_(GetTestTempDir() + "/file_backed_vector_benchmark"),
+ file_path_(base_dir_ + "/test_vector"),
+ ddir_(&filesystem_, base_dir_),
+ random_engine_(/*seed=*/12345) {}
+
+ const Filesystem& filesystem() const { return filesystem_; }
+ const std::string& file_path() const { return file_path_; }
+ std::default_random_engine& random_engine() { return random_engine_; }
+
+ private:
+ Filesystem filesystem_;
+ std::string base_dir_;
+ std::string file_path_;
+ DestructibleDirectory ddir_;
+
+ std::default_random_engine random_engine_;
+};
+
+// Benchmark Set() (without extending vector, i.e. the index should be in range
+// [0, num_elts - 1].
+void BM_Set(benchmark::State& state) {
+ int num_elts = state.range(0);
+
+ FileBackedVectorBenchmark fbv_benchmark;
+
+ fbv_benchmark.filesystem().DeleteFile(fbv_benchmark.file_path().c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> fbv,
+ FileBackedVector<int>::Create(
+ fbv_benchmark.filesystem(), fbv_benchmark.file_path(),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // Extend to num_elts
+ ICING_ASSERT_OK(fbv->Set(num_elts - 1, 0));
+
+ std::uniform_int_distribution<> distrib(0, num_elts - 1);
+ for (auto _ : state) {
+ int idx = distrib(fbv_benchmark.random_engine());
+ ICING_ASSERT_OK(fbv->Set(idx, idx));
+ }
+}
+BENCHMARK(BM_Set)
+ ->Arg(1 << 10)
+ ->Arg(1 << 11)
+ ->Arg(1 << 12)
+ ->Arg(1 << 13)
+ ->Arg(1 << 14)
+ ->Arg(1 << 15)
+ ->Arg(1 << 16)
+ ->Arg(1 << 17)
+ ->Arg(1 << 18)
+ ->Arg(1 << 19)
+ ->Arg(1 << 20);
+
+// Benchmark single Append(). Equivalent to Set(fbv->num_elements(), val), which
+// extends the vector every round.
+void BM_Append(benchmark::State& state) {
+ FileBackedVectorBenchmark fbv_benchmark;
+
+ fbv_benchmark.filesystem().DeleteFile(fbv_benchmark.file_path().c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> fbv,
+ FileBackedVector<int>::Create(
+ fbv_benchmark.filesystem(), fbv_benchmark.file_path(),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ std::uniform_int_distribution<> distrib(0, std::numeric_limits<int>::max());
+ for (auto _ : state) {
+ ICING_ASSERT_OK(fbv->Append(distrib(fbv_benchmark.random_engine())));
+ }
+}
+BENCHMARK(BM_Append);
+
+// Benchmark appending many elements.
+void BM_AppendMany(benchmark::State& state) {
+ int num = state.range(0);
+
+ FileBackedVectorBenchmark fbv_benchmark;
+
+ for (auto _ : state) {
+ state.PauseTiming();
+ fbv_benchmark.filesystem().DeleteFile(fbv_benchmark.file_path().c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> fbv,
+ FileBackedVector<int>::Create(
+ fbv_benchmark.filesystem(), fbv_benchmark.file_path(),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ state.ResumeTiming();
+
+ for (int i = 0; i < num; ++i) {
+ ICING_ASSERT_OK(fbv->Append(i));
+ }
+
+ // Since destructor calls PersistToDisk, to avoid calling it twice, we reset
+ // the unique pointer to invoke destructor instead of calling PersistToDisk
+ // explicitly, so in this case PersistToDisk will be called only once.
+ fbv.reset();
+ }
+}
+BENCHMARK(BM_AppendMany)
+ ->Arg(1 << 5)
+ ->Arg(1 << 6)
+ ->Arg(1 << 7)
+ ->Arg(1 << 8)
+ ->Arg(1 << 9)
+ ->Arg(1 << 10)
+ ->Arg(1 << 11)
+ ->Arg(1 << 12)
+ ->Arg(1 << 13)
+ ->Arg(1 << 14)
+ ->Arg(1 << 15)
+ ->Arg(1 << 16)
+ ->Arg(1 << 17)
+ ->Arg(1 << 18)
+ ->Arg(1 << 19)
+ ->Arg(1 << 20);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/file-backed-vector_test.cc b/icing/file/file-backed-vector_test.cc
index 7561b57..524bbc1 100644
--- a/icing/file/file-backed-vector_test.cc
+++ b/icing/file/file-backed-vector_test.cc
@@ -14,25 +14,35 @@
#include "icing/file/file-backed-vector.h"
-#include <errno.h>
+#include <unistd.h>
#include <algorithm>
+#include <cerrno>
#include <cstdint>
+#include <limits>
#include <memory>
+#include <string>
#include <string_view>
#include <vector>
+#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
+#include "icing/file/mock-filesystem.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
+using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::IsTrue;
+using ::testing::Lt;
+using ::testing::Not;
using ::testing::Pointee;
+using ::testing::SizeIs;
namespace icing {
namespace lib {
@@ -55,24 +65,36 @@ class FileBackedVectorTest : public testing::Test {
// Helper method to loop over some data and insert into the vector at some idx
template <typename T>
- void Insert(FileBackedVector<T>* vector, int32_t idx, std::string data) {
- for (int i = 0; i < data.length(); ++i) {
+ void Insert(FileBackedVector<T>* vector, int32_t idx,
+ const std::vector<T>& data) {
+ for (int i = 0; i < data.size(); ++i) {
ICING_ASSERT_OK(vector->Set(idx + i, data.at(i)));
}
}
+ void Insert(FileBackedVector<char>* vector, int32_t idx, std::string data) {
+ Insert(vector, idx, std::vector<char>(data.begin(), data.end()));
+ }
+
// Helper method to retrieve data from the beginning of the vector
template <typename T>
- std::string_view Get(FileBackedVector<T>* vector, int32_t expected_len) {
+ std::vector<T> Get(FileBackedVector<T>* vector, int32_t idx,
+ int32_t expected_len) {
+ return std::vector<T>(vector->array() + idx,
+ vector->array() + idx + expected_len);
+ }
+
+ std::string_view Get(FileBackedVector<char>* vector, int32_t expected_len) {
return Get(vector, 0, expected_len);
}
- template <typename T>
- std::string_view Get(FileBackedVector<T>* vector, int32_t idx,
+ std::string_view Get(FileBackedVector<char>* vector, int32_t idx,
int32_t expected_len) {
return std::string_view(vector->array() + idx, expected_len);
}
+ const Filesystem& filesystem() const { return filesystem_; }
+
Filesystem filesystem_;
std::string file_path_;
int fd_;
@@ -96,6 +118,79 @@ TEST_F(FileBackedVectorTest, Create) {
}
}
+TEST_F(FileBackedVectorTest, CreateWithInvalidStrategy) {
+ // Create a vector with unimplemented strategy
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_MANUAL_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED));
+}
+
+TEST_F(FileBackedVectorTest, CreateWithCustomMaxFileSize) {
+ int32_t header_size = FileBackedVector<char>::Header::kHeaderSize;
+
+ // Create a vector with invalid max_file_size
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ {
+ // Create a vector with max_file_size that allows only 1 element.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) * 1));
+ ICING_ASSERT_OK(vector->Set(0, 'a'));
+ }
+
+ {
+ // We can create it again with larger max_file_size, as long as it is not
+ // greater than kMaxFileSize.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/header_size + sizeof(char) * 2));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+ ICING_ASSERT_OK(vector->Set(1, 'b'));
+ }
+
+ // We cannot create it again with max_file_size < current_file_size, even if
+ // it is a valid value.
+ int64_t current_file_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_THAT(current_file_size, Eq(header_size + sizeof(char) * 2));
+ ASSERT_THAT(current_file_size - 1, Not(Lt(header_size + sizeof(char))));
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/current_file_size - 1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ {
+ // We can create it again with max_file_size == current_file_size.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto vector, FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/current_file_size));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b'))));
+ }
+}
+
TEST_F(FileBackedVectorTest, SimpleShared) {
// Create a vector and add some data.
ICING_ASSERT_OK_AND_ASSIGN(
@@ -132,7 +227,7 @@ TEST_F(FileBackedVectorTest, SimpleShared) {
ASSERT_THAT(FileBackedVector<char>::Create(
filesystem_, file_path_,
MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
// Get it back into an ok state
filesystem_.PWrite(file_path_.data(),
@@ -158,8 +253,8 @@ TEST_F(FileBackedVectorTest, SimpleShared) {
// Truncate the content
ICING_EXPECT_OK(vector->TruncateTo(0));
- // We don't automatically update the crc when we truncate.
- EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc));
+ // Crc is cleared after truncation and reset to 0.
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
EXPECT_EQ(0u, vector->num_elements());
}
@@ -188,6 +283,432 @@ TEST_F(FileBackedVectorTest, Get) {
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
+TEST_F(FileBackedVectorTest, SetWithoutGrowing) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ std::string original = "abcde";
+ Insert(vector.get(), /*idx=*/0, original);
+ ASSERT_THAT(vector->num_elements(), Eq(original.length()));
+ ASSERT_THAT(Get(vector.get(), /*idx=*/0, /*expected_len=*/5), Eq(original));
+
+ ICING_EXPECT_OK(vector->Set(/*idx=*/1, /*len=*/3, 'z'));
+ EXPECT_THAT(vector->num_elements(), Eq(5));
+ EXPECT_THAT(Get(vector.get(), /*idx=*/0, /*expected_len=*/5), Eq("azzze"));
+}
+
+TEST_F(FileBackedVectorTest, SetWithGrowing) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ std::string original = "abcde";
+ Insert(vector.get(), /*idx=*/0, original);
+ ASSERT_THAT(vector->num_elements(), Eq(original.length()));
+ ASSERT_THAT(Get(vector.get(), /*idx=*/0, /*expected_len=*/5), Eq(original));
+
+ ICING_EXPECT_OK(vector->Set(/*idx=*/3, /*len=*/4, 'z'));
+ EXPECT_THAT(vector->num_elements(), Eq(7));
+ EXPECT_THAT(Get(vector.get(), /*idx=*/0, /*expected_len=*/7), Eq("abczzzz"));
+}
+
+TEST_F(FileBackedVectorTest, SetInvalidArguments) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(vector->Set(/*idx=*/0, /*len=*/-1, 'z'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Set(/*idx=*/0, /*len=*/0, 'z'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Set(/*idx=*/-1, /*len=*/2, 'z'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Set(/*idx=*/100,
+ /*len=*/std::numeric_limits<int32_t>::max(), 'z'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedVectorTest, MutableView) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'a'));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(FileBackedVector<char>::MutableView mutable_elt,
+ vector->GetMutable(3));
+
+ mutable_elt.Get() = 'b';
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('b'))));
+
+ mutable_elt.Get() = 'c';
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('c'))));
+}
+
+TEST_F(FileBackedVectorTest, MutableViewShouldSetDirty) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'a'));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2620640643U)));
+
+ std::string_view reconstructed_view =
+ std::string_view(vector->array(), vector->num_elements());
+
+ ICING_ASSERT_OK_AND_ASSIGN(FileBackedVector<char>::MutableView mutable_elt,
+ vector->GetMutable(3));
+
+ // Mutate the element via MutateView
+ // If non-const Get() is called, MutateView should set the element index dirty
+ // so that ComputeChecksum() can pick up the change and compute the checksum
+ // correctly. Validate by mapping another array on top.
+ mutable_elt.Get() = 'b';
+ ASSERT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('b'))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ Crc32 full_crc1;
+ full_crc1.Append(reconstructed_view);
+ EXPECT_THAT(crc1, Eq(full_crc1));
+
+ // Mutate and test again.
+ mutable_elt.Get() = 'c';
+ ASSERT_THAT(vector->Get(3), IsOkAndHolds(Pointee(Eq('c'))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ Crc32 full_crc2;
+ full_crc2.Append(reconstructed_view);
+ EXPECT_THAT(crc2, Eq(full_crc2));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayView) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, /*len=*/3));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+
+ mutable_arr[0] = 2;
+ mutable_arr[1] = 3;
+ mutable_arr[2] = 4;
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(2))));
+ EXPECT_THAT(mutable_arr.data()[0], Eq(2));
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 1), IsOkAndHolds(Pointee(Eq(3))));
+ EXPECT_THAT(mutable_arr.data()[1], Eq(3));
+
+ EXPECT_THAT(vector->Get(kArrayViewOffset + 2), IsOkAndHolds(Pointee(Eq(4))));
+ EXPECT_THAT(mutable_arr.data()[2], Eq(4));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArray) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ std::vector<int> change1{2, 3, 4};
+ mutable_arr.SetArray(/*idx=*/0, change1.data(), change1.size());
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 4, 1, 1));
+
+ std::vector<int> change2{5, 6};
+ mutable_arr.SetArray(/*idx=*/2, change2.data(), change2.size());
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 5, 6, 1));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArrayWithZeroLength) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ // Zero arr_len should work and change nothing
+ std::vector<int> change{2, 3};
+ mutable_arr.SetArray(/*idx=*/0, change.data(), /*arr_len=*/0);
+ EXPECT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(1, 1, 1, 1, 1));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewIndexOperatorShouldSetDirty) {
+ // Create an array with some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ std::string_view reconstructed_view(
+ reinterpret_cast<const char*>(vector->array()),
+ vector->num_elements() * sizeof(int));
+
+ constexpr int kArrayViewOffset = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, /*len=*/3));
+
+ // Use operator[] to mutate elements
+ // If non-const operator[] is called, MutateView should set the element index
+ // dirty so that ComputeChecksum() can pick up the change and compute the
+ // checksum correctly. Validate by mapping another array on top.
+ mutable_arr[0] = 2;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(2))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ EXPECT_THAT(crc1, Eq(Crc32(reconstructed_view)));
+
+ mutable_arr[1] = 3;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 1), IsOkAndHolds(Pointee(Eq(3))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ EXPECT_THAT(crc2, Eq(Crc32(reconstructed_view)));
+
+ mutable_arr[2] = 4;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 2), IsOkAndHolds(Pointee(Eq(4))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc3, vector->ComputeChecksum());
+ EXPECT_THAT(crc3, Eq(Crc32(reconstructed_view)));
+
+ // Change the same position. It should set dirty again.
+ mutable_arr[0] = 5;
+ ASSERT_THAT(vector->Get(kArrayViewOffset + 0), IsOkAndHolds(Pointee(Eq(5))));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc4, vector->ComputeChecksum());
+ EXPECT_THAT(crc4, Eq(Crc32(reconstructed_view)));
+}
+
+TEST_F(FileBackedVectorTest, MutableArrayViewSetArrayShouldSetDirty) {
+ // Create an array with some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::vector<int>(/*count=*/100, /*value=*/1));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(2494890115U)));
+
+ std::string_view reconstructed_view(
+ reinterpret_cast<const char*>(vector->array()),
+ vector->num_elements() * sizeof(int));
+
+ constexpr int kArrayViewOffset = 3;
+ constexpr int kArrayViewLen = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FileBackedVector<int>::MutableArrayView mutable_arr,
+ vector->GetMutable(kArrayViewOffset, kArrayViewLen));
+
+ std::vector<int> change{2, 3, 4};
+ mutable_arr.SetArray(/*idx=*/0, change.data(), change.size());
+ ASSERT_THAT(Get(vector.get(), kArrayViewOffset, kArrayViewLen),
+ ElementsAre(2, 3, 4, 1, 1));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc, vector->ComputeChecksum());
+ EXPECT_THAT(crc, Eq(Crc32(reconstructed_view)));
+}
+
+TEST_F(FileBackedVectorTest, Append) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(1));
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(Eq('a'))));
+
+ ICING_EXPECT_OK(vector->Append('b'));
+ EXPECT_THAT(vector->num_elements(), Eq(2));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(Eq('b'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendAfterSet) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK(vector->Set(9, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(10));
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(11));
+ EXPECT_THAT(vector->Get(10), IsOkAndHolds(Pointee(Eq('a'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendAfterTruncate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(1000));
+
+ ICING_ASSERT_OK(vector->TruncateTo(5));
+ ICING_EXPECT_OK(vector->Append('a'));
+ EXPECT_THAT(vector->num_elements(), Eq(6));
+ EXPECT_THAT(vector->Get(5), IsOkAndHolds(Pointee(Eq('a'))));
+}
+
+TEST_F(FileBackedVectorTest, AppendShouldFailIfExceedingMaxFileSize) {
+ int32_t max_file_size = (1 << 10) - 1;
+ int32_t max_num_elements =
+ (max_file_size - FileBackedVector<char>::Header::kHeaderSize) /
+ sizeof(char);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
+ ICING_ASSERT_OK(vector->Set(max_num_elements - 1, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(max_num_elements));
+
+ EXPECT_THAT(vector->Append('a'),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedVectorTest, Allocate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(3));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/0, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateAfterSet) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ ICING_ASSERT_OK(vector->Set(9, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(10));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(13));
+ EXPECT_THAT(mutable_arr, SizeIs(3));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/10, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateAfterTruncate) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), /*idx=*/0, std::string(1000, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(1000));
+
+ ICING_ASSERT_OK(vector->TruncateTo(5));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ typename FileBackedVector<char>::MutableArrayView mutable_arr,
+ vector->Allocate(3));
+ EXPECT_THAT(vector->num_elements(), Eq(8));
+ std::string change = "abc";
+ mutable_arr.SetArray(/*idx=*/0, /*arr=*/change.data(), /*arr_len=*/3);
+ EXPECT_THAT(Get(vector.get(), /*idx=*/5, /*expected_len=*/3), Eq(change));
+}
+
+TEST_F(FileBackedVectorTest, AllocateInvalidLengthShouldFail) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ASSERT_THAT(vector->num_elements(), Eq(0));
+
+ EXPECT_THAT(vector->Allocate(-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->num_elements(), Eq(0));
+
+ EXPECT_THAT(vector->Allocate(0),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->num_elements(), Eq(0));
+}
+
+TEST_F(FileBackedVectorTest, AllocateShouldFailIfExceedingMaxFileSize) {
+ int32_t max_file_size = (1 << 10) - 1;
+ int32_t max_num_elements =
+ (max_file_size - FileBackedVector<char>::Header::kHeaderSize) /
+ sizeof(char);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
+ ICING_ASSERT_OK(vector->Set(max_num_elements - 3, 'z'));
+ ASSERT_THAT(vector->num_elements(), Eq(max_num_elements - 2));
+
+ EXPECT_THAT(vector->Allocate(3),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Allocate(2), IsOk());
+}
+
TEST_F(FileBackedVectorTest, IncrementalCrc_NonOverlappingChanges) {
int num_elements = 1000;
int incremental_size = 3;
@@ -265,30 +786,58 @@ TEST_F(FileBackedVectorTest, IncrementalCrc_OverlappingChanges) {
}
}
+TEST_F(FileBackedVectorTest, SetIntMaxShouldReturnOutOfRangeError) {
+ // Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int32_t>> vector,
+ FileBackedVector<int32_t>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
+
+ // It is an edge case. Since Set() calls GrowIfNecessary(idx + 1), we have to
+ // make sure that when idx is INT32_MAX, Set() should handle it correctly.
+ EXPECT_THAT(vector->Set(std::numeric_limits<int32_t>::max(), 1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
TEST_F(FileBackedVectorTest, Grow) {
- // This is the same value as FileBackedVector::kMaxNumElts
- constexpr int32_t kMaxNumElts = 1U << 20;
+ int32_t max_file_size = (1 << 20) - 1;
+ int32_t header_size = FileBackedVector<int32_t>::Header::kHeaderSize;
+ int32_t element_type_size = static_cast<int32_t>(sizeof(int32_t));
+
+ // Max file size includes size of the header and elements, so max # of
+ // elements will be (max_file_size - header_size) / element_type_size.
+ //
+ // Also ensure that (max_file_size - header_size) is not a multiple of
+ // element_type_size, in order to test if the desired # of elements is
+ // computed by (math) floor instead of ceil.
+ ASSERT_THAT((max_file_size - header_size) % element_type_size, Not(Eq(0)));
+ int32_t max_num_elements = (max_file_size - header_size) / element_type_size;
ASSERT_TRUE(filesystem_.Truncate(fd_, 0));
- // Create an array and add some data.
+ // Create a vector and add some data.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<FileBackedVector<char>> vector,
- FileBackedVector<char>::Create(
+ std::unique_ptr<FileBackedVector<int32_t>> vector,
+ FileBackedVector<int32_t>::Create(
filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
-
- EXPECT_THAT(vector->Set(kMaxNumElts + 11, 'a'),
+ // max_num_elements is the allowed max # of elements, so the valid index
+ // should be 0 to max_num_elements-1.
+ EXPECT_THAT(vector->Set(max_num_elements, 1),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
- EXPECT_THAT(vector->Set(-1, 'a'),
+ EXPECT_THAT(vector->Set(-1, 1),
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Set(max_num_elements - 1, 1), IsOk());
- uint32_t start = kMaxNumElts - 13;
- Insert(vector.get(), start, "abcde");
+ int32_t start = max_num_elements - 5;
+ std::vector<int32_t> data{1, 2, 3, 4, 5};
+ Insert(vector.get(), start, data);
// Crc works?
- const Crc32 good_crc(1134899064U);
+ const Crc32 good_crc(650981917U);
EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(good_crc));
// PersistToDisk does nothing bad, and ensures the content is still there
@@ -300,12 +849,12 @@ TEST_F(FileBackedVectorTest, Grow) {
vector.reset();
ICING_ASSERT_OK_AND_ASSIGN(
- vector, FileBackedVector<char>::Create(
- filesystem_, file_path_,
- MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ vector,
+ FileBackedVector<int32_t>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size));
- std::string expected = "abcde";
- EXPECT_EQ(expected, Get(vector.get(), start, expected.length()));
+ EXPECT_THAT(Get(vector.get(), start, data.size()), Eq(data));
}
TEST_F(FileBackedVectorTest, GrowsInChunks) {
@@ -318,25 +867,32 @@ TEST_F(FileBackedVectorTest, GrowsInChunks) {
filesystem_, file_path_,
MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
- // Our initial file size should just be the size of the header
- EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
- Eq(sizeof(FileBackedVector<char>::Header)));
-
- // Once we add something though, we'll grow to kGrowElements big
- Insert(vector.get(), 0, "a");
- EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
- Eq(kGrowElements * sizeof(int)));
+ // Our initial file size should just be the size of the header. Disk usage
+ // will indicate that one block has been allocated, which contains the header.
+ int header_size = sizeof(FileBackedVector<char>::Header);
+ int page_size = getpagesize();
+ EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(header_size));
+ EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(page_size));
+
+ // Once we add something though, we'll grow to be kGrowElements big. From this
+ // point on, file size and disk usage should be the same because Growing will
+ // explicitly allocate the number of blocks needed to accomodate the file.
+ Insert(vector.get(), 0, {1});
+ int file_size = 1 * kGrowElements * sizeof(int);
+ EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
+ EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
// Should still be the same size, don't need to grow underlying file
- Insert(vector.get(), 1, "b");
- EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
- Eq(kGrowElements * sizeof(int)));
+ Insert(vector.get(), 1, {2});
+ EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
+ EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
// Now we grow by a kGrowElements chunk, so the underlying file is 2
// kGrowElements big
- Insert(vector.get(), 2, std::string(kGrowElements, 'c'));
- EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
- Eq(kGrowElements * 2 * sizeof(int)));
+ file_size = 2 * kGrowElements * sizeof(int);
+ Insert(vector.get(), 2, std::vector<int>(kGrowElements, 3));
+ EXPECT_THAT(filesystem_.GetFileSize(fd_), Eq(file_size));
+ EXPECT_THAT(filesystem_.GetDiskUsage(fd_), Eq(file_size));
// Destroy/persist the contents.
vector.reset();
@@ -409,10 +965,10 @@ TEST_F(FileBackedVectorTest, TruncateTo) {
EXPECT_EQ(1, vector->num_elements());
EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(31158534)));
- // Truncating doesn't cause the checksum to be updated.
+ // Truncating clears the checksum and resets it to 0
ICING_EXPECT_OK(vector->TruncateTo(0));
EXPECT_EQ(0, vector->num_elements());
- EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(31158534)));
+ EXPECT_THAT(vector->ComputeChecksum(), IsOkAndHolds(Crc32(0)));
// Can't truncate past end.
EXPECT_THAT(vector->TruncateTo(100),
@@ -423,6 +979,386 @@ TEST_F(FileBackedVectorTest, TruncateTo) {
StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
}
+TEST_F(FileBackedVectorTest, TruncateAndReReadFile) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<float>> vector,
+ FileBackedVector<float>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ ICING_ASSERT_OK(vector->Set(0, 1.0));
+ ICING_ASSERT_OK(vector->Set(1, 2.0));
+ ICING_ASSERT_OK(vector->Set(2, 2.0));
+ ICING_ASSERT_OK(vector->Set(3, 2.0));
+ ICING_ASSERT_OK(vector->Set(4, 2.0));
+ } // Destroying the vector should trigger a checksum of the 5 elements
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<float>> vector,
+ FileBackedVector<float>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_EQ(5, vector->num_elements());
+ ICING_EXPECT_OK(vector->TruncateTo(4));
+ EXPECT_EQ(4, vector->num_elements());
+ } // Destroying the vector should update the checksum to 4 elements
+
+ // Creating again should double check that our checksum of 4 elements matches
+ // what was previously saved.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<float>> vector,
+ FileBackedVector<float>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_EQ(vector->num_elements(), 4);
+ }
+}
+
+TEST_F(FileBackedVectorTest, Sort) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK(vector->Set(0, 5));
+ ICING_ASSERT_OK(vector->Set(1, 4));
+ ICING_ASSERT_OK(vector->Set(2, 2));
+ ICING_ASSERT_OK(vector->Set(3, 3));
+ ICING_ASSERT_OK(vector->Set(4, 1));
+
+ // Sort vector range [1, 4) (excluding 4).
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/1, /*end_idx=*/4), IsOk());
+ // Verify sorted range should be sorted and others should remain unchanged.
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(5)));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(2)));
+ EXPECT_THAT(vector->Get(2), IsOkAndHolds(Pointee(3)));
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(4)));
+ EXPECT_THAT(vector->Get(4), IsOkAndHolds(Pointee(1)));
+
+ // Sort again by end_idx = num_elements().
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/0, /*end_idx=*/vector->num_elements()),
+ IsOk());
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(1)));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(2)));
+ EXPECT_THAT(vector->Get(2), IsOkAndHolds(Pointee(3)));
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(4)));
+ EXPECT_THAT(vector->Get(4), IsOkAndHolds(Pointee(5)));
+}
+
+TEST_F(FileBackedVectorTest, SortByInvalidIndexShouldReturnOutOfRangeError) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK(vector->Set(0, 5));
+ ICING_ASSERT_OK(vector->Set(1, 4));
+ ICING_ASSERT_OK(vector->Set(2, 2));
+ ICING_ASSERT_OK(vector->Set(3, 3));
+ ICING_ASSERT_OK(vector->Set(4, 1));
+
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/-1, /*end_idx=*/4),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/0, /*end_idx=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/3, /*end_idx=*/3),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/3, /*end_idx=*/1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/5, /*end_idx=*/5),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/3, /*end_idx=*/6),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(FileBackedVectorTest, SortShouldSetDirtyCorrectly) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK(vector->Set(0, 5));
+ ICING_ASSERT_OK(vector->Set(1, 4));
+ ICING_ASSERT_OK(vector->Set(2, 2));
+ ICING_ASSERT_OK(vector->Set(3, 3));
+ ICING_ASSERT_OK(vector->Set(4, 1));
+ } // Destroying the vector should trigger a checksum of the 5 elements
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // Sort vector range [1, 4) (excluding 4).
+ EXPECT_THAT(vector->Sort(/*begin_idx=*/1, /*end_idx=*/4), IsOk());
+ } // Destroying the vector should update the checksum
+
+ // Creating again should check that the checksum after sorting matches what
+ // was previously saved. This tests the correctness of SetDirty() for sorted
+ // elements.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<int>> vector,
+ FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ // Verify sorted range should be sorted and others should remain unchanged.
+ EXPECT_THAT(vector->Get(0), IsOkAndHolds(Pointee(5)));
+ EXPECT_THAT(vector->Get(1), IsOkAndHolds(Pointee(2)));
+ EXPECT_THAT(vector->Get(2), IsOkAndHolds(Pointee(3)));
+ EXPECT_THAT(vector->Get(3), IsOkAndHolds(Pointee(4)));
+ EXPECT_THAT(vector->Get(4), IsOkAndHolds(Pointee(1)));
+}
+
+TEST_F(FileBackedVectorTest, SetDirty) {
+ // 1. Create a vector and add some data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "abcd");
+
+ std::string_view reconstructed_view =
+ std::string_view(vector->array(), vector->num_elements());
+
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc1, vector->ComputeChecksum());
+ Crc32 full_crc_before_overwrite;
+ full_crc_before_overwrite.Append(reconstructed_view);
+ EXPECT_THAT(crc1, Eq(full_crc_before_overwrite));
+
+ // 2. Manually overwrite the values of the first two elements.
+ std::string corrupted_content = "ef";
+ ASSERT_THAT(
+ filesystem_.PWrite(fd_, /*offset=*/sizeof(FileBackedVector<char>::Header),
+ corrupted_content.c_str(), corrupted_content.length()),
+ IsTrue());
+ ASSERT_THAT(Get(vector.get(), 0, 4), Eq("efcd"));
+ Crc32 full_crc_after_overwrite;
+ full_crc_after_overwrite.Append(reconstructed_view);
+ ASSERT_THAT(full_crc_before_overwrite, Not(Eq(full_crc_after_overwrite)));
+
+ // 3. Without calling SetDirty(), the checksum will be recomputed incorrectly.
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc2, vector->ComputeChecksum());
+ EXPECT_THAT(crc2, Not(Eq(full_crc_after_overwrite)));
+
+ // 4. Call SetDirty()
+ vector->SetDirty(0);
+ vector->SetDirty(1);
+
+ // 5. The checksum should be computed correctly after calling SetDirty() with
+ // correct index.
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 crc3, vector->ComputeChecksum());
+ EXPECT_THAT(crc3, Eq(full_crc_after_overwrite));
+}
+
+TEST_F(FileBackedVectorTest, InitFileTooSmallForHeaderFails) {
+ {
+ // 1. Create a vector with a few elements.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "A");
+ Insert(vector.get(), 1, "Z");
+ ASSERT_THAT(vector->PersistToDisk(), IsOk());
+ }
+
+ // 2. Shrink the file to be smaller than the header.
+ filesystem_.Truncate(fd_, sizeof(FileBackedVector<char>::Header) - 1);
+
+ {
+ // 3. Attempt to create the file and confirm that it fails.
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(FileBackedVectorTest, InitWrongDataSizeFails) {
+ {
+ // 1. Create a vector with a few elements.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "A");
+ Insert(vector.get(), 1, "Z");
+ ASSERT_THAT(vector->PersistToDisk(), IsOk());
+ }
+
+ {
+ // 2. Attempt to create the file with a different element size and confirm
+ // that it fails.
+ EXPECT_THAT(FileBackedVector<int>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(FileBackedVectorTest, InitCorruptHeaderFails) {
+ {
+ // 1. Create a vector with a few elements.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "A");
+ Insert(vector.get(), 1, "Z");
+ ASSERT_THAT(vector->PersistToDisk(), IsOk());
+ }
+
+ // 2. Modify the header, but don't update the checksum. This would be similar
+ // to corruption of the header.
+ FileBackedVector<char>::Header header;
+ ASSERT_THAT(filesystem_.PRead(fd_, &header, sizeof(header), /*offset=*/0),
+ IsTrue());
+ header.num_elements = 1;
+ ASSERT_THAT(filesystem_.PWrite(fd_, /*offset=*/0, &header, sizeof(header)),
+ IsTrue());
+
+ {
+ // 3. Attempt to create the file with a header that doesn't match its
+ // checksum and confirm that it fails.
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ }
+}
+
+TEST_F(FileBackedVectorTest, InitHeaderElementSizeTooBigFails) {
+ {
+ // 1. Create a vector with a few elements.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "A");
+ Insert(vector.get(), 1, "Z");
+ ASSERT_THAT(vector->PersistToDisk(), IsOk());
+ }
+
+ // 2. Modify the header so that the number of elements exceeds the actual size
+ // of the underlying file.
+ FileBackedVector<char>::Header header;
+ ASSERT_THAT(filesystem_.PRead(fd_, &header, sizeof(header), /*offset=*/0),
+ IsTrue());
+ int64_t file_size = filesystem_.GetFileSize(fd_);
+ int64_t allocated_elements_size = file_size - sizeof(header);
+ header.num_elements = (allocated_elements_size / sizeof(char)) + 1;
+ header.header_checksum = header.CalculateHeaderChecksum();
+ ASSERT_THAT(filesystem_.PWrite(fd_, /*offset=*/0, &header, sizeof(header)),
+ IsTrue());
+
+ {
+ // 3. Attempt to create the file with num_elements that is larger than the
+ // underlying file and confirm that it fails.
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(FileBackedVectorTest, InitCorruptElementsFails) {
+ {
+ // 1. Create a vector with a few elements.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "A");
+ Insert(vector.get(), 1, "Z");
+ ASSERT_THAT(vector->PersistToDisk(), IsOk());
+ }
+
+ // 2. Overwrite the values of the first two elements.
+ std::string corrupted_content = "BY";
+ ASSERT_THAT(
+ filesystem_.PWrite(fd_, /*offset=*/sizeof(FileBackedVector<char>::Header),
+ corrupted_content.c_str(), corrupted_content.length()),
+ IsTrue());
+
+ {
+ // 3. Attempt to create the file with elements that don't match their
+ // checksum and confirm that it fails.
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ }
+}
+
+TEST_F(FileBackedVectorTest, InitNormalSucceeds) {
+ {
+ // 1. Create a vector with a few elements.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ Insert(vector.get(), 0, "A");
+ Insert(vector.get(), 1, "Z");
+ ASSERT_THAT(vector->PersistToDisk(), IsOk());
+ }
+
+ {
+ // 2. Attempt to create the file with a completely valid header and elements
+ // region. This should succeed.
+ EXPECT_THAT(FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC),
+ IsOk());
+ }
+}
+
+TEST_F(FileBackedVectorTest, InitFromExistingFileShouldPreMapAtLeastFileSize) {
+ {
+ // 1. Create a vector with a few elements.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<char>::kMaxFileSize));
+ Insert(vector.get(), 10000, "A");
+ Insert(vector.get(), 10001, "Z");
+ ASSERT_THAT(vector->PersistToDisk(), IsOk());
+ }
+
+ {
+ // 2. Attempt to create the file with pre_mapping_mmap_size < file_size. It
+ // should still pre-map file_size, so we can pass the checksum
+ // verification when initializing and get the correct contents.
+ int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
+ int pre_mapping_mmap_size = 10;
+ ASSERT_THAT(pre_mapping_mmap_size, Lt(file_size));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> vector,
+ FileBackedVector<char>::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<char>::kMaxFileSize, pre_mapping_mmap_size));
+ EXPECT_THAT(Get(vector.get(), /*idx=*/10000, /*expected_len=*/2), Eq("AZ"));
+ }
+}
+
} // namespace
} // namespace lib
diff --git a/icing/file/filesystem.cc b/icing/file/filesystem.cc
index 4a76c01..cd905e7 100644
--- a/icing/file/filesystem.cc
+++ b/icing/file/filesystem.cc
@@ -16,7 +16,6 @@
#include <dirent.h>
#include <dlfcn.h>
-#include <errno.h>
#include <fcntl.h>
#include <fnmatch.h>
#include <pthread.h>
@@ -26,6 +25,7 @@
#include <unistd.h>
#include <algorithm>
+#include <cerrno>
#include <cstdint>
#include <unordered_set>
@@ -63,18 +63,16 @@ void LogOpenFileDescriptors() {
constexpr int kMaxFileDescriptorsToStat = 4096;
struct rlimit rlim = {0, 0};
if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "getrlimit() failed (errno=%d)", errno);
+ ICING_LOG(ERROR) << "getrlimit() failed (errno=" << errno << ")";
return;
}
int fd_lim = rlim.rlim_cur;
if (fd_lim > kMaxFileDescriptorsToStat) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Maximum number of file descriptors (%d) too large.", fd_lim);
+ ICING_LOG(ERROR) << "Maximum number of file descriptors (" << fd_lim
+ << ") too large.";
fd_lim = kMaxFileDescriptorsToStat;
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Listing up to %d file descriptors.", fd_lim);
+ ICING_LOG(INFO) << "Listing up to " << fd_lim << " file descriptors.";
// Verify that /proc/self/fd is a directory. If not, procfs is not mounted or
// inaccessible for some other reason. In that case, there's no point trying
@@ -96,15 +94,12 @@ void LogOpenFileDescriptors() {
if (len >= 0) {
// Zero-terminate the buffer, because readlink() won't.
target[len < target_size ? len : target_size - 1] = '\0';
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> \"%s\"", fd,
- target);
+ ICING_LOG(INFO) << "fd " << fd << " -> \"" << target << "\"";
} else if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> ? (errno=%d)",
- fd, errno);
+ ICING_LOG(ERROR) << "fd " << fd << " -> ? (errno=" << errno << ")";
}
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "File descriptor list complete.");
+ ICING_LOG(INFO) << "File descriptor list complete.";
}
// Logs an error formatted as: desc1 + file_name + desc2 + strerror(errnum).
@@ -113,8 +108,11 @@ void LogOpenFileDescriptors() {
// file descriptors (see LogOpenFileDescriptors() above).
void LogOpenError(const char* desc1, const char* file_name, const char* desc2,
int errnum) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "%s%s%s%s", desc1, file_name, desc2, strerror(errnum));
+ if (errnum == ENOENT) {
+ ICING_VLOG(1) << desc1 << file_name << desc2 << strerror(errnum);
+ } else {
+ ICING_LOG(ERROR) << desc1 << file_name << desc2 << strerror(errnum);
+ }
if (errnum == EMFILE) {
LogOpenFileDescriptors();
}
@@ -133,6 +131,9 @@ bool ListDirectoryInternal(const char* dir_name,
return false;
}
+ // According to linux man page
+ // (https://man7.org/linux/man-pages/man3/readdir.3.html#RETURN_VALUE), dirent
+ // may be statically allocated, so don't free it.
dirent* p;
// readdir's implementation seems to be thread safe.
while ((p = readdir(dir)) != nullptr) {
@@ -155,8 +156,7 @@ bool ListDirectoryInternal(const char* dir_name,
}
}
if (closedir(dir) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Error closing %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Error closing " << dir_name << " " << strerror(errno);
}
return true;
}
@@ -179,11 +179,10 @@ void ScopedFd::reset(int fd) {
const int64_t Filesystem::kBadFileSize;
bool Filesystem::DeleteFile(const char* file_name) const {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Deleting file %s", file_name);
+ ICING_VLOG(1) << "Deleting file " << file_name;
int ret = unlink(file_name);
if (ret != 0 && errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting file %s failed: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting file " << file_name << " failed: " << strerror(errno);
return false;
}
return true;
@@ -192,8 +191,7 @@ bool Filesystem::DeleteFile(const char* file_name) const {
bool Filesystem::DeleteDirectory(const char* dir_name) const {
int ret = rmdir(dir_name);
if (ret != 0 && errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting directory " << dir_name << " failed: " << strerror(errno);
return false;
}
return true;
@@ -206,8 +204,7 @@ bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const {
if (errno == ENOENT) {
return true; // If directory didn't exist, this was successful.
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << dir_name << " failed: " << strerror(errno);
return false;
}
vector<std::string> entries;
@@ -220,8 +217,7 @@ bool Filesystem::DeleteDirectoryRecursively(const char* dir_name) const {
++i) {
std::string filename = std::string(dir_name) + '/' + *i;
if (stat(filename.c_str(), &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", filename.c_str(), strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << filename << " failed: " << strerror(errno);
success = false;
} else if (S_ISDIR(st.st_mode)) {
success = DeleteDirectoryRecursively(filename.c_str()) && success;
@@ -244,8 +240,7 @@ bool Filesystem::FileExists(const char* file_name) const {
exists = S_ISREG(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file " << file_name << ": " << strerror(errno);
}
exists = false;
}
@@ -259,8 +254,7 @@ bool Filesystem::DirectoryExists(const char* dir_name) const {
exists = S_ISDIR(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat directory %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat directory " << dir_name << ": " << strerror(errno);
}
exists = false;
}
@@ -316,8 +310,7 @@ bool Filesystem::GetMatchingFiles(const char* glob,
int basename_idx = GetBasenameIndex(glob);
if (basename_idx == 0) {
// We need a directory.
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Expected directory, no matching files for: %s", glob);
+ ICING_VLOG(1) << "Expected directory, no matching files for: " << glob;
return true;
}
const char* basename_glob = glob + basename_idx;
@@ -372,8 +365,11 @@ int Filesystem::OpenForRead(const char* file_name) const {
int64_t Filesystem::GetFileSize(int fd) const {
struct stat st;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ if (errno == ENOENT) {
+ ICING_VLOG(1) << "Unable to stat file: " << strerror(errno);
+ } else {
+ ICING_LOG(WARNING) << "Unable to stat file: " << strerror(errno);
+ }
return kBadFileSize;
}
return st.st_size;
@@ -383,11 +379,9 @@ int64_t Filesystem::GetFileSize(const char* filename) const {
struct stat st;
if (stat(filename, &st) < 0) {
if (errno == ENOENT) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", filename, strerror(errno));
+ ICING_VLOG(1) << "Unable to stat file " << filename << ": " << strerror(errno);
} else {
- ICING_LOG(WARNING) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", filename, strerror(errno));
+ ICING_LOG(WARNING) << "Unable to stat file " << filename << ": " << strerror(errno);
}
return kBadFileSize;
}
@@ -396,8 +390,7 @@ int64_t Filesystem::GetFileSize(const char* filename) const {
bool Filesystem::Truncate(int fd, int64_t new_size) const {
if (ftruncate(fd, new_size) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to truncate file: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Unable to truncate file: " << strerror(errno);
return false;
}
lseek(fd, new_size, SEEK_SET);
@@ -416,8 +409,7 @@ bool Filesystem::Truncate(const char* filename, int64_t new_size) const {
bool Filesystem::Grow(int fd, int64_t new_size) const {
if (ftruncate(fd, new_size) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to grow file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to grow file: " << strerror(errno);
return false;
}
@@ -442,8 +434,7 @@ bool Filesystem::Write(int fd, const void* data, size_t data_size) const {
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = write(fd, data, chunk_size);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t*>(data) + wrote;
@@ -464,6 +455,68 @@ bool Filesystem::Write(const char* filename, const void* data,
return success;
}
+bool Filesystem::CopyFile(const char* src, const char* dst) const {
+ ScopedFd src_fd(OpenForRead(src));
+
+ std::string dir = GetDirname(dst);
+ if (!CreateDirectoryRecursively(dir.c_str())) {
+ return false;
+ }
+ ScopedFd dst_fd(OpenForWrite(dst));
+
+ if (!src_fd.is_valid() || !dst_fd.is_valid()) {
+ return false;
+ }
+ uint64_t size = GetFileSize(*src_fd);
+ std::unique_ptr<uint8_t[]> buf = std::make_unique<uint8_t[]>(size);
+ if (!Read(*src_fd, buf.get(), size)) {
+ return false;
+ }
+ return Write(*dst_fd, buf.get(), size);
+}
+
+bool Filesystem::CopyDirectory(const char* src_dir, const char* dst_dir,
+ bool recursive) const {
+ DIR* dir = opendir(src_dir);
+ if (!dir) {
+ LogOpenError("Unable to open directory ", src_dir, ": ", errno);
+ return false;
+ }
+
+ dirent* p;
+ // readdir's implementation seems to be thread safe.
+ while ((p = readdir(dir)) != nullptr) {
+ std::string file_name(p->d_name);
+ if (file_name == "." || file_name == "..") {
+ continue;
+ }
+
+ std::string full_src_path = absl_ports::StrCat(src_dir, "/", p->d_name);
+ std::string full_dst_path = absl_ports::StrCat(dst_dir, "/", p->d_name);
+
+ // Directories are copied when writing a non-directory file, so no
+ // explicit copying of a directory is required.
+ if (p->d_type != DT_DIR) {
+ if (!CopyFile(full_src_path.c_str(), full_dst_path.c_str())) {
+ return false;
+ }
+ }
+
+ // Recurse down directories, if requested.
+ if (recursive && (p->d_type == DT_DIR)) {
+ std::string src_sub_dir = absl_ports::StrCat(src_dir, "/", p->d_name);
+ std::string dst_sub_dir = absl_ports::StrCat(dst_dir, "/", p->d_name);
+ if (!CopyDirectory(src_sub_dir.c_str(), dst_sub_dir.c_str(), recursive)) {
+ return false;
+ }
+ }
+ }
+ if (closedir(dir) != 0) {
+ ICING_LOG(ERROR) << "Error closing " << src_dir << ": " << strerror(errno);
+ }
+ return true;
+}
+
bool Filesystem::PWrite(int fd, off_t offset, const void* data,
size_t data_size) const {
size_t write_len = data_size;
@@ -472,8 +525,7 @@ bool Filesystem::PWrite(int fd, off_t offset, const void* data,
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = pwrite(fd, data, chunk_size, offset);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t*>(data) + wrote;
@@ -498,8 +550,7 @@ bool Filesystem::PWrite(const char* filename, off_t offset, const void* data,
bool Filesystem::Read(int fd, void* buf, size_t buf_size) const {
ssize_t read_status = read(fd, buf, buf_size);
if (read_status < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad read: " << strerror(errno);
return false;
}
return true;
@@ -519,8 +570,7 @@ bool Filesystem::Read(const char* filename, void* buf, size_t buf_size) const {
bool Filesystem::PRead(int fd, void* buf, size_t buf_size, off_t offset) const {
ssize_t read_status = pread(fd, buf, buf_size, offset);
if (read_status < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad read: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad read: " << strerror(errno);
return false;
}
return true;
@@ -546,8 +596,7 @@ bool Filesystem::DataSync(int fd) const {
#endif
if (result < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to sync data: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to sync data: " << strerror(errno);
return false;
}
return true;
@@ -555,9 +604,7 @@ bool Filesystem::DataSync(int fd) const {
bool Filesystem::RenameFile(const char* old_name, const char* new_name) const {
if (rename(old_name, new_name) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to rename file %s to %s: %s", old_name, new_name,
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to rename file " << old_name << " to " << new_name << ": " << strerror(errno);
return false;
}
return true;
@@ -595,8 +642,7 @@ bool Filesystem::CreateDirectory(const char* dir_name) const {
if (mkdir(dir_name, S_IRUSR | S_IWUSR | S_IXUSR) == 0) {
success = true;
} else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Creating directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Creating directory " << dir_name << " failed: " << strerror(errno);
}
}
return success;
@@ -616,8 +662,7 @@ bool Filesystem::CreateDirectoryRecursively(const char* dir_name) const {
int64_t Filesystem::GetDiskUsage(int fd) const {
struct stat st;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -626,8 +671,7 @@ int64_t Filesystem::GetDiskUsage(int fd) const {
int64_t Filesystem::GetFileDiskUsage(const char* path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -636,8 +680,7 @@ int64_t Filesystem::GetFileDiskUsage(const char* path) const {
int64_t Filesystem::GetDiskUsage(const char* path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
int64_t result = st.st_blocks * kStatBlockSize;
diff --git a/icing/file/filesystem.h b/icing/file/filesystem.h
index b85f3a0..dd2c5d1 100644
--- a/icing/file/filesystem.h
+++ b/icing/file/filesystem.h
@@ -17,11 +17,9 @@
#ifndef ICING_FILE_FILESYSTEM_H_
#define ICING_FILE_FILESYSTEM_H_
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
#include <cstdint>
+#include <cstdio>
+#include <cstring>
#include <memory>
#include <string>
#include <unordered_set>
@@ -83,8 +81,15 @@ class Filesystem {
// success or if the directory did not yet exist.
virtual bool DeleteDirectoryRecursively(const char* dir_name) const;
+ // Copies the src file to the dst file.
+ virtual bool CopyFile(const char* src, const char* dst) const;
+
+ // Copies the src directory and its contents to the dst dir.
+ virtual bool CopyDirectory(const char* src_dir, const char* dst_dir,
+ bool recursive) const;
+
// Returns true if a file exists. False if the file doesn't exist.
- // If there is an error getting stat on the file, it logs the error and //
+ // If there is an error getting stat on the file, it logs the error and
// asserts.
virtual bool FileExists(const char* file_name) const;
@@ -228,6 +233,11 @@ class Filesystem {
// Increments to_increment by size if size is valid, or sets to_increment
// to kBadFileSize if either size or to_increment is kBadFileSize.
static void IncrementByOrSetInvalid(int64_t size, int64_t* to_increment);
+
+ // Return -1 if file_size is invalid. Otherwise, return file_size.
+ static int64_t SanitizeFileSize(int64_t file_size) {
+ return (file_size != kBadFileSize) ? file_size : -1;
+ }
};
// LINT.ThenChange(//depot/google3/icing/file/mock-filesystem.h)
diff --git a/icing/file/filesystem_test.cc b/icing/file/filesystem_test.cc
index 492a50d..214180e 100644
--- a/icing/file/filesystem_test.cc
+++ b/icing/file/filesystem_test.cc
@@ -38,6 +38,7 @@ using ::testing::Gt;
using ::testing::Le;
using ::testing::Ne;
using ::testing::UnorderedElementsAre;
+using ::testing::UnorderedElementsAreArray;
namespace icing {
namespace lib {
@@ -450,5 +451,47 @@ TEST_F(FilesystemTest, ReadWrite) {
EXPECT_THAT(hello, Eq("hello"));
}
+TEST_F(FilesystemTest, CopyDirectory) {
+ Filesystem filesystem;
+
+ // File structure:
+ // <temp_dir>/
+ // src_dir/
+ // file1
+ // file2
+ // sub_dir/
+ // file3
+ const std::string src_dir = temp_dir_ + "/src_dir";
+ const std::string sub_dir = "sub_dir";
+ const std::string sub_dir_path = src_dir + "/" + sub_dir;
+ vector<std::string> some_files = {"file1", "file2", sub_dir + "/file3"};
+
+ // Make sure there is no pre-existing test-dir structure
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(src_dir.c_str()));
+
+ // Setup a test-dir structure
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(
+ sub_dir_path.c_str())); // deepest path for test
+ CreateTestFiles(some_files, src_dir);
+
+ const std::string dst_dir = temp_dir_ + "/dst_dir";
+ EXPECT_TRUE(filesystem.CopyDirectory(src_dir.c_str(), dst_dir.c_str(),
+ /*recursive=*/true));
+
+ vector<std::string> src_dir_files;
+ EXPECT_TRUE(filesystem.ListDirectory(src_dir.c_str(), /*exclude=*/{},
+ /*recursive=*/true, &src_dir_files));
+
+ vector<std::string> dst_dir_files;
+ EXPECT_TRUE(filesystem.ListDirectory(dst_dir.c_str(), /*exclude=*/{},
+ /*recursive=*/true, &dst_dir_files));
+
+ EXPECT_THAT(dst_dir_files, UnorderedElementsAreArray(src_dir_files));
+
+ // Clean up
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(src_dir.c_str()));
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(dst_dir.c_str()));
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/file/memory-mapped-file-leak_test.cc b/icing/file/memory-mapped-file-leak_test.cc
deleted file mode 100644
index 598fb61..0000000
--- a/icing/file/memory-mapped-file-leak_test.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/file/memory-mapped-file.h"
-
-#include "perftools/profiles/collector/heap/alloc_recorder.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/file/filesystem.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/recorder-test-utils.h"
-#include "icing/testing/tmp-directory.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-namespace heap_profile = ::perftools::profiles::collector::heap;
-
-using testing::Le;
-
-TEST(MemoryMappedFileTest, MMapMemoryLeak) {
- std::string test_dir = GetTestTempDir();
- std::string recorder_dir = test_dir + "/recorder";
- Filesystem filesystem;
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(recorder_dir.c_str()));
-
- ASSERT_TRUE(heap_profile::AllocRecorderStartWithMmapTracking(recorder_dir));
- {
- std::string mmfile_dir = test_dir + "/file";
- ASSERT_TRUE(filesystem.CreateDirectoryRecursively(mmfile_dir.c_str()));
- MemoryMappedFile mmfile(filesystem, mmfile_dir + "/mmfile",
- MemoryMappedFile::READ_WRITE_AUTO_SYNC);
- // How this works:
- // We request a 500-byte mapping starting at the 101st byte of the file.
- // But(!), mmap only accepts offsets that are multiples of page size. So
- // instead mmfile will create a 600-byte mapping starting at the 1st byte of
- // file and then return the address of the 101st byte within that mapping.
- // For this reason, total bytes and peak bytes will be 600 bytes.
- //
- // When mmfile goes out of scope it needs to munmap the mapping that it
- // created. But, remember that the mapping is larger (600 bytes) than what
- // we requested (500 bytes)! So mmfile needs to remember the actual size of
- // the mapping, NOT the requested size. Calling munmap with the correct size
- // will ensure that total_inuse_bytes is 0 after mmfile goes out of scope.
- // Calling munmap with the requested size would still keep 100 bytes of the
- // mapping around!
- mmfile.Remap(100, 500);
- }
- heap_profile::AllocRecorderStop();
-
- // Mmap only affects bytes measurements.
- ProfileInfo profile_info = SummarizeProfileProto(recorder_dir + ".0.pb.gz");
- EXPECT_THAT(profile_info.total_alloc_bytes, Le(600));
- EXPECT_THAT(profile_info.peak_bytes, Le(600));
- EXPECT_THAT(profile_info.inuse_bytes, Le(0));
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/file/memory-mapped-file.cc b/icing/file/memory-mapped-file.cc
index 34365a9..43ed030 100644
--- a/icing/file/memory-mapped-file.cc
+++ b/icing/file/memory-mapped-file.cc
@@ -12,108 +12,156 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// TODO(cassiewang) Add unit-tests to this class.
-
#include "icing/file/memory-mapped-file.h"
#include <sys/mman.h>
#include <cerrno>
+#include <cinttypes>
+#include <memory>
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/file/filesystem.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/util/math-util.h"
+#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
+/* static */ libtextclassifier3::StatusOr<MemoryMappedFile>
+MemoryMappedFile::Create(const Filesystem& filesystem,
+ std::string_view file_path, Strategy mmap_strategy,
+ int64_t max_file_size) {
+ if (max_file_size <= 0 || max_file_size > kMaxFileSize) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Invalid max file size %" PRId64 " for MemoryMappedFile",
+ max_file_size));
+ }
+
+ const std::string file_path_str(file_path);
+ int64_t file_size = filesystem.FileExists(file_path_str.c_str())
+ ? filesystem.GetFileSize(file_path_str.c_str())
+ : 0;
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Bad file size for file ", file_path));
+ }
+
+ return MemoryMappedFile(filesystem, file_path, mmap_strategy, max_file_size,
+ file_size);
+}
+
+/* static */ libtextclassifier3::StatusOr<MemoryMappedFile>
+MemoryMappedFile::Create(const Filesystem& filesystem,
+ std::string_view file_path, Strategy mmap_strategy,
+ int64_t max_file_size, int64_t pre_mapping_file_offset,
+ int64_t pre_mapping_mmap_size) {
+ if (max_file_size <= 0 || max_file_size > kMaxFileSize) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Invalid max file size %" PRId64 " for MemoryMappedFile",
+ max_file_size));
+ }
+
+ // We need at least pre_mapping_file_offset + pre_mapping_mmap_size bytes for
+ // the underlying file size, so max_file_size should be at least
+ // pre_mapping_file_offset + pre_mapping_mmap_size. Safe integer check.
+ if (pre_mapping_file_offset < 0 || pre_mapping_mmap_size < 0 ||
+ pre_mapping_file_offset > max_file_size - pre_mapping_mmap_size) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Invalid pre-mapping file offset %" PRId64 " and mmap size %" PRId64
+ " with max file size %" PRId64 "for MemoryMappedFile",
+ pre_mapping_file_offset, pre_mapping_mmap_size, max_file_size));
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile mmapped_file,
+ Create(filesystem, file_path, mmap_strategy, max_file_size));
+
+ if (pre_mapping_mmap_size > 0) {
+ ICING_RETURN_IF_ERROR(
+ mmapped_file.RemapImpl(pre_mapping_file_offset, pre_mapping_mmap_size));
+ }
+
+ return std::move(mmapped_file);
+}
+
MemoryMappedFile::MemoryMappedFile(const Filesystem& filesystem,
- const std::string_view file_path,
- Strategy mmap_strategy)
+ std::string_view file_path,
+ Strategy mmap_strategy,
+ int64_t max_file_size, int64_t file_size)
: filesystem_(&filesystem),
file_path_(file_path),
- strategy_(mmap_strategy) {}
+ strategy_(mmap_strategy),
+ max_file_size_(max_file_size),
+ file_size_(file_size),
+ mmap_result_(nullptr),
+ file_offset_(0),
+ mmap_size_(0),
+ alignment_adjustment_(0) {}
+
+MemoryMappedFile::MemoryMappedFile(MemoryMappedFile&& other)
+ // Make sure that mmap_result_ is a nullptr before we call Swap. We don't
+ // care what values the remaining members hold before we swap into other,
+ // but if mmap_result_ holds a non-NULL value before we initialized anything
+ // then other will try to free memory at that address when it's destroyed!
+ : mmap_result_(nullptr) {
+ Swap(&other);
+}
+
+MemoryMappedFile& MemoryMappedFile::operator=(MemoryMappedFile&& other) {
+ // Swap all of our elements with other. This will ensure that both this now
+ // holds other's previous resources and that this's previous resources will be
+ // properly freed when other is destructed at the end of this function.
+ Swap(&other);
+ return *this;
+}
MemoryMappedFile::~MemoryMappedFile() { Unmap(); }
void MemoryMappedFile::MemoryMappedFile::Unmap() {
if (mmap_result_ != nullptr) {
- munmap(mmap_result_, adjusted_mmap_size_);
+ munmap(mmap_result_, adjusted_mmap_size());
mmap_result_ = nullptr;
}
file_offset_ = 0;
- region_ = nullptr;
- region_size_ = 0;
- adjusted_mmap_size_ = 0;
+ mmap_size_ = 0;
+ alignment_adjustment_ = 0;
}
-libtextclassifier3::Status MemoryMappedFile::Remap(size_t file_offset,
- size_t mmap_size) {
- // First unmap any previously mmapped region.
- Unmap();
+libtextclassifier3::Status MemoryMappedFile::Remap(int64_t file_offset,
+ int64_t mmap_size) {
+ return RemapImpl(file_offset, mmap_size);
+}
- if (mmap_size == 0) {
- // Nothing more to do.
- return libtextclassifier3::Status::OK;
+libtextclassifier3::Status MemoryMappedFile::GrowAndRemapIfNecessary(
+ int64_t new_file_offset, int64_t new_mmap_size) {
+ // We need at least new_file_offset + new_mmap_size bytes for the underlying
+ // file size, and it should not exceed max_file_size_. Safe integer check.
+ if (new_file_offset < 0 || new_mmap_size < 0 ||
+ new_file_offset > max_file_size_ - new_mmap_size) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Invalid new file offset %" PRId64 " and new mmap size %" PRId64
+ " with max file size %" PRId64 "for MemoryMappedFile",
+ new_file_offset, new_mmap_size, max_file_size_));
}
- size_t aligned_offset =
- math_util::RoundDownTo(file_offset, system_page_size());
- size_t alignment_adjustment = file_offset - aligned_offset;
- size_t adjusted_mmap_size = alignment_adjustment + mmap_size;
-
- int mmap_flags = 0;
- // Determines if the mapped region should just be readable or also writable.
- int protection_flags = 0;
- ScopedFd fd;
- switch (strategy_) {
- case Strategy::READ_ONLY: {
- mmap_flags = MAP_PRIVATE;
- protection_flags = PROT_READ;
- fd.reset(filesystem_->OpenForRead(file_path_.c_str()));
- break;
- }
- case Strategy::READ_WRITE_AUTO_SYNC: {
- mmap_flags = MAP_SHARED;
- protection_flags = PROT_READ | PROT_WRITE;
- fd.reset(filesystem_->OpenForWrite(file_path_.c_str()));
- break;
- }
- case Strategy::READ_WRITE_MANUAL_SYNC: {
- mmap_flags = MAP_PRIVATE;
- protection_flags = PROT_READ | PROT_WRITE;
- // TODO(cassiewang) MAP_PRIVATE effectively makes it a read-only file.
- // figure out if we can open this file in read-only mode.
- fd.reset(filesystem_->OpenForWrite(file_path_.c_str()));
- break;
- }
- default:
- return absl_ports::UnknownError(IcingStringUtil::StringPrintf(
- "Invalid value in switch statement: %d", strategy_));
- }
-
- if (!fd.is_valid()) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Unable to open file meant to be mmapped: ", file_path_));
+ if (new_mmap_size == 0) {
+ // Unmap any previously mmapped region.
+ Unmap();
+ return libtextclassifier3::Status::OK;
}
- mmap_result_ = mmap(nullptr, adjusted_mmap_size, protection_flags, mmap_flags,
- fd.get(), aligned_offset);
+ ICING_RETURN_IF_ERROR(GrowFileSize(new_file_offset + new_mmap_size));
- if (mmap_result_ == MAP_FAILED) {
- mmap_result_ = nullptr;
- return absl_ports::InternalError(absl_ports::StrCat(
- "Failed to mmap region due to error: ", strerror(errno)));
+ if (new_file_offset != file_offset_ || new_mmap_size > mmap_size_) {
+ ICING_RETURN_IF_ERROR(RemapImpl(new_file_offset, new_mmap_size));
}
- file_offset_ = file_offset;
- region_ = reinterpret_cast<char*>(mmap_result_) + alignment_adjustment;
- region_size_ = mmap_size;
- adjusted_mmap_size_ = adjusted_mmap_size;
return libtextclassifier3::Status::OK;
}
@@ -123,13 +171,27 @@ libtextclassifier3::Status MemoryMappedFile::PersistToDisk() {
"Attempting to PersistToDisk on a read-only file: ", file_path_));
}
- if (region_ == nullptr) {
+ if (mmap_result_ == nullptr) {
// Nothing mapped to sync.
return libtextclassifier3::Status::OK;
}
+ // Sync actual file size via system call.
+ int64_t actual_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (actual_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError("Unable to retrieve file size");
+ }
+ file_size_ = actual_file_size;
+
if (strategy_ == Strategy::READ_WRITE_AUTO_SYNC &&
- msync(mmap_result_, adjusted_mmap_size_, MS_SYNC) != 0) {
+ // adjusted_mmap_size(), which is the mmap size after alignment
+ // adjustment, may be larger than the actual underlying file size since we
+ // can pre-mmap a large memory region before growing the file. Therefore,
+ // we should std::min with file_size_ - adjusted_offset() as the msync
+ // size.
+ msync(mmap_result_,
+ std::min(file_size_ - adjusted_offset(), adjusted_mmap_size()),
+ MS_SYNC) != 0) {
return absl_ports::InternalError(
absl_ports::StrCat("Unable to sync file using msync(): ", file_path_));
}
@@ -139,7 +201,13 @@ libtextclassifier3::Status MemoryMappedFile::PersistToDisk() {
// can't be synced using msync(). So, we have to directly write to the
// underlying file to update it.
if (strategy_ == Strategy::READ_WRITE_MANUAL_SYNC &&
- !filesystem_->PWrite(file_path_.c_str(), 0, region(), region_size())) {
+ // Contents before file_offset_ won't be modified by the caller, so we
+ // only need to PWrite contents starting at file_offset_. mmap_size_ may
+ // be larger than the actual underlying file size since we can pre-mmap a
+ // large memory before growing the file. Therefore, we should std::min
+ // with file_size_ - file_offset_ as the PWrite size.
+ !filesystem_->PWrite(file_path_.c_str(), file_offset_, region(),
+ std::min(mmap_size_, file_size_ - file_offset_))) {
return absl_ports::InternalError(
absl_ports::StrCat("Unable to sync file using PWrite(): ", file_path_));
}
@@ -160,12 +228,162 @@ libtextclassifier3::Status MemoryMappedFile::OptimizeFor(
madvise_flag = MADV_SEQUENTIAL;
}
- if (madvise(mmap_result_, adjusted_mmap_size_, madvise_flag) != 0) {
+ if (madvise(mmap_result_, adjusted_mmap_size(), madvise_flag) != 0) {
return absl_ports::InternalError(absl_ports::StrCat(
"Unable to madvise file ", file_path_, "; Error: ", strerror(errno)));
}
return libtextclassifier3::Status::OK;
}
+libtextclassifier3::Status MemoryMappedFile::GrowFileSize(
+ int64_t new_file_size) {
+ // Early return if new_file_size doesn't exceed the cached file size
+ // (file_size_). It saves a system call for getting the actual file size and
+ // reduces latency significantly.
+ if (new_file_size <= file_size_) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ if (new_file_size > max_file_size_) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "new file size %" PRId64 " exceeds maximum file size allowed, %" PRId64
+ " bytes",
+ new_file_size, max_file_size_));
+ }
+
+ // Sync actual file size via system call.
+ int64_t actual_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (actual_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError("Unable to retrieve file size");
+ }
+ file_size_ = actual_file_size;
+
+ // Early return again if new_file_size doesn't exceed actual_file_size. It
+ // saves system calls for opening and closing file descriptor.
+ if (new_file_size <= actual_file_size) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ if (strategy_ == Strategy::READ_ONLY) {
+ return absl_ports::FailedPreconditionError(absl_ports::StrCat(
+ "Attempting to grow a read-only file: ", file_path_));
+ }
+
+ // We use Write here rather than Grow because Grow doesn't actually allocate
+ // an underlying disk block. This can lead to problems with mmap because mmap
+ // has no effective way to signal that it was impossible to allocate the disk
+ // block and ends up crashing instead. Write will force the allocation of
+ // these blocks, which will ensure that any failure to grow will surface here.
+ int64_t page_size = system_page_size();
+ auto buf = std::make_unique<uint8_t[]>(page_size);
+ int64_t size_to_write = std::min(page_size - (file_size_ % page_size),
+ new_file_size - file_size_);
+ ScopedFd sfd(filesystem_->OpenForAppend(file_path_.c_str()));
+ if (!sfd.is_valid()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Couldn't open file ", file_path_));
+ }
+ while (size_to_write > 0 && file_size_ < new_file_size) {
+ if (!filesystem_->Write(sfd.get(), buf.get(), size_to_write)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Couldn't grow file ", file_path_));
+ }
+ file_size_ += size_to_write;
+ size_to_write = std::min(page_size - (file_size_ % page_size),
+ new_file_size - file_size_);
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status MemoryMappedFile::RemapImpl(int64_t new_file_offset,
+ int64_t new_mmap_size) {
+ if (new_file_offset < 0) {
+ return absl_ports::OutOfRangeError("Invalid file offset");
+ }
+
+ if (new_mmap_size < 0) {
+ return absl_ports::OutOfRangeError("Invalid mmap size");
+ }
+
+ if (new_mmap_size == 0) {
+ // First unmap any previously mmapped region.
+ Unmap();
+ return libtextclassifier3::Status::OK;
+ }
+
+ int64_t new_aligned_offset =
+ math_util::RoundDownTo(new_file_offset, system_page_size());
+ int64_t new_alignment_adjustment = new_file_offset - new_aligned_offset;
+ int64_t new_adjusted_mmap_size = new_alignment_adjustment + new_mmap_size;
+
+ int mmap_flags = 0;
+ // Determines if the mapped region should just be readable or also writable.
+ int protection_flags = 0;
+ ScopedFd fd;
+ switch (strategy_) {
+ case Strategy::READ_ONLY: {
+ mmap_flags = MAP_PRIVATE;
+ protection_flags = PROT_READ;
+ fd.reset(filesystem_->OpenForRead(file_path_.c_str()));
+ break;
+ }
+ case Strategy::READ_WRITE_AUTO_SYNC: {
+ mmap_flags = MAP_SHARED;
+ protection_flags = PROT_READ | PROT_WRITE;
+ fd.reset(filesystem_->OpenForWrite(file_path_.c_str()));
+ break;
+ }
+ case Strategy::READ_WRITE_MANUAL_SYNC: {
+ mmap_flags = MAP_PRIVATE;
+ protection_flags = PROT_READ | PROT_WRITE;
+ // TODO(cassiewang) MAP_PRIVATE effectively makes it a read-only file.
+ // figure out if we can open this file in read-only mode.
+ fd.reset(filesystem_->OpenForWrite(file_path_.c_str()));
+ break;
+ }
+ default:
+ return absl_ports::UnknownError(IcingStringUtil::StringPrintf(
+ "Invalid value in switch statement: %d", strategy_));
+ }
+
+ if (!fd.is_valid()) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Unable to open file meant to be mmapped: ", file_path_));
+ }
+
+ void* new_mmap_result =
+ mmap(nullptr, new_adjusted_mmap_size, protection_flags, mmap_flags,
+ fd.get(), new_aligned_offset);
+
+ if (new_mmap_result == MAP_FAILED) {
+ new_mmap_result = nullptr;
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to mmap region due to error: ", strerror(errno)));
+ }
+
+ // Now we know that we have successfully created a new mapping. We can free
+ // the old one and switch to the new one.
+ Unmap();
+
+ mmap_result_ = new_mmap_result;
+ file_offset_ = new_file_offset;
+ mmap_size_ = new_mmap_size;
+ alignment_adjustment_ = new_alignment_adjustment;
+ return libtextclassifier3::Status::OK;
+}
+
+void MemoryMappedFile::Swap(MemoryMappedFile* other) {
+ std::swap(filesystem_, other->filesystem_);
+ std::swap(file_path_, other->file_path_);
+ std::swap(strategy_, other->strategy_);
+ std::swap(max_file_size_, other->max_file_size_);
+ std::swap(file_size_, other->file_size_);
+ std::swap(mmap_result_, other->mmap_result_);
+ std::swap(file_offset_, other->file_offset_);
+ std::swap(mmap_size_, other->mmap_size_);
+ std::swap(alignment_adjustment_, other->alignment_adjustment_);
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/file/memory-mapped-file.h b/icing/file/memory-mapped-file.h
index 5a52368..54507af 100644
--- a/icing/file/memory-mapped-file.h
+++ b/icing/file/memory-mapped-file.h
@@ -21,18 +21,54 @@
// faster reads as well as background-sync vs manual-sync of changes to disk.
// For more details, see comments at MemoryMappedFile::Strategy.
//
-// Usage:
+// ** Usage 1: pre-mmap large memory and grow the underlying file internally **
//
-// MemoryMappedFile mmapped_file(filesystem, "/file.pb", READ_WRITE_AUTO_SYNC));
-// mmapped_file->Remap(0, 16* 1024); // load the first 16K of the file.
+// // Create MemoryMappedFile instance.
+// ICING_ASSIGN_OR_RETURN(
+// std::unique_ptr<MemoryMappedFile> mmapped_file,
+// MemoryMappedFile::Create(filesystem, "/file.pb",
+// READ_WRITE_AUTO_SYNC,
+// max_file_size,
+// /*pre_mapping_file_offset=*/0,
+// /*pre_mapping_mmap_size=*/1024 * 1024));
//
+// // Found that we need 4K bytes for the file and mmapped region.
+// mmapped_file->GrowAndRemapIfNecessary(
+// /*new_file_offset=*/0, /*new_mmap_size=*/4 * 1024);
+// char read_byte = mmapped_file->region()[4000];
+// mmapped_file->mutable_region()[4001] = write_byte;
+//
+// mmapped_file->PersistToDisk(); // Optional; immediately writes changes to
+// disk.
+//
+// // Found that we need 2048 * 1024 bytes for the file and mmapped region.
+// mmapped_file->GrowAndRemapIfNecessary(
+// /*new_file_offset=*/0, /*new_mmap_size=*/2048 * 1024);
+// mmapped_file->mutable_region()[2000 * 1024] = write_byte;
+// mmapped_file.reset();
+//
+// ** Usage 2: load by segments **
+//
+// ICING_ASSIGN_OR_RETURN(
+// std::unique_ptr<MemoryMappedFile> mmapped_file,
+// MemoryMappedFile::Create(filesystem, "/file.pb",
+// READ_WRITE_AUTO_SYNC,
+// max_file_size,
+// /*pre_mapping_file_offset=*/0,
+// /*pre_mapping_mmap_size=*/16 * 1024));
+//
+// // load the first 16K.
+// mmapped_file->GrowAndRemapIfNecessary(
+// /*new_file_offset=*/0, /*new_mmap_size=*/16 * 1024);
// char read_byte = mmapped_file->region()[100];
// mmapped_file->mutable_region()[10] = write_byte;
//
// mmapped_file->PersistToDisk(); // Optional; immediately writes changes to
// disk.
//
-// mmapped_file->Remap(16*1024, 16* 1024); // load the next 16K.
+// // load the next 16K.
+// mmapped_file->GrowAndRemapIfNecessary(
+// /*new_file_offset=*/16 * 1024, /*new_mmap_size=*/16 * 1024);
// mmapped_file->mutable_region()[10] = write_byte;
// mmapped_file.reset();
@@ -41,12 +77,14 @@
#include <unistd.h>
+#include <algorithm>
#include <cstdint>
#include <memory>
#include <string>
#include <string_view>
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/file/filesystem.h"
namespace icing {
@@ -54,8 +92,9 @@ namespace lib {
class MemoryMappedFile {
public:
- static size_t __attribute__((const)) system_page_size() {
- static const size_t page_size = sysconf(_SC_PAGE_SIZE);
+ static int64_t __attribute__((const)) system_page_size() {
+ static const int64_t page_size =
+ static_cast<int64_t>(sysconf(_SC_PAGE_SIZE));
return page_size;
}
@@ -71,23 +110,95 @@ class MemoryMappedFile {
// Memory map a read-write file into a writable memory region. Changes made
// to this region will never be auto-synced to the underlying file. Unless
// the caller explicitly calls PersistToDisk(), all changes will be lost
- // when the
- // MemoryMappedFile is destroyed.
+ // when the MemoryMappedFile is destroyed.
READ_WRITE_MANUAL_SYNC,
};
- // file_path : Full path of the file that needs to be memory-mapped.
- MemoryMappedFile(const Filesystem& filesystem, std::string_view file_path,
- Strategy mmap_strategy);
+ // Absolute max file size, 16 GiB.
+ static constexpr int64_t kMaxFileSize = INT64_C(1) << 34;
+
+ // Default max file size, 1 MiB.
+ static constexpr int64_t kDefaultMaxFileSize = INT64_C(1) << 20;
+
+ // Creates a new MemoryMappedFile to read/write content to.
+ //
+ // filesystem : Object to make system level calls
+ // file_path : Full path of the file that needs to be memory-mapped.
+ // mmap_strategy : Strategy/optimizations to access the content.
+ // max_file_size : Maximum file size for MemoryMappedFile, default
+ // kDefaultMaxFileSize.
+ //
+ // Returns:
+ // A MemoryMappedFile instance on success
+ // OUT_OF_RANGE_ERROR if max_file_size is invalid
+ // INTERNAL_ERROR on I/O error
+ static libtextclassifier3::StatusOr<MemoryMappedFile> Create(
+ const Filesystem& filesystem, std::string_view file_path,
+ Strategy mmap_strategy, int64_t max_file_size = kDefaultMaxFileSize);
+
+ // Creates a new MemoryMappedFile to read/write content to. It remaps when
+ // creating the instance, but doesn't check or grow the actual file size, so
+ // the caller should call GrowAndRemapIfNecessary before accessing region.
+ //
+ // filesystem : Object to make system level calls
+ // file_path : Full path of the file that needs to be memory-mapped.
+ // mmap_strategy : Strategy/optimizations to access the content.
+ // max_file_size : Maximum file size for MemoryMappedFile.
+ // pre_mapping_file_offset : The offset of the file to be memory mapped.
+ // pre_mapping_mmap_size : mmap size for pre-mapping.
+ //
+ // Returns:
+ // A MemoryMappedFile instance on success
+ // OUT_OF_RANGE_ERROR if max_file_size, file_offset, or mmap_size is invalid
+ // INTERNAL_ERROR on I/O error
+ static libtextclassifier3::StatusOr<MemoryMappedFile> Create(
+ const Filesystem& filesystem, std::string_view file_path,
+ Strategy mmap_strategy, int64_t max_file_size,
+ int64_t pre_mapping_file_offset, int64_t pre_mapping_mmap_size);
+
+ // Delete copy constructor and assignment operator.
+ MemoryMappedFile(const MemoryMappedFile& other) = delete;
+ MemoryMappedFile& operator=(const MemoryMappedFile& other) = delete;
+
+ MemoryMappedFile(MemoryMappedFile&& other);
+ MemoryMappedFile& operator=(MemoryMappedFile&& other);
// Frees any region that is still memory-mapped region.
~MemoryMappedFile();
+ // TODO(b/247671531): migrate all callers to use GrowAndRemapIfNecessary and
+ // deprecate this API.
+ //
// Memory-map the newly specified region within the file specified by
// file_offset and mmap_size. Unmaps any previously mmapped region.
+ // It doesn't handle the underlying file growth.
//
// Returns any encountered IO error.
- libtextclassifier3::Status Remap(size_t file_offset, size_t mmap_size);
+ libtextclassifier3::Status Remap(int64_t file_offset, int64_t mmap_size);
+
+ // Attempt to memory-map the newly specified region within the file specified
+ // by new_file_offset and new_mmap_size. It handles mmap and file growth
+ // intelligently.
+ // - Compute least file size needed according to new_file_offset and
+ // new_mmap_size, and compare with the current file size. If requiring file
+ // growth, then grow the underlying file (Write) or return error if
+ // strategy_ is READ_ONLY.
+ // - If new_file_offset is different from the current file_offset_ or
+ // new_mmap_size is greater than the current mmap_size_, then memory-map
+ // the newly specified region and unmap any previously mmapped region.
+ //
+ // This API is useful for file growth since it grows the underlying file
+ // internally and handles remapping intelligently. By pre-mmapping a large
+ // memory, we only need to grow the underlying file (Write) without remapping
+ // in each round of growth, which significantly reduces the cost of system
+ // call and memory paging after remap.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if new_file_offset and new_mmap_size is invalid
+ // Any error from GrowFileSize() and RemapImpl()
+ libtextclassifier3::Status GrowAndRemapIfNecessary(int64_t new_file_offset,
+ int64_t new_mmap_size);
// unmap and free-up the region that has currently been memory mapped.
void Unmap();
@@ -126,32 +237,147 @@ class MemoryMappedFile {
};
libtextclassifier3::Status OptimizeFor(AccessPattern access_pattern);
+ Strategy strategy() const { return strategy_; }
+
+ int64_t max_file_size() const { return max_file_size_; }
+
// Accessors to the memory-mapped region. Returns null if nothing is mapped.
- const char* region() const { return region_; }
- char* mutable_region() { return region_; }
+ const char* region() const {
+ return reinterpret_cast<const char*>(mmap_result_) + alignment_adjustment_;
+ }
+ char* mutable_region() {
+ return reinterpret_cast<char*>(mmap_result_) + alignment_adjustment_;
+ }
- size_t region_size() const { return region_size_; }
- Strategy strategy() const { return strategy_; }
+ int64_t file_offset() const { return file_offset_; }
+
+ // TODO(b/247671531): remove this API after migrating all callers to use
+ // GrowAndRemapIfNecessary.
+ int64_t region_size() const { return mmap_size_; }
+
+ // The size that is safe for the client to read/write. This is only valid for
+ // callers that use GrowAndRemapIfNecessary.
+ int64_t available_size() const {
+ return std::min(mmap_size_,
+ std::max(INT64_C(0), file_size_ - file_offset_));
+ }
private:
+ explicit MemoryMappedFile(const Filesystem& filesystem,
+ std::string_view file_path, Strategy mmap_strategy,
+ int64_t max_file_size, int64_t file_size);
+
+ // Grow the underlying file to new_file_size.
+ // Note: it is possible that Write() (implemented in the file system call
+ // library) grows the underlying file partially and returns error due to
+ // failures, so the cached file_size_ may contain out-of-date value, but it is
+ // still guaranteed that file_size_ is always smaller or equal to the actual
+ // file size. In the next round of growing:
+ // - If new_file_size is not greater than file_size_, then we're still
+ // confident that the actual file size is large enough and therefore skip
+ // the grow process.
+ // - If new_file_size is greater than file_size_, then we will invoke the
+ // system call to sync the actual file size. At this moment, file_size_ is
+ // the actual file size and therefore we can grow the underlying file size
+ // correctly.
+ //
+ // Returns:
+ // OK on success
+ // FAILED_PRECONDITION_ERROR if requiring file growth and strategy_ is
+ // READ_ONLY
+ // OUT_OF_RANGE_ERROR if new_mmap_size exceeds max_file_size_
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status GrowFileSize(int64_t new_file_size);
+
+ // Memory-map the newly specified region within the file specified by
+ // new_file_offset and new_mmap_size. Unmaps any previously mmapped region.
+ // It doesn't handle the underlying file growth.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if new_file_offset and new_mmap_size is invalid
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status RemapImpl(int64_t new_file_offset,
+ int64_t new_mmap_size);
+
+ // Swaps the contents of this with other.
+ void Swap(MemoryMappedFile* other);
+
+ int64_t adjusted_offset() const {
+ return file_offset_ - alignment_adjustment_;
+ }
+
+ int64_t adjusted_mmap_size() const {
+ return alignment_adjustment_ + mmap_size_;
+ }
+
// Cached constructor params.
- const Filesystem* const filesystem_;
- const std::string file_path_;
- const Strategy strategy_;
+ const Filesystem* filesystem_;
+ std::string file_path_;
+ Strategy strategy_;
- // Offset within the file at which the current memory-mapped region starts.
- size_t file_offset_ = 0;
+ // Raw file related fields:
+ // - max_file_size_
+ // - file_size_
+
+ // Max file size for MemoryMappedFile. It should not exceed the absolute max
+ // size of memory mapped file (kMaxFileSize). It is only used in
+ // GrowAndRemapIfNecessary(), the new API that handles underlying file growth
+ // internally and remaps intelligently.
+ //
+ // Note: max_file_size_ will be specified in runtime and the caller should
+ // make sure its value is correct and reasonable.
+ int64_t max_file_size_;
- // Region that is currently memory-mapped.
- char* region_ = nullptr;
- size_t region_size_ = 0;
+ // Cached file size to avoid calling system call too frequently. It is only
+ // used in GrowAndRemapIfNecessary(), the new API that handles underlying file
+ // growth internally and remaps intelligently.
+ //
+ // Note: it is guaranteed that file_size_ is smaller or equal to the actual
+ // file size as long as the underlying file hasn't been truncated or deleted
+ // externally. See GrowFileSize() for more details.
+ int64_t file_size_;
- // The actual size of the region we mmapped. As the requested region might not
- // align with system pages, we often mmap more bytes than requested.
- size_t adjusted_mmap_size_ = 0;
+ // Memory mapped related fields:
+ // - mmap_result_
+ // - file_offset_
+ // - alignment_adjustment_
+ // - mmap_size_
// Raw pointer (or error) returned by calls to mmap().
- void* mmap_result_ = nullptr;
+ void* mmap_result_;
+
+ // Offset within the file at which the current memory-mapped region starts.
+ int64_t file_offset_;
+
+ // Size that is currently memory-mapped.
+ // Note that the mmapped size can be larger than the underlying file size. We
+ // can reduce remapping by pre-mmapping a large memory and grow the file size
+ // later. See GrowAndRemapIfNecessary().
+ int64_t mmap_size_;
+
+ // The difference between file_offset_ and the actual adjusted (aligned)
+ // offset.
+ // Since mmap requires the offset to be a multiple of system page size, we
+ // have to align file_offset_ to the last multiple of system page size.
+ int64_t alignment_adjustment_;
+
+ // E.g. system_page_size = 5, RemapImpl(/*new_file_offset=*/8, mmap_size)
+ //
+ // File layout: xxxxx xxxxx xxxxx xxxxx xxxxx xx
+ // file_offset_: 8
+ // adjusted_offset(): 5
+ // region()/mutable_region(): |
+ // mmap_result_: |
+ //
+ // alignment_adjustment_: file_offset_ - adjusted_offset()
+ // mmap_size_: mmap_size
+ // region_size(): mmap_size_
+ // available_size(): std::min(mmap_size_,
+ // std::max(0, file_size_ - file_offset_))
+ // region_range: [file_offset_, file_offset + mmap_size)
+ // adjusted_mmap_size(): alignment_adjustment_ + mmap_size_
+ // adjusted_mmap_range: [alignment_offset, file_offset + mmap_size)
};
} // namespace lib
diff --git a/icing/file/memory-mapped-file_test.cc b/icing/file/memory-mapped-file_test.cc
new file mode 100644
index 0000000..16f76e6
--- /dev/null
+++ b/icing/file/memory-mapped-file_test.cc
@@ -0,0 +1,668 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/memory-mapped-file.h"
+
+#include <cstdint>
+#include <limits>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+using ::testing::DoDefault;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::IsNull;
+using ::testing::Le;
+using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::Return;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+class MemoryMappedFileTest : public ::testing::Test {
+ protected:
+ void SetUp() override { file_path_ = GetTestTempDir() + "/mmap_test_file"; }
+
+ void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
+
+ const Filesystem& filesystem() const { return filesystem_; }
+
+ Filesystem filesystem_;
+ std::string file_path_;
+};
+
+TEST_F(MemoryMappedFileTest, Create) {
+ constexpr int max_file_size = 8192;
+ MemoryMappedFile::Strategy stragegy =
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC;
+ // Create MemoryMappedFile
+ ICING_ASSERT_OK_AND_ASSIGN(MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ stragegy, max_file_size));
+
+ EXPECT_THAT(mmapped_file.strategy(), Eq(stragegy));
+ EXPECT_THAT(mmapped_file.max_file_size(), Eq(max_file_size));
+ EXPECT_THAT(mmapped_file.region(), IsNull());
+ EXPECT_THAT(mmapped_file.mutable_region(), IsNull());
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(0));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(0));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(0));
+}
+
+TEST_F(MemoryMappedFileTest, CreateFromExistingFile) {
+ int init_file_size = 100;
+ {
+ // Initialize file
+ ScopedFd sfd(filesystem_.OpenForWrite(file_path_.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+ auto buf = std::make_unique<char[]>(init_file_size);
+ ASSERT_TRUE(filesystem_.Write(sfd.get(), buf.get(), init_file_size));
+ }
+
+ constexpr int max_file_size = 8192;
+ MemoryMappedFile::Strategy stragegy =
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC;
+ // Create MemoryMappedFile from an existing file
+ ICING_ASSERT_OK_AND_ASSIGN(MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ stragegy, max_file_size));
+
+ EXPECT_THAT(mmapped_file.strategy(), Eq(stragegy));
+ EXPECT_THAT(mmapped_file.max_file_size(), Eq(max_file_size));
+ EXPECT_THAT(mmapped_file.region(), IsNull());
+ EXPECT_THAT(mmapped_file.mutable_region(), IsNull());
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(0));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(0));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(0));
+}
+
+TEST_F(MemoryMappedFileTest, CreateWithInvalidMaxFileSize) {
+ EXPECT_THAT(
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(MemoryMappedFile::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/MemoryMappedFile::kMaxFileSize + 1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(MemoryMappedFile::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/-1, /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/8192),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(
+ MemoryMappedFile::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/MemoryMappedFile::kMaxFileSize + 1,
+ /*pre_mapping_file_offset=*/0, /*pre_mapping_mmap_size=*/8192),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(MemoryMappedFileTest, CreateWithPreMappingInfo) {
+ constexpr int max_file_size = 8192;
+ constexpr int pre_mapping_file_offset = 99;
+ constexpr int pre_mapping_mmap_size = 2000;
+ MemoryMappedFile::Strategy stragegy =
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC;
+ // Create MemoryMappedFile with pre-mapping file_offset and mmap_size
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_, stragegy, max_file_size,
+ pre_mapping_file_offset, pre_mapping_mmap_size));
+
+ EXPECT_THAT(mmapped_file.strategy(), Eq(stragegy));
+ EXPECT_THAT(mmapped_file.max_file_size(), Eq(max_file_size));
+ EXPECT_THAT(mmapped_file.region(), NotNull());
+ EXPECT_THAT(mmapped_file.mutable_region(), NotNull());
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(pre_mapping_file_offset));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(pre_mapping_mmap_size));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(0));
+
+ // Manually grow the file externally and mutate region. There should be no
+ // memory error.
+ {
+ ScopedFd sfd(filesystem_.OpenForAppend(file_path_.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+ int grow_size = 4096;
+ auto buf = std::make_unique<char[]>(grow_size);
+ ASSERT_TRUE(filesystem_.Write(sfd.get(), buf.get(), grow_size));
+ }
+ mmapped_file.mutable_region()[0] = 'a';
+ ICING_EXPECT_OK(mmapped_file.PersistToDisk());
+
+ {
+ ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+ int buf_size = 10;
+ auto buf = std::make_unique<char[]>(buf_size);
+ ASSERT_TRUE(filesystem_.PRead(sfd.get(), buf.get(), buf_size,
+ pre_mapping_file_offset));
+ EXPECT_THAT(buf.get()[0], Eq('a'));
+ }
+}
+
+TEST_F(MemoryMappedFileTest, CreateWithInvalidPreMappingInfo) {
+ int page_size = MemoryMappedFile::system_page_size();
+ int max_file_size = page_size * 2;
+
+ // Negative file_offset
+ EXPECT_THAT(
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ max_file_size,
+ /*pre_mapping_file_offset=*/-1,
+ /*pre_mapping_mmap_size=*/page_size),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Negative mmap_size
+ EXPECT_THAT(
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ max_file_size, /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // pre_mapping_file_offset + pre_mapping_mmap_size > max_file_size.
+ int pre_mapping_file_offset = 99;
+ int pre_mapping_mmap_size = max_file_size - pre_mapping_file_offset + 1;
+ EXPECT_THAT(
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ max_file_size, pre_mapping_file_offset,
+ pre_mapping_mmap_size),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Edge cases to make sure the implementation of range check won't have
+ // integer overflow bug.
+ EXPECT_THAT(
+ MemoryMappedFile::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
+ /*pre_mapping_file_offset=*/99,
+ /*pre_mapping_mmap_size=*/std::numeric_limits<int64_t>::max()),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(MemoryMappedFile::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ max_file_size, /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/INT64_C(-1) *
+ (std::numeric_limits<int64_t>::max() - max_file_size)),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(
+ MemoryMappedFile::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
+ /*pre_mapping_file_offset=*/INT64_C(-1) *
+ (std::numeric_limits<int64_t>::max() - max_file_size),
+ /*pre_mapping_mmap_size=*/page_size),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+// TODO(b/247671531): remove this test after deprecating Remap
+TEST_F(MemoryMappedFileTest, RemapZeroMmapSizeShouldUnmap) {
+ // Create MemoryMappedFile
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ MemoryMappedFile::kDefaultMaxFileSize));
+
+ int page_size = MemoryMappedFile::system_page_size();
+ int file_offset = 99;
+ int mmap_size = page_size * 2 - file_offset;
+ ICING_ASSERT_OK(mmapped_file.Remap(file_offset, mmap_size));
+ ASSERT_THAT(mmapped_file.region(), NotNull());
+
+ // Call GrowAndRemapIfNecessary with any file_offset and new_mmap_size = 0.
+ // The original mmapped region should be unmapped.
+ ICING_EXPECT_OK(mmapped_file.Remap(file_offset, /*mmap_size=*/0));
+ EXPECT_THAT(mmapped_file.region(), IsNull());
+}
+
+TEST_F(MemoryMappedFileTest, GrowAndRemapIfNecessary) {
+ int page_size = MemoryMappedFile::system_page_size();
+ int pre_mapping_file_offset = 99;
+ int pre_mapping_mmap_size = page_size * 2 - pre_mapping_file_offset;
+ {
+ // Create MemoryMappedFile with pre-mapping file_offset and mmap_size
+ // without growing the file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(
+ filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ MemoryMappedFile::kDefaultMaxFileSize, pre_mapping_file_offset,
+ pre_mapping_mmap_size));
+ ASSERT_THAT(filesystem_.GetFileSize(file_path_.c_str()), Eq(0));
+ const char* original_region = mmapped_file.region();
+
+ // Call GrowAndRemapIfNecessary with same file_offset and new mmap_size that
+ // doesn't exceed pre_mapping_mmap_size. The underlying file size should
+ // grow correctly, but there should be no remap.
+ int new_mmap_size1 = page_size - pre_mapping_file_offset;
+ ICING_EXPECT_OK(mmapped_file.GrowAndRemapIfNecessary(
+ pre_mapping_file_offset, new_mmap_size1));
+
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(pre_mapping_file_offset + new_mmap_size1));
+ EXPECT_THAT(mmapped_file.region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.mutable_region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(pre_mapping_file_offset));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(pre_mapping_mmap_size));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(new_mmap_size1));
+
+ // Test it with new_mmap_size2 = pre_mapping_mmap_size
+ int new_mmap_size2 = pre_mapping_mmap_size;
+ ICING_EXPECT_OK(mmapped_file.GrowAndRemapIfNecessary(
+ pre_mapping_file_offset, new_mmap_size2));
+
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(pre_mapping_file_offset + new_mmap_size2));
+ EXPECT_THAT(mmapped_file.region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.mutable_region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(pre_mapping_file_offset));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(pre_mapping_mmap_size));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(new_mmap_size2));
+
+ // Write some bytes to region()[0]. It should write the underlying file at
+ // file_offset.
+ mmapped_file.mutable_region()[0] = 'a';
+ ICING_ASSERT_OK(mmapped_file.PersistToDisk());
+ }
+
+ ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+ int buf_size = 1;
+ auto buf = std::make_unique<char[]>(buf_size);
+ ASSERT_TRUE(filesystem_.PRead(sfd.get(), buf.get(), buf_size,
+ pre_mapping_file_offset));
+ EXPECT_THAT(buf.get()[0], Eq('a'));
+}
+
+TEST_F(MemoryMappedFileTest,
+ GrowAndRemapIfNecessaryExceedingPreMappingMmapSize) {
+ int page_size = MemoryMappedFile::system_page_size();
+ int pre_mapping_file_offset = 99;
+ int pre_mapping_mmap_size = page_size * 2 - pre_mapping_file_offset;
+ // Create MemoryMappedFile with pre-mapping file_offset and mmap_size without
+ // growing the file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ MemoryMappedFile::kDefaultMaxFileSize,
+ pre_mapping_file_offset, pre_mapping_mmap_size));
+ const char* original_region = mmapped_file.region();
+
+ // Call GrowAndRemapIfNecessary with same file offset and new mmap_size that
+ // exceeds pre_mapping_mmap_size (but still below max_file_size). The
+ // underlying file size should grow correctly and the region should be
+ // remapped.
+ int new_mmap_size = page_size * 3 - pre_mapping_file_offset;
+ ICING_EXPECT_OK(mmapped_file.GrowAndRemapIfNecessary(pre_mapping_file_offset,
+ new_mmap_size));
+
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(pre_mapping_file_offset + new_mmap_size));
+ EXPECT_THAT(mmapped_file.region(), Not(Eq(original_region)));
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(pre_mapping_file_offset));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(new_mmap_size));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(new_mmap_size));
+}
+
+TEST_F(MemoryMappedFileTest, GrowAndRemapIfNecessaryDecreasingMmapSize) {
+ int page_size = MemoryMappedFile::system_page_size();
+ int pre_mapping_file_offset = 99;
+ int pre_mapping_mmap_size = page_size * 2 - pre_mapping_file_offset;
+ // Create MemoryMappedFile with pre-mapping file_offset and mmap_size, and
+ // call GrowAndRemapIfNecessary to grow the underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ MemoryMappedFile::kDefaultMaxFileSize,
+ pre_mapping_file_offset, pre_mapping_mmap_size));
+ ICING_ASSERT_OK(mmapped_file.GrowAndRemapIfNecessary(pre_mapping_file_offset,
+ pre_mapping_mmap_size));
+
+ const char* original_region = mmapped_file.region();
+ int original_file_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_THAT(original_file_size,
+ Eq(pre_mapping_file_offset + pre_mapping_mmap_size));
+ ASSERT_THAT(mmapped_file.region_size(), Eq(pre_mapping_mmap_size));
+ ASSERT_THAT(mmapped_file.available_size(), Eq(pre_mapping_mmap_size));
+
+ // Call GrowAndRemapIfNecessary with same file offset and new mmap_size
+ // smaller than pre_mapping_mmap_size. There should be no file growth/truncate
+ // or remap.
+ int new_mmap_size = page_size - pre_mapping_file_offset;
+ ICING_EXPECT_OK(mmapped_file.GrowAndRemapIfNecessary(pre_mapping_file_offset,
+ new_mmap_size));
+
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(original_file_size));
+ EXPECT_THAT(mmapped_file.region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(pre_mapping_file_offset));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(pre_mapping_mmap_size));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(pre_mapping_mmap_size));
+}
+
+TEST_F(MemoryMappedFileTest, GrowAndRemapIfNecessaryZeroMmapSizeShouldUnmap) {
+ int page_size = MemoryMappedFile::system_page_size();
+ int pre_mapping_file_offset = 99;
+ int pre_mapping_mmap_size = page_size * 2 - pre_mapping_file_offset;
+ // Create MemoryMappedFile with pre-mapping file_offset and mmap_size, and
+ // call GrowAndRemapIfNecessary to grow the underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ MemoryMappedFile::kDefaultMaxFileSize,
+ pre_mapping_file_offset, pre_mapping_mmap_size));
+ ICING_ASSERT_OK(mmapped_file.GrowAndRemapIfNecessary(pre_mapping_file_offset,
+ pre_mapping_mmap_size));
+
+ int original_file_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_THAT(original_file_size,
+ Eq(pre_mapping_file_offset + pre_mapping_mmap_size));
+ ASSERT_THAT(mmapped_file.region(), NotNull());
+ ASSERT_THAT(mmapped_file.region_size(), Eq(pre_mapping_mmap_size));
+ ASSERT_THAT(mmapped_file.available_size(), Eq(pre_mapping_mmap_size));
+
+ // Call GrowAndRemapIfNecessary with any file_offset and new_mmap_size = 0.
+ // There should be no file growth/truncate, but the original mmapped region
+ // should be unmapped.
+ ICING_EXPECT_OK(mmapped_file.GrowAndRemapIfNecessary(pre_mapping_file_offset,
+ /*new_mmap_size=*/0));
+
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(original_file_size));
+ EXPECT_THAT(mmapped_file.region(), IsNull());
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(0));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(0));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(0));
+}
+
+TEST_F(MemoryMappedFileTest, GrowAndRemapIfNecessaryChangeOffset) {
+ int page_size = MemoryMappedFile::system_page_size();
+ int pre_mapping_file_offset = 99;
+ int pre_mapping_mmap_size = page_size * 2 - pre_mapping_file_offset;
+ // Create MemoryMappedFile with pre-mapping file_offset and mmap_size, and
+ // call GrowAndRemapIfNecessary to grow the underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ MemoryMappedFile::kDefaultMaxFileSize,
+ pre_mapping_file_offset, pre_mapping_mmap_size));
+ ICING_ASSERT_OK(mmapped_file.GrowAndRemapIfNecessary(pre_mapping_file_offset,
+ pre_mapping_mmap_size));
+
+ const char* original_region = mmapped_file.region();
+ int original_file_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_THAT(original_file_size,
+ Eq(pre_mapping_file_offset + pre_mapping_mmap_size));
+ ASSERT_THAT(mmapped_file.region_size(), Eq(pre_mapping_mmap_size));
+ ASSERT_THAT(mmapped_file.available_size(), Eq(pre_mapping_mmap_size));
+
+ // Call GrowAndRemapIfNecessary with different file_offset and new mmap_size
+ // that doesn't require to grow the underlying file. The region should still
+ // be remapped since offset has been changed.
+ int new_file_offset = pre_mapping_file_offset + page_size;
+ int new_mmap_size = page_size * 2 - new_file_offset;
+ ASSERT_THAT(new_file_offset + new_mmap_size, Le(original_file_size));
+ ICING_EXPECT_OK(
+ mmapped_file.GrowAndRemapIfNecessary(new_file_offset, new_mmap_size));
+
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(original_file_size));
+ EXPECT_THAT(mmapped_file.region(), Not(Eq(original_region)));
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(new_file_offset));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(new_mmap_size));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(new_mmap_size));
+}
+
+TEST_F(MemoryMappedFileTest, GrowAndRemapIfNecessaryInvalidMmapRegionInfo) {
+ int page_size = MemoryMappedFile::system_page_size();
+ int max_file_size = page_size * 2;
+ // Create MemoryMappedFile with pre-mapping file_offset and mmap_size, and
+ // call GrowAndRemapIfNecessary to grow the underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ max_file_size,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/page_size * 2));
+
+ // Negative new_file_offset.
+ EXPECT_THAT(mmapped_file.GrowAndRemapIfNecessary(
+ /*new_file_offset=*/-1,
+ /*new_mmap_size=*/page_size),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Negative new_mmap_size
+ EXPECT_THAT(mmapped_file.GrowAndRemapIfNecessary(
+ /*new_file_offset=*/0,
+ /*new_mmap_size=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // new_file_offset + new_mmap_size > max_file_size.
+ int new_file_offset = 99;
+ int new_mmap_size = max_file_size - new_file_offset + 1;
+ EXPECT_THAT(
+ mmapped_file.GrowAndRemapIfNecessary(new_file_offset, new_mmap_size),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ // Edge cases to make sure the implementation of range check won't have
+ // integer overflow bug.
+ EXPECT_THAT(mmapped_file.GrowAndRemapIfNecessary(
+ /*new_file_offset=*/99,
+ /*new_mmap_size=*/std::numeric_limits<int64_t>::max()),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(mmapped_file.GrowAndRemapIfNecessary(
+ /*new_file_offset=*/0,
+ /*new_mmap_size=*/INT64_C(-1) *
+ (std::numeric_limits<int64_t>::max() - max_file_size)),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(mmapped_file.GrowAndRemapIfNecessary(
+ /*new_file_offset=*/INT64_C(-1) *
+ (std::numeric_limits<int64_t>::max() - max_file_size),
+ /*new_mmap_size=*/page_size),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(MemoryMappedFileTest, RemapFailureStillValidInstance) {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ int page_size = MemoryMappedFile::system_page_size();
+ int max_file_size = page_size * 10;
+
+ // 1. Create MemoryMappedFile with pre-mapping offset=0 and
+ // mmap_size=page_size. Also call GrowAndRemapIfNecessary to grow the file
+ // size to page_size.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(*mock_filesystem, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ max_file_size,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/page_size));
+ ICING_ASSERT_OK(
+ mmapped_file.GrowAndRemapIfNecessary(/*new_file_offset=*/0,
+ /*new_mmap_size=*/page_size));
+ ASSERT_THAT(filesystem_.GetFileSize(file_path_.c_str()), Eq(page_size));
+ ASSERT_THAT(mmapped_file.region(), NotNull());
+ ASSERT_THAT(mmapped_file.mutable_region(), NotNull());
+ ASSERT_THAT(mmapped_file.file_offset(), Eq(0));
+ ASSERT_THAT(mmapped_file.region_size(), Eq(page_size));
+ ASSERT_THAT(mmapped_file.available_size(), Eq(page_size));
+ mmapped_file.mutable_region()[page_size - 1] = 'a';
+
+ const char* original_region = mmapped_file.region();
+
+ // 2. Call GrowAndRemapIfNecessary with different offset and greater
+ // mmap_size. Here we're testing the case when file growth succeeds but
+ // remap (RemapImpl) fails.
+ // To make RemapImpl fail, mock OpenForWrite to fail. Note that we use
+ // OpenForAppend when growing the file, so it is ok to make OpenForWrite
+ // fail without affecting file growth.
+ ON_CALL(*mock_filesystem, OpenForWrite(_)).WillByDefault(Return(-1));
+ EXPECT_THAT(
+ mmapped_file.GrowAndRemapIfNecessary(/*new_file_offset=*/1,
+ /*new_mmap_size=*/page_size * 2 - 1),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+
+ // 3. Verify the result. The file size should be grown, but since remap fails,
+ // mmap related fields should remain unchanged.
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), Eq(page_size * 2));
+ EXPECT_THAT(mmapped_file.region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.mutable_region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(0));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(page_size));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(page_size));
+ // We should still be able to get the correct content via region.
+ EXPECT_THAT(mmapped_file.region()[page_size - 1], Eq('a'));
+}
+
+TEST_F(MemoryMappedFileTest, BadFileSizeDuringGrowReturnsError) {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ int page_size = MemoryMappedFile::system_page_size();
+ int max_file_size = page_size * 10;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(*mock_filesystem, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ max_file_size,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/page_size));
+ ICING_ASSERT_OK(
+ mmapped_file.GrowAndRemapIfNecessary(/*new_file_offset=*/0,
+ /*new_mmap_size=*/page_size));
+ ASSERT_THAT(filesystem_.GetFileSize(file_path_.c_str()), Eq(page_size));
+ ASSERT_THAT(mmapped_file.region(), NotNull());
+ ASSERT_THAT(mmapped_file.mutable_region(), NotNull());
+ ASSERT_THAT(mmapped_file.file_offset(), Eq(0));
+ ASSERT_THAT(mmapped_file.region_size(), Eq(page_size));
+ ASSERT_THAT(mmapped_file.available_size(), Eq(page_size));
+ mmapped_file.mutable_region()[page_size - 1] = 'a';
+
+ const char* original_region = mmapped_file.region();
+
+ // Calling GrowAndRemapIfNecessary with larger size will cause file growth.
+ // During file growth, we will attempt to sync the underlying file size via
+ // GetFileSize to see if growing is actually necessary. Mock GetFileSize to
+ // return an error.
+ ON_CALL(*mock_filesystem, GetFileSize(A<const char*>()))
+ .WillByDefault(Return(Filesystem::kBadFileSize));
+
+ // We should fail gracefully and return an INTERNAL error to indicate that
+ // there was an issue retrieving the file size. The underlying file size and
+ // mmap info should remain unchanged.
+ EXPECT_THAT(
+ mmapped_file.GrowAndRemapIfNecessary(/*new_file_offset=*/0,
+ /*new_mmap_size=*/page_size * 2),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), Eq(page_size));
+ EXPECT_THAT(mmapped_file.region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.mutable_region(), Eq(original_region));
+ EXPECT_THAT(mmapped_file.file_offset(), Eq(0));
+ EXPECT_THAT(mmapped_file.region_size(), Eq(page_size));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(page_size));
+ // We should still be able to get the correct content via region.
+ EXPECT_THAT(mmapped_file.region()[page_size - 1], Eq('a'));
+}
+
+TEST_F(MemoryMappedFileTest, WriteSucceedsPartiallyAndFailsDuringGrow) {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ int page_size = MemoryMappedFile::system_page_size();
+ int max_file_size = page_size * 10;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(*mock_filesystem, file_path_,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ max_file_size,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/max_file_size));
+
+ // 1. Initially the underlying file size is 0. When calling
+ // GrowAndRemapIfNecessary first time with new_mmap_size = page_size * 2,
+ // Write() should be called 2 times and each time should grow the
+ // underlying file by page_size bytes.
+ // Mock the 2nd Write() to write partially (1 byte) and fail, so the file
+ // will only be grown by page_size + 1 bytes in total.
+ auto open_lambda = [this](int fd, const void* data,
+ size_t data_size) -> bool {
+ EXPECT_THAT(data_size, Gt(1));
+ EXPECT_THAT(this->filesystem_.Write(fd, data, 1), Eq(1));
+ return false;
+ };
+ EXPECT_CALL(*mock_filesystem, Write(A<int>(), A<const void*>(), A<size_t>()))
+ .WillOnce(DoDefault())
+ .WillOnce(open_lambda);
+
+ // 2. Call GrowAndRemapIfNecessary and expect to fail. The actual file size
+ // should be page_size + 1, but the (cached) file_size_ should be page_size
+ // since it fails to update that partially written byte of the 2nd Write().
+ EXPECT_THAT(
+ mmapped_file.GrowAndRemapIfNecessary(/*new_file_offset=*/0,
+ /*new_mmap_size=*/page_size * 2),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), Eq(page_size + 1));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(page_size));
+
+ // 3. Call GrowAndRemapIfNecessary again with new_mmap_size = page_size + 1.
+ // Even though file_size_ only caches page_size and excludes the partially
+ // written byte(s) due to failure of the previous round of grow, the next
+ // round should sync the actual file size to file_size_ via system call and
+ // skip Write() since the actual file size is large enough for the new
+ // mmap_size.
+ // Note: WillOnce() above will ensure that Write() won't be called for
+ // another time.
+ ICING_EXPECT_OK(
+ mmapped_file.GrowAndRemapIfNecessary(/*new_file_offset=*/0,
+ /*new_mmap_size=*/page_size + 1));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(page_size + 1));
+
+ // 4. Call GrowAndRemapIfNecessary again with new_mmap_size = page_size * 2.
+ // Even though the current file size is page_size + 1, the next round of
+ // grow should automatically calibrate the file size back to a multiple of
+ // page_size instead of just simply appending page_size bytes to the file.
+ EXPECT_CALL(*mock_filesystem, Write(A<int>(), A<const void*>(), A<size_t>()))
+ .WillOnce(DoDefault());
+ ICING_EXPECT_OK(
+ mmapped_file.GrowAndRemapIfNecessary(/*new_file_offset=*/0,
+ /*new_mmap_size=*/page_size * 2));
+ EXPECT_THAT(filesystem_.GetFileSize(file_path_.c_str()), Eq(page_size * 2));
+ EXPECT_THAT(mmapped_file.available_size(), Eq(page_size * 2));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/mock-filesystem.h b/icing/file/mock-filesystem.h
index b89295e..32817d4 100644
--- a/icing/file/mock-filesystem.h
+++ b/icing/file/mock-filesystem.h
@@ -44,6 +44,17 @@ class MockFilesystem : public Filesystem {
return real_filesystem_.DeleteDirectoryRecursively(dir_name);
});
+ ON_CALL(*this, CopyFile)
+ .WillByDefault([this](const char* src, const char* dst) {
+ return real_filesystem_.CopyFile(src, dst);
+ });
+
+ ON_CALL(*this, CopyDirectory)
+ .WillByDefault(
+ [this](const char* src, const char* dst, bool recursive) {
+ return real_filesystem_.CopyDirectory(src, dst, recursive);
+ });
+
ON_CALL(*this, FileExists).WillByDefault([this](const char* file_name) {
return real_filesystem_.FileExists(file_name);
});
@@ -225,6 +236,11 @@ class MockFilesystem : public Filesystem {
MOCK_METHOD(bool, DeleteDirectoryRecursively, (const char* dir_name),
(const));
+ MOCK_METHOD(bool, CopyFile, (const char* src, const char* dst), (const));
+
+ MOCK_METHOD(bool, CopyDirectory,
+ (const char* src, const char* dst, bool recursive), (const));
+
MOCK_METHOD(bool, FileExists, (const char* file_name), (const));
MOCK_METHOD(bool, DirectoryExists, (const char* dir_name), (const));
diff --git a/icing/file/persistent-hash-map.cc b/icing/file/persistent-hash-map.cc
new file mode 100644
index 0000000..6936c45
--- /dev/null
+++ b/icing/file/persistent-hash-map.cc
@@ -0,0 +1,750 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/persistent-hash-map.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Helper function to check if there is no termination character '\0' in the
+// key.
+libtextclassifier3::Status ValidateKey(std::string_view key) {
+ if (key.find('\0') != std::string_view::npos) { // NOLINT
+ return absl_ports::InvalidArgumentError(
+ "Key cannot contain termination character '\\0'");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+// Helper function to convert the key to bucket index by hash.
+//
+// Returns:
+// int32_t: A valid bucket index with range [0, num_buckets - 1].
+// INTERNAL_ERROR if num_buckets == 0
+libtextclassifier3::StatusOr<int32_t> HashKeyToBucketIndex(
+ std::string_view key, int32_t num_buckets) {
+ if (num_buckets == 0) {
+ return absl_ports::InternalError("Should not have empty bucket");
+ }
+ return static_cast<int32_t>(std::hash<std::string_view>()(key) % num_buckets);
+}
+
+// The following 4 methods are helper functions to get the correct path of
+// metadata/bucket/entry/key-value storages, according to the given working
+// directory path.
+std::string GetMetadataFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
+ ".m");
+}
+
+std::string GetBucketStorageFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
+ ".b");
+}
+
+std::string GetEntryStorageFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
+ ".e");
+}
+
+std::string GetKeyValueStorageFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", PersistentHashMap::kFilePrefix,
+ ".k");
+}
+
+// Calculates how many buckets we need given num_entries and
+// max_load_factor_percent. Round it up to 2's power.
+//
+// REQUIRES: 0 < num_entries <= Entry::kMaxNumEntries &&
+// max_load_factor_percent > 0
+int32_t CalculateNumBucketsRequired(int32_t num_entries,
+ int32_t max_load_factor_percent) {
+ // Calculate ceil(num_entries * 100 / max_load_factor_percent)
+ int32_t num_entries_100 = num_entries * 100;
+ int32_t num_buckets_required =
+ num_entries_100 / max_load_factor_percent +
+ (num_entries_100 % max_load_factor_percent == 0 ? 0 : 1);
+ if ((num_buckets_required & (num_buckets_required - 1)) != 0) {
+ // not 2's power
+ return 1 << (32 - __builtin_clz(num_buckets_required));
+ }
+ return num_buckets_required;
+}
+
+} // namespace
+
+bool PersistentHashMap::Options::IsValid() const {
+ if (!(value_type_size > 0 && value_type_size <= kMaxValueTypeSize &&
+ max_num_entries > 0 && max_num_entries <= Entry::kMaxNumEntries &&
+ max_load_factor_percent > 0 && average_kv_byte_size > 0 &&
+ init_num_buckets > 0 && init_num_buckets <= Bucket::kMaxNumBuckets)) {
+ return false;
+ }
+
+ // We've ensured (static_assert) that storing kMaxNumBuckets buckets won't
+ // exceed FileBackedVector::kMaxFileSize, so only need to verify # of buckets
+ // required won't exceed kMaxNumBuckets.
+ if (CalculateNumBucketsRequired(max_num_entries, max_load_factor_percent) >
+ Bucket::kMaxNumBuckets) {
+ return false;
+ }
+
+ // Verify # of key value pairs can fit into kv_storage.
+ if (average_kv_byte_size > kMaxKVTotalByteSize / max_num_entries) {
+ return false;
+ }
+
+ // Verify init_num_buckets is 2's power. Requiring init_num_buckets to be 2^n
+ // guarantees that num_buckets will eventually grow to be exactly
+ // max_num_buckets since CalculateNumBucketsRequired rounds it up to 2^n.
+ if ((init_num_buckets & (init_num_buckets - 1)) != 0) {
+ return false;
+ }
+
+ return true;
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::Create(const Filesystem& filesystem,
+ std::string working_path, Options options) {
+ if (!options.IsValid()) {
+ return absl_ports::InvalidArgumentError(
+ "Invalid PersistentHashMap options");
+ }
+
+ if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
+ !filesystem.FileExists(GetBucketStorageFilePath(working_path).c_str()) ||
+ !filesystem.FileExists(GetEntryStorageFilePath(working_path).c_str()) ||
+ !filesystem.FileExists(
+ GetKeyValueStorageFilePath(working_path).c_str())) {
+ // Discard working_path if any of them is missing, and reinitialize.
+ if (filesystem.DirectoryExists(working_path.c_str())) {
+ ICING_RETURN_IF_ERROR(Discard(filesystem, working_path));
+ }
+ return InitializeNewFiles(filesystem, std::move(working_path),
+ std::move(options));
+ }
+ return InitializeExistingFiles(filesystem, std::move(working_path),
+ std::move(options));
+}
+
+PersistentHashMap::~PersistentHashMap() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to persist hash map to disk while destructing "
+ << working_path_;
+ }
+}
+
+libtextclassifier3::Status PersistentHashMap::Put(std::string_view key,
+ const void* value) {
+ SetDirty();
+
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
+ // If not found, then insert new key value pair.
+ return Insert(bucket_idx, key, value);
+ }
+
+ // Otherwise, overwrite the value.
+ ICING_ASSIGN_OR_RETURN(const Entry* entry,
+ entry_storage_->Get(idx_pair.target_entry_index));
+
+ int32_t kv_len = key.length() + 1 + info().value_type_size;
+ int32_t value_offset = key.length() + 1;
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<char>::MutableArrayView mutable_kv_arr,
+ kv_storage_->GetMutable(entry->key_value_index(), kv_len));
+ // It is the same key and value_size is fixed, so we can directly overwrite
+ // serialized value.
+ mutable_kv_arr.SetArray(value_offset, reinterpret_cast<const char*>(value),
+ info().value_type_size);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PersistentHashMap::GetOrPut(std::string_view key,
+ void* next_value) {
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
+ // If not found, then insert new key value pair.
+ SetDirty();
+ return Insert(bucket_idx, key, next_value);
+ }
+
+ // Otherwise, copy the hash map value into next_value.
+ return CopyEntryValue(idx_pair.target_entry_index, next_value);
+}
+
+libtextclassifier3::Status PersistentHashMap::Get(std::string_view key,
+ void* value) const {
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Key not found in PersistentHashMap ", working_path_));
+ }
+
+ return CopyEntryValue(idx_pair.target_entry_index, value);
+}
+
+libtextclassifier3::Status PersistentHashMap::Delete(std::string_view key) {
+ SetDirty();
+
+ ICING_RETURN_IF_ERROR(ValidateKey(key));
+ ICING_ASSIGN_OR_RETURN(
+ int32_t bucket_idx,
+ HashKeyToBucketIndex(key, bucket_storage_->num_elements()));
+
+ ICING_ASSIGN_OR_RETURN(EntryIndexPair idx_pair,
+ FindEntryIndexByKey(bucket_idx, key));
+ if (idx_pair.target_entry_index == Entry::kInvalidIndex) {
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Key not found in PersistentHashMap ", working_path_));
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<Entry>::MutableView mutable_target_entry,
+ entry_storage_->GetMutable(idx_pair.target_entry_index));
+ if (idx_pair.prev_entry_index == Entry::kInvalidIndex) {
+ // If prev_entry_idx is Entry::kInvalidIndex, then target_entry must be the
+ // head element of the entry linked list, and we have to update
+ // bucket->head_entry_index_.
+ //
+ // Before: target_entry (head) -> next_entry -> ...
+ // After: next_entry (head) -> ...
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<Bucket>::MutableView mutable_bucket,
+ bucket_storage_->GetMutable(bucket_idx));
+ if (mutable_bucket.Get().head_entry_index() !=
+ idx_pair.target_entry_index) {
+ return absl_ports::InternalError(
+ "Bucket head entry index is inconsistent with the actual entry linked"
+ "list head. This shouldn't happen");
+ }
+ mutable_bucket.Get().set_head_entry_index(
+ mutable_target_entry.Get().next_entry_index());
+ } else {
+ // Otherwise, connect prev_entry and next_entry, to remove target_entry from
+ // the entry linked list.
+ //
+ // Before: ... -> prev_entry -> target_entry -> next_entry -> ...
+ // After: ... -> prev_entry -> next_entry -> ...
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<Entry>::MutableView mutable_prev_entry,
+ entry_storage_->GetMutable(idx_pair.prev_entry_index));
+ mutable_prev_entry.Get().set_next_entry_index(
+ mutable_target_entry.Get().next_entry_index());
+ }
+
+ // Zero out the key value bytes. It is necessary for iterator to iterate
+ // through kv_storage and handle deleted keys properly.
+ int32_t kv_len = key.length() + 1 + info().value_type_size;
+ ICING_RETURN_IF_ERROR(kv_storage_->Set(
+ mutable_target_entry.Get().key_value_index(), kv_len, '\0'));
+
+ // Invalidate target_entry
+ mutable_target_entry.Get().set_key_value_index(kInvalidKVIndex);
+ mutable_target_entry.Get().set_next_entry_index(Entry::kInvalidIndex);
+
+ ++(info().num_deleted_entries);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetDiskUsage() const {
+ ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_disk_usage,
+ bucket_storage_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(int64_t entry_storage_disk_usage,
+ entry_storage_->GetDiskUsage());
+ ICING_ASSIGN_OR_RETURN(int64_t kv_storage_disk_usage,
+ kv_storage_->GetDiskUsage());
+
+ int64_t total = bucket_storage_disk_usage + entry_storage_disk_usage +
+ kv_storage_disk_usage;
+ Filesystem::IncrementByOrSetInvalid(
+ filesystem_.GetDiskUsage(GetMetadataFilePath(working_path_).c_str()),
+ &total);
+
+ if (total < 0 || total == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get disk usage of PersistentHashMap");
+ }
+ return total;
+}
+
+libtextclassifier3::StatusOr<int64_t> PersistentHashMap::GetElementsSize()
+ const {
+ ICING_ASSIGN_OR_RETURN(int64_t bucket_storage_elements_size,
+ bucket_storage_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(int64_t entry_storage_elements_size,
+ entry_storage_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(int64_t kv_storage_elements_size,
+ kv_storage_->GetElementsFileSize());
+ return bucket_storage_elements_size + entry_storage_elements_size +
+ kv_storage_elements_size;
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::InitializeNewFiles(const Filesystem& filesystem,
+ std::string&& working_path,
+ Options&& options) {
+ // PersistentHashMap uses working_path as working directory path.
+ // Create working directory.
+ if (!filesystem.CreateDirectory(working_path.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create directory: ", working_path));
+ }
+
+ int32_t max_num_buckets_required =
+ std::max(options.init_num_buckets,
+ CalculateNumBucketsRequired(options.max_num_entries,
+ options.max_load_factor_percent));
+
+ // Initialize bucket_storage
+ int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
+ int32_t max_file_size =
+ pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetBucketStorageFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize entry_storage
+ pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
+ max_file_size =
+ pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ FileBackedVector<Entry>::Create(
+ filesystem, GetEntryStorageFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize kv_storage
+ pre_mapping_mmap_size =
+ options.average_kv_byte_size * options.max_num_entries;
+ max_file_size =
+ pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<char>> kv_storage,
+ FileBackedVector<char>::Create(
+ filesystem, GetKeyValueStorageFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize buckets.
+ ICING_RETURN_IF_ERROR(bucket_storage->Set(
+ /*idx=*/0, /*len=*/options.init_num_buckets, Bucket()));
+ ICING_RETURN_IF_ERROR(bucket_storage->PersistToDisk());
+
+ // Initialize metadata file. Create MemoryMappedFile with pre-mapping, and
+ // call GrowAndRemapIfNecessary to grow the underlying file.
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile metadata_mmapped_file,
+ MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/kMetadataFileSize,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/kMetadataFileSize));
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
+ /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
+
+ // Create instance.
+ auto new_persistent_hash_map =
+ std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
+ filesystem, std::move(working_path), std::move(options),
+ std::move(metadata_mmapped_file), std::move(bucket_storage),
+ std::move(entry_storage), std::move(kv_storage)));
+ // Initialize info content by writing mapped memory directly.
+ Info& info_ref = new_persistent_hash_map->info();
+ info_ref.magic = Info::kMagic;
+ info_ref.value_type_size = new_persistent_hash_map->options_.value_type_size;
+ info_ref.max_load_factor_percent =
+ new_persistent_hash_map->options_.max_load_factor_percent;
+ info_ref.num_deleted_entries = 0;
+ info_ref.num_deleted_key_value_bytes = 0;
+ // Initialize new PersistentStorage. The initial checksums will be computed
+ // and set via InitializeNewStorage.
+ ICING_RETURN_IF_ERROR(new_persistent_hash_map->InitializeNewStorage());
+
+ return new_persistent_hash_map;
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+PersistentHashMap::InitializeExistingFiles(const Filesystem& filesystem,
+ std::string&& working_path,
+ Options&& options) {
+ // Initialize metadata file
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile metadata_mmapped_file,
+ MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/kMetadataFileSize,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/kMetadataFileSize));
+ if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
+ return absl_ports::FailedPreconditionError("Incorrect metadata file size");
+ }
+
+ int32_t max_num_buckets_required = CalculateNumBucketsRequired(
+ options.max_num_entries, options.max_load_factor_percent);
+
+ // Initialize bucket_storage
+ int32_t pre_mapping_mmap_size = sizeof(Bucket) * max_num_buckets_required;
+ int32_t max_file_size =
+ pre_mapping_mmap_size + FileBackedVector<Bucket>::Header::kHeaderSize;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetBucketStorageFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize entry_storage
+ pre_mapping_mmap_size = sizeof(Entry) * options.max_num_entries;
+ max_file_size =
+ pre_mapping_mmap_size + FileBackedVector<Entry>::Header::kHeaderSize;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ FileBackedVector<Entry>::Create(
+ filesystem, GetEntryStorageFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize kv_storage
+ pre_mapping_mmap_size =
+ options.average_kv_byte_size * options.max_num_entries;
+ max_file_size =
+ pre_mapping_mmap_size + FileBackedVector<char>::Header::kHeaderSize;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<char>> kv_storage,
+ FileBackedVector<char>::Create(
+ filesystem, GetKeyValueStorageFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, max_file_size,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Create instance.
+ auto persistent_hash_map =
+ std::unique_ptr<PersistentHashMap>(new PersistentHashMap(
+ filesystem, std::move(working_path), std::move(options),
+ std::move(metadata_mmapped_file), std::move(bucket_storage),
+ std::move(entry_storage), std::move(kv_storage)));
+ // Initialize existing PersistentStorage. Checksums will be validated.
+ ICING_RETURN_IF_ERROR(persistent_hash_map->InitializeExistingStorage());
+
+ // Validate other values of info and options.
+ // Current # of entries should not exceed options_.max_num_entries
+ // We compute max_file_size of 3 storages by options_.max_num_entries. Since
+ // we won't recycle space of deleted entries (and key-value bytes), they're
+ // still occupying space in storages. Even if # of "active" entries doesn't
+ // exceed options_.max_num_entries, the new kvp to be inserted still
+ // potentially exceeds max_file_size.
+ // Therefore, we should use entry_storage_->num_elements() instead of # of
+ // "active" entries
+ // (i.e. entry_storage_->num_elements() - info_ptr->num_deleted_entries) to
+ // check. This feature avoids storages being grown extremely large when there
+ // are many Delete() and Put() operations.
+ if (persistent_hash_map->entry_storage_->num_elements() >
+ persistent_hash_map->options_.max_num_entries) {
+ return absl_ports::FailedPreconditionError(
+ "Current # of entries exceeds max num entries");
+ }
+
+ // Magic should be the same.
+ if (persistent_hash_map->info().magic != Info::kMagic) {
+ return absl_ports::FailedPreconditionError(
+ "PersistentHashMap header magic mismatch");
+ }
+
+ // Value type size should be consistent.
+ if (persistent_hash_map->options_.value_type_size !=
+ persistent_hash_map->info().value_type_size) {
+ return absl_ports::FailedPreconditionError("Incorrect value type size");
+ }
+
+ // Allow max_load_factor_percent_ change.
+ if (persistent_hash_map->options_.max_load_factor_percent !=
+ persistent_hash_map->info().max_load_factor_percent) {
+ ICING_VLOG(2) << "Changing max_load_factor_percent from "
+ << persistent_hash_map->info().max_load_factor_percent
+ << " to "
+ << persistent_hash_map->options_.max_load_factor_percent;
+
+ persistent_hash_map->SetInfoDirty();
+ persistent_hash_map->info().max_load_factor_percent =
+ persistent_hash_map->options_.max_load_factor_percent;
+ ICING_RETURN_IF_ERROR(
+ persistent_hash_map->RehashIfNecessary(/*force_rehash=*/false));
+
+ ICING_RETURN_IF_ERROR(persistent_hash_map->PersistToDisk());
+ }
+
+ return persistent_hash_map;
+}
+
+libtextclassifier3::Status PersistentHashMap::PersistStoragesToDisk(
+ bool force) {
+ if (!force && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ ICING_RETURN_IF_ERROR(bucket_storage_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(entry_storage_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(kv_storage_->PersistToDisk());
+ is_storage_dirty_ = false;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PersistentHashMap::PersistMetadataToDisk(
+ bool force) {
+ // We can skip persisting metadata to disk only if both info and storage are
+ // clean.
+ if (!force && !is_info_dirty() && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Changes should have been applied to the underlying file when using
+ // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
+ // extra safety step to ensure they are written out.
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file_->PersistToDisk());
+ is_info_dirty_ = false;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeInfoChecksum(
+ bool force) {
+ if (!force && !is_info_dirty()) {
+ return Crc32(crcs().component_crcs.info_crc);
+ }
+
+ return info().ComputeChecksum();
+}
+
+libtextclassifier3::StatusOr<Crc32> PersistentHashMap::ComputeStoragesChecksum(
+ bool force) {
+ if (!force && !is_storage_dirty()) {
+ return Crc32(crcs().component_crcs.storages_crc);
+ }
+
+ // Compute crcs
+ ICING_ASSIGN_OR_RETURN(Crc32 bucket_storage_crc,
+ bucket_storage_->ComputeChecksum());
+ ICING_ASSIGN_OR_RETURN(Crc32 entry_storage_crc,
+ entry_storage_->ComputeChecksum());
+ ICING_ASSIGN_OR_RETURN(Crc32 kv_storage_crc, kv_storage_->ComputeChecksum());
+
+ return Crc32(bucket_storage_crc.Get() ^ entry_storage_crc.Get() ^
+ kv_storage_crc.Get());
+}
+
+libtextclassifier3::StatusOr<PersistentHashMap::EntryIndexPair>
+PersistentHashMap::FindEntryIndexByKey(int32_t bucket_idx,
+ std::string_view key) const {
+ // Iterate all entries in the bucket, compare with key, and return the entry
+ // index if exists.
+ ICING_ASSIGN_OR_RETURN(const Bucket* bucket,
+ bucket_storage_->Get(bucket_idx));
+
+ int32_t prev_entry_idx = Entry::kInvalidIndex;
+ int32_t curr_entry_idx = bucket->head_entry_index();
+ while (curr_entry_idx != Entry::kInvalidIndex) {
+ ICING_ASSIGN_OR_RETURN(const Entry* entry,
+ entry_storage_->Get(curr_entry_idx));
+ if (entry->key_value_index() == kInvalidKVIndex) {
+ ICING_LOG(ERROR) << "Got an invalid key value index in the persistent "
+ "hash map bucket. This shouldn't happen";
+ return absl_ports::InternalError("Unexpected invalid key value index");
+ }
+ ICING_ASSIGN_OR_RETURN(const char* kv_arr,
+ kv_storage_->Get(entry->key_value_index()));
+ if (key.compare(kv_arr) == 0) {
+ return EntryIndexPair(curr_entry_idx, prev_entry_idx);
+ }
+
+ prev_entry_idx = curr_entry_idx;
+ curr_entry_idx = entry->next_entry_index();
+ }
+
+ return EntryIndexPair(curr_entry_idx, prev_entry_idx);
+}
+
+libtextclassifier3::Status PersistentHashMap::CopyEntryValue(
+ int32_t entry_idx, void* value) const {
+ ICING_ASSIGN_OR_RETURN(const Entry* entry, entry_storage_->Get(entry_idx));
+
+ ICING_ASSIGN_OR_RETURN(const char* kv_arr,
+ kv_storage_->Get(entry->key_value_index()));
+ int32_t value_offset = strlen(kv_arr) + 1;
+ memcpy(value, kv_arr + value_offset, info().value_type_size);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PersistentHashMap::Insert(int32_t bucket_idx,
+ std::string_view key,
+ const void* value) {
+ SetDirty();
+
+ // If entry_storage_->num_elements() + 1 exceeds options_.max_num_entries,
+ // then return error.
+ // We compute max_file_size of 3 storages by options_.max_num_entries. Since
+ // we won't recycle space of deleted entries (and key-value bytes), they're
+ // still occupying space in storages. Even if # of "active" entries (i.e.
+ // size()) doesn't exceed options_.max_num_entries, the new kvp to be inserted
+ // still potentially exceeds max_file_size.
+ // Therefore, we should use entry_storage_->num_elements() instead of size()
+ // to check. This feature avoids storages being grown extremely large when
+ // there are many Delete() and Put() operations.
+ if (entry_storage_->num_elements() > options_.max_num_entries - 1) {
+ return absl_ports::ResourceExhaustedError("Cannot insert new entry");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<Bucket>::MutableView mutable_bucket,
+ bucket_storage_->GetMutable(bucket_idx));
+
+ // Append new key value.
+ int32_t new_kv_idx = kv_storage_->num_elements();
+ int32_t kv_len = key.size() + 1 + info().value_type_size;
+ int32_t value_offset = key.size() + 1;
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<char>::MutableArrayView mutable_new_kv_arr,
+ kv_storage_->Allocate(kv_len));
+ mutable_new_kv_arr.SetArray(/*idx=*/0, key.data(), key.size());
+ mutable_new_kv_arr.SetArray(/*idx=*/key.size(), "\0", 1);
+ mutable_new_kv_arr.SetArray(/*idx=*/value_offset,
+ reinterpret_cast<const char*>(value),
+ info().value_type_size);
+
+ // Append new entry.
+ int32_t new_entry_idx = entry_storage_->num_elements();
+ ICING_RETURN_IF_ERROR(entry_storage_->Append(
+ Entry(new_kv_idx, mutable_bucket.Get().head_entry_index())));
+ mutable_bucket.Get().set_head_entry_index(new_entry_idx);
+
+ return RehashIfNecessary(/*force_rehash=*/false);
+}
+
+libtextclassifier3::Status PersistentHashMap::RehashIfNecessary(
+ bool force_rehash) {
+ int32_t new_num_bucket = bucket_storage_->num_elements();
+ while (new_num_bucket <= Bucket::kMaxNumBuckets / 2 &&
+ size() > static_cast<int64_t>(new_num_bucket) *
+ info().max_load_factor_percent / 100) {
+ new_num_bucket *= 2;
+ }
+
+ if (!force_rehash && new_num_bucket == bucket_storage_->num_elements()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ SetDirty();
+
+ // Resize and reset buckets.
+ ICING_RETURN_IF_ERROR(
+ bucket_storage_->Set(0, new_num_bucket, Bucket(Entry::kInvalidIndex)));
+
+ // Iterate all key value pairs in kv_storage, rehash and insert.
+ Iterator iter = GetIterator();
+ int32_t entry_idx = 0;
+ while (iter.Advance()) {
+ ICING_ASSIGN_OR_RETURN(int32_t bucket_idx,
+ HashKeyToBucketIndex(iter.GetKey(), new_num_bucket));
+ ICING_ASSIGN_OR_RETURN(FileBackedVector<Bucket>::MutableView mutable_bucket,
+ bucket_storage_->GetMutable(bucket_idx));
+
+ // Update entry and bucket.
+ ICING_RETURN_IF_ERROR(entry_storage_->Set(
+ entry_idx,
+ Entry(iter.GetIndex(), mutable_bucket.Get().head_entry_index())));
+ mutable_bucket.Get().set_head_entry_index(entry_idx);
+
+ ++entry_idx;
+ }
+
+ // Since there will be some deleted entries, after rehashing entry_storage_
+ // # of vector elements may be greater than the actual # of entries.
+ // Therefore, we have to truncate entry_storage_ to the correct size.
+ if (entry_idx < entry_storage_->num_elements()) {
+ ICING_RETURN_IF_ERROR(entry_storage_->TruncateTo(entry_idx));
+ }
+
+ info().num_deleted_entries = 0;
+
+ return libtextclassifier3::Status::OK;
+}
+
+bool PersistentHashMap::Iterator::Advance() {
+ // Jump over the current key value pair before advancing to the next valid
+ // key value pair. In the first round (after construction), curr_key_len_
+ // is 0, so don't jump over anything.
+ if (curr_key_len_ != 0) {
+ curr_kv_idx_ += curr_key_len_ + 1 + map_->info().value_type_size;
+ curr_key_len_ = 0;
+ }
+
+ // By skipping null chars, we will be automatically handling deleted entries
+ // (which are zeroed out during deletion).
+ for (const char* curr_kv_ptr = map_->kv_storage_->array() + curr_kv_idx_;
+ curr_kv_idx_ < map_->kv_storage_->num_elements();
+ ++curr_kv_ptr, ++curr_kv_idx_) {
+ if (*curr_kv_ptr != '\0') {
+ curr_key_len_ = strlen(curr_kv_ptr);
+ return true;
+ }
+ }
+ return false;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/persistent-hash-map.h b/icing/file/persistent-hash-map.h
new file mode 100644
index 0000000..5f7999d
--- /dev/null
+++ b/icing/file/persistent-hash-map.h
@@ -0,0 +1,529 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_PERSISTENT_HASH_MAP_H_
+#define ICING_FILE_PERSISTENT_HASH_MAP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// Low level persistent hash map.
+// It supports variant length serialized key + fixed length serialized value.
+// Key and value can be any type, but callers should serialize key/value by
+// themselves and pass raw bytes into the hash map, and the serialized key
+// should not contain termination character '\0'.
+class PersistentHashMap : public PersistentStorage {
+ public:
+ // For iterating through persistent hash map. The order is not guaranteed.
+ //
+ // Not thread-safe.
+ //
+ // Change in underlying persistent hash map invalidates iterator.
+ class Iterator {
+ public:
+ // Advance to the next entry.
+ //
+ // Returns:
+ // True on success, otherwise false.
+ bool Advance();
+
+ int32_t GetIndex() const { return curr_kv_idx_; }
+
+ // Get the key.
+ //
+ // REQUIRES: The preceding call for Advance() is true.
+ std::string_view GetKey() const {
+ return std::string_view(map_->kv_storage_->array() + curr_kv_idx_,
+ curr_key_len_);
+ }
+
+ // Get the memory mapped address of the value.
+ //
+ // REQUIRES: The preceding call for Advance() is true.
+ const void* GetValue() const {
+ return static_cast<const void*>(map_->kv_storage_->array() +
+ curr_kv_idx_ + curr_key_len_ + 1);
+ }
+
+ private:
+ explicit Iterator(const PersistentHashMap* map)
+ : map_(map), curr_kv_idx_(0), curr_key_len_(0) {}
+
+ // Does not own
+ const PersistentHashMap* map_;
+
+ int32_t curr_kv_idx_;
+ int32_t curr_key_len_;
+
+ friend class PersistentHashMap;
+ };
+
+ // Metadata file layout: <Crcs><Info>
+ static constexpr int32_t kCrcsMetadataFileOffset = 0;
+ static constexpr int32_t kInfoMetadataFileOffset =
+ static_cast<int32_t>(sizeof(Crcs));
+
+ struct Info {
+ static constexpr int32_t kMagic = 0x653afd7b;
+
+ int32_t magic;
+ int32_t value_type_size;
+ int32_t max_load_factor_percent;
+ int32_t num_deleted_entries;
+ int32_t num_deleted_key_value_bytes;
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(
+ std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
+ }
+ } __attribute__((packed));
+ static_assert(sizeof(Info) == 20, "");
+
+ static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
+ static_assert(kMetadataFileSize == 32, "");
+
+ // Bucket
+ class Bucket {
+ public:
+ // Absolute max # of buckets allowed. Since we're using FileBackedVector to
+ // store buckets, add some static_asserts to ensure numbers here are
+ // compatible with FileBackedVector.
+ static constexpr int32_t kMaxNumBuckets = 1 << 24;
+
+ explicit Bucket(int32_t head_entry_index = Entry::kInvalidIndex)
+ : head_entry_index_(head_entry_index) {}
+
+ // For FileBackedVector
+ bool operator==(const Bucket& other) const {
+ return head_entry_index_ == other.head_entry_index_;
+ }
+
+ int32_t head_entry_index() const { return head_entry_index_; }
+ void set_head_entry_index(int32_t head_entry_index) {
+ head_entry_index_ = head_entry_index;
+ }
+
+ private:
+ int32_t head_entry_index_;
+ } __attribute__((packed));
+ static_assert(sizeof(Bucket) == 4, "");
+ static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize,
+ "Bucket type size is inconsistent with FileBackedVector "
+ "element type size");
+ static_assert(Bucket::kMaxNumBuckets <=
+ (FileBackedVector<Bucket>::kMaxFileSize -
+ FileBackedVector<Bucket>::Header::kHeaderSize) /
+ FileBackedVector<Bucket>::kElementTypeSize,
+ "Max # of buckets cannot fit into FileBackedVector");
+
+ // Entry
+ class Entry {
+ public:
+ // Absolute max # of entries allowed. Since we're using FileBackedVector to
+ // store entries, add some static_asserts to ensure numbers here are
+ // compatible with FileBackedVector.
+ //
+ // Still the actual max # of entries are determined by key-value storage,
+ // since length of the key varies and affects # of actual key-value pairs
+ // that can be stored.
+ static constexpr int32_t kMaxNumEntries = 1 << 23;
+ static constexpr int32_t kMaxIndex = kMaxNumEntries - 1;
+ static constexpr int32_t kInvalidIndex = -1;
+
+ explicit Entry(int32_t key_value_index, int32_t next_entry_index)
+ : key_value_index_(key_value_index),
+ next_entry_index_(next_entry_index) {}
+
+ bool operator==(const Entry& other) const {
+ return key_value_index_ == other.key_value_index_ &&
+ next_entry_index_ == other.next_entry_index_;
+ }
+
+ int32_t key_value_index() const { return key_value_index_; }
+ void set_key_value_index(int32_t key_value_index) {
+ key_value_index_ = key_value_index;
+ }
+
+ int32_t next_entry_index() const { return next_entry_index_; }
+ void set_next_entry_index(int32_t next_entry_index) {
+ next_entry_index_ = next_entry_index;
+ }
+
+ private:
+ int32_t key_value_index_;
+ int32_t next_entry_index_;
+ } __attribute__((packed));
+ static_assert(sizeof(Entry) == 8, "");
+ static_assert(sizeof(Entry) == FileBackedVector<Entry>::kElementTypeSize,
+ "Entry type size is inconsistent with FileBackedVector "
+ "element type size");
+ static_assert(Entry::kMaxNumEntries <=
+ (FileBackedVector<Entry>::kMaxFileSize -
+ FileBackedVector<Entry>::Header::kHeaderSize) /
+ FileBackedVector<Entry>::kElementTypeSize,
+ "Max # of entries cannot fit into FileBackedVector");
+
+ // Key-value serialized type
+ static constexpr int32_t kMaxKVTotalByteSize = 1 << 28;
+ static constexpr int32_t kMaxKVIndex = kMaxKVTotalByteSize - 1;
+ static constexpr int32_t kInvalidKVIndex = -1;
+ static_assert(sizeof(char) == FileBackedVector<char>::kElementTypeSize,
+ "Char type size is inconsistent with FileBackedVector element "
+ "type size");
+ static_assert(kMaxKVTotalByteSize <=
+ FileBackedVector<char>::kMaxFileSize -
+ FileBackedVector<char>::Header::kHeaderSize,
+ "Max total byte size of key value pairs cannot fit into "
+ "FileBackedVector");
+
+ static constexpr int32_t kMaxValueTypeSize = 1 << 10;
+
+ struct Options {
+ static constexpr int32_t kDefaultMaxLoadFactorPercent = 100;
+ static constexpr int32_t kDefaultAverageKVByteSize = 32;
+ static constexpr int32_t kDefaultInitNumBuckets = 1 << 13;
+
+ explicit Options(
+ int32_t value_type_size_in,
+ int32_t max_num_entries_in = Entry::kMaxNumEntries,
+ int32_t max_load_factor_percent_in = kDefaultMaxLoadFactorPercent,
+ int32_t average_kv_byte_size_in = kDefaultAverageKVByteSize,
+ int32_t init_num_buckets_in = kDefaultInitNumBuckets,
+ bool pre_mapping_fbv_in = false)
+ : value_type_size(value_type_size_in),
+ max_num_entries(max_num_entries_in),
+ max_load_factor_percent(max_load_factor_percent_in),
+ average_kv_byte_size(average_kv_byte_size_in),
+ init_num_buckets(init_num_buckets_in),
+ pre_mapping_fbv(pre_mapping_fbv_in) {}
+
+ bool IsValid() const;
+
+ // (fixed) size of the serialized value type for hash map.
+ int32_t value_type_size;
+
+ // Max # of entries, default Entry::kMaxNumEntries.
+ int32_t max_num_entries;
+
+ // Percentage of the max loading for the hash map. If load_factor_percent
+ // exceeds max_load_factor_percent, then rehash will be invoked (and # of
+ // buckets will be doubled).
+ // load_factor_percent = 100 * num_keys / num_buckets
+ //
+ // Note that load_factor_percent exceeding 100 is considered valid.
+ int32_t max_load_factor_percent;
+
+ // Average byte size of a key value pair. It is used to estimate kv_storage_
+ // pre_mapping_mmap_size.
+ int32_t average_kv_byte_size;
+
+ // Initial # of buckets for the persistent hash map. It should be 2's power.
+ // It is used when creating new persistent hash map and ignored when
+ // creating the instance from existing files.
+ int32_t init_num_buckets;
+
+ // Flag indicating whether memory map max possible file size for underlying
+ // FileBackedVector before growing the actual file size.
+ bool pre_mapping_fbv;
+ };
+
+ static constexpr WorkingPathType kWorkingPathType =
+ WorkingPathType::kDirectory;
+ static constexpr std::string_view kFilePrefix = "persistent_hash_map";
+
+ // Creates a new PersistentHashMap to read/write/delete key value pairs.
+ //
+ // filesystem: Object to make system level calls
+ // working_path: Specifies the working path for PersistentStorage.
+ // PersistentHashMap uses working path as working directory and
+ // all related files will be stored under this directory. It
+ // takes full ownership and of working_path_, including
+ // creation/deletion. It is the caller's responsibility to
+ // specify correct working path and avoid mixing different
+ // persistent storages together under the same path. Also the
+ // caller has the ownership for the parent directory of
+ // working_path_, and it is responsible for parent directory
+ // creation/deletion. See PersistentStorage for more details
+ // about the concept of working_path.
+ // options: Options instance.
+ //
+ // Returns:
+ // INVALID_ARGUMENT_ERROR if any value in options is invalid.
+ // FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
+ // checksum or any other inconsistency.
+ // INTERNAL_ERROR on I/O errors.
+ // Any FileBackedVector errors.
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ Create(const Filesystem& filesystem, std::string working_path,
+ Options options);
+
+ // Deletes PersistentHashMap under working_path.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ static libtextclassifier3::Status Discard(const Filesystem& filesystem,
+ std::string working_path) {
+ return PersistentStorage::Discard(filesystem, working_path,
+ kWorkingPathType);
+ }
+
+ ~PersistentHashMap() override;
+
+ // Update a key value pair. If key does not exist, then insert (key, value)
+ // into the storage. Otherwise overwrite the value into the storage.
+ //
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // RESOURCE_EXHAUSTED_ERROR if # of entries reach options_.max_num_entries
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Put(std::string_view key, const void* value);
+
+ // If key does not exist, then insert (key, next_value) into the storage.
+ // Otherwise, copy the hash map value into next_value.
+ //
+ // REQUIRES: the buffer pointed to by next_value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status GetOrPut(std::string_view key, void* next_value);
+
+ // Get the value by key from the storage. If key exists, then copy the hash
+ // map value into into value buffer. Otherwise, return NOT_FOUND_ERROR.
+ //
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND_ERROR if the key doesn't exist
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Get(std::string_view key, void* value) const;
+
+ // Delete the key value pair from the storage. If key doesn't exist, then do
+ // nothing and return NOT_FOUND_ERROR.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND_ERROR if the key doesn't exist
+ // INVALID_ARGUMENT_ERROR if the key is invalid (i.e. contains '\0')
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Delete(std::string_view key);
+
+ Iterator GetIterator() const { return Iterator(this); }
+
+ // Calculates and returns the disk usage (metadata + 3 storages total file
+ // size) in bytes.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Returns the total file size of the all the elements held in the persistent
+ // hash map. File size is in bytes. This excludes the size of any internal
+ // metadata, i.e. crcs/info of persistent hash map, file backed vector's
+ // header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
+ int32_t size() const {
+ return entry_storage_->num_elements() - info().num_deleted_entries;
+ }
+
+ bool empty() const { return size() == 0; }
+
+ int32_t num_buckets() const { return bucket_storage_->num_elements(); }
+
+ private:
+ struct EntryIndexPair {
+ int32_t target_entry_index;
+ int32_t prev_entry_index;
+
+ explicit EntryIndexPair(int32_t target_entry_index_in,
+ int32_t prev_entry_index_in)
+ : target_entry_index(target_entry_index_in),
+ prev_entry_index(prev_entry_index_in) {}
+ };
+
+ explicit PersistentHashMap(
+ const Filesystem& filesystem, std::string&& working_path,
+ Options&& options, MemoryMappedFile&& metadata_mmapped_file,
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ std::unique_ptr<FileBackedVector<char>> kv_storage)
+ : PersistentStorage(filesystem, std::move(working_path),
+ kWorkingPathType),
+ options_(std::move(options)),
+ metadata_mmapped_file_(std::make_unique<MemoryMappedFile>(
+ std::move(metadata_mmapped_file))),
+ bucket_storage_(std::move(bucket_storage)),
+ entry_storage_(std::move(entry_storage)),
+ kv_storage_(std::move(kv_storage)),
+ is_info_dirty_(false),
+ is_storage_dirty_(false) {}
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
+ Options&& options);
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ InitializeExistingFiles(const Filesystem& filesystem,
+ std::string&& working_path, Options&& options);
+
+ // Flushes contents of all storages to underlying files.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistStoragesToDisk(bool force) override;
+
+ // Flushes contents of metadata file.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistMetadataToDisk(bool force) override;
+
+ // Computes and returns Info checksum.
+ //
+ // Returns:
+ // - Crc of the Info on success
+ libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override;
+
+ // Computes and returns all storages checksum. Checksums of bucket_storage_,
+ // entry_storage_ and kv_storage_ will be combined together by XOR.
+ //
+ // Returns:
+ // - Crc of all storages on success
+ // - INTERNAL_ERROR if any data inconsistency
+ libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) override;
+
+ // Find the index of the target entry (that contains the key) from a bucket
+ // (specified by bucket index). Also return the previous entry index, since
+ // Delete() needs it to update the linked list and head entry index. The
+ // caller should specify the desired bucket index.
+ //
+ // Returns:
+ // std::pair<int32_t, int32_t>: target entry index and previous entry index
+ // on success. If not found, then target entry
+ // index will be Entry::kInvalidIndex
+ // INTERNAL_ERROR if any content inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::StatusOr<EntryIndexPair> FindEntryIndexByKey(
+ int32_t bucket_idx, std::string_view key) const;
+
+ // Copy the hash map value of the entry into value buffer.
+ //
+ // REQUIRES: entry_idx should be valid.
+ // REQUIRES: the buffer pointed to by value must be of value_size()
+ //
+ // Returns:
+ // OK on success
+ // Any FileBackedVector errors
+ libtextclassifier3::Status CopyEntryValue(int32_t entry_idx,
+ void* value) const;
+
+ // Insert a new key value pair into a bucket (specified by the bucket index).
+ // The caller should specify the desired bucket index and make sure that the
+ // key is not present in the hash map before calling.
+ //
+ // Returns:
+ // OK on success
+ // Any FileBackedVector errors
+ libtextclassifier3::Status Insert(int32_t bucket_idx, std::string_view key,
+ const void* value);
+
+ // Rehash function. If force_rehash is true or the hash map loading is greater
+ // than max_load_factor, then it will rehash all keys.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on I/O error or any data inconsistency
+ // Any FileBackedVector errors
+ libtextclassifier3::Status RehashIfNecessary(bool force_rehash);
+
+ Crcs& crcs() override {
+ return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
+ kCrcsMetadataFileOffset);
+ }
+
+ const Crcs& crcs() const override {
+ return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
+ kCrcsMetadataFileOffset);
+ }
+
+ Info& info() {
+ return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
+ kInfoMetadataFileOffset);
+ }
+
+ const Info& info() const {
+ return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
+ kInfoMetadataFileOffset);
+ }
+
+ void SetInfoDirty() { is_info_dirty_ = true; }
+ // When storage is dirty, we have to set info dirty as well. So just expose
+ // SetDirty to set both.
+ void SetDirty() {
+ is_info_dirty_ = true;
+ is_storage_dirty_ = true;
+ }
+
+ bool is_info_dirty() const { return is_info_dirty_; }
+ bool is_storage_dirty() const { return is_storage_dirty_; }
+
+ Options options_;
+
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
+
+ // Storages
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage_;
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage_;
+ std::unique_ptr<FileBackedVector<char>> kv_storage_;
+
+ bool is_info_dirty_;
+ bool is_storage_dirty_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_PERSISTENT_HASH_MAP_H_
diff --git a/icing/file/persistent-hash-map_test.cc b/icing/file/persistent-hash-map_test.cc
new file mode 100644
index 0000000..5535629
--- /dev/null
+++ b/icing/file/persistent-hash-map_test.cc
@@ -0,0 +1,1577 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/persistent-hash-map.h"
+
+#include <cstring>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+
+using ::testing::Contains;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::IsTrue;
+using ::testing::Key;
+using ::testing::Lt;
+using ::testing::Ne;
+using ::testing::Not;
+using ::testing::Pair;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using Bucket = PersistentHashMap::Bucket;
+using Crcs = PersistentStorage::Crcs;
+using Entry = PersistentHashMap::Entry;
+using Info = PersistentHashMap::Info;
+using Options = PersistentHashMap::Options;
+
+static constexpr int32_t kCorruptedValueOffset = 3;
+static constexpr int32_t kTestInitNumBuckets = 1;
+
+class PersistentHashMapTest : public ::testing::TestWithParam<bool> {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/icing";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ working_path_ = base_dir_ + "/persistent_hash_map_test";
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ std::vector<char> Serialize(int val) {
+ std::vector<char> ret(sizeof(val));
+ memcpy(ret.data(), &val, sizeof(val));
+ return ret;
+ }
+
+ libtextclassifier3::StatusOr<int> GetValueByKey(
+ PersistentHashMap* persistent_hash_map, std::string_view key) {
+ int val;
+ ICING_RETURN_IF_ERROR(persistent_hash_map->Get(key, &val));
+ return val;
+ }
+
+ std::unordered_map<std::string, int> GetAllKeyValuePairs(
+ PersistentHashMap::Iterator&& iter) {
+ std::unordered_map<std::string, int> kvps;
+
+ while (iter.Advance()) {
+ int val;
+ memcpy(&val, iter.GetValue(), sizeof(val));
+ kvps.emplace(iter.GetKey(), val);
+ }
+ return kvps;
+ }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+ std::string working_path_;
+};
+
+TEST_P(PersistentHashMapTest, OptionsInvalidValueTypeSize) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ ASSERT_TRUE(options.IsValid());
+
+ options.value_type_size = -1;
+ EXPECT_FALSE(options.IsValid());
+
+ options.value_type_size = 0;
+ EXPECT_FALSE(options.IsValid());
+
+ options.value_type_size = PersistentHashMap::kMaxValueTypeSize + 1;
+ EXPECT_FALSE(options.IsValid());
+}
+
+TEST_P(PersistentHashMapTest, OptionsInvalidMaxNumEntries) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ ASSERT_TRUE(options.IsValid());
+
+ options.max_num_entries = -1;
+ EXPECT_FALSE(options.IsValid());
+
+ options.max_num_entries = 0;
+ EXPECT_FALSE(options.IsValid());
+
+ options.max_num_entries = Entry::kMaxNumEntries + 1;
+ EXPECT_FALSE(options.IsValid());
+}
+
+TEST_P(PersistentHashMapTest, OptionsInvalidMaxLoadFactorPercent) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ ASSERT_TRUE(options.IsValid());
+
+ options.max_load_factor_percent = -1;
+ EXPECT_FALSE(options.IsValid());
+
+ options.max_load_factor_percent = 0;
+ EXPECT_FALSE(options.IsValid());
+}
+
+TEST_P(PersistentHashMapTest, OptionsInvalidAverageKVByteSize) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ ASSERT_TRUE(options.IsValid());
+
+ options.average_kv_byte_size = -1;
+ EXPECT_FALSE(options.IsValid());
+
+ options.average_kv_byte_size = 0;
+ EXPECT_FALSE(options.IsValid());
+}
+
+TEST_P(PersistentHashMapTest, OptionsInvalidInitNumBuckets) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ ASSERT_TRUE(options.IsValid());
+
+ options.init_num_buckets = -1;
+ EXPECT_FALSE(options.IsValid());
+
+ options.init_num_buckets = 0;
+ EXPECT_FALSE(options.IsValid());
+
+ options.init_num_buckets = Bucket::kMaxNumBuckets + 1;
+ EXPECT_FALSE(options.IsValid());
+
+ // not 2's power
+ options.init_num_buckets = 3;
+ EXPECT_FALSE(options.IsValid());
+}
+
+TEST_P(PersistentHashMapTest, OptionsNumBucketsRequiredExceedsMaxNumBuckets) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ ASSERT_TRUE(options.IsValid());
+
+ options.max_num_entries = Entry::kMaxNumEntries;
+ options.max_load_factor_percent = 30;
+ EXPECT_FALSE(options.IsValid());
+}
+
+TEST_P(PersistentHashMapTest,
+ OptionsEstimatedNumKeyValuePairExceedsStorageMaxSize) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ ASSERT_TRUE(options.IsValid());
+
+ options.max_num_entries = 1 << 20;
+ options.average_kv_byte_size = 1 << 20;
+ ASSERT_THAT(static_cast<int64_t>(options.max_num_entries) *
+ options.average_kv_byte_size,
+ Gt(PersistentHashMap::kMaxKVTotalByteSize));
+ EXPECT_FALSE(options.IsValid());
+}
+
+TEST_P(PersistentHashMapTest, InvalidWorkingPath) {
+ EXPECT_THAT(PersistentHashMap::Create(
+ filesystem_, "/dev/null/persistent_hash_map_test",
+ Options(/*value_type_size_in=*/sizeof(int))),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_P(PersistentHashMapTest, CreateWithInvalidOptionsShouldFail) {
+ Options invalid_options(/*value_type_size_in=*/-1);
+ invalid_options.pre_mapping_fbv = GetParam();
+ ASSERT_FALSE(invalid_options.IsValid());
+
+ EXPECT_THAT(
+ PersistentHashMap::Create(filesystem_, working_path_, invalid_options),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(PersistentHashMapTest, InitializeNewFiles) {
+ {
+ ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str()));
+
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_,
+ std::move(options)));
+ EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ // Metadata file should be initialized correctly for both info and crcs
+ // sections.
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ // Check info section
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ PersistentHashMap::kInfoMetadataFileOffset));
+ EXPECT_THAT(info.magic, Eq(Info::kMagic));
+ EXPECT_THAT(info.value_type_size, Eq(sizeof(int)));
+ EXPECT_THAT(info.max_load_factor_percent,
+ Eq(Options::kDefaultMaxLoadFactorPercent));
+ EXPECT_THAT(info.num_deleted_entries, Eq(0));
+ EXPECT_THAT(info.num_deleted_key_value_bytes, Eq(0));
+
+ // Check crcs section
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ PersistentHashMap::kCrcsMetadataFileOffset));
+ // # of elements in bucket_storage should be 1, so it should have non-zero
+ // all storages crc value.
+ EXPECT_THAT(crcs.component_crcs.storages_crc, Ne(0));
+ EXPECT_THAT(crcs.component_crcs.info_crc,
+ Eq(Crc32(std::string_view(reinterpret_cast<const char*>(&info),
+ sizeof(Info)))
+ .Get()));
+ EXPECT_THAT(crcs.all_crc,
+ Eq(Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get()));
+}
+
+TEST_P(PersistentHashMapTest, InitializeNewFilesWithCustomInitNumBuckets) {
+ int custom_init_num_buckets = 128;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/custom_init_num_buckets,
+ /*pre_mapping_fbv=*/GetParam())));
+ EXPECT_THAT(persistent_hash_map->num_buckets(), Eq(custom_init_num_buckets));
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeNewFilesWithInitNumBucketsSmallerThanNumBucketsRequired) {
+ int init_num_buckets = 65536;
+
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/1,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/init_num_buckets,
+ /*pre_mapping_fbv=*/GetParam())));
+ EXPECT_THAT(persistent_hash_map->num_buckets(), Eq(init_num_buckets));
+}
+
+TEST_P(PersistentHashMapTest, InitNumBucketsShouldNotAffectExistingFiles) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ int original_init_num_buckets = 4;
+ {
+ options.init_num_buckets = original_init_num_buckets;
+ ASSERT_TRUE(options.IsValid());
+
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ EXPECT_THAT(persistent_hash_map->num_buckets(),
+ Eq(original_init_num_buckets));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ // Set new init_num_buckets.
+ options.init_num_buckets = 8;
+ ASSERT_TRUE(options.IsValid());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ // # of buckets should still be the original value.
+ EXPECT_THAT(persistent_hash_map->num_buckets(),
+ Eq(original_init_num_buckets));
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializationShouldFailWithoutPersistToDiskOrDestruction) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+
+ // Put some key value pairs.
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("c", Serialize(3).data()));
+ // Call Delete() to change PersistentHashMap metadata info
+ // (num_deleted_entries)
+ ICING_ASSERT_OK(persistent_hash_map->Delete("c"));
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ // Without calling PersistToDisk, checksums will not be recomputed or synced
+ // to disk, so initializing another instance on the same files should fail.
+ EXPECT_THAT(PersistentHashMap::Create(filesystem_, working_path_, options),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_P(PersistentHashMapTest, InitializationShouldSucceedWithPersistToDisk) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map1,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+
+ // Put some key value pairs.
+ ICING_ASSERT_OK(persistent_hash_map1->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map1->Put("b", Serialize(2).data()));
+ ICING_ASSERT_OK(persistent_hash_map1->Put("c", Serialize(3).data()));
+ // Call Delete() to change PersistentHashMap metadata info
+ // (num_deleted_entries)
+ ICING_ASSERT_OK(persistent_hash_map1->Delete("c"));
+
+ ASSERT_THAT(persistent_hash_map1, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map1.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map1.get(), "b"), IsOkAndHolds(2));
+
+ // After calling PersistToDisk, all checksums should be recomputed and synced
+ // correctly to disk, so initializing another instance on the same files
+ // should succeed, and we should be able to get the same contents.
+ ICING_EXPECT_OK(persistent_hash_map1->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map2,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ EXPECT_THAT(persistent_hash_map2, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map2.get(), "b"), IsOkAndHolds(2));
+}
+
+TEST_P(PersistentHashMapTest, InitializationShouldSucceedAfterDestruction) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("c", Serialize(3).data()));
+ // Call Delete() to change PersistentHashMap metadata info
+ // (num_deleted_entries)
+ ICING_ASSERT_OK(persistent_hash_map->Delete("c"));
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+ }
+
+ {
+ // The previous instance went out of scope and was destructed. Although we
+ // didn't call PersistToDisk explicitly, the destructor should invoke it and
+ // thus initializing another instance on the same files should succeed, and
+ // we should be able to get the same contents.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesWithDifferentMagicShouldFail) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Manually change kMagic and update checksum
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ PersistentHashMap::kCrcsMetadataFileOffset));
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ PersistentHashMap::kInfoMetadataFileOffset));
+
+ // Manually change magic and update checksums.
+ info.magic += kCorruptedValueOffset;
+ crcs.component_crcs.info_crc = info.ComputeChecksum().Get();
+ crcs.all_crc = crcs.component_crcs.ComputeChecksum().Get();
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(),
+ PersistentHashMap::kCrcsMetadataFileOffset,
+ &crcs, sizeof(Crcs)));
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(),
+ PersistentHashMap::kInfoMetadataFileOffset,
+ &info, sizeof(Info)));
+ }
+
+ {
+ // Attempt to create the persistent hash map with different magic. This
+ // should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or =
+ PersistentHashMap::Create(filesystem_, working_path_, options);
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("PersistentHashMap header magic mismatch"));
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesWithDifferentValueTypeSizeShouldFail) {
+ {
+ // Create new persistent hash map
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Attempt to create the persistent hash map with different value type size.
+ // This should fail.
+ ASSERT_THAT(sizeof(char), Ne(sizeof(int)));
+
+ Options options(/*value_type_size_in=*/sizeof(char));
+ options.pre_mapping_fbv = GetParam();
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or =
+ PersistentHashMap::Create(filesystem_, working_path_, options);
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Incorrect value type size"));
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesWithMaxNumEntriesSmallerThanSizeShouldFail) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+
+ {
+ // Attempt to create the persistent hash map with max num entries smaller
+ // than the current size. This should fail.
+ options.max_num_entries = 1;
+ ASSERT_TRUE(options.IsValid());
+
+ EXPECT_THAT(PersistentHashMap::Create(filesystem_, working_path_, options),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ }
+
+ // Delete 1 kvp.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("a"));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+
+ {
+ // Attempt to create the persistent hash map with max num entries:
+ // - Not smaller than current # of active kvps.
+ // - Smaller than # of all inserted kvps (regardless of activeness).
+ // This should fail.
+ options.max_num_entries = 1;
+ ASSERT_TRUE(options.IsValid());
+
+ EXPECT_THAT(PersistentHashMap::Create(filesystem_, working_path_, options),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ }
+}
+
+TEST_P(PersistentHashMapTest, InitializeExistingFilesWithWrongAllCrc) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ PersistentHashMap::kCrcsMetadataFileOffset));
+
+ // Manually corrupt all_crc
+ crcs.all_crc += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(),
+ PersistentHashMap::kCrcsMetadataFileOffset,
+ &crcs, sizeof(Crcs)));
+ metadata_sfd.reset();
+
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted all_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or =
+ PersistentHashMap::Create(filesystem_, working_path_, options);
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid all crc"));
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesWithCorruptedInfoShouldFail) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ PersistentHashMap::kInfoMetadataFileOffset));
+
+ // Modify info, but don't update the checksum. This would be similar to
+ // corruption of info.
+ info.num_deleted_entries += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(),
+ PersistentHashMap::kInfoMetadataFileOffset,
+ &info, sizeof(Info)));
+ {
+ // Attempt to create the persistent hash map with info that doesn't match
+ // its checksum and confirm that it fails.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or =
+ PersistentHashMap::Create(filesystem_, working_path_, options);
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid info crc"));
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesWithCorruptedBucketStorage) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Update bucket storage manually.
+ const std::string bucket_storage_file_path = absl_ports::StrCat(
+ working_path_, "/", PersistentHashMap::kFilePrefix, ".b");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> bucket_storage,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, bucket_storage_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc,
+ bucket_storage->ComputeChecksum());
+ ICING_ASSERT_OK(bucket_storage->Append(Bucket()));
+ ICING_ASSERT_OK(bucket_storage->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc,
+ bucket_storage->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted bucket_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or =
+ PersistentHashMap::Create(filesystem_, working_path_, options);
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid storages crc"));
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesWithCorruptedEntryStorage) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Update entry storage manually.
+ const std::string entry_storage_file_path = absl_ports::StrCat(
+ working_path_, "/", PersistentHashMap::kFilePrefix, ".e");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Entry>> entry_storage,
+ FileBackedVector<Entry>::Create(
+ filesystem_, entry_storage_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, entry_storage->ComputeChecksum());
+ ICING_ASSERT_OK(entry_storage->Append(
+ Entry(/*key_value_index=*/-1, /*next_entry_index=*/-1)));
+ ICING_ASSERT_OK(entry_storage->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, entry_storage->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted entry_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or =
+ PersistentHashMap::Create(filesystem_, working_path_, options);
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid storages crc"));
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesWithCorruptedKeyValueStorage) {
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Update kv storage manually.
+ const std::string kv_storage_file_path = absl_ports::StrCat(
+ working_path_, "/", PersistentHashMap::kFilePrefix, ".k");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> kv_storage,
+ FileBackedVector<char>::Create(
+ filesystem_, kv_storage_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, kv_storage->ComputeChecksum());
+ ICING_ASSERT_OK(kv_storage->Append('z'));
+ ICING_ASSERT_OK(kv_storage->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, kv_storage->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ {
+ // Attempt to create the persistent hash map with metadata containing
+ // corrupted kv_storage_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<PersistentHashMap>>
+ persistent_hash_map_or =
+ PersistentHashMap::Create(filesystem_, working_path_, options);
+ EXPECT_THAT(persistent_hash_map_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(persistent_hash_map_or.status().error_message(),
+ HasSubstr("Invalid storages crc"));
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesAllowDifferentMaxLoadFactorPercent) {
+ Options options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv=*/GetParam());
+
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Set new max_load_factor_percent.
+ options.max_load_factor_percent = 200;
+ ASSERT_TRUE(options.IsValid());
+ ASSERT_THAT(options.max_load_factor_percent,
+ Ne(Options::kDefaultMaxLoadFactorPercent));
+
+ // Attempt to create the persistent hash map with different max load factor
+ // percent. This should succeed and metadata should be modified correctly.
+ // Also verify all entries should remain unchanged.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", PersistentHashMap::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ PersistentHashMap::kInfoMetadataFileOffset));
+ EXPECT_THAT(info.max_load_factor_percent,
+ Eq(options.max_load_factor_percent));
+
+ // Also should update crcs correctly. We test it by creating instance again
+ // and make sure it won't get corrupted crcs/info errors.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+}
+
+TEST_P(PersistentHashMapTest,
+ InitializeExistingFilesWithDifferentMaxLoadFactorPercentShouldRehash) {
+ Options options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv=*/GetParam());
+
+ double prev_loading_percent;
+ int prev_num_buckets;
+ {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+ ICING_ASSERT_OK(persistent_hash_map->Put("a", Serialize(1).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("b", Serialize(2).data()));
+ ICING_ASSERT_OK(persistent_hash_map->Put("c", Serialize(3).data()));
+
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "c"), IsOkAndHolds(3));
+
+ prev_loading_percent = persistent_hash_map->size() * 100.0 /
+ persistent_hash_map->num_buckets();
+ prev_num_buckets = persistent_hash_map->num_buckets();
+ ASSERT_THAT(prev_loading_percent,
+ Not(Gt(Options::kDefaultMaxLoadFactorPercent)));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Set greater max_load_factor_percent.
+ options.max_load_factor_percent = 150;
+ ASSERT_TRUE(options.IsValid());
+ ASSERT_THAT(options.max_load_factor_percent, Gt(prev_loading_percent));
+
+ // Attempt to create the persistent hash map with max load factor greater
+ // than previous loading. There should be no rehashing and # of buckets
+ // should remain the same.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+
+ EXPECT_THAT(persistent_hash_map->num_buckets(), Eq(prev_num_buckets));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+
+ {
+ // Set smaller max_load_factor_percent.
+ options.max_load_factor_percent = 50;
+ ASSERT_TRUE(options.IsValid());
+ ASSERT_THAT(options.max_load_factor_percent, Lt(prev_loading_percent));
+
+ // Attempt to create the persistent hash map with max load factor smaller
+ // than previous loading. There should be rehashing since the loading
+ // exceeds the limit.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+
+ // After changing max_load_factor_percent, there should be rehashing and the
+ // new loading should not be greater than the new max load factor.
+ EXPECT_THAT(persistent_hash_map->size() * 100.0 /
+ persistent_hash_map->num_buckets(),
+ Not(Gt(options.max_load_factor_percent)));
+ EXPECT_THAT(persistent_hash_map->num_buckets(), Ne(prev_num_buckets));
+
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "a"), IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "b"), IsOkAndHolds(2));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "c"), IsOkAndHolds(3));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+ }
+}
+
+TEST_P(PersistentHashMapTest, PutAndGet) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv=*/GetParam())));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(IsEmpty()));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-youtube.com", Serialize(50).data()));
+
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(100));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"),
+ IsOkAndHolds(50));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "key-not-exist"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK(persistent_hash_map->PersistToDisk());
+}
+
+TEST_P(PersistentHashMapTest, PutShouldOverwriteValueIfKeyExists) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(100));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(200).data()));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(200));
+
+ ICING_EXPECT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(300).data()));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(300));
+}
+
+TEST_P(PersistentHashMapTest, ShouldRehash) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv=*/GetParam())));
+
+ int original_num_buckets = persistent_hash_map->num_buckets();
+ // Insert 100 key value pairs. There should be rehashing so the loading of
+ // hash map doesn't exceed max_load_factor.
+ for (int i = 0; i < 100; ++i) {
+ std::string key = "default-google.com-" + std::to_string(i);
+ ICING_ASSERT_OK(persistent_hash_map->Put(key, &i));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(i + 1)));
+
+ EXPECT_THAT(persistent_hash_map->size() * 100.0 /
+ persistent_hash_map->num_buckets(),
+ Not(Gt(Options::kDefaultMaxLoadFactorPercent)));
+ }
+ EXPECT_THAT(persistent_hash_map->num_buckets(), Ne(original_num_buckets));
+
+ // After rehashing, we should still be able to get all inserted entries.
+ for (int i = 0; i < 100; ++i) {
+ std::string key = "default-google.com-" + std::to_string(i);
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), key), IsOkAndHolds(i));
+ }
+}
+
+TEST_P(PersistentHashMapTest, GetOrPutShouldPutIfKeyDoesNotExist) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv=*/GetParam())));
+
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ int val = 1;
+ EXPECT_THAT(persistent_hash_map->GetOrPut("default-google.com", &val),
+ IsOk());
+ EXPECT_THAT(val, Eq(1));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+}
+
+TEST_P(PersistentHashMapTest, GetOrPutShouldGetIfKeyExists) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ASSERT_THAT(
+ persistent_hash_map->Put("default-google.com", Serialize(1).data()),
+ IsOk());
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+
+ int val = 2;
+ EXPECT_THAT(persistent_hash_map->GetOrPut("default-google.com", &val),
+ IsOk());
+ EXPECT_THAT(val, Eq(1));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(1));
+}
+
+TEST_P(PersistentHashMapTest, Delete) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ // Delete a non-existing key should get NOT_FOUND error
+ EXPECT_THAT(persistent_hash_map->Delete("default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-youtube.com", Serialize(50).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+
+ // Delete an existing key should succeed
+ ICING_EXPECT_OK(persistent_hash_map->Delete("default-google.com"));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ // The deleted key should not be found.
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // Other key should remain unchanged and available
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"),
+ IsOkAndHolds(50));
+
+ // Insert back the deleted key. Should get new value
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(200).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(2)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ IsOkAndHolds(200));
+
+ // Delete again
+ ICING_EXPECT_OK(persistent_hash_map->Delete("default-google.com"));
+ EXPECT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // Other keys should remain unchanged and available
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-youtube.com"),
+ IsOkAndHolds(50));
+}
+
+TEST_P(PersistentHashMapTest, DeleteMultiple) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ std::unordered_map<std::string, int> existing_keys;
+ std::unordered_set<std::string> deleted_keys;
+ // Insert 100 key value pairs
+ for (int i = 0; i < 100; ++i) {
+ std::string key = "default-google.com-" + std::to_string(i);
+ ICING_ASSERT_OK(persistent_hash_map->Put(key, &i));
+ existing_keys[key] = i;
+ }
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(existing_keys.size())));
+
+ // Delete several keys.
+ // Simulate with std::unordered_map and verify.
+ std::vector<int> delete_target_ids{3, 4, 6, 9, 13, 18, 24, 31, 39, 48, 58};
+ for (const int delete_target_id : delete_target_ids) {
+ std::string key = "default-google.com-" + std::to_string(delete_target_id);
+ ASSERT_THAT(existing_keys, Contains(Key(key)));
+ ASSERT_THAT(GetValueByKey(persistent_hash_map.get(), key),
+ IsOkAndHolds(existing_keys[key]));
+ ICING_EXPECT_OK(persistent_hash_map->Delete(key));
+
+ existing_keys.erase(key);
+ deleted_keys.insert(key);
+ }
+
+ // Deleted keys should not be found.
+ for (const std::string& deleted_key : deleted_keys) {
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), deleted_key),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ }
+ // Other keys should remain unchanged and available
+ for (const auto& [existing_key, existing_value] : existing_keys) {
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), existing_key),
+ IsOkAndHolds(existing_value));
+ }
+ // Verify by iterator as well
+ EXPECT_THAT(GetAllKeyValuePairs(persistent_hash_map->GetIterator()),
+ Eq(existing_keys));
+}
+
+TEST_P(PersistentHashMapTest, DeleteBucketHeadElement) {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing.
+ // Preventing rehashing makes it much easier to test collisions.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/1000,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-0", Serialize(0).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-1", Serialize(1).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-2", Serialize(2).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3)));
+ ASSERT_THAT(persistent_hash_map->num_buckets(), Eq(1));
+
+ // Delete the head element of the bucket. Note that in our implementation, the
+ // last added element will become the head element of the bucket.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-2"));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-0"),
+ IsOkAndHolds(0));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-1"),
+ IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-2"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_P(PersistentHashMapTest, DeleteBucketIntermediateElement) {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing.
+ // Preventing rehashing makes it much easier to test collisions.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/1000,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-0", Serialize(0).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-1", Serialize(1).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-2", Serialize(2).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3)));
+ ASSERT_THAT(persistent_hash_map->num_buckets(), Eq(1));
+
+ // Delete any intermediate element of the bucket.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-1"));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-0"),
+ IsOkAndHolds(0));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-1"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-2"),
+ IsOkAndHolds(2));
+}
+
+TEST_P(PersistentHashMapTest, DeleteBucketTailElement) {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing.
+ // Preventing rehashing makes it much easier to test collisions.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/1000,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-0", Serialize(0).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-1", Serialize(1).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-2", Serialize(2).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3)));
+ ASSERT_THAT(persistent_hash_map->num_buckets(), Eq(1));
+
+ // Delete the last element of the bucket. Note that in our implementation, the
+ // first added element will become the tail element of the bucket.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-0"));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-0"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-1"),
+ IsOkAndHolds(1));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com-2"),
+ IsOkAndHolds(2));
+}
+
+TEST_P(PersistentHashMapTest, DeleteBucketOnlySingleElement) {
+ // Create new persistent hash map
+ // Set max_load_factor_percent as 1000. Load factor percent is calculated as
+ // 100 * num_keys / num_buckets. Therefore, with 1 bucket (the initial # of
+ // buckets in an empty PersistentHashMap) and a max_load_factor_percent of
+ // 1000, we would allow the insertion of up to 10 keys before rehashing.
+ // Preventing rehashing makes it much easier to test collisions.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/1000,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+
+ // Delete the only single element of the bucket.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com"));
+ ASSERT_THAT(persistent_hash_map, Pointee(IsEmpty()));
+ EXPECT_THAT(GetValueByKey(persistent_hash_map.get(), "default-google.com"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_P(PersistentHashMapTest, OperationsWhenReachingMaxNumEntries) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/1,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com", Serialize(100).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(1)));
+
+ // Put new key should fail.
+ EXPECT_THAT(
+ persistent_hash_map->Put("default-youtube.com", Serialize(50).data()),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ // Modify existing key should succeed.
+ EXPECT_THAT(
+ persistent_hash_map->Put("default-google.com", Serialize(200).data()),
+ IsOk());
+
+ // Put after delete should still fail. See the comment in
+ // PersistentHashMap::Insert for more details.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com"));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(0)));
+ EXPECT_THAT(
+ persistent_hash_map->Put("default-youtube.com", Serialize(50).data()),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST_P(PersistentHashMapTest, ShouldFailIfKeyContainsTerminationCharacter) {
+ // Create new persistent hash map
+ Options options(/*value_type_size_in=*/sizeof(int));
+ options.pre_mapping_fbv = GetParam();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(filesystem_, working_path_, options));
+
+ const char invalid_key[] = "a\0bc";
+ std::string_view invalid_key_view(invalid_key, 4);
+
+ int val = 1;
+ EXPECT_THAT(persistent_hash_map->Put(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(persistent_hash_map->GetOrPut(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(persistent_hash_map->Get(invalid_key_view, &val),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(persistent_hash_map->Delete(invalid_key_view),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(PersistentHashMapTest, EmptyHashMapIterator) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ EXPECT_FALSE(persistent_hash_map->GetIterator().Advance());
+}
+
+TEST_P(PersistentHashMapTest, Iterator) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ std::unordered_map<std::string, int> kvps;
+ // Insert 100 key value pairs
+ for (int i = 0; i < 100; ++i) {
+ std::string key = "default-google.com-" + std::to_string(i);
+ ICING_ASSERT_OK(persistent_hash_map->Put(key, &i));
+ kvps.emplace(key, i);
+ }
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(kvps.size())));
+
+ EXPECT_THAT(GetAllKeyValuePairs(persistent_hash_map->GetIterator()),
+ Eq(kvps));
+}
+
+TEST_P(PersistentHashMapTest, IteratorAfterDeletingFirstKeyValuePair) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-0", Serialize(0).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-1", Serialize(1).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-2", Serialize(2).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3)));
+
+ // Delete the first key value pair.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-0"));
+ EXPECT_THAT(GetAllKeyValuePairs(persistent_hash_map->GetIterator()),
+ UnorderedElementsAre(Pair("default-google.com-1", 1),
+ Pair("default-google.com-2", 2)));
+}
+
+TEST_P(PersistentHashMapTest, IteratorAfterDeletingIntermediateKeyValuePair) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-0", Serialize(0).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-1", Serialize(1).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-2", Serialize(2).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3)));
+
+ // Delete any intermediate key value pair.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-1"));
+ EXPECT_THAT(GetAllKeyValuePairs(persistent_hash_map->GetIterator()),
+ UnorderedElementsAre(Pair("default-google.com-0", 0),
+ Pair("default-google.com-2", 2)));
+}
+
+TEST_P(PersistentHashMapTest, IteratorAfterDeletingLastKeyValuePair) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-0", Serialize(0).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-1", Serialize(1).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-2", Serialize(2).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3)));
+
+ // Delete the last key value pair.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-2"));
+ EXPECT_THAT(GetAllKeyValuePairs(persistent_hash_map->GetIterator()),
+ UnorderedElementsAre(Pair("default-google.com-0", 0),
+ Pair("default-google.com-1", 1)));
+}
+
+TEST_P(PersistentHashMapTest, IteratorAfterDeletingAllKeyValuePairs) {
+ // Create new persistent hash map
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem_, working_path_,
+ Options(
+ /*value_type_size_in=*/sizeof(int),
+ /*max_num_entries_in=*/Entry::kMaxNumEntries,
+ /*max_load_factor_percent_in=*/
+ Options::kDefaultMaxLoadFactorPercent,
+ /*average_kv_byte_size_in=*/Options::kDefaultAverageKVByteSize,
+ /*init_num_buckets_in=*/kTestInitNumBuckets,
+ /*pre_mapping_fbv_in=*/GetParam())));
+
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-0", Serialize(0).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-1", Serialize(1).data()));
+ ICING_ASSERT_OK(
+ persistent_hash_map->Put("default-google.com-2", Serialize(2).data()));
+ ASSERT_THAT(persistent_hash_map, Pointee(SizeIs(3)));
+
+ // Delete all key value pairs.
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-0"));
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-1"));
+ ICING_ASSERT_OK(persistent_hash_map->Delete("default-google.com-2"));
+ ASSERT_THAT(persistent_hash_map, Pointee(IsEmpty()));
+ EXPECT_FALSE(persistent_hash_map->GetIterator().Advance());
+}
+
+INSTANTIATE_TEST_SUITE_P(PersistentHashMapTest, PersistentHashMapTest,
+ testing::Values(true, false));
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/persistent-storage.cc b/icing/file/persistent-storage.cc
new file mode 100644
index 0000000..9a595ef
--- /dev/null
+++ b/icing/file/persistent-storage.cc
@@ -0,0 +1,55 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/persistent-storage.h"
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/legacy/core/icing-string-util.h"
+
+namespace icing {
+namespace lib {
+
+/* static */ libtextclassifier3::Status PersistentStorage::Discard(
+ const Filesystem& filesystem, const std::string& working_path,
+ WorkingPathType working_path_type) {
+ switch (working_path_type) {
+ case WorkingPathType::kSingleFile: {
+ if (!filesystem.DeleteFile(working_path.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to delete PersistentStorage file: ", working_path));
+ }
+ return libtextclassifier3::Status::OK;
+ }
+ case WorkingPathType::kDirectory: {
+ if (!filesystem.DeleteDirectoryRecursively(working_path.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to delete PersistentStorage directory: ", working_path));
+ }
+ return libtextclassifier3::Status::OK;
+ }
+ case WorkingPathType::kDummy:
+ return libtextclassifier3::Status::OK;
+ }
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Unknown working path type %d for PersistentStorage %s",
+ static_cast<int>(working_path_type), working_path.c_str()));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/persistent-storage.h b/icing/file/persistent-storage.h
new file mode 100644
index 0000000..9cb5e4d
--- /dev/null
+++ b/icing/file/persistent-storage.h
@@ -0,0 +1,369 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_PERSISTENT_STORAGE_H_
+#define ICING_FILE_PERSISTENT_STORAGE_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// PersistentStorage: an abstract class for all persistent data structures.
+// - It provides some common persistent file methods, e.g. PersistToDisk.
+// - It encapsulates most of the checksum handling logics (including update and
+// validation).
+//
+// Terminology:
+// - Crcs: checksum section
+// - Info: (custom) information for derived class
+// - Metadata: Crcs + Info
+//
+// Usually a persistent data structure will have its own custom Info and
+// storages (single or composite storages) definition. To create a new
+// persistent data structure via PersistentStorage:
+// - Decide what type the working path is (single file or directory). See
+// working_path_ and WorkingPathType for more details.
+// - Create a new class that inherits PersistentStorage:
+// - Declare custom Info and design the metadata section layout.
+// Usually the layout is <Crcs><Info>, and there are 2 common ways to
+// manage metadata section:
+// - Have a separate file for metadata. In this case, the new persistent
+// data structure contains multiple files, so working path should be used
+// as directory path and multiple files will be stored under it. Example:
+// PersistentHashMap.
+// - Have a single file for both metadata and storage data. In this case,
+// the file layout should be <Crcs><Info><Storage Data>, and
+// working path should be used as file path. Example: FileBackedVector.
+// - Handle working path file/directory creation and deletion.
+// PersistentStorage only provides static Discard() method to use. The
+// derived class should implement other logics, e.g. working path (file
+// /directory) creation, check condition to discard working path and start
+// over new file(s).
+// - Implement all pure virtual methods:
+// - PersistStoragesToDisk: persist all (composite) storages. In general,
+// the implementation will be calling PersistToDisk for all composite
+// storages.
+// - PersistMetadataToDisk: persist metadata, including Crcs and Info.
+// - If the derived class maintains a concrete Crc and (custom) Info
+// instance, then it should perform write/pwrite into the metadata
+// section.
+// - If the derived class uses memory-mapped region directly for metadata,
+// then it should call MemoryMappedFile::PersistToDisk.
+// - See crcs() for more details.
+// - ComputeInfoChecksum: compute the checksum for custom Info.
+// - ComputeStoragesChecksum: compute the (combined) checksum for all
+// (composite) storages. In general, the implementation will be calling
+// UpdateChecksums for all composite storages and XOR all checksums.
+// - crcs(): provide the reference for PersistentStorage to write checksums.
+// The derived class can either maintain a concrete Crcs instance, or
+// reinterpret_cast the memory-mapped region to Crcs reference. Either
+// choice is fine as long as PersistMetadataToDisk flushes it to disk
+// correctly.
+// - Call either InitializeNewStorage or InitializeExistingStorage when creating
+// and initializing an instance, depending on initializing new storage or from
+// existing file(s).
+class PersistentStorage {
+ public:
+ enum class WorkingPathType {
+ kSingleFile,
+ kDirectory,
+ kDummy,
+ };
+
+ // Crcs and Info will be written into the metadata section. Info is defined by
+ // the actual implementation of each persistent storage. Usually the Metadata
+ // layout is: <Crcs><Info>
+ struct Crcs {
+ struct ComponentCrcs {
+ uint32_t info_crc;
+ uint32_t storages_crc;
+
+ bool operator==(const ComponentCrcs& other) const {
+ return info_crc == other.info_crc && storages_crc == other.storages_crc;
+ }
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(std::string_view(reinterpret_cast<const char*>(this),
+ sizeof(ComponentCrcs)));
+ }
+ } __attribute__((packed));
+
+ bool operator==(const Crcs& other) const {
+ return all_crc == other.all_crc && component_crcs == other.component_crcs;
+ }
+
+ uint32_t all_crc;
+ ComponentCrcs component_crcs;
+ } __attribute__((packed));
+ static_assert(sizeof(Crcs) == 12, "");
+
+ // Deletes working_path according to its type.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ // - INVALID_ARGUMENT_ERROR if working_path_type is unknown type
+ static libtextclassifier3::Status Discard(const Filesystem& filesystem,
+ const std::string& working_path,
+ WorkingPathType working_path_type);
+
+ virtual ~PersistentStorage() = default;
+
+ // Initializes new persistent storage. It computes the initial checksums and
+ // writes into the metadata file.
+ //
+ // Note: either InitializeNewStorage or InitializeExistingStorage should be
+ // invoked after creating a PersistentStorage instance before using, otherwise
+ // an uninitialized instance will fail to use persistent storage features,
+ // e.g. PersistToDisk, UpdateChecksums.
+ //
+ // Returns:
+ // - OK on success or already initialized
+ // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending
+ // on actual implementation
+ libtextclassifier3::Status InitializeNewStorage() {
+ if (is_initialized_) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ ICING_RETURN_IF_ERROR(UpdateChecksumsInternal(/*force=*/true));
+ ICING_RETURN_IF_ERROR(PersistStoragesToDisk(/*force=*/true));
+ ICING_RETURN_IF_ERROR(PersistMetadataToDisk(/*force=*/true));
+
+ is_initialized_ = true;
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Initializes persistent storage from existing file(s).
+ //
+ // It enforces the following check(s):
+ // - Validate checksums.
+ //
+ // Note: either InitializeNewStorage or InitializeExistingStorage should be
+ // invoked after creating a PersistentStorage instance before using.
+ //
+ // Returns:
+ // - OK on success or already initialized
+ // - FAILED_PRECONDITION_ERROR if checksum validation fails.
+ // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending
+ // on actual implementation
+ libtextclassifier3::Status InitializeExistingStorage() {
+ if (is_initialized_) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ ICING_RETURN_IF_ERROR(ValidateChecksums());
+
+ is_initialized_ = true;
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Flushes contents to underlying files.
+ // 1) Flushes storages.
+ // 2) Updates all checksums by new data.
+ // 3) Flushes metadata.
+ //
+ // Force flag will be passed down to PersistMetadataToDisk,
+ // PersistStoragesToDisk, ComputeInfoChecksum, ComputeStoragesChecksum.
+ // - If force == true, then performs actual persisting operations/recomputes
+ // the checksum.
+ // - Otherwise, the derived class can decide itself whether skipping
+ // persisting operations/doing lazy checksum recomputing if the storage is
+ // not dirty.
+ //
+ // Returns:
+ // - OK on success
+ // - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized
+ // - Any errors from PersistStoragesToDisk, UpdateChecksums,
+ // PersistMetadataToDisk, depending on actual implementation
+ libtextclassifier3::Status PersistToDisk(bool force = false) {
+ if (!is_initialized_) {
+ return absl_ports::FailedPreconditionError(absl_ports::StrCat(
+ "PersistentStorage ", working_path_, " not initialized"));
+ }
+
+ ICING_RETURN_IF_ERROR(UpdateChecksumsInternal(force));
+ ICING_RETURN_IF_ERROR(PersistStoragesToDisk(force));
+ ICING_RETURN_IF_ERROR(PersistMetadataToDisk(force));
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Updates checksums of all components and returns the overall crc (all_crc)
+ // of the persistent storage.
+ //
+ // Force flag will be passed down ComputeInfoChecksum,
+ // ComputeStoragesChecksum.
+ // - If force == true, then recomputes the checksum.
+ // - Otherwise, the derived class can decide itself whether doing lazy
+ // checksum recomputing if the storage is not dirty.
+ //
+ // Returns:
+ // - Overall crc of the persistent storage on success
+ // - FAILED_PRECONDITION_ERROR if PersistentStorage is uninitialized
+ // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending
+ // on actual implementation
+ libtextclassifier3::StatusOr<Crc32> UpdateChecksums(bool force = false) {
+ if (!is_initialized_) {
+ return absl_ports::FailedPreconditionError(absl_ports::StrCat(
+ "PersistentStorage ", working_path_, " not initialized"));
+ }
+
+ return UpdateChecksumsInternal(force);
+ }
+
+ protected:
+ explicit PersistentStorage(const Filesystem& filesystem,
+ std::string working_path,
+ WorkingPathType working_path_type)
+ : filesystem_(filesystem),
+ working_path_(std::move(working_path)),
+ working_path_type_(working_path_type),
+ is_initialized_(false) {}
+
+ // Flushes contents of metadata. The implementation should flush Crcs and Info
+ // correctly, depending on whether they're using memory-mapped regions or
+ // concrete instances in the derived class.
+ //
+ // Returns:
+ // - OK on success
+ // - Any other errors, depending on actual implementation
+ virtual libtextclassifier3::Status PersistMetadataToDisk(bool force) = 0;
+
+ // Flushes contents of all storages to underlying files.
+ //
+ // Returns:
+ // - OK on success
+ // - Any other errors, depending on actual implementation
+ virtual libtextclassifier3::Status PersistStoragesToDisk(bool force) = 0;
+
+ // Computes and returns Info checksum.
+ // - If force = true, then recompute the entire checksum.
+ // - Otherwise, the derived class can decide itself whether doing lazy
+ // checksum computing if the storage is not dirty.
+ //
+ // This function will be mainly called by UpdateChecksums.
+ //
+ // Returns:
+ // - Crc of the Info on success
+ // - Any other errors, depending on actual implementation
+ virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(
+ bool force) = 0;
+
+ // Computes and returns all storages checksum. If there are multiple storages,
+ // usually we XOR their checksums together to a single checksum.
+ // - If force = true, then recompute the entire checksum.
+ // - Otherwise, the derived class can decide itself whether doing lazy
+ // checksum computing if the storage is not dirty.
+ //
+ // This function will be mainly called by UpdateChecksums.
+ //
+ // Returns:
+ // - Crc of all storages on success
+ // - Any other errors from depending on actual implementation
+ virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) = 0;
+
+ // Returns the Crcs instance reference. The derived class can either own a
+ // concrete Crcs instance, or reinterpret_cast the memory-mapped region to
+ // Crcs reference. PersistMetadataToDisk should flush it to disk correctly.
+ virtual Crcs& crcs() = 0;
+ virtual const Crcs& crcs() const = 0;
+
+ const Filesystem& filesystem_; // Does not own
+ // Path to the storage. It can be a single file path or a directory path
+ // depending on the implementation of the derived class.
+ //
+ // Note that the derived storage class will take full ownership and of
+ // working_path_, including creation/deletion. It is the caller's
+ // responsibility to specify correct working path and avoid mixing different
+ // persistent storages together under the same path. Also the caller has the
+ // ownership for the parent directory of working_path_, and it is responsible
+ // for parent directory creation/deletion.
+ std::string working_path_;
+ WorkingPathType working_path_type_;
+
+ bool is_initialized_;
+
+ private:
+ // Updates checksums of all components and returns the overall crc (all_crc)
+ // of the persistent storage. Different from UpdateChecksums, it won't check
+ // if PersistentStorage is initialized or not.
+ //
+ // Returns:
+ // - Overall crc of the persistent storage on success
+ // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending
+ // on actual implementation
+ libtextclassifier3::StatusOr<Crc32> UpdateChecksumsInternal(bool force) {
+ Crcs& crcs_ref = crcs();
+ // Compute and update storages + info checksums.
+ ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum(force));
+ ICING_ASSIGN_OR_RETURN(Crc32 storages_crc, ComputeStoragesChecksum(force));
+ if (crcs_ref.component_crcs.info_crc == info_crc.Get() &&
+ crcs_ref.component_crcs.storages_crc == storages_crc.Get()) {
+ // If info and storages crc haven't changed, then we don't have to update
+ // checksums.
+ return Crc32(crcs_ref.all_crc);
+ }
+
+ crcs_ref.component_crcs.info_crc = info_crc.Get();
+ crcs_ref.component_crcs.storages_crc = storages_crc.Get();
+
+ // Finally compute and update overall checksum.
+ crcs_ref.all_crc = crcs_ref.component_crcs.ComputeChecksum().Get();
+ return Crc32(crcs_ref.all_crc);
+ }
+
+ // Validates all checksums of the persistent storage.
+ //
+ // Returns:
+ // - OK on success
+ // - FAILED_PRECONDITION_ERROR if any checksum is incorrect.
+ // - Any errors from ComputeInfoChecksum, ComputeStoragesChecksum, depending
+ // on actual implementation
+ libtextclassifier3::Status ValidateChecksums() {
+ const Crcs& crcs_ref = crcs();
+ if (crcs_ref.all_crc != crcs_ref.component_crcs.ComputeChecksum().Get()) {
+ return absl_ports::FailedPreconditionError("Invalid all crc");
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 info_crc, ComputeInfoChecksum(/*force=*/true));
+ if (crcs_ref.component_crcs.info_crc != info_crc.Get()) {
+ return absl_ports::FailedPreconditionError("Invalid info crc");
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 storages_crc,
+ ComputeStoragesChecksum(/*force=*/true));
+ if (crcs_ref.component_crcs.storages_crc != storages_crc.Get()) {
+ return absl_ports::FailedPreconditionError("Invalid storages crc");
+ }
+
+ return libtextclassifier3::Status::OK;
+ }
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_PERSISTENT_STORAGE_H_
diff --git a/icing/file/portable-file-backed-proto-log.h b/icing/file/portable-file-backed-proto-log.h
new file mode 100644
index 0000000..a36bd9e
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log.h
@@ -0,0 +1,1263 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// File-backed log of protos with append-only writes and position based reads.
+//
+// There should only be one instance of a PortableFileBackedProtoLog of the same
+// file at a time; using multiple instances at the same time may lead to
+// undefined behavior.
+//
+// The entire checksum is computed on initialization to verify the contents are
+// valid. On failure, the log will be truncated to the last verified state when
+// PersistToDisk() was called. If the log cannot successfully restore the last
+// state due to disk corruption or some other inconsistency, then the entire log
+// will be lost.
+//
+// Each proto written to the file will have a metadata written just before it.
+// The metadata consists of
+// {
+// 1 bytes of kProtoMagic;
+// 3 bytes of the proto size
+// n bytes of the proto itself
+// }
+//
+// All metadata is written in a portable format, encoded with htonl before
+// writing to file and decoded with ntohl when reading from file.
+//
+// Example usage:
+// ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+// PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+// file_path_,
+// options));
+// auto proto_log = create_result.proto_log;
+//
+// Document document;
+// document.set_namespace("com.google.android.example");
+// document.set_uri("www.google.com");
+//
+// int64_t document_offset = proto_log->WriteProto(document));
+// Document same_document = proto_log->ReadProto(document_offset));
+// proto_log->PersistToDisk();
+
+#ifndef ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
+#define ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
+
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/gzip_stream.h"
+#include "icing/portable/platform.h"
+#include "icing/portable/zlib.h"
+#include "icing/util/bit-util.h"
+#include "icing/util/crc32.h"
+#include "icing/util/data-loss.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+
+namespace icing {
+namespace lib {
+
+template <typename ProtoT>
+class PortableFileBackedProtoLog {
+ public:
+ struct Options {
+ // Whether to compress each proto before writing to the proto log.
+ bool compress;
+
+ // Byte-size limit for each proto written to the store. This does not
+ // include the bytes needed for the metadata of each proto.
+ //
+ // NOTE: Currently, we only support protos up to 16MiB. We store the proto
+ // size in 3 bytes within the metadata.
+ //
+ // NOTE: This limit is only enforced for future writes. If the store
+ // previously had a higher limit, then reading older entries could return
+ // larger protos.
+ //
+ // NOTE: The max_proto_size is the upper limit for input protos into the
+ // ProtoLog. Even if the proto is larger than max_proto_size, but compresses
+ // to a smaller size, ProtoLog will not accept it. Protos that result in a
+ // compressed size larger than max_proto_size are also not accepted.
+ const int32_t max_proto_size;
+
+ // Level of compression if enabled, NO_COMPRESSION = 0, BEST_SPEED = 1,
+ // BEST_COMPRESSION = 9
+ const int32_t compression_level;
+
+ // Must specify values for options.
+ Options() = delete;
+ explicit Options(
+ bool compress_in, const int32_t max_proto_size_in = kMaxProtoSize,
+ const int32_t compression_level_in = kDeflateCompressionLevel)
+ : compress(compress_in),
+ max_proto_size(max_proto_size_in),
+ compression_level(compression_level_in) {}
+ };
+
+ // Our internal max for protos.
+ //
+ // WARNING: Changing this to a larger number may invalidate our assumption
+ // that that proto size can safely be stored in the last 3 bytes of the proto
+ // header.
+ static constexpr int kMaxProtoSize = (1 << 24) - 1; // 16MiB
+ static_assert(kMaxProtoSize <= 0x00FFFFFF,
+ "kMaxProtoSize doesn't fit in 3 bytes");
+
+ // Level of compression, BEST_SPEED = 1, BEST_COMPRESSION = 9
+ static constexpr int kDeflateCompressionLevel = 3;
+
+ // Number of bytes we reserve for the heading at the beginning of the proto
+ // log. We reserve this so the header can grow without running into the
+ // contents of the proto log, triggering an unnecessary migration of the data.
+ static constexpr int kHeaderReservedBytes = 256;
+
+ // Header stored at the beginning of the file before the rest of the log
+ // contents. Stores metadata on the log.
+ class Header {
+ public:
+ static constexpr int32_t kMagic = 0xf4c6f67a;
+
+ // We should go directly from 0 to 2 the next time we have to change the
+ // format.
+ static constexpr int32_t kFileFormatVersion = 0;
+
+ uint32_t CalculateHeaderChecksum() const {
+ Crc32 crc;
+
+ // Get a string_view of all the fields of the Header, excluding the
+ // magic_nbytes_ and header_checksum_nbytes_
+ std::string_view header_str(
+ reinterpret_cast<const char*>(this) +
+ offsetof(Header, header_checksum_nbytes_) +
+ sizeof(header_checksum_nbytes_),
+ sizeof(Header) - sizeof(magic_nbytes_) -
+ sizeof(header_checksum_nbytes_));
+ crc.Append(header_str);
+ return crc.Get();
+ }
+
+ int32_t GetMagic() const { return GNetworkToHostL(magic_nbytes_); }
+
+ void SetMagic(int32_t magic_in) {
+ magic_nbytes_ = GHostToNetworkL(magic_in);
+ }
+
+ int32_t GetFileFormatVersion() const {
+ return GNetworkToHostL(file_format_version_nbytes_);
+ }
+
+ void SetFileFormatVersion(int32_t file_format_version_in) {
+ file_format_version_nbytes_ = GHostToNetworkL(file_format_version_in);
+ }
+
+ int32_t GetMaxProtoSize() const {
+ return GNetworkToHostL(max_proto_size_nbytes_);
+ }
+
+ void SetMaxProtoSize(int32_t max_proto_size_in) {
+ max_proto_size_nbytes_ = GHostToNetworkL(max_proto_size_in);
+ }
+
+ int32_t GetLogChecksum() const {
+ return GNetworkToHostL(log_checksum_nbytes_);
+ }
+
+ void SetLogChecksum(int32_t log_checksum_in) {
+ log_checksum_nbytes_ = GHostToNetworkL(log_checksum_in);
+ }
+
+ int64_t GetRewindOffset() const {
+ return GNetworkToHostLL(rewind_offset_nbytes_);
+ }
+
+ void SetRewindOffset(int64_t rewind_offset_in) {
+ rewind_offset_nbytes_ = GHostToNetworkLL(rewind_offset_in);
+ }
+
+ int32_t GetHeaderChecksum() const {
+ return GNetworkToHostL(header_checksum_nbytes_);
+ }
+
+ void SetHeaderChecksum(int32_t header_checksum_in) {
+ header_checksum_nbytes_ = GHostToNetworkL(header_checksum_in);
+ }
+
+ bool GetCompressFlag() const { return GetFlag(kCompressBit); }
+
+ void SetCompressFlag(bool compress) { SetFlag(kCompressBit, compress); }
+
+ bool GetDirtyFlag() const { return GetFlag(kDirtyBit); }
+
+ void SetDirtyFlag(bool dirty) { SetFlag(kDirtyBit, dirty); }
+
+ private:
+ // The least-significant bit offset at which the compress flag is stored in
+ // 'flags_nbytes_'. Represents whether the protos in the log are compressed
+ // or not.
+ static constexpr int32_t kCompressBit = 0;
+
+ // The least-significant bit offset at which the dirty flag is stored in
+ // 'flags'. Represents whether the checksummed portion of the log has been
+ // modified after the last checksum was computed.
+ static constexpr int32_t kDirtyBit = 1;
+
+ bool GetFlag(int offset) const {
+ return bit_util::BitfieldGet(flags_, offset, /*len=*/1);
+ }
+
+ void SetFlag(int offset, bool value) {
+ bit_util::BitfieldSet(value, offset, /*len=*/1, &flags_);
+ }
+
+ // Holds the magic as a quick sanity check against file corruption.
+ //
+ // Field is in network-byte order.
+ int32_t magic_nbytes_ = GHostToNetworkL(kMagic);
+
+ // Must be at the beginning after kMagic. Contains the crc checksum of
+ // the following fields.
+ //
+ // Field is in network-byte order.
+ uint32_t header_checksum_nbytes_ = 0;
+
+ // Last known good offset at which the log and its checksum were updated.
+ // If we crash between writing to the log and updating the checksum, we can
+ // try to rewind the log to this offset and verify the checksum is still
+ // valid instead of throwing away the entire log.
+ //
+ // Field is in network-byte order.
+ int64_t rewind_offset_nbytes_ = GHostToNetworkLL(kHeaderReservedBytes);
+
+ // Version number tracking how we serialize the file to disk. If we change
+ // how/what we write to disk, this version should be updated and this class
+ // should handle a migration.
+ //
+ // Currently at kFileFormatVersion.
+ //
+ // Field is in network-byte order.
+ int32_t file_format_version_nbytes_ = 0;
+
+ // The maximum proto size that can be written to the log.
+ //
+ // Field is in network-byte order.
+ int32_t max_proto_size_nbytes_ = 0;
+
+ // Checksum of the log elements, doesn't include the header fields.
+ //
+ // Field is in network-byte order.
+ uint32_t log_checksum_nbytes_ = 0;
+
+ // Bits are used to hold various flags.
+ // Lowest bit is whether the protos are compressed or not.
+ //
+ // Field is only 1 byte, so is byte-order agnostic.
+ uint8_t flags_ = 0;
+
+ // NOTE: New fields should *almost always* be added to the end here. Since
+ // this class may have already been written to disk, appending fields
+ // increases the chances that changes are backwards-compatible.
+ };
+ static_assert(sizeof(Header) <= kHeaderReservedBytes,
+ "Header has grown past our reserved bytes!");
+
+ struct CreateResult {
+ // A successfully initialized log.
+ std::unique_ptr<PortableFileBackedProtoLog<ProtoT>> proto_log;
+
+ // The data status after initializing from a previous state. Data loss can
+ // happen if the file is corrupted or some previously added data was
+ // unpersisted. This may be used to signal that any derived data off of the
+ // proto log may need to be regenerated.
+ DataLoss data_loss = DataLoss::NONE;
+
+ // Whether the proto log had to recalculate the checksum to check its
+ // integrity. This can be avoided if no changes were made or the log was
+ // able to update its checksum before shutting down. But it may have to
+ // recalculate if it's unclear if we crashed after updating the log, but
+ // before updating our checksum.
+ bool recalculated_checksum = false;
+
+ bool has_data_loss() const {
+ return data_loss == DataLoss::PARTIAL || data_loss == DataLoss::COMPLETE;
+ }
+ };
+
+ // Factory method to create, initialize, and return a
+ // PortableFileBackedProtoLog. Will create the file if it doesn't exist.
+ //
+ // If on re-initialization the log detects disk corruption or some previously
+ // added data was unpersisted, the log will rewind to the last-good state. The
+ // log saves these checkpointed "good" states when PersistToDisk() is called
+ // or the log is safely destructed. If the log rewinds successfully to the
+ // last-good state, then the returned CreateResult.data_loss indicates
+ // whether it has a data loss and what kind of data loss it is (partial or
+ // complete) so that any derived data may know that it needs to be updated. If
+ // the log re-initializes successfully without any data loss,
+ // CreateResult.data_loss will be NONE.
+ //
+ // Params:
+ // filesystem: Handles system level calls
+ // file_path: Path of the underlying file. Directory of the file should
+ // already exist
+ // options: Configuration options for the proto log
+ //
+ // Returns:
+ // PortableFileBackedProtoLog::CreateResult on success
+ // INVALID_ARGUMENT on an invalid option
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<CreateResult> Create(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options);
+
+ // Not copyable
+ PortableFileBackedProtoLog(const PortableFileBackedProtoLog&) = delete;
+ PortableFileBackedProtoLog& operator=(const PortableFileBackedProtoLog&) =
+ delete;
+
+ // This will update the checksum of the log as well.
+ ~PortableFileBackedProtoLog();
+
+ // Writes the serialized proto to the underlying file. Writes are applied
+ // directly to the underlying file. Users do not need to sync the file after
+ // writing.
+ //
+ // Returns:
+ // Offset of the newly appended proto in file on success
+ // INVALID_ARGUMENT if proto is too large, as decided by
+ // Options.max_proto_size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> WriteProto(const ProtoT& proto);
+
+ // Reads out a proto located at file_offset from the file.
+ //
+ // Returns:
+ // A proto on success
+ // NOT_FOUND if the proto at the given offset has been erased
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<ProtoT> ReadProto(int64_t file_offset) const;
+
+ // Erases the data of a proto located at file_offset from the file.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file size
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status EraseProto(int64_t file_offset);
+
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Returns the file size of all the elements held in the log. File size is in
+ // bytes. This excludes the size of any internal metadata of the log, e.g. the
+ // log's header.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
+ // An iterator helping to find offsets of all the protos in file.
+ // Example usage:
+ //
+ // while (iterator.Advance().ok()) {
+ // int64_t offset = iterator.GetOffset();
+ // // Do something
+ // }
+ class Iterator {
+ public:
+ Iterator(const Filesystem& filesystem, int fd, int64_t initial_offset);
+
+ // Advances to the position of next proto whether it has been erased or not.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if it reaches the end
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Advance();
+
+ // Returns the file offset of current proto.
+ int64_t GetOffset();
+
+ private:
+ static constexpr int64_t kInvalidOffset = -1;
+ // Used to read proto metadata
+ // Offset of first proto
+ const Filesystem* const filesystem_;
+ int64_t initial_offset_;
+ int64_t current_offset_;
+ int64_t file_size_;
+ int fd_;
+ };
+
+ // Returns an iterator of current proto log. The caller needs to keep the
+ // proto log unchanged while using the iterator, otherwise unexpected
+ // behaviors could happen.
+ Iterator GetIterator();
+
+ // Persists all changes since initialization or the last call to
+ // PersistToDisk(). Any changes that aren't persisted may be lost if the
+ // system fails to close safely.
+ //
+ // Example use case:
+ //
+ // Document document;
+ // document.set_namespace("com.google.android.example");
+ // document.set_uri("www.google.com");
+ //
+ // {
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+ // file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // int64_t document_offset = proto_log->WriteProto(document));
+ //
+ // // We lose the document here since it wasn't persisted.
+ // // *SYSTEM CRASH*
+ // }
+ //
+ // {
+ // // Can still successfully create after a crash since the log can
+ // // rewind/truncate to recover into a previously good state
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+ // file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // // Lost the proto since we didn't PersistToDisk before the crash
+ // proto_log->ReadProto(document_offset)); // INVALID_ARGUMENT error
+ //
+ // int64_t document_offset = proto_log->WriteProto(document));
+ //
+ // // Persisted this time, so we should be ok.
+ // ICING_ASSERT_OK(proto_log->PersistToDisk());
+ // }
+ //
+ // {
+ // ICING_ASSERT_OK_AND_ASSIGN(auto create_result,
+ // PortableFileBackedProtoLog<DocumentProto>::Create(filesystem,
+ // file_path,
+ // options));
+ // auto proto_log = std::move(create_result.proto_log);
+ //
+ // // SUCCESS
+ // Document same_document = proto_log->ReadProto(document_offset));
+ // }
+ //
+ // NOTE: Since all protos are already written to the file directly, this
+ // just updates the checksum and rewind position. Without these updates,
+ // future initializations will truncate the file and discard unpersisted
+ // changes.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status PersistToDisk();
+
+ // Calculates the checksum of the log contents. Excludes the header content.
+ //
+ // Returns:
+ // Crc of the log content
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
+ private:
+ // Object can only be instantiated via the ::Create factory.
+ PortableFileBackedProtoLog(const Filesystem* filesystem,
+ const std::string& file_path,
+ std::unique_ptr<Header> header,
+ int32_t compression_level);
+
+ // Initializes a new proto log.
+ //
+ // Returns:
+ // std::unique_ptr<CreateResult> on success
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<CreateResult> InitializeNewFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options);
+
+ // Verifies that the existing proto log is in a good state. If not in a good
+ // state, then the proto log may be truncated to the last good state and
+ // content will be lost.
+ //
+ // Returns:
+ // std::unique_ptr<CreateResult> on success
+ // INTERNAL_ERROR on IO error or internal inconsistencies in the file
+ // INVALID_ARGUMENT_ERROR if options aren't consistent with previous
+ // instances
+ static libtextclassifier3::StatusOr<CreateResult> InitializeExistingFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options, int64_t file_size);
+
+ // Takes an initial checksum and updates it with the content between `start`
+ // and `end` offsets in the file.
+ //
+ // Returns:
+ // Crc of the content between `start`, inclusive, and `end`, exclusive.
+ // INTERNAL_ERROR on IO error
+ // INVALID_ARGUMENT_ERROR if start and end aren't within the file size
+ static libtextclassifier3::StatusOr<Crc32> ComputeChecksum(
+ const Filesystem* filesystem, const std::string& file_path,
+ Crc32 initial_crc, int64_t start, int64_t end);
+
+ // Reads out the metadata of a proto located at file_offset from the fd.
+ // Metadata will be returned in host byte order endianness.
+ //
+ // Returns:
+ // Proto's metadata on success
+ // OUT_OF_RANGE_ERROR if file_offset exceeds file_size
+ // INTERNAL_ERROR if the metadata is invalid or any IO errors happen
+ static libtextclassifier3::StatusOr<int32_t> ReadProtoMetadata(
+ const Filesystem* const filesystem, int fd, int64_t file_offset,
+ int64_t file_size);
+
+ // Writes metadata of a proto to the fd. Takes in a host byte order endianness
+ // metadata and converts it into a portable metadata before writing.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on any IO errors
+ static libtextclassifier3::Status WriteProtoMetadata(
+ const Filesystem* filesystem, int fd, int32_t host_order_metadata);
+
+ static bool IsEmptyBuffer(const char* buffer, int size) {
+ return std::all_of(buffer, buffer + size,
+ [](const char byte) { return byte == 0; });
+ }
+
+ // Helper function to get stored proto size from the metadata.
+ // Metadata format: 8 bits magic + 24 bits size
+ static int GetProtoSize(int metadata) { return metadata & 0x00FFFFFF; }
+
+ // Helper function to get stored proto magic from the metadata.
+ // Metadata format: 8 bits magic + 24 bits size
+ static uint8_t GetProtoMagic(int metadata) { return metadata >> 24; }
+
+ // Magic number added in front of every proto. Used when reading out protos
+ // as a first check for corruption in each entry in the file. Even if there is
+ // a corruption, the best we can do is roll back to our last recovery point
+ // and throw away un-flushed data. We can discard/reuse this byte if needed so
+ // that we have 4 bytes to store the size of protos, and increase the size of
+ // protos we support.
+ static constexpr uint8_t kProtoMagic = 0x5C;
+
+ // Chunks of the file to mmap at a time, so we don't mmap the entire file.
+ // Only used on 32-bit devices
+ static constexpr int kMmapChunkSize = 4 * 1024 * 1024; // 4MiB
+
+ ScopedFd fd_;
+ const Filesystem* const filesystem_;
+ const std::string file_path_;
+ std::unique_ptr<Header> header_;
+ const int32_t compression_level_;
+};
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::PortableFileBackedProtoLog(
+ const Filesystem* filesystem, const std::string& file_path,
+ std::unique_ptr<Header> header, int32_t compression_level)
+ : filesystem_(filesystem),
+ file_path_(file_path),
+ header_(std::move(header)),
+ compression_level_(compression_level) {
+ fd_.reset(filesystem_->OpenForAppend(file_path.c_str()));
+}
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::~PortableFileBackedProtoLog() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING) << "Error persisting to disk during destruction of "
+ "PortableFileBackedProtoLog: "
+ << file_path_;
+ }
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+ typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::Create(const Filesystem* filesystem,
+ const std::string& file_path,
+ const Options& options) {
+ if (options.max_proto_size <= 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "options.max_proto_size must be greater than 0, was %d",
+ options.max_proto_size));
+ }
+
+ // Since we store the proto_size in 3 bytes, we can only support protos of up
+ // to 16MiB.
+ if (options.max_proto_size > kMaxProtoSize) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "options.max_proto_size must be under 16MiB, was %d",
+ options.max_proto_size));
+ }
+
+ if (options.compression_level < 0 || options.compression_level > 9) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "options.compression_level must be between 0 and 9 inclusive, was %d",
+ options.compression_level));
+ }
+
+ if (!filesystem->FileExists(file_path.c_str())) {
+ return InitializeNewFile(filesystem, file_path, options);
+ }
+
+ int64_t file_size = filesystem->GetFileSize(file_path.c_str());
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Bad file size '", file_path, "'"));
+ }
+
+ if (file_size == 0) {
+ return InitializeNewFile(filesystem, file_path, options);
+ }
+
+ return InitializeExistingFile(filesystem, file_path, options, file_size);
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+ typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::InitializeNewFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options) {
+ // Grow to the minimum reserved bytes for the header.
+ if (!filesystem->Truncate(file_path.c_str(), kHeaderReservedBytes)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to initialize file size: ", file_path));
+ }
+
+ // Create the header
+ std::unique_ptr<Header> header = std::make_unique<Header>();
+ header->SetCompressFlag(options.compress);
+ header->SetMaxProtoSize(options.max_proto_size);
+ header->SetHeaderChecksum(header->CalculateHeaderChecksum());
+
+ if (!filesystem->Write(file_path.c_str(), header.get(), sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write header for file: ", file_path));
+ }
+
+ CreateResult create_result = {
+ std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
+ new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
+ std::move(header),
+ options.compression_level)),
+ /*data_loss=*/DataLoss::NONE, /*recalculated_checksum=*/false};
+
+ return create_result;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<
+ typename PortableFileBackedProtoLog<ProtoT>::CreateResult>
+PortableFileBackedProtoLog<ProtoT>::InitializeExistingFile(
+ const Filesystem* filesystem, const std::string& file_path,
+ const Options& options, int64_t file_size) {
+ bool header_changed = false;
+ if (file_size < kHeaderReservedBytes) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("File header too short for: ", file_path));
+ }
+
+ std::unique_ptr<Header> header = std::make_unique<Header>();
+ if (!filesystem->PRead(file_path.c_str(), header.get(), sizeof(Header),
+ /*offset=*/0)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to read header for file: ", file_path));
+ }
+
+ // Make sure the header is still valid before we use any of its values. This
+ // is covered by the header_checksum check below, but this is a quick check
+ // that can save us from an extra crc computation.
+ if (header->GetMagic() != Header::kMagic) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header kMagic for file: ", file_path));
+ }
+
+ if (header->GetHeaderChecksum() != header->CalculateHeaderChecksum()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header checksum for: ", file_path));
+ }
+
+ if (header->GetFileFormatVersion() != Header::kFileFormatVersion) {
+ // If this changes, we might need to handle a migration rather than throwing
+ // an error.
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header file format version: ", file_path));
+ }
+
+ if (header->GetCompressFlag() != options.compress) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Inconsistent compress option, expected %d, actual %d",
+ header->GetCompressFlag(), options.compress));
+ }
+
+ int32_t existing_max_proto_size = header->GetMaxProtoSize();
+ if (existing_max_proto_size > options.max_proto_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Max proto size cannot be smaller than previous "
+ "instantiations, previous size %d, wanted size %d",
+ header->GetMaxProtoSize(), options.max_proto_size));
+ } else if (existing_max_proto_size < options.max_proto_size) {
+ // It's fine if our new max size is greater than our previous one. Existing
+ // data is still valid.
+ header->SetMaxProtoSize(options.max_proto_size);
+ header_changed = true;
+ }
+
+ DataLoss data_loss = DataLoss::NONE;
+
+ // If we have any documents in our tail, get rid of them since they're not in
+ // our checksum. Our checksum reflects content up to the rewind offset.
+ if (file_size > header->GetRewindOffset()) {
+ if (!filesystem->Truncate(file_path.c_str(), header->GetRewindOffset())) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Failed to truncate '%s' to size %lld", file_path.data(),
+ static_cast<long long>(header->GetRewindOffset())));
+ }
+ data_loss = DataLoss::PARTIAL;
+ }
+
+ bool recalculated_checksum = false;
+
+ // If our dirty flag is set, that means we might have crashed in the middle of
+ // erasing a proto. This could have happened anywhere between:
+ // A. Set dirty flag to true and update header checksum
+ // B. Erase the proto
+ // C. Set dirty flag to false, update log checksum, update header checksum
+ //
+ // Scenario 1: We went down between A and B. Maybe our dirty flag is a
+ // false alarm and we can keep all our data.
+ //
+ // Scenario 2: We went down between B and C. Our data is compromised and
+ // we need to throw everything out.
+ if (header->GetDirtyFlag()) {
+ // Recompute the log's checksum to detect which scenario we're in.
+ ICING_ASSIGN_OR_RETURN(
+ Crc32 calculated_log_checksum,
+ ComputeChecksum(filesystem, file_path, Crc32(),
+ /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+
+ if (header->GetLogChecksum() != calculated_log_checksum.Get()) {
+ // Still doesn't match, we're in Scenario 2. Throw out all our data now
+ // and initialize as a new instance.
+ ICING_ASSIGN_OR_RETURN(CreateResult create_result,
+ InitializeNewFile(filesystem, file_path, options));
+ create_result.data_loss = DataLoss::COMPLETE;
+ create_result.recalculated_checksum = true;
+ return create_result;
+ }
+ // Otherwise we're good, checksum matches our contents so continue
+ // initializing like normal.
+ recalculated_checksum = true;
+
+ // Update our header.
+ header->SetDirtyFlag(false);
+ header_changed = true;
+ }
+
+ if (header_changed) {
+ header->SetHeaderChecksum(header->CalculateHeaderChecksum());
+
+ if (!filesystem->PWrite(file_path.c_str(), /*offset=*/0, header.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path));
+ }
+ }
+
+ CreateResult create_result = {
+ std::unique_ptr<PortableFileBackedProtoLog<ProtoT>>(
+ new PortableFileBackedProtoLog<ProtoT>(filesystem, file_path,
+ std::move(header),
+ options.compression_level)),
+ data_loss, recalculated_checksum};
+
+ return create_result;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32>
+PortableFileBackedProtoLog<ProtoT>::ComputeChecksum(
+ const Filesystem* filesystem, const std::string& file_path,
+ Crc32 initial_crc, int64_t start, int64_t end) {
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(*filesystem, file_path,
+ MemoryMappedFile::Strategy::READ_ONLY));
+ Crc32 new_crc(initial_crc.Get());
+
+ if (start < 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Starting checksum offset of file '%s' must be greater than 0, was "
+ "%lld",
+ file_path.c_str(), static_cast<long long>(start)));
+ }
+
+ if (end < start) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Ending checksum offset of file '%s' must be greater than start "
+ "'%lld', was '%lld'",
+ file_path.c_str(), static_cast<long long>(start),
+ static_cast<long long>(end)));
+ }
+
+ int64_t file_size = filesystem->GetFileSize(file_path.c_str());
+ if (end > file_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Ending checksum offset of file '%s' must be within "
+ "file size of %lld, was %lld",
+ file_path.c_str(), static_cast<long long>(file_size),
+ static_cast<long long>(end)));
+ }
+
+ Architecture architecture = GetArchitecture();
+ switch (architecture) {
+ case Architecture::BIT_64: {
+ // Don't mmap in chunks here since mmapping can be harmful on 64-bit
+ // devices where mmap/munmap calls need the mmap write semaphore, which
+ // blocks mmap/munmap/mprotect and all page faults from executing while
+ // they run. On 64-bit devices, this doesn't actually load into memory, it
+ // just makes the file faultable. So the whole file should be ok.
+ // b/185822878.
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(start, end - start));
+ auto mmap_str = std::string_view(mmapped_file.region(), end - start);
+ new_crc.Append(mmap_str);
+ break;
+ }
+ case Architecture::BIT_32:
+ [[fallthrough]];
+ case Architecture::UNKNOWN: {
+ // 32-bit devices only have 4GB of RAM. Mmap in chunks to not use up too
+ // much memory at once. If we're unknown, then also chunk it because we're
+ // not sure what the device can handle.
+ for (int i = start; i < end; i += kMmapChunkSize) {
+ // Don't read past the file size.
+ int next_chunk_size = kMmapChunkSize;
+ if ((i + kMmapChunkSize) >= end) {
+ next_chunk_size = end - i;
+ }
+
+ ICING_RETURN_IF_ERROR(mmapped_file.Remap(i, next_chunk_size));
+
+ auto mmap_str =
+ std::string_view(mmapped_file.region(), next_chunk_size);
+ new_crc.Append(mmap_str);
+ }
+ break;
+ }
+ }
+
+ return new_crc;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::WriteProto(const ProtoT& proto) {
+ int64_t proto_size = proto.ByteSizeLong();
+ int32_t host_order_metadata;
+ int64_t current_position = filesystem_->GetCurrentPosition(fd_.get());
+
+ if (proto_size > header_->GetMaxProtoSize()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "proto_size, %lld, was too large to write. Max is %d",
+ static_cast<long long>(proto_size), header_->GetMaxProtoSize()));
+ }
+
+ // At this point, we've guaranteed that proto_size is under kMaxProtoSize
+ // (see
+ // ::Create), so we can safely store it in an int.
+ int final_size = 0;
+
+ std::string proto_str;
+ google::protobuf::io::StringOutputStream proto_stream(&proto_str);
+
+ if (header_->GetCompressFlag()) {
+ protobuf_ports::GzipOutputStream::Options options;
+ options.format = protobuf_ports::GzipOutputStream::ZLIB;
+ options.compression_level = compression_level_;
+
+ protobuf_ports::GzipOutputStream compressing_stream(&proto_stream, options);
+
+ bool success = proto.SerializeToZeroCopyStream(&compressing_stream) &&
+ compressing_stream.Close();
+
+ if (!success) {
+ return absl_ports::InternalError("Error compressing proto.");
+ }
+
+ final_size = proto_str.size();
+
+ // In case the compressed proto is larger than the original proto, we also
+ // can't write it.
+ if (final_size > header_->GetMaxProtoSize()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Compressed proto size, %d, was greater than "
+ "max_proto_size, %d",
+ final_size, header_->GetMaxProtoSize()));
+ }
+ } else {
+ // Serialize the proto directly into the write buffer at an offset of the
+ // metadata.
+ proto.SerializeToZeroCopyStream(&proto_stream);
+ final_size = proto_str.size();
+ }
+
+ // 1st byte for magic, next 3 bytes for proto size.
+ host_order_metadata = (kProtoMagic << 24) | final_size;
+
+ // Actually write metadata, has to be done after we know the possibly
+ // compressed proto size
+ ICING_RETURN_IF_ERROR(
+ WriteProtoMetadata(filesystem_, fd_.get(), host_order_metadata));
+
+ // Write the serialized proto
+ if (!filesystem_->Write(fd_.get(), proto_str.data(), proto_str.size())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto to: ", file_path_));
+ }
+
+ return current_position;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<ProtoT>
+PortableFileBackedProtoLog<ProtoT>::ReadProto(int64_t file_offset) const {
+ int64_t file_size = filesystem_->GetFileSize(fd_.get());
+ // Read out the metadata
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::OutOfRangeError("Unable to correctly read size.");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ int32_t metadata,
+ ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
+
+ // Copy out however many bytes it says the proto is
+ int stored_size = GetProtoSize(metadata);
+ file_offset += sizeof(metadata);
+
+ // Read the compressed proto out.
+ if (file_offset + stored_size > file_size) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+ auto buf = std::make_unique<char[]>(stored_size);
+ if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
+
+ if (IsEmptyBuffer(buf.get(), stored_size)) {
+ return absl_ports::NotFoundError("The proto data has been erased.");
+ }
+
+ google::protobuf::io::ArrayInputStream proto_stream(buf.get(), stored_size);
+
+ // Deserialize proto
+ ProtoT proto;
+ if (header_->GetCompressFlag()) {
+ protobuf_ports::GzipInputStream decompress_stream(&proto_stream);
+ proto.ParseFromZeroCopyStream(&decompress_stream);
+ } else {
+ proto.ParseFromZeroCopyStream(&proto_stream);
+ }
+
+ return proto;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::EraseProto(
+ int64_t file_offset) {
+ int64_t file_size = filesystem_->GetFileSize(fd_.get());
+ if (file_size == Filesystem::kBadFileSize) {
+ return absl_ports::OutOfRangeError("Unable to correctly read size.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ int32_t metadata,
+ ReadProtoMetadata(filesystem_, fd_.get(), file_offset, file_size));
+ // Copy out however many bytes it says the proto is
+ int stored_size = GetProtoSize(metadata);
+ file_offset += sizeof(metadata);
+ if (file_offset + stored_size > file_size) {
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Trying to read from a location, %lld, "
+ "out of range of the file size, %lld",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size - 1)));
+ }
+ auto buf = std::make_unique<char[]>(stored_size);
+
+ // We need to update the crc checksum if the erased area is before the
+ // rewind position.
+ int32_t new_crc;
+ if (file_offset < header_->GetRewindOffset()) {
+ // Set to "dirty" before we start writing anything.
+ header_->SetDirtyFlag(true);
+ header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to update dirty bit of header to: ", file_path_));
+ }
+
+ // We need to calculate [original string xor 0s].
+ // The xored string is the same as the original string because 0 xor 0 =
+ // 0, 1 xor 0 = 1.
+ // Read the compressed proto out.
+ if (!filesystem_->PRead(fd_.get(), buf.get(), stored_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
+ const std::string_view xored_str(buf.get(), stored_size);
+
+ Crc32 crc(header_->GetLogChecksum());
+ ICING_ASSIGN_OR_RETURN(
+ new_crc,
+ crc.UpdateWithXor(xored_str,
+ /*full_data_size=*/header_->GetRewindOffset() -
+ kHeaderReservedBytes,
+ /*position=*/file_offset - kHeaderReservedBytes));
+ }
+
+ // Clear the region.
+ memset(buf.get(), '\0', stored_size);
+ if (!filesystem_->PWrite(fd_.get(), file_offset, buf.get(), stored_size)) {
+ return absl_ports::InternalError("");
+ }
+
+ // If we cleared something in our checksummed area, we should update our
+ // checksum and reset our dirty bit.
+ if (file_offset < header_->GetRewindOffset()) {
+ header_->SetDirtyFlag(false);
+ header_->SetLogChecksum(new_crc);
+ header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::GetDiskUsage() const {
+ int64_t size = filesystem_->GetDiskUsage(file_path_.c_str());
+ if (size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError("Failed to get disk usage of proto log");
+ }
+ return size;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int64_t>
+PortableFileBackedProtoLog<ProtoT>::GetElementsFileSize() const {
+ int64_t total_file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (total_file_size == Filesystem::kBadFileSize) {
+ return absl_ports::InternalError(
+ "Failed to get file size of elments in the proto log");
+ }
+ return total_file_size - kHeaderReservedBytes;
+}
+
+template <typename ProtoT>
+PortableFileBackedProtoLog<ProtoT>::Iterator::Iterator(
+ const Filesystem& filesystem, int fd, int64_t initial_offset)
+ : filesystem_(&filesystem),
+ initial_offset_(initial_offset),
+ current_offset_(kInvalidOffset),
+ fd_(fd) {
+ file_size_ = filesystem_->GetFileSize(fd_);
+ if (file_size_ == Filesystem::kBadFileSize) {
+ // Fails all Advance() calls
+ file_size_ = 0;
+ }
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status
+PortableFileBackedProtoLog<ProtoT>::Iterator::Advance() {
+ if (current_offset_ == kInvalidOffset) {
+ // First Advance() call
+ current_offset_ = initial_offset_;
+ } else {
+ // Jumps to the next proto position
+ ICING_ASSIGN_OR_RETURN(
+ int32_t metadata,
+ ReadProtoMetadata(filesystem_, fd_, current_offset_, file_size_));
+ current_offset_ += sizeof(metadata) + GetProtoSize(metadata);
+ }
+
+ if (current_offset_ < file_size_) {
+ return libtextclassifier3::Status::OK;
+ } else {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "The next proto offset, %lld, is out of file range [0, %lld)",
+ static_cast<long long>(current_offset_),
+ static_cast<long long>(file_size_)));
+ }
+}
+
+template <typename ProtoT>
+int64_t PortableFileBackedProtoLog<ProtoT>::Iterator::GetOffset() {
+ return current_offset_;
+}
+
+template <typename ProtoT>
+typename PortableFileBackedProtoLog<ProtoT>::Iterator
+PortableFileBackedProtoLog<ProtoT>::GetIterator() {
+ return Iterator(*filesystem_, fd_.get(),
+ /*initial_offset=*/kHeaderReservedBytes);
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<int32_t>
+PortableFileBackedProtoLog<ProtoT>::ReadProtoMetadata(
+ const Filesystem* const filesystem, int fd, int64_t file_offset,
+ int64_t file_size) {
+ // Checks file_offset
+ if (file_offset >= file_size) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "offset, %lld, is out of file range [0, %lld)",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size)));
+ }
+ int32_t portable_metadata;
+ int metadata_size = sizeof(portable_metadata);
+ if (file_offset + metadata_size >= file_size) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Wrong metadata offset %lld, metadata doesn't fit in "
+ "with file range [0, %lld)",
+ static_cast<long long>(file_offset),
+ static_cast<long long>(file_size)));
+ }
+
+ if (!filesystem->PRead(fd, &portable_metadata, metadata_size, file_offset)) {
+ return absl_ports::InternalError("");
+ }
+
+ // Need to switch it back to host order endianness after reading from disk.
+ int32_t host_order_metadata = GNetworkToHostL(portable_metadata);
+
+ // Checks magic number
+ uint8_t stored_k_proto_magic = GetProtoMagic(host_order_metadata);
+ if (stored_k_proto_magic != kProtoMagic) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Failed to read kProtoMagic, expected %d, actual %d", kProtoMagic,
+ stored_k_proto_magic));
+ }
+
+ return host_order_metadata;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status
+PortableFileBackedProtoLog<ProtoT>::WriteProtoMetadata(
+ const Filesystem* filesystem, int fd, int32_t host_order_metadata) {
+ // Convert it into portable endian format before writing to disk
+ int32_t portable_metadata = GHostToNetworkL(host_order_metadata);
+ int portable_metadata_size = sizeof(portable_metadata);
+
+ // Write metadata
+ if (!filesystem->Write(fd, &portable_metadata, portable_metadata_size)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write proto metadata."));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::Status PortableFileBackedProtoLog<ProtoT>::PersistToDisk() {
+ int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
+ if (file_size == header_->GetRewindOffset()) {
+ // No new protos appended, don't need to update the checksum.
+ return libtextclassifier3::Status::OK;
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
+
+ header_->SetLogChecksum(crc.Get());
+ header_->SetRewindOffset(file_size);
+ header_->SetHeaderChecksum(header_->CalculateHeaderChecksum());
+
+ if (!filesystem_->PWrite(fd_.get(), /*offset=*/0, header_.get(),
+ sizeof(Header)) ||
+ !filesystem_->DataSync(fd_.get())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to update header to: ", file_path_));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename ProtoT>
+libtextclassifier3::StatusOr<Crc32>
+PortableFileBackedProtoLog<ProtoT>::ComputeChecksum() {
+ int64_t file_size = filesystem_->GetFileSize(file_path_.c_str());
+ int64_t new_content_size = file_size - header_->GetRewindOffset();
+ Crc32 crc;
+ if (new_content_size == 0) {
+ // No new protos appended, return cached checksum
+ return Crc32(header_->GetLogChecksum());
+ } else if (new_content_size < 0) {
+ // File shrunk, recalculate the entire checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc,
+ ComputeChecksum(filesystem_, file_path_, Crc32(),
+ /*start=*/kHeaderReservedBytes, /*end=*/file_size));
+ } else {
+ // Append new changes to the existing checksum.
+ ICING_ASSIGN_OR_RETURN(
+ crc, ComputeChecksum(
+ filesystem_, file_path_, Crc32(header_->GetLogChecksum()),
+ /*start=*/header_->GetRewindOffset(), /*end=*/file_size));
+ }
+ return crc;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_PORTABLE_FILE_BACKED_PROTO_LOG_H_
diff --git a/icing/file/portable-file-backed-proto-log_benchmark.cc b/icing/file/portable-file-backed-proto-log_benchmark.cc
new file mode 100644
index 0000000..d7ea4bb
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log_benchmark.cc
@@ -0,0 +1,343 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <random>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/tmp-directory.h"
+
+// go/microbenchmarks
+//
+// To build and run on a local machine:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// icing/file:portable-file-backed-proto-log_benchmark
+//
+// $ blaze-bin/icing/file/portable-file-backed-proto-log_benchmark
+// --benchmark_filter=all
+//
+//
+// To build and run on an Android device (must be connected and rooted):
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// icing/file:portable-file-backed-proto-log_benchmark
+//
+// $ adb root
+//
+// $ adb push
+// blaze-bin/icing/file/portable-file-backed-proto-log_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/portable-file-backed-proto-log-benchmark
+// --benchmark_filter=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+void BM_Write(benchmark::State& state) {
+ const Filesystem filesystem;
+ int string_length = state.range(0);
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->WriteProto(document));
+ }
+ state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+ string_length);
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Write)
+ ->Arg(1)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(8 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(32 * 1024)
+ ->Arg(256 * 1024)
+ ->Arg(2 * 1024 * 1024)
+ ->Arg(8 * 1024 * 1024)
+ ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
+ // 16MiB, and we need some extra space for the
+ // rest of the document properties
+
+void BM_Read(benchmark::State& state) {
+ const Filesystem filesystem;
+ int string_length = state.range(0);
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s%d%s", GetTestTempDir().c_str(), "/proto_", string_length, ".log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
+ proto_log->WriteProto(document));
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ReadProto(write_offset));
+ }
+ state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) *
+ string_length);
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Read)
+ ->Arg(1)
+ ->Arg(32)
+ ->Arg(512)
+ ->Arg(1024)
+ ->Arg(4 * 1024)
+ ->Arg(8 * 1024)
+ ->Arg(16 * 1024)
+ ->Arg(32 * 1024)
+ ->Arg(256 * 1024)
+ ->Arg(2 * 1024 * 1024)
+ ->Arg(8 * 1024 * 1024)
+ ->Arg(15 * 1024 * 1024); // We do 15MiB here since our max proto size is
+ // 16MiB, and we need some extra space for the
+ // rest of the document properties
+ //
+void BM_Erase(benchmark::State& state) {
+ const Filesystem filesystem;
+ const std::string file_path = IcingStringUtil::StringPrintf(
+ "%s%s", GetTestTempDir().c_str(), "/proto.log");
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ std::default_random_engine random;
+ const std::string rand_str = RandomString(kAlNumAlphabet, /*len=*/1, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ for (auto _ : state) {
+ state.PauseTiming();
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t write_offset,
+ proto_log->WriteProto(document));
+ state.ResumeTiming();
+
+ testing::DoNotOptimize(proto_log->EraseProto(write_offset));
+ }
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_Erase);
+
+void BM_ComputeChecksum(benchmark::State& state) {
+ const Filesystem filesystem;
+ const std::string file_path = GetTestTempDir() + "/proto.log";
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Make each document 1KiB
+ int string_length = 1024;
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ int num_docs = state.range(0);
+ for (int i = 0; i < num_docs; ++i) {
+ ICING_ASSERT_OK(proto_log->WriteProto(document));
+ }
+
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ComputeChecksum());
+ }
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_ComputeChecksum)->Range(1024, 1 << 20);
+
+void BM_ComputeChecksumWithCachedChecksum(benchmark::State& state) {
+ const Filesystem filesystem;
+ const std::string file_path = GetTestTempDir() + "/proto.log";
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Make the document 1KiB
+ int string_length = 1024;
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ // Write some content and persist. This should update our cached checksum to
+ // include the document.
+ ICING_ASSERT_OK(proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // This ComputeChecksum call shouldn't need to do any computation since we can
+ // reuse our cached checksum.
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ComputeChecksum());
+ }
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_ComputeChecksumWithCachedChecksum);
+
+void BM_ComputeChecksumOnlyForTail(benchmark::State& state) {
+ const Filesystem filesystem;
+ const std::string file_path = GetTestTempDir() + "/proto.log";
+ int max_proto_size = (1 << 24) - 1; // 16 MiB
+ bool compress = true;
+
+ // Make sure it doesn't already exist.
+ filesystem.DeleteFile(file_path.c_str());
+
+ auto proto_log = PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem, file_path,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress, max_proto_size))
+ .ValueOrDie()
+ .proto_log;
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Make the document 1KiB
+ int string_length = 1024;
+ std::default_random_engine random;
+ const std::string rand_str =
+ RandomString(kAlNumAlphabet, string_length, &random);
+
+ auto document_properties = document.add_properties();
+ document_properties->set_name("string property");
+ document_properties->add_string_values(rand_str);
+
+ // Write some content and persist. This should update our cached checksum to
+ // include the document.
+ ICING_ASSERT_OK(proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // Write another proto into the tail, but it's not included in our cached
+ // checksum since we didn't call persist.
+ ICING_ASSERT_OK(proto_log->WriteProto(document));
+
+ // ComputeChecksum should be calculating the checksum of the tail and adding
+ // it to the cached checksum we have.
+ for (auto _ : state) {
+ testing::DoNotOptimize(proto_log->ComputeChecksum());
+ }
+
+ // Cleanup after ourselves
+ filesystem.DeleteFile(file_path.c_str());
+}
+BENCHMARK(BM_ComputeChecksumOnlyForTail);
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/portable-file-backed-proto-log_test.cc b/icing/file/portable-file-backed-proto-log_test.cc
new file mode 100644
index 0000000..cc70151
--- /dev/null
+++ b/icing/file/portable-file-backed-proto-log_test.cc
@@ -0,0 +1,1265 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/portable-file-backed-proto-log.h"
+
+#include <cstdint>
+#include <cstdlib>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::A;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::Pair;
+using ::testing::Return;
+
+using Header = PortableFileBackedProtoLog<DocumentProto>::Header;
+
+Header ReadHeader(Filesystem filesystem, const std::string& file_path) {
+ Header header;
+ filesystem.PRead(file_path.c_str(), &header, sizeof(Header),
+ /*offset=*/0);
+ return header;
+}
+
+void WriteHeader(Filesystem filesystem, const std::string& file_path,
+ Header& header) {
+ filesystem.Write(file_path.c_str(), &header, sizeof(Header));
+}
+
+class PortableFileBackedProtoLogTest : public ::testing::Test {
+ protected:
+ // Adds a user-defined default construct because a const member variable may
+ // make the compiler accidentally delete the default constructor.
+ // https://stackoverflow.com/a/47368753
+ PortableFileBackedProtoLogTest() {}
+
+ void SetUp() override {
+ file_path_ = GetTestTempDir() + "/proto_log";
+ filesystem_.DeleteFile(file_path_.c_str());
+ }
+
+ void TearDown() override { filesystem_.DeleteFile(file_path_.c_str()); }
+
+ const Filesystem filesystem_;
+ std::string file_path_;
+ bool compress_ = true;
+ int32_t compression_level_ =
+ PortableFileBackedProtoLog<DocumentProto>::kDeflateCompressionLevel;
+ int64_t max_proto_size_ = 256 * 1024; // 256 KiB
+};
+
+TEST_F(PortableFileBackedProtoLogTest, Initialize) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ EXPECT_THAT(create_result.proto_log, NotNull());
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ // Can't recreate the same file with different options.
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ !compress_, max_proto_size_, compression_level_)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, InitializeValidatesOptions) {
+ // max_proto_size must be greater than 0
+ int invalid_max_proto_size = 0;
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, invalid_max_proto_size, compression_level_)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // max_proto_size must be under 16 MiB
+ invalid_max_proto_size = 16 * 1024 * 1024;
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, invalid_max_proto_size, compression_level_)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // compression_level must be between 0 and 9 inclusive
+ int invalid_compression_level = -1;
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, invalid_compression_level)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // compression_level must be between 0 and 9 inclusive
+ invalid_compression_level = 10;
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, invalid_compression_level)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReservedSpaceForHeader) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+
+ // With no protos written yet, the log should be minimum the size of the
+ // reserved header space.
+ ASSERT_EQ(filesystem_.GetFileSize(file_path_.c_str()),
+ PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes);
+}
+
+TEST_F(PortableFileBackedProtoLogTest, WriteProtoTooLarge) {
+ int max_proto_size = 1;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ // Proto is too large for the max_proto_size_in
+ ASSERT_THAT(proto_log->WriteProto(document),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadProtoWrongKProtoMagic) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a proto
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t file_offset,
+ proto_log->WriteProto(document));
+
+ // The 4 bytes of metadata that just doesn't have the same kProtoMagic
+ // specified in file-backed-proto-log.h
+ uint32_t wrong_magic = 0x7E000000;
+
+ // Sanity check that we opened the file correctly
+ int fd = filesystem_.OpenForWrite(file_path_.c_str());
+ ASSERT_GT(fd, 0);
+
+ // Write the wrong kProtoMagic in, kProtoMagics are stored at the beginning of
+ // a proto entry.
+ filesystem_.PWrite(fd, file_offset, &wrong_magic, sizeof(wrong_magic));
+
+ ASSERT_THAT(proto_log->ReadProto(file_offset),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadWriteUncompressedProto) {
+ int last_offset;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/false, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the first proto
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+ proto_log->WriteProto(document1));
+
+ int document1_offset = written_position;
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(written_position,
+ proto_log->WriteProto(document2));
+
+ int document2_offset = written_position;
+ last_offset = written_position;
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ {
+ // Make a new proto_log with the same file_path, and make sure we
+ // can still write to the same underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/false, max_proto_size_, compression_level_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a third proto
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+ IsOkAndHolds(Gt(last_offset)));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadWriteCompressedProto) {
+ int last_offset;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the first proto
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(int written_position,
+ proto_log->WriteProto(document1));
+
+ int document1_offset = written_position;
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(written_position,
+ proto_log->WriteProto(document2));
+
+ int document2_offset = written_position;
+ last_offset = written_position;
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(proto_log->ReadProto(written_position),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ {
+ // Make a new proto_log with the same file_path, and make sure we
+ // can still write to the same underlying file.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_, compression_level_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write a third proto
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ ASSERT_THAT(recreated_proto_log->WriteProto(document3),
+ IsOkAndHolds(Gt(last_offset)));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ReadWriteDifferentCompressionLevel) {
+ int document1_offset;
+ int document2_offset;
+ int document3_offset;
+
+ // The first proto to write that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace2", "uri2").Build();
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace3", "uri3").Build();
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_,
+ /*compression_level_in=*/3)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
+ proto_log->WriteProto(document1));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+ }
+
+ // Make a new proto_log with the same file_path but different compression
+ // level, and make sure we can still read from and write to the same
+ // underlying file.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_,
+ /*compression_level_in=*/9)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Check the first proto
+ ASSERT_THAT(recreated_proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Write a second proto
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ recreated_proto_log->WriteProto(document2));
+
+ ASSERT_GT(document2_offset, document1_offset);
+
+ // Check the second proto
+ ASSERT_THAT(recreated_proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ ICING_ASSERT_OK(recreated_proto_log->PersistToDisk());
+ }
+
+ // One more time but with 0 compression level
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_,
+ /*compression_level=*/0)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Check the first proto
+ ASSERT_THAT(recreated_proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+
+ // Check the second proto
+ ASSERT_THAT(recreated_proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ // Write a third proto
+ ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
+ recreated_proto_log->WriteProto(document3));
+
+ ASSERT_GT(document3_offset, document2_offset);
+
+ // Check the third proto
+ ASSERT_THAT(recreated_proto_log->ReadProto(document3_offset),
+ IsOkAndHolds(EqualsProto(document3)));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ WriteDifferentCompressionLevelDifferentSizes) {
+ int document_log_size_with_compression_3;
+ int document_log_size_with_no_compression;
+
+ // The first proto to write that's close to the max size. Leave some room for
+ // the rest of the proto properties.
+ std::string long_str(max_proto_size_ - 1024, 'a');
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .AddStringProperty("long_str", long_str)
+ .Build();
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_,
+ /*compression_level_in=*/3)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the proto
+ ICING_ASSERT_OK(proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ document_log_size_with_compression_3 =
+ filesystem_.GetFileSize(file_path_.c_str());
+ }
+
+ // Delete the proto_log so we can reuse the file_path
+ filesystem_.DeleteFile(file_path_.c_str());
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ /*compress_in=*/true, max_proto_size_,
+ /*compression_level_in=*/0)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write the proto
+ ICING_ASSERT_OK(proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ document_log_size_with_no_compression =
+ filesystem_.GetFileSize(file_path_.c_str());
+
+ // Uncompressed document file size should be larger than original compressed
+ // document file size
+ ASSERT_GT(document_log_size_with_no_compression,
+ document_log_size_with_compression_3);
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, CorruptHeader) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+ }
+
+ int corrupt_checksum = 24;
+
+ // Write the corrupted header
+ Header header = ReadHeader(filesystem_, file_path_);
+ header.SetHeaderChecksum(corrupt_checksum);
+ WriteHeader(filesystem_, file_path_, header);
+
+ {
+ // Reinitialize the same proto_log
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+ HasSubstr("Invalid header checksum")));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DifferentMagic) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto recreated_proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+
+ // Corrupt the magic that's stored at the beginning of the header.
+ int invalid_magic = -1;
+ ASSERT_THAT(invalid_magic, Not(Eq(Header::kMagic)));
+
+ // Write the corrupted header
+ Header header = ReadHeader(filesystem_, file_path_);
+ header.SetMagic(invalid_magic);
+ WriteHeader(filesystem_, file_path_, header);
+ }
+
+ {
+ // Reinitialize the same proto_log
+ ASSERT_THAT(PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL,
+ HasSubstr("Invalid header kMagic")));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ UnableToDetectCorruptContentWithoutDirtyBit) {
+ // This is intentional that we can't detect corruption. We're trading off
+ // earlier corruption detection for lower initialization latency. By not
+ // calculating the checksum on initialization, we can initialize much faster,
+ // but at the cost of detecting corruption. Note that even if we did detect
+ // corruption, there was nothing we could've done except throw an error to
+ // clients. We'll still do that, but at some later point when the log is
+ // attempting to be accessed and we can't actually deserialize a proto from
+ // it. See the description in cl/374278280 for more details.
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write and persist an document.
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // "Corrupt" the content written in the log.
+ document.set_uri("invalid");
+ std::string serialized_document = document.SerializeAsString();
+ ASSERT_TRUE(filesystem_.PWrite(file_path_.c_str(), document_offset,
+ serialized_document.data(),
+ serialized_document.size()));
+ }
+
+ {
+ // We can recover, and we don't have data loss.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ // We still have the corrupted content in our file, we didn't throw
+ // everything out.
+ EXPECT_THAT(
+ filesystem_.GetFileSize(file_path_.c_str()),
+ Gt(PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ DetectAndThrowOutCorruptContentWithDirtyBit) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .AddStringProperty("string_property", "foo", "bar")
+ .Build();
+
+ // Write and persist the protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+ }
+
+ {
+ // "Corrupt" the content written in the log. Make the corrupt document
+ // smaller than our original one so we don't accidentally write past our
+ // file.
+ DocumentProto document =
+ DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build();
+ std::string serialized_document = document.SerializeAsString();
+ ASSERT_TRUE(filesystem_.PWrite(
+ file_path_.c_str(),
+ PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes,
+ serialized_document.data(), serialized_document.size()));
+
+ Header header = ReadHeader(filesystem_, file_path_);
+
+ // Set dirty bit to true to reflect that something changed in the log.
+ header.SetDirtyFlag(true);
+ header.SetHeaderChecksum(header.CalculateHeaderChecksum());
+
+ WriteHeader(filesystem_, file_path_, header);
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_TRUE(create_result.has_data_loss());
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+
+ // We had to recalculate the checksum to detect the corruption.
+ EXPECT_TRUE(create_result.recalculated_checksum);
+
+ // We lost everything, file size is back down to the header.
+ EXPECT_THAT(
+ filesystem_.GetFileSize(file_path_.c_str()),
+ Eq(PortableFileBackedProtoLog<DocumentProto>::kHeaderReservedBytes));
+
+ // At least the log is no longer dirty.
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DirtyBitFalseAlarmKeepsData) {
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+ int64_t document_offset;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(document_offset,
+ proto_log->WriteProto(document));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+ }
+
+ {
+ Header header = ReadHeader(filesystem_, file_path_);
+
+ // Simulate the dirty flag set as true, but no data has been changed yet.
+ // Maybe we crashed between writing the dirty flag and erasing a proto.
+ header.SetDirtyFlag(true);
+ header.SetHeaderChecksum(header.CalculateHeaderChecksum());
+
+ WriteHeader(filesystem_, file_path_, header);
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+
+ // Even though nothing changed, the false alarm dirty bit should have
+ // triggered us to recalculate our checksum.
+ EXPECT_TRUE(create_result.recalculated_checksum);
+
+ // Check that our document still exists even though dirty bit was true.
+ EXPECT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ PersistToDiskKeepsPersistedDataAndTruncatesExtraData) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace2", "uri2").Build();
+ int document1_offset, document2_offset;
+ int log_size;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // Write, but don't explicitly persist the second proto
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ log_size = filesystem_.GetFileSize(file_path_.c_str());
+ ASSERT_GT(log_size, 0);
+
+ // PersistToDisk happens implicitly during the destructor.
+ }
+
+ {
+ // The header rewind position and checksum aren't updated in this "system
+ // crash" scenario.
+
+ std::string bad_proto =
+ "some incomplete proto that we didn't finish writing before the "
+ "system crashed";
+ filesystem_.PWrite(file_path_.c_str(), log_size, bad_proto.data(),
+ bad_proto.size());
+
+ // Double check that we actually wrote something to the underlying file
+ ASSERT_GT(filesystem_.GetFileSize(file_path_.c_str()), log_size);
+ }
+
+ {
+ // We can recover, but we have data loss
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_TRUE(create_result.has_data_loss());
+ ASSERT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
+ ASSERT_FALSE(create_result.recalculated_checksum);
+
+ // Check that everything was persisted across instances
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ IsOkAndHolds(EqualsProto(document1)));
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+
+ // We correctly rewound to the last good state.
+ ASSERT_EQ(log_size, filesystem_.GetFileSize(file_path_.c_str()));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ DirtyBitIsFalseAfterPutAndPersistToDisk) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+
+ // We previously persisted to disk so everything should be in a perfect
+ // state.
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ DirtyBitIsFalseAfterDeleteAndPersistToDisk) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write, delete, and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->EraseProto(document_offset));
+ ICING_ASSERT_OK(proto_log->PersistToDisk());
+
+ // The proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+
+ // We previously persisted to disk so everything should be in a perfect
+ // state.
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, DirtyBitIsFalseAfterPutAndDestructor) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+
+ // Check that what we read is what we wrote
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ IsOkAndHolds(EqualsProto(document)));
+
+ // PersistToDisk is implicitly called as part of the destructor and
+ // PersistToDisk will clear the dirty bit.
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+
+ // We previously persisted to disk so everything should be in a perfect
+ // state.
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest,
+ DirtyBitIsFalseAfterDeleteAndDestructor) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ DocumentProto document =
+ DocumentBuilder().SetKey("namespace1", "uri1").Build();
+
+ // Write, delete, and persist the first proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document_offset,
+ proto_log->WriteProto(document));
+ ICING_ASSERT_OK(proto_log->EraseProto(document_offset));
+
+ // The proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // PersistToDisk is implicitly called as part of the destructor and
+ // PersistToDisk will clear the dirty bit.
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+
+ // We previously persisted to disk so everything should be in a perfect
+ // state.
+ EXPECT_FALSE(create_result.has_data_loss());
+ EXPECT_FALSE(create_result.recalculated_checksum);
+
+ Header header = ReadHeader(filesystem_, file_path_);
+ EXPECT_FALSE(header.GetDirtyFlag());
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, Iterator) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ {
+ // Empty iterator
+ auto iterator = proto_log->GetIterator();
+ ASSERT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+
+ {
+ // Iterates through some documents
+ ICING_ASSERT_OK(proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->WriteProto(document2));
+ auto iterator = proto_log->GetIterator();
+ // 1st proto
+ ICING_ASSERT_OK(iterator.Advance());
+ ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+ IsOkAndHolds(EqualsProto(document1)));
+ // 2nd proto
+ ICING_ASSERT_OK(iterator.Advance());
+ ASSERT_THAT(proto_log->ReadProto(iterator.GetOffset()),
+ IsOkAndHolds(EqualsProto(document2)));
+ // Tries to advance
+ ASSERT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+
+ {
+ // Iterator with bad filesystem
+ ScopedFd sfd(filesystem_.OpenForRead(file_path_.c_str()));
+ MockFilesystem mock_filesystem;
+ ON_CALL(mock_filesystem, GetFileSize(A<int>()))
+ .WillByDefault(Return(Filesystem::kBadFileSize));
+ PortableFileBackedProtoLog<DocumentProto>::Iterator bad_iterator(
+ mock_filesystem, sfd.get(), /*initial_offset=*/0);
+ ASSERT_THAT(bad_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ComputeChecksum) {
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+ Crc32 checksum;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ ICING_EXPECT_OK(proto_log->WriteProto(document));
+
+ ICING_ASSERT_OK_AND_ASSIGN(checksum, proto_log->ComputeChecksum());
+
+ // Calling it twice with no changes should get us the same checksum
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+ }
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Checksum should be consistent across instances
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+ // PersistToDisk shouldn't affect the checksum value
+ ICING_EXPECT_OK(proto_log->PersistToDisk());
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+
+ // Check that modifying the log leads to a different checksum
+ ICING_EXPECT_OK(proto_log->WriteProto(document));
+ EXPECT_THAT(proto_log->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldSetZero) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes and erases proto
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // Checks if the erased area is set to 0.
+ int64_t file_size = filesystem_.GetFileSize(file_path_.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ MemoryMappedFile mmapped_file,
+ MemoryMappedFile::Create(filesystem_, file_path_,
+ MemoryMappedFile::Strategy::READ_ONLY));
+
+ // document1_offset + sizeof(int) is the start byte of the proto where
+ // sizeof(int) is the size of the proto metadata.
+ ICING_ASSERT_OK(
+ mmapped_file.Remap(document1_offset + sizeof(int), file_size - 1));
+ for (size_t i = 0; i < mmapped_file.region_size(); ++i) {
+ ASSERT_THAT(mmapped_file.region()[i], Eq(0));
+ }
+}
+
+TEST_F(PortableFileBackedProtoLogTest, EraseProtoShouldReturnNotFound) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes 2 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document2_offset,
+ proto_log->WriteProto(document2));
+
+ // Erases the first proto
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ // The first proto has been erased.
+ ASSERT_THAT(proto_log->ReadProto(document1_offset),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // The second proto should be returned.
+ ASSERT_THAT(proto_log->ReadProto(document2_offset),
+ IsOkAndHolds(EqualsProto(document2)));
+}
+
+TEST_F(PortableFileBackedProtoLogTest, ChecksumShouldBeCorrectWithErasedProto) {
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "uri1").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "uri2").Build();
+ DocumentProto document3 =
+ DocumentBuilder().SetKey("namespace", "uri3").Build();
+ DocumentProto document4 =
+ DocumentBuilder().SetKey("namespace", "uri4").Build();
+
+ int64_t document2_offset;
+ int64_t document3_offset;
+
+ {
+ // Erase data after the rewind position. This won't update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Writes 3 protos
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t document1_offset,
+ proto_log->WriteProto(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(document2_offset,
+ proto_log->WriteProto(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(document3_offset,
+ proto_log->WriteProto(document3));
+
+ // Erases the 1st proto, checksum won't be updated immediately because the
+ // rewind position is 0.
+ ICING_ASSERT_OK(proto_log->EraseProto(document1_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(2175574628))));
+ } // New checksum is updated in destructor.
+
+ {
+ // Erase data before the rewind position. This will update the checksum
+ // immediately.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Erases the 2nd proto that is now before the rewind position. Checksum
+ // is updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document2_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(790877774))));
+ }
+
+ {
+ // Append data and erase data before the rewind position. This will update
+ // the checksum twice: in EraseProto() and destructor.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ ASSERT_FALSE(create_result.has_data_loss());
+
+ // Append a new document which is after the rewind position.
+ ICING_ASSERT_OK(proto_log->WriteProto(document4));
+
+ // Erases the 3rd proto that is now before the rewind position. Checksum
+ // is updated.
+ ICING_ASSERT_OK(proto_log->EraseProto(document3_offset));
+
+ EXPECT_THAT(proto_log->ComputeChecksum(),
+ IsOkAndHolds(Eq(Crc32(2344803210))));
+ } // Checksum is updated with the newly appended document.
+
+ {
+ // A successful creation means that the checksum matches.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PortableFileBackedProtoLog<DocumentProto>::CreateResult create_result,
+ PortableFileBackedProtoLog<DocumentProto>::Create(
+ &filesystem_, file_path_,
+ PortableFileBackedProtoLog<DocumentProto>::Options(
+ compress_, max_proto_size_, compression_level_)));
+ auto proto_log = std::move(create_result.proto_log);
+ EXPECT_FALSE(create_result.has_data_loss());
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/flash-index-storage-header.h b/icing/file/posting_list/flash-index-storage-header.h
new file mode 100644
index 0000000..6bbf1ba
--- /dev/null
+++ b/icing/file/posting_list/flash-index-storage-header.h
@@ -0,0 +1,122 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_
+#define ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+
+namespace icing {
+namespace lib {
+
+// The class used to manage the flash block that contains the header for
+// FlashIndexStorage. This contains information about the index blocks that
+// store the posting lists.
+class HeaderBlock {
+ public:
+ // The class used to access the actual header.
+ struct Header {
+ // A magic used to mark the beginning of a valid header.
+ static constexpr int kMagic = 0xb0780cf4;
+ int magic;
+ int block_size;
+ int last_indexed_docid;
+ // The size of the index_block_infos array.
+ int num_index_block_infos;
+
+ struct IndexBlockInfo {
+ // The size of the posting lists that fit on all the index blocks in this
+ // chain. Each block on this posting list will have posting lists of size
+ // posting_list_bytes.
+ int posting_list_bytes;
+ // The block index of the first block in the free list chain.
+ int free_list_block_index;
+ };
+ // Variable-size array, num_index_block_infos long. Can have a max length
+ // of log(block_size). This array is used to maintain a free list for the
+ // available blocks.
+ IndexBlockInfo index_block_infos[0];
+ };
+
+ // Read HeaderBlock from the specified fd.
+ //
+ // RETURNS:
+ // - HeaderBlock, on success
+ // - INTERNAL if unable to read block_size bytes from fd.
+ static libtextclassifier3::StatusOr<HeaderBlock> Read(
+ const Filesystem* filesystem, int fd, int block_size) {
+ std::unique_ptr<uint8_t[]> buffer = std::make_unique<uint8_t[]>(block_size);
+ if (!filesystem->PRead(fd, buffer.get(), block_size, 0)) {
+ return absl_ports::InternalError("Unable to reader header block!");
+ }
+ return HeaderBlock(filesystem, std::move(buffer), block_size);
+ }
+
+ // Make a new HeaderBlock with the specified size.
+ explicit HeaderBlock(const Filesystem* filesystem, int block_size)
+ : HeaderBlock(filesystem, std::make_unique<uint8_t[]>(block_size),
+ block_size) {
+ std::memset(header_buffer_.get(), 0, block_size);
+ }
+
+ Header* header() const {
+ return reinterpret_cast<Header*>(header_buffer_.get());
+ }
+
+ // Add another entry to the index_block_infos array and return a pointer to
+ // that entry. Returns a nullptr if the index_block_infos array is already
+ // at a max size.
+ Header::IndexBlockInfo* AddIndexBlockInfo() {
+ if (size() + sizeof(Header::IndexBlockInfo) > block_size_) {
+ return nullptr;
+ }
+ ++header()->num_index_block_infos;
+ return header()->index_block_infos + (header()->num_index_block_infos - 1);
+ }
+
+ // Returns the size of the header block currently in use.
+ int size() const {
+ return sizeof(Header) +
+ header()->num_index_block_infos * sizeof(Header::IndexBlockInfo);
+ }
+
+ // Writes the header to fd. Returns true on success.
+ bool Write(int fd) {
+ return filesystem_->PWrite(fd, 0, header_buffer_.get(), block_size_);
+ }
+
+ private:
+ explicit HeaderBlock(const Filesystem* filesystem,
+ std::unique_ptr<uint8_t[]> buffer, int block_size)
+ : filesystem_(filesystem),
+ header_buffer_(std::move(buffer)),
+ block_size_(block_size) {}
+
+ const Filesystem* filesystem_; // does NOT own!
+ std::unique_ptr<uint8_t[]> header_buffer_;
+ int block_size_;
+};
+static_assert(16 == sizeof(HeaderBlock::Header),
+ "Header has changed size. Consider how this change might affect "
+ "pre-existing indices.");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_HEADER_H_
diff --git a/icing/file/posting_list/flash-index-storage.cc b/icing/file/posting_list/flash-index-storage.cc
new file mode 100644
index 0000000..2198d2c
--- /dev/null
+++ b/icing/file/posting_list/flash-index-storage.cc
@@ -0,0 +1,661 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/posting_list/flash-index-storage.h"
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <cerrno>
+#include <cinttypes>
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/math-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<FlashIndexStorage> FlashIndexStorage::Create(
+ std::string index_filename, const Filesystem* filesystem,
+ PostingListSerializer* serializer, bool in_memory) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+ ICING_RETURN_ERROR_IF_NULL(serializer);
+
+ FlashIndexStorage storage(filesystem, std::move(index_filename), serializer,
+ in_memory);
+ if (!storage.Init()) {
+ return absl_ports::InternalError(
+ "Unable to successfully read header block!");
+ }
+ return storage;
+}
+
+/* static */ libtextclassifier3::StatusOr<int>
+FlashIndexStorage::ReadHeaderMagic(const Filesystem* filesystem,
+ const std::string& index_filename) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+
+ if (!filesystem->FileExists(index_filename.c_str())) {
+ return absl_ports::NotFoundError("Flash index file doesn't exist");
+ }
+
+ ScopedFd sfd(filesystem->OpenForRead(index_filename.c_str()));
+ if (!sfd.is_valid()) {
+ return absl_ports::InternalError("Fail to open flash index file");
+ }
+
+ uint32_t block_size = SelectBlockSize();
+ // Read and validate header.
+ ICING_ASSIGN_OR_RETURN(HeaderBlock header_block,
+ HeaderBlock::Read(filesystem, sfd.get(), block_size));
+ return header_block.header()->magic;
+}
+
+FlashIndexStorage::~FlashIndexStorage() {
+ if (header_block_ != nullptr) {
+ libtextclassifier3::Status status = FlushInMemoryFreeList();
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << "Cannot flush in memory free list: "
+ << status.error_message();
+ }
+ PersistToDisk();
+ }
+}
+
+/* static */ uint32_t FlashIndexStorage::SelectBlockSize() {
+ // This should be close to the flash page size.
+ static constexpr uint32_t kMinBlockSize = 4096;
+
+ // Determine a good block size.
+ uint32_t page_size = getpagesize();
+ uint32_t block_size = std::max(kMinBlockSize, page_size);
+
+ // Align up to the nearest page size.
+ return math_util::RoundUpTo(block_size, page_size);
+}
+
+bool FlashIndexStorage::Init() {
+ storage_sfd_ = ScopedFd(filesystem_->OpenForWrite(index_filename_.c_str()));
+ if (!storage_sfd_.is_valid()) {
+ return false;
+ }
+
+ // Read in or create the header.
+ return InitHeader();
+}
+
+bool FlashIndexStorage::InitHeader() {
+ // Look for an existing file size.
+ int64_t file_size = filesystem_->GetFileSize(storage_sfd_.get());
+ if (file_size == Filesystem::kBadFileSize) {
+ ICING_LOG(ERROR) << "Could not initialize main index. Bad file size.";
+ return false;
+ }
+
+ if (file_size == 0) {
+ if (!CreateHeader()) {
+ ICING_LOG(ERROR)
+ << "Could not initialize main index. Unable to create header.";
+ return false;
+ }
+ } else {
+ if (!OpenHeader(file_size)) {
+ ICING_LOG(ERROR)
+ << "Could not initialize main index. Unable to open header.";
+ return false;
+ }
+ }
+ in_memory_freelists_.resize(header_block_->header()->num_index_block_infos);
+
+ return true;
+}
+
+bool FlashIndexStorage::CreateHeader() {
+ uint32_t block_size = SelectBlockSize();
+ header_block_ = std::make_unique<HeaderBlock>(filesystem_, block_size);
+ // Initialize.
+ header_block_->header()->magic = HeaderBlock::Header::kMagic;
+ header_block_->header()->block_size = block_size;
+ header_block_->header()->last_indexed_docid = kInvalidDocumentId;
+
+ // Work down from the largest posting list that fits in
+ // block_size. We don't care about locality of blocks because this
+ // is a flash index.
+ for (uint32_t posting_list_bytes = max_posting_list_bytes();
+ posting_list_bytes >= serializer_->GetMinPostingListSize();
+ posting_list_bytes /= 2) {
+ uint32_t aligned_posting_list_bytes =
+ (posting_list_bytes / serializer_->GetDataTypeBytes()) *
+ serializer_->GetDataTypeBytes();
+ ICING_VLOG(1) << "Block size "
+ << header_block_->header()->num_index_block_infos << ": "
+ << aligned_posting_list_bytes;
+
+ // Initialize free list to empty.
+ HeaderBlock::Header::IndexBlockInfo* block_info =
+ header_block_->AddIndexBlockInfo();
+ if (block_info == nullptr) {
+ // This should never happen anyways. Min block size is 4k, so adding these
+ // IndexBlockInfos should never exceed the block size.
+ return false;
+ }
+ block_info->posting_list_bytes = aligned_posting_list_bytes;
+ block_info->free_list_block_index = kInvalidBlockIndex;
+ }
+
+ // Write the header.
+ if (!header_block_->Write(storage_sfd_.get())) {
+ filesystem_->Truncate(storage_sfd_.get(), 0);
+ return false;
+ }
+ num_blocks_ = 1;
+ return true;
+}
+
+bool FlashIndexStorage::OpenHeader(int64_t file_size) {
+ uint32_t block_size = SelectBlockSize();
+ // Read and validate header.
+ ICING_ASSIGN_OR_RETURN(
+ HeaderBlock read_header,
+ HeaderBlock::Read(filesystem_, storage_sfd_.get(), block_size), false);
+ if (read_header.header()->magic != HeaderBlock::Header::kMagic) {
+ ICING_LOG(ERROR) << "Index header block wrong magic";
+ return false;
+ }
+ if (file_size % read_header.header()->block_size != 0) {
+ ICING_LOG(ERROR) << "Index size " << file_size
+ << " not a multiple of block size "
+ << read_header.header()->block_size;
+ return false;
+ }
+
+ if (file_size < static_cast<int64_t>(read_header.header()->block_size)) {
+ ICING_LOG(ERROR) << "Index size " << file_size
+ << " shorter than block size "
+ << read_header.header()->block_size;
+ return false;
+ }
+
+ if (read_header.header()->block_size % getpagesize() != 0) {
+ ICING_LOG(ERROR) << "Block size " << read_header.header()->block_size
+ << " is not a multiple of page size " << getpagesize();
+ return false;
+ }
+ num_blocks_ = file_size / read_header.header()->block_size;
+ if (block_size != read_header.header()->block_size) {
+ // The block_size changed? That's weird. But the old block_size is still
+ // valid (it must be some multiple of the new block_size). So reinitialize
+ // with that old block size. Using the old block size means that we can
+ // still use the main index, but reads/writes won't be as efficient in terms
+ // of flash IO because the 'blocks' that we're reading are actually multiple
+ // pages long.
+ ICING_LOG(ERROR) << "Block size of existing header ("
+ << read_header.header()->block_size
+ << ") does not match the requested block size ("
+ << block_size << "). Defaulting to existing block size "
+ << read_header.header()->block_size;
+ ICING_ASSIGN_OR_RETURN(HeaderBlock read_header,
+ HeaderBlock::Read(filesystem_, storage_sfd_.get(),
+ read_header.header()->block_size),
+ false);
+ }
+ header_block_ = std::make_unique<HeaderBlock>(std::move(read_header));
+
+ // Check for memory alignment on posting_list_bytes. See b/29983315.
+ // The issue of potential corruption to the header could also be handled by
+ // checksumming the header block.
+ for (int i = 0; i < header_block_->header()->num_index_block_infos; ++i) {
+ int posting_list_bytes =
+ header_block_->header()->index_block_infos[i].posting_list_bytes;
+ if (posting_list_bytes % serializer_->GetDataTypeBytes() != 0) {
+ ICING_LOG(ERROR)
+ << "Posting list size misaligned, index " << i << ", size "
+ << header_block_->header()->index_block_infos[i].posting_list_bytes
+ << ", data_type_bytes " << serializer_->GetDataTypeBytes()
+ << ", file_size " << file_size;
+ return false;
+ }
+ }
+ return true;
+}
+
+bool FlashIndexStorage::PersistToDisk() {
+ // First, write header.
+ if (!header_block_->Write(storage_sfd_.get())) {
+ ICING_LOG(ERROR) << "Write index header failed: " << strerror(errno);
+ return false;
+ }
+
+ // Then sync.
+ return filesystem_->DataSync(storage_sfd_.get());
+}
+
+libtextclassifier3::Status FlashIndexStorage::Reset() {
+ // Reset in-memory members to default values.
+ num_blocks_ = 0;
+ header_block_.reset();
+ storage_sfd_.reset();
+ in_memory_freelists_.clear();
+
+ // Delete the underlying file.
+ if (!filesystem_->DeleteFile(index_filename_.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Unable to delete file: ", index_filename_));
+ }
+
+ // Re-initialize.
+ if (!Init()) {
+ return absl_ports::InternalError(
+ "Unable to successfully read header block!");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingList(PostingListIdentifier id) const {
+ ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(id.block_index()));
+ ICING_ASSIGN_OR_RETURN(
+ IndexBlock::PostingListAndBlockInfo pl_block_info,
+ block.GetAllocatedPostingList(id.posting_list_index()));
+ return PostingListHolder(std::move(pl_block_info.posting_list_used), id,
+ pl_block_info.next_block_index);
+}
+
+libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::GetIndexBlock(
+ uint32_t block_index) const {
+ if (block_index >= num_blocks_) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Unable to create an index block at index %" PRIu32
+ " when only %d blocks have been allocated.",
+ block_index, num_blocks_));
+ }
+ off_t offset = static_cast<off_t>(block_index) * block_size();
+ return IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ filesystem_, serializer_, storage_sfd_.get(), offset, block_size());
+}
+
+libtextclassifier3::StatusOr<IndexBlock> FlashIndexStorage::CreateIndexBlock(
+ uint32_t block_index, uint32_t posting_list_size) const {
+ if (block_index >= num_blocks_) {
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Unable to create an index block at index %" PRIu32
+ " when only %d blocks have been allocated.",
+ block_index, num_blocks_));
+ }
+ off_t offset = static_cast<off_t>(block_index) * block_size();
+ return IndexBlock::CreateFromUninitializedRegion(
+ filesystem_, serializer_, storage_sfd_.get(), offset, block_size(),
+ posting_list_size);
+}
+
+int FlashIndexStorage::FindBestIndexBlockInfo(
+ uint32_t posting_list_bytes) const {
+ int i = header_block_->header()->num_index_block_infos - 1;
+ for (; i >= 0; i--) {
+ if (header_block_->header()->index_block_infos[i].posting_list_bytes >=
+ posting_list_bytes) {
+ return i;
+ }
+ }
+ return i;
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingListFromInMemoryFreeList(int block_info_index) {
+ // Get something from in memory free list.
+ ICING_ASSIGN_OR_RETURN(PostingListIdentifier posting_list_id,
+ in_memory_freelists_[block_info_index].TryPop());
+ // Remember, posting lists stored on the in-memory free list were never
+ // actually freed. So it will still contain a valid PostingListUsed. First, we
+ // need to free this posting list.
+ ICING_ASSIGN_OR_RETURN(IndexBlock block,
+ GetIndexBlock(posting_list_id.block_index()));
+ ICING_RETURN_IF_ERROR(
+ block.FreePostingList(posting_list_id.posting_list_index()));
+
+ // Now, we can allocate a posting list from the same index block. It may not
+ // be the same posting list that was just freed, but that's okay.
+ ICING_ASSIGN_OR_RETURN(IndexBlock::PostingListAndBlockInfo pl_block_info,
+ block.AllocatePostingList());
+ posting_list_id = PostingListIdentifier(
+ posting_list_id.block_index(), pl_block_info.posting_list_index,
+ posting_list_id.posting_list_index_bits());
+
+ return PostingListHolder(std::move(pl_block_info.posting_list_used),
+ posting_list_id, pl_block_info.next_block_index);
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::GetPostingListFromOnDiskFreeList(int block_info_index) {
+ // Get something from the free list.
+ uint32_t block_index = header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index;
+ if (block_index == kInvalidBlockIndex) {
+ return absl_ports::NotFoundError("No available entry in free list.");
+ }
+
+ // Get the index block
+ ICING_ASSIGN_OR_RETURN(IndexBlock block, GetIndexBlock(block_index));
+ ICING_ASSIGN_OR_RETURN(IndexBlock::PostingListAndBlockInfo pl_block_info,
+ block.AllocatePostingList());
+ PostingListIdentifier posting_list_id =
+ PostingListIdentifier(block_index, pl_block_info.posting_list_index,
+ block.posting_list_index_bits());
+ if (!pl_block_info.has_free_posting_lists) {
+ ICING_RETURN_IF_ERROR(
+ RemoveFromOnDiskFreeList(block_index, block_info_index, &block));
+ }
+
+ return PostingListHolder(std::move(pl_block_info.posting_list_used),
+ posting_list_id, pl_block_info.next_block_index);
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocateNewPostingList(int block_info_index) {
+ uint32_t block_index = GrowIndex();
+ if (block_index == kInvalidBlockIndex) {
+ return absl_ports::ResourceExhaustedError(
+ "Unable to grow the index further!");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ IndexBlock block,
+ CreateIndexBlock(block_index, header_block_->header()
+ ->index_block_infos[block_info_index]
+ .posting_list_bytes));
+ ICING_ASSIGN_OR_RETURN(IndexBlock::PostingListAndBlockInfo pl_block_info,
+ block.AllocatePostingList());
+ PostingListIdentifier posting_list_id =
+ PostingListIdentifier(block_index, pl_block_info.posting_list_index,
+ block.posting_list_index_bits());
+ if (pl_block_info.has_free_posting_lists) {
+ AddToOnDiskFreeList(block_index, block_info_index, &block);
+ }
+
+ return PostingListHolder(std::move(pl_block_info.posting_list_used),
+ posting_list_id, pl_block_info.next_block_index);
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocatePostingList(uint32_t min_posting_list_bytes) {
+ int max_pl_size = max_posting_list_bytes();
+ if (min_posting_list_bytes > max_pl_size) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d exceeds max posting list size %d",
+ min_posting_list_bytes, max_pl_size));
+ }
+ int best_block_info_index = FindBestIndexBlockInfo(min_posting_list_bytes);
+
+ auto holder_or = GetPostingListFromInMemoryFreeList(best_block_info_index);
+ if (holder_or.ok()) {
+ return std::move(holder_or).ValueOrDie();
+ }
+
+ // Nothing in memory. Look for something in the block file.
+ holder_or = GetPostingListFromOnDiskFreeList(best_block_info_index);
+ if (holder_or.ok()) {
+ return std::move(holder_or).ValueOrDie();
+ }
+
+ return AllocateNewPostingList(best_block_info_index);
+}
+
+libtextclassifier3::StatusOr<PostingListHolder>
+FlashIndexStorage::AllocateAndChainMaxSizePostingList(
+ uint32_t prev_block_index) {
+ uint32_t max_pl_size = max_posting_list_bytes();
+ int best_block_info_index = FindBestIndexBlockInfo(max_pl_size);
+
+ auto holder_or = GetPostingListFromInMemoryFreeList(best_block_info_index);
+ if (!holder_or.ok()) {
+ // Nothing in memory. Look for something in the block file.
+ holder_or = GetPostingListFromOnDiskFreeList(best_block_info_index);
+ }
+
+ if (!holder_or.ok()) {
+ // Nothing in memory or block file. Allocate new block and posting list.
+ holder_or = AllocateNewPostingList(best_block_info_index);
+ }
+
+ if (!holder_or.ok()) {
+ return holder_or;
+ }
+
+ PostingListHolder holder = std::move(holder_or).ValueOrDie();
+ ICING_ASSIGN_OR_RETURN(IndexBlock block,
+ GetIndexBlock(holder.id.block_index()));
+ ICING_RETURN_IF_ERROR(block.SetNextBlockIndex(prev_block_index));
+ holder.next_block_index = prev_block_index;
+ return holder;
+}
+
+void FlashIndexStorage::AddToOnDiskFreeList(uint32_t block_index,
+ int block_info_index,
+ IndexBlock* index_block) {
+ libtextclassifier3::Status status =
+ index_block->SetNextBlockIndex(header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index);
+ if (!status.ok()) {
+ // If an error occurs, then simply skip this block. It just prevents us from
+ // allocating posting lists from this free block in the future and thus
+ // wastes at most one block, but the entire storage (including the
+ // FlashIndexStorage header) is still valid. Therefore, we can swallow
+ // errors here.
+ ICING_VLOG(1) << "Fail to set next block index to chain blocks with free "
+ "lists on disk: "
+ << status.error_message();
+ return;
+ }
+
+ header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index = block_index;
+}
+
+libtextclassifier3::Status FlashIndexStorage::RemoveFromOnDiskFreeList(
+ uint32_t block_index, int block_info_index, IndexBlock* index_block) {
+ // Cannot be used anymore. Move free ptr to the next block.
+ ICING_ASSIGN_OR_RETURN(uint32_t next_block_index,
+ index_block->GetNextBlockIndex());
+ ICING_RETURN_IF_ERROR(index_block->SetNextBlockIndex(kInvalidBlockIndex));
+ header_block_->header()
+ ->index_block_infos[block_info_index]
+ .free_list_block_index = next_block_index;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status FlashIndexStorage::FreePostingList(
+ PostingListHolder&& holder) {
+ ICING_ASSIGN_OR_RETURN(IndexBlock block,
+ GetIndexBlock(holder.id.block_index()));
+ if (block.posting_list_bytes() == max_posting_list_bytes()) {
+ ICING_RETURN_IF_ERROR(block.SetNextBlockIndex(kInvalidBlockIndex));
+ }
+
+ uint32_t posting_list_bytes = block.posting_list_bytes();
+ int best_block_info_index = FindBestIndexBlockInfo(posting_list_bytes);
+
+ // It *should* be guaranteed elsewhere that FindBestIndexBlockInfo will not
+ // return a value in >= in_memory_freelists_, but check regardless. If it
+ // doesn't fit for some reason, then put it in the Header free list instead.
+ if (has_in_memory_freelists_ &&
+ best_block_info_index < in_memory_freelists_.size()) {
+ in_memory_freelists_[best_block_info_index].Push(holder.id);
+ } else {
+ ICING_ASSIGN_OR_RETURN(bool was_not_full, block.HasFreePostingLists());
+ ICING_RETURN_IF_ERROR(
+ block.FreePostingList(holder.id.posting_list_index()));
+ // If this block was not already full, then it is already in the free list.
+ if (!was_not_full) {
+ AddToOnDiskFreeList(holder.id.block_index(), best_block_info_index,
+ &block);
+ }
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status FlashIndexStorage::WritePostingListToDisk(
+ const PostingListHolder& holder) {
+ ICING_ASSIGN_OR_RETURN(IndexBlock block,
+ GetIndexBlock(holder.id.block_index()));
+ return block.WritePostingListToDisk(holder.posting_list,
+ holder.id.posting_list_index());
+}
+
+int FlashIndexStorage::GrowIndex() {
+ if (num_blocks_ >= kMaxBlockIndex) {
+ ICING_VLOG(1) << "Reached max block index " << kMaxBlockIndex;
+ return kInvalidBlockIndex;
+ }
+
+ // Grow the index file.
+ if (!filesystem_->Grow(
+ storage_sfd_.get(),
+ static_cast<uint64_t>(num_blocks_ + 1) * block_size())) {
+ ICING_VLOG(1) << "Error growing index file: " << strerror(errno);
+ return kInvalidBlockIndex;
+ }
+
+ return num_blocks_++;
+}
+
+libtextclassifier3::Status FlashIndexStorage::FlushInMemoryFreeList() {
+ for (int i = 0; i < in_memory_freelists_.size(); ++i) {
+ FreeList& freelist = in_memory_freelists_.at(i);
+ auto freelist_elt_or = freelist.TryPop();
+ while (freelist_elt_or.ok()) {
+ PostingListIdentifier freelist_elt = freelist_elt_or.ValueOrDie();
+ // Remember, posting lists stored on the in-memory free list were never
+ // actually freed. So it will still contain a valid PostingListUsed.
+ // First, we need to free this posting list.
+ auto block_or = GetIndexBlock(freelist_elt.block_index());
+ if (!block_or.ok()) {
+ // Can't read the block. Nothing to do here. This posting list will have
+ // to leak. Just proceed to the next freelist element.
+ freelist_elt_or = freelist.TryPop();
+ continue;
+ }
+ IndexBlock block = std::move(block_or).ValueOrDie();
+ ICING_ASSIGN_OR_RETURN(bool was_not_full, block.HasFreePostingLists());
+ ICING_RETURN_IF_ERROR(
+ block.FreePostingList(freelist_elt.posting_list_index()));
+ // If this block was not already full, then it is already in the free
+ // list.
+ if (!was_not_full) {
+ AddToOnDiskFreeList(freelist_elt.block_index(), /*block_info_index=*/i,
+ &block);
+ }
+ freelist_elt_or = freelist.TryPop();
+ }
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+void FlashIndexStorage::GetDebugInfo(DebugInfoVerbosity::Code verbosity,
+ std::string* out) const {
+ // Dump and check integrity of the index block free lists.
+ out->append("Free lists:\n");
+ for (size_t i = 0; i < header_block_->header()->num_index_block_infos; ++i) {
+ // TODO(tjbarron) Port over StringAppendFormat to migrate off of this legacy
+ // util.
+ IcingStringUtil::SStringAppendF(
+ out, 100, "Posting list bytes %u: ",
+ header_block_->header()->index_block_infos[i].posting_list_bytes);
+ uint32_t block_index =
+ header_block_->header()->index_block_infos[i].free_list_block_index;
+ int count = 0;
+ while (block_index != kInvalidBlockIndex) {
+ auto block_or = GetIndexBlock(block_index);
+ IcingStringUtil::SStringAppendF(out, 100, "%u ", block_index);
+ ++count;
+
+ block_index = kInvalidBlockIndex;
+ if (block_or.ok()) {
+ auto block_index_or = block_or.ValueOrDie().GetNextBlockIndex();
+ if (block_index_or.ok()) {
+ block_index = block_index_or.ValueOrDie();
+ }
+ }
+ }
+ IcingStringUtil::SStringAppendF(out, 100, "(count=%d)\n", count);
+ }
+
+ out->append("In memory free lists:\n");
+ if (in_memory_freelists_.size() ==
+ header_block_->header()->num_index_block_infos) {
+ for (size_t i = 0; i < in_memory_freelists_.size(); ++i) {
+ IcingStringUtil::SStringAppendF(
+ out, 100, "Posting list bytes %u %s\n",
+ header_block_->header()->index_block_infos[i].posting_list_bytes,
+ in_memory_freelists_.at(i).DebugString().c_str());
+ }
+ } else {
+ IcingStringUtil::SStringAppendF(
+ out, 100,
+ "In memory free list size %zu doesn't match index block infos size "
+ "%d\n",
+ in_memory_freelists_.size(),
+ header_block_->header()->num_index_block_infos);
+ }
+}
+
+// FreeList.
+void FlashIndexStorage::FreeList::Push(PostingListIdentifier id) {
+ if (free_list_.size() >= kMaxSize) {
+ ICING_LOG(WARNING)
+ << "Freelist for posting lists of size (block_size / "
+ << (1u << id.posting_list_index_bits())
+ << ") has reached max size. Dropping freed posting list [block_index:"
+ << id.block_index()
+ << ", posting_list_index:" << id.posting_list_index() << "]";
+ ++num_dropped_free_list_entries_;
+ return;
+ }
+
+ free_list_.push_back(id);
+ free_list_size_high_watermark_ = std::max(
+ free_list_size_high_watermark_, static_cast<int>(free_list_.size()));
+}
+
+libtextclassifier3::StatusOr<PostingListIdentifier>
+FlashIndexStorage::FreeList::TryPop() {
+ if (free_list_.empty()) {
+ return absl_ports::NotFoundError("No available entry in free list.");
+ }
+
+ PostingListIdentifier id = free_list_.back();
+ free_list_.pop_back();
+ return id;
+}
+
+std::string FlashIndexStorage::FreeList::DebugString() const {
+ return IcingStringUtil::StringPrintf(
+ "size %zu max %d dropped %d", free_list_.size(),
+ free_list_size_high_watermark_, num_dropped_free_list_entries_);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/flash-index-storage.h b/icing/file/posting_list/flash-index-storage.h
new file mode 100644
index 0000000..378b2dc
--- /dev/null
+++ b/icing/file/posting_list/flash-index-storage.h
@@ -0,0 +1,381 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
+#define ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage-header.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// PostingListHolder: group PostingListUsed, id, and some other useful info for
+// callers.
+struct PostingListHolder {
+ // PostingListUsed owns an in-memory posting list data buffer. The data being
+ // interpreted is initialized via PRead from the storage. As such, we should
+ // sync it to disk after modifying it.
+ PostingListUsed posting_list;
+
+ // The PostingListIdentifier, which identifies both the block index and the
+ // posting list index on that block, is also returned for convenience.
+ PostingListIdentifier id;
+
+ // Next block index is also returned for convenience. If PostingListUsed is a
+ // max-sized posting list, then the caller has to use this value to handle
+ // chained max-sized posting list blocks.
+ uint32_t next_block_index;
+
+ explicit PostingListHolder(PostingListUsed&& posting_list_in,
+ PostingListIdentifier id_in,
+ uint32_t next_block_index_in)
+ : posting_list(std::move(posting_list_in)),
+ id(id_in),
+ next_block_index(next_block_index_in) {}
+};
+
+// The FlashIndexStorage class manages the actual file that makes up blocks for
+// posting lists. It allocates IndexBlocks as needed and maintains freelists to
+// prevent excessive block fragmentation.
+//
+// It maintains two types of free lists:
+// 1. On-disk, Header free list - This free list is stored in the Header
+// block. There is a free list for every possible posting list size. Each
+// entry for a posting list size contains the block_index of the
+// IndexBlock that starts the free list chain. Each IndexBlock in the free
+// list chain stores the index of the next IndexBlock in the chain.
+// 2. In-memory free list - Like the Header free list, there is a free list of
+// every possible posting list size. This free list contains not just the
+// block_index of the available IndexBlock, but also the posting_list_index
+// of the available PostingListUsed within the IndexBlock. This is because,
+// unlike the Header free list, PostingListUseds are not actually freed
+// when added to this free list.
+//
+// Whether or not the in-memory free list is used can be chosen via the
+// in_memory param to the Create factory function.
+//
+// The advantage of using the in-memory free list is that it reduces the amount
+// of flash writes made while editing the index (because actually freeing the
+// PostingLists would require writing to that flash block). The disadvantage is
+// that it introduces code complexity and potentially leaks blocks if power is
+// lost or if FlashIndexStorage is destroyed before emptying the free list.
+class FlashIndexStorage {
+ public:
+ // Creates a FlashIndexStorage at index_filename. in_memory determines whether
+ // or not the FlashIndexStorage maintains an in-memory freelist in order to
+ // avoid writes to the on-disk freelist.
+ //
+ // RETURNS:
+ // - On success, a valid instance of FlashIndexStorage
+ // - FAILED_PRECONDITION_ERROR if filesystem or serializer is null
+ // - INTERNAL_ERROR if unable to create a new header or read the existing
+ // one from disk.
+ static libtextclassifier3::StatusOr<FlashIndexStorage> Create(
+ std::string index_filename, const Filesystem* filesystem,
+ PostingListSerializer* serializer, bool in_memory = true);
+
+ // Reads magic from existing file header. We need this during Icing
+ // initialization phase to determine the version.
+ //
+ // RETURNS:
+ // - On success, a valid magic
+ // - FAILED_PRECONDITION_ERROR if filesystem is null
+ // - NOT_FOUND_ERROR if the flash index file doesn't exist
+ // - INTERNAL_ERROR on I/O error
+ static libtextclassifier3::StatusOr<int> ReadHeaderMagic(
+ const Filesystem* filesystem, const std::string& index_filename);
+
+ FlashIndexStorage(FlashIndexStorage&&) = default;
+ FlashIndexStorage(const FlashIndexStorage&) = delete;
+ FlashIndexStorage& operator=(FlashIndexStorage&&) = default;
+ FlashIndexStorage& operator=(const FlashIndexStorage&) = delete;
+
+ ~FlashIndexStorage();
+
+ // Selects block size to use.
+ static uint32_t SelectBlockSize();
+
+ // Retrieves the PostingList referred to by PostingListIdentifier. This
+ // posting list must have been previously allocated by a prior call to
+ // AllocatePostingList.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListHolder containing the
+ // requested PostingListUsed.
+ // - Any IndexBlock errors
+ libtextclassifier3::StatusOr<PostingListHolder> GetPostingList(
+ PostingListIdentifier id) const;
+
+ // Allocates and returns a PostingListHolder containing a PostingListUsed that
+ // can fit min_posting_list_bytes.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListHolder containing the
+ // requested PostingListUsed.
+ // - INVALID_ARGUMENT_ERROR if min_posting_list_bytes >
+ // max_posting_list_bytes()
+ // - RESOURCE_EXHAUSTED_ERROR if unable to grow the index to create a
+ // PostingListUsed of the requested size.
+ // - Any IndexBlock errors
+ libtextclassifier3::StatusOr<PostingListHolder> AllocatePostingList(
+ uint32_t min_posting_list_bytes);
+
+ // Allocates a new IndexBlock with a single max-sized PostingListUsed. This
+ // chains index blocks by setting the next_block_index field of this new
+ // block's header to be prev_block_index and returns a PostingListHolder
+ // containing a max-sized PostingListUsed.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListHolder containing the
+ // requested PostingListUsed.
+ // - RESOURCE_EXHAUSTED_ERROR if unable to grow the index to create a
+ // PostingListUsed of max size
+ // - Any IndexBlock errors
+ libtextclassifier3::StatusOr<PostingListHolder>
+ AllocateAndChainMaxSizePostingList(uint32_t prev_block_index);
+
+ // Frees the PostingListUsed that this holder holds.
+ //
+ // RETURNS:
+ // - OK on success
+ // - Any IndexBlock errors
+ libtextclassifier3::Status FreePostingList(PostingListHolder&& holder);
+
+ // Writes back the PostingListUsed that this holder holds to disk.
+ //
+ // RETURNS:
+ // - OK on success
+ // - Any IndexBlock errors
+ libtextclassifier3::Status WritePostingListToDisk(
+ const PostingListHolder& holder);
+
+ // Discards all existing data by deleting the existing file and
+ // re-initializing a new one.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if unable to delete existing files or initialize a new
+ // file with header
+ libtextclassifier3::Status Reset();
+
+ // Used to track the largest docid indexed in the index.
+ DocumentId get_last_indexed_docid() const {
+ return header_block_->header()->last_indexed_docid;
+ }
+ void set_last_indexed_docid(DocumentId docid) {
+ header_block_->header()->last_indexed_docid = docid;
+ }
+
+ // Updates the header and persists all changes to the index to disk. Returns
+ // true on success.
+ bool PersistToDisk();
+
+ // Returns the size of the index file in bytes.
+ int64_t GetDiskUsage() const {
+ return filesystem_->GetDiskUsage(storage_sfd_.get());
+ }
+
+ // Returns the size of the index file used to contains data.
+ uint64_t GetElementsSize() const {
+ // Element size is the same as disk size excluding the header block.
+ return GetDiskUsage() - block_size();
+ }
+
+ int num_blocks() const { return num_blocks_; }
+
+ // Gets the byte size of max sized posting list.
+ uint32_t max_posting_list_bytes() const {
+ return IndexBlock::CalculateMaxPostingListBytes(
+ block_size(), serializer_->GetDataTypeBytes());
+ }
+
+ // Info about the index based on the block size.
+ int block_size() const { return header_block_->header()->block_size; }
+
+ // Num blocks starts at 1 since the first block is the header.
+ bool empty() const { return num_blocks_ <= 1; }
+
+ // The percentage of the maximum index size that is free. Allocated blocks are
+ // treated as fully used, even if they are only partially used. In this way,
+ // min_free_fraction is a lower bound of available space.
+ double min_free_fraction() const {
+ return 1.0 - static_cast<double>(num_blocks_) / kMaxBlockIndex;
+ }
+
+ const PostingListSerializer* serializer() const { return serializer_; }
+ PostingListSerializer* serializer() { return serializer_; }
+
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
+ void GetDebugInfo(DebugInfoVerbosity::Code verbosity, std::string* out) const;
+
+ private:
+ explicit FlashIndexStorage(const Filesystem* filesystem,
+ std::string&& index_filename,
+ PostingListSerializer* serializer,
+ bool has_in_memory_freelists)
+ : filesystem_(filesystem),
+ index_filename_(std::move(index_filename)),
+ serializer_(serializer),
+ num_blocks_(0),
+ has_in_memory_freelists_(has_in_memory_freelists) {}
+
+ // Init the index from persistence. Create if file does not exist. We do not
+ // erase corrupt files.
+ //
+ // Returns false if unable to create a new header or if the existing one is
+ // corrupt.
+ bool Init();
+
+ // Create or open the header block. Returns true on success.
+ bool InitHeader();
+
+ // Create a new header block for an empty index file.
+ bool CreateHeader();
+
+ // Loads the header stored at the beginning of the index file and validates
+ // the values stored in it.
+ bool OpenHeader(int64_t file_size);
+
+ // Adds the IndexBlock referred to by block_index in the on-disk free list
+ // with index block_info_index.
+ void AddToOnDiskFreeList(uint32_t block_index, int block_info_index,
+ IndexBlock* index_block);
+
+ // Removes the IndexBlock referred to by block_index from the Header free list
+ // with index block_info_index.
+ //
+ // RETURNS:
+ // - OK on success
+ // - Any IndexBlock errors
+ libtextclassifier3::Status RemoveFromOnDiskFreeList(uint32_t block_index,
+ int block_info_index,
+ IndexBlock* index_block);
+
+ // RETURNS:
+ // - On success, a valid PostingListHolder created from the first entry of
+ // the in-memory freelist at block_info_index
+ // - OUT_OF_RANGE_ERROR if in_memory_freelists_ contains
+ // PostingListIdentifier with block_index >= num_blocks_
+ // - NOT_FOUND_ERROR if there was no entry in the freelist
+ // - Any IndexBlock errors
+ libtextclassifier3::StatusOr<PostingListHolder>
+ GetPostingListFromInMemoryFreeList(int block_info_index);
+
+ // RETURNS:
+ // - On success, a valid PostingListHolder created from the first entry of
+ // the on-disk freelist at block_info_index
+ // - OUT_OF_RANGE_ERROR if header()->index_block_infos[block_info_index]
+ // contains block_index >= num_blocks_
+ // - NOT_FOUND_ERROR if there was no entry in the freelist
+ // - Any IndexBlock errors
+ libtextclassifier3::StatusOr<PostingListHolder>
+ GetPostingListFromOnDiskFreeList(int block_info_index);
+
+ // Returns:
+ // - On success, a valid PostingListHolder created from a newly allocated
+ // IndexBlock.
+ // - RESOURCE_EXHAUSTED if the index couldn't be grown to fit a new
+ // IndexBlock.
+ // - Any IndexBlock errors
+ libtextclassifier3::StatusOr<PostingListHolder> AllocateNewPostingList(
+ int block_info_index);
+
+ // Returns:
+ // - On success, a newly created IndexBlock at block_index with posting
+ // lists of size posting_list_size
+ // - OUT_OF_RANGE_ERROR if block_index >= num_blocks_
+ // - Any IndexBlock errors
+ libtextclassifier3::StatusOr<IndexBlock> CreateIndexBlock(
+ uint32_t block_index, uint32_t posting_list_size) const;
+
+ // Returns:
+ // - On success, the IndexBlock that exists at block_index
+ // - OUT_OF_RANGE_ERROR if block_index >= num_blocks_
+ // - Any IndexBlock errors
+ libtextclassifier3::StatusOr<IndexBlock> GetIndexBlock(
+ uint32_t block_index) const;
+
+ // Add a new block to the end of the file and return its block
+ // index. Returns kInvalidBlockIndex if unable to grow the index file.
+ int GrowIndex();
+
+ // Return the index into index_block_infos of the smallest posting_list free
+ // list that can fit posting_list_bytes or -1 if posting_list_bytes exceeds
+ // the max-sized posting list.
+ int FindBestIndexBlockInfo(uint32_t posting_list_bytes) const;
+
+ // Flushes the in-memory free list to disk.
+ //
+ // RETURNS:
+ // - OK on success
+ // - Any IndexBlock errors
+ libtextclassifier3::Status FlushInMemoryFreeList();
+
+ const Filesystem* filesystem_; // not owned; can't be null
+ std::string index_filename_;
+
+ PostingListSerializer* serializer_; // not owned; can't be null
+
+ // We open the index file into this fd.
+ ScopedFd storage_sfd_;
+
+ int num_blocks_; // can be inferred from index file size
+
+ std::unique_ptr<HeaderBlock> header_block_;
+
+ // In-memory cache of free posting lists.
+ struct FreeList {
+ // Experimentally determined that high watermark for largest
+ // freelist was ~3500.
+ static constexpr size_t kMaxSize = 4096;
+
+ // Push a new PostingListIdentifier if there is space.
+ void Push(PostingListIdentifier id);
+
+ // Attempt to pop a PostingListIdentifier.
+ //
+ // RETURNS:
+ // - identifier of a free posting list, on success
+ // - NOT_FOUND if there are no free posting lists on this free list.
+ libtextclassifier3::StatusOr<PostingListIdentifier> TryPop();
+
+ std::string DebugString() const;
+
+ private:
+ std::vector<PostingListIdentifier> free_list_;
+ int free_list_size_high_watermark_ = 0;
+ int num_dropped_free_list_entries_ = 0;
+ };
+ std::vector<FreeList> in_memory_freelists_;
+
+ bool has_in_memory_freelists_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_FLASH_INDEX_STORAGE_H_
diff --git a/icing/file/posting_list/flash-index-storage_test.cc b/icing/file/posting_list/flash-index-storage_test.cc
new file mode 100644
index 0000000..ef60037
--- /dev/null
+++ b/icing/file/posting_list/flash-index-storage_test.cc
@@ -0,0 +1,610 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/posting_list/flash-index-storage.h"
+
+#include <unistd.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage-header.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Ne;
+using ::testing::Not;
+
+class FlashIndexStorageTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/test_dir";
+ file_name_ = test_dir_ + "/test_file.idx.index";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+
+ // TODO(b/249829533): test different serializers
+ serializer_ = std::make_unique<PostingListHitSerializer>();
+ }
+
+ void TearDown() override {
+ serializer_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ protected:
+ std::string test_dir_;
+ std::string file_name_;
+ Filesystem filesystem_;
+ std::unique_ptr<PostingListHitSerializer> serializer_;
+};
+
+TEST_F(FlashIndexStorageTest, ReadHeaderMagic) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ }
+ EXPECT_THAT(FlashIndexStorage::ReadHeaderMagic(&filesystem_, file_name_),
+ IsOkAndHolds(HeaderBlock::Header::kMagic));
+}
+
+TEST_F(FlashIndexStorageTest, ReadHeaderMagicOldVersion) {
+ int block_size;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ block_size = flash_index_storage.block_size();
+ }
+
+ int old_magic = 0x6dfba6ae;
+ ASSERT_THAT(old_magic, Ne(HeaderBlock::Header::kMagic));
+ {
+ // Manually modify the header magic.
+ ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+ ASSERT_THAT(sfd.is_valid(), IsTrue());
+
+ // Read and validate header.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ HeaderBlock header_block,
+ HeaderBlock::Read(&filesystem_, sfd.get(), block_size));
+ header_block.header()->magic = old_magic;
+ ASSERT_THAT(header_block.Write(sfd.get()), IsTrue());
+ }
+
+ EXPECT_THAT(FlashIndexStorage::ReadHeaderMagic(&filesystem_, file_name_),
+ IsOkAndHolds(old_magic));
+}
+
+TEST_F(FlashIndexStorageTest,
+ ReadHeaderMagicNonExistingFileShouldGetNotFoundError) {
+ EXPECT_THAT(FlashIndexStorage::ReadHeaderMagic(&filesystem_, file_name_),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(FlashIndexStorageTest, CorruptHeader) {
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ }
+ {
+ // Read the valid header - should pass
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ }
+ {
+ // Corrupt the header file by changing pl_bytes
+ ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+ off_t offset = 16;
+ uint32_t pl_bytes = sizeof(Hit) - 1; // This is intentionally invalid
+ filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t));
+ }
+ {
+ // Read the header file - should fail because pl_bytes is not divisible
+ // by sizeof(Hit), which is 5 as of writing
+ ASSERT_THAT(
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+ {
+ // Correct the pl_bytes header alignment
+ ScopedFd sfd(filesystem_.OpenForWrite(file_name_.c_str()));
+ off_t offset = 16;
+ uint32_t pl_bytes = 2 * sizeof(Hit); // Should be valid
+ filesystem_.PWrite(sfd.get(), offset, &pl_bytes, sizeof(uint32_t));
+ }
+ {
+ // Read the valid header - should pass
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ }
+
+ // Delete the file
+ filesystem_.DeleteFile(file_name_.c_str());
+}
+
+TEST_F(FlashIndexStorageTest, EmptyStorage) {
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ // An 'empty' FlashIndexStorage should have:
+ // 1. One block allocated for the header
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
+ EXPECT_THAT(flash_index_storage.empty(), IsTrue());
+ // 2. The invalid DocumentId stored in its header
+ EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
+ Eq(kInvalidDocumentId));
+ // 3. It's disk usage should be the equivalent of one block.
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(flash_index_storage.block_size()));
+ }
+ {
+ // Read the valid header. All functions should return the same values.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(1));
+ EXPECT_THAT(flash_index_storage.empty(), IsTrue());
+ EXPECT_THAT(flash_index_storage.get_last_indexed_docid(),
+ Eq(kInvalidDocumentId));
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(flash_index_storage.block_size()));
+ }
+}
+
+TEST_F(FlashIndexStorageTest, FreeListInMemory) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free list
+ ICING_ASSERT_OK(
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1)));
+
+ // 4. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, FreeListNotInMemory) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get(),
+ /*in_memory=*/false));
+
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free list
+ ICING_ASSERT_OK(
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1)));
+
+ // 4. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, FreeListInMemoryPersistence) {
+ PostingListIdentifier id1 = PostingListIdentifier::kInvalid;
+ int half_block_posting_list_size = 0;
+ {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ half_block_posting_list_size =
+ (flash_index_storage.block_size() - 100) / 2;
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(
+ half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get another PL. This should be on the same flash block. There should
+ // be no allocation.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(
+ half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Now, free the first posting list. This should add it to the free
+ // list
+ ICING_ASSERT_OK(
+ flash_index_storage.FreePostingList(std::move(posting_list_holder1)));
+ }
+
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+ // 4. The FlashIndexStorage should go out of scope and flush the in-memory
+ // posting list to disk
+ }
+
+ {
+ // Recreate the flash index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+
+ {
+ // 5. Request another posting list. This should NOT grow the index because
+ // the first posting list is free.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(
+ half_block_posting_list_size));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should not have grown.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ // 3. The returned posting list holder should have the same id as the
+ // first posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.posting_list_index(),
+ Eq(id1.posting_list_index()));
+ EXPECT_THAT(posting_list_holder3.id.block_index(), Eq(id1.block_index()));
+ // Make sure this pl is empty. The hits that used to be there should be
+ // gone.
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
+ IsOkAndHolds(IsEmpty()));
+ std::vector<Hit> hits3 = {
+ Hit(/*section_id=*/7, /*document_id=*/1, /*term_frequency=*/62),
+ Hit(/*section_id=*/12, /*document_id=*/3, /*term_frequency=*/45),
+ Hit(/*section_id=*/11, /*document_id=*/18, /*term_frequency=*/12),
+ Hit(/*section_id=*/7, /*document_id=*/100, /*term_frequency=*/74)};
+ for (const Hit& hit : hits3) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder3.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder3.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits3.rbegin(), hits3.rend())));
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(2 * flash_index_storage.block_size()));
+ }
+}
+
+TEST_F(FlashIndexStorageTest, DifferentSizedPostingLists) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ {
+ // 1. Request a PL that is 1/2 block size. Remember that block size also
+ // includes the BlockHeader. The BlockHeader isn't publicly visible, so we
+ // subtract 100 bytes to be sure. AllocatePostingList will round up from
+ // kHalfBlockPostingListSize to whatever the correct size is.
+ const int kHalfBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 2;
+ const int kQuarterBlockPostingListSize =
+ (flash_index_storage.block_size() - 100) / 4;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder1,
+ flash_index_storage.AllocatePostingList(kHalfBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ PostingListIdentifier id1 = posting_list_holder1.id;
+ EXPECT_THAT(id1.is_valid(), IsTrue());
+ // 2. The index file should have grown by exactly one flash block.
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(2));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits1 = {
+ Hit(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/12),
+ Hit(/*section_id=*/6, /*document_id=*/2, /*term_frequency=*/19),
+ Hit(/*section_id=*/5, /*document_id=*/2, /*term_frequency=*/100),
+ Hit(/*section_id=*/8, /*document_id=*/5, /*term_frequency=*/197)};
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder1.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder1.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+
+ // 2. Get a PL that is 1/4 block size. Even though a 1/4 block PL could
+ // theoretically fit in the same block, we'll allocate a new one because PLs
+ // on a block are required to be the same size.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder2,
+ flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder2.id.is_valid(), IsTrue());
+ // 2. The index file should have grown by one block.
+ EXPECT_THAT(posting_list_holder2.id.block_index(),
+ Not(Eq(id1.block_index())));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+
+ std::vector<Hit> hits2 = {
+ Hit(/*section_id=*/4, /*document_id=*/0, /*term_frequency=*/12),
+ Hit(/*section_id=*/8, /*document_id=*/4, /*term_frequency=*/19),
+ Hit(/*section_id=*/9, /*document_id=*/7, /*term_frequency=*/100),
+ Hit(/*section_id=*/6, /*document_id=*/7, /*term_frequency=*/197)};
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&posting_list_holder2.posting_list, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&posting_list_holder2.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+
+ // 3. Request another 1/4 block-size posting list. This should NOT grow the
+ // index because there should be three free posting lists on block2.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder posting_list_holder3,
+ flash_index_storage.AllocatePostingList(kQuarterBlockPostingListSize));
+ // We expect:
+ // 1. FlashIndexStorage will return a valid id.
+ EXPECT_THAT(posting_list_holder3.id.is_valid(), IsTrue());
+ // 2. The index file should have remained the same size as before and the
+ // third posting list holder should use the same block as the second
+ // posting list holder.
+ EXPECT_THAT(posting_list_holder3.id.block_index(),
+ Eq(posting_list_holder2.id.block_index()));
+ EXPECT_THAT(flash_index_storage.num_blocks(), Eq(3));
+ EXPECT_THAT(flash_index_storage.empty(), IsFalse());
+ }
+ EXPECT_THAT(flash_index_storage.GetDiskUsage(),
+ Eq(3 * flash_index_storage.block_size()));
+}
+
+TEST_F(FlashIndexStorageTest, AllocateTooLargePostingList) {
+ // Create the header file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+
+ // Request a PL that is 2x block size.
+ const int kDoubleBlockSize = flash_index_storage.block_size() * 2;
+ EXPECT_THAT(flash_index_storage.AllocatePostingList(kDoubleBlockSize),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/index-block.cc b/icing/file/posting_list/index-block.cc
new file mode 100644
index 0000000..3fa397c
--- /dev/null
+++ b/icing/file/posting_list/index-block.cc
@@ -0,0 +1,333 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/posting_list/index-block.h"
+
+#include <sys/types.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-free.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/file/posting_list/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+libtextclassifier3::Status ValidatePostingListBytes(
+ PostingListSerializer* serializer, uint32_t posting_list_bytes,
+ uint32_t block_size) {
+ if (posting_list_bytes > IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer->GetDataTypeBytes()) ||
+ !posting_list_utils::IsValidPostingListSize(
+ posting_list_bytes, serializer->GetDataTypeBytes(),
+ serializer->GetMinPostingListSize())) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d is illegal for a flash block with max "
+ "posting list size of %d",
+ posting_list_bytes,
+ IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer->GetDataTypeBytes())));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace
+
+/* static */ libtextclassifier3::StatusOr<IndexBlock>
+IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ const Filesystem* filesystem, PostingListSerializer* serializer, int fd,
+ off_t block_file_offset, uint32_t block_size) {
+ if (block_size < sizeof(BlockHeader)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Provided block_size %d is too small to fit even the BlockHeader!",
+ block_size));
+ }
+
+ BlockHeader header;
+ if (!filesystem->PRead(fd, &header, sizeof(BlockHeader), block_file_offset)) {
+ return absl_ports::InternalError("PRead block header error");
+ }
+
+ ICING_RETURN_IF_ERROR(ValidatePostingListBytes(
+ serializer, header.posting_list_bytes, block_size));
+
+ return IndexBlock(filesystem, serializer, fd, block_file_offset, block_size,
+ header.posting_list_bytes);
+}
+
+/* static */ libtextclassifier3::StatusOr<IndexBlock>
+IndexBlock::CreateFromUninitializedRegion(const Filesystem* filesystem,
+ PostingListSerializer* serializer,
+ int fd, off_t block_file_offset,
+ uint32_t block_size,
+ uint32_t posting_list_bytes) {
+ if (block_size < sizeof(BlockHeader)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Provided block_size %d is too small to fit even the BlockHeader!",
+ block_size));
+ }
+
+ ICING_RETURN_IF_ERROR(
+ ValidatePostingListBytes(serializer, posting_list_bytes, block_size));
+ IndexBlock block(filesystem, serializer, fd, block_file_offset, block_size,
+ posting_list_bytes);
+ ICING_RETURN_IF_ERROR(block.Reset());
+
+ return block;
+}
+
+libtextclassifier3::StatusOr<IndexBlock::PostingListAndBlockInfo>
+IndexBlock::GetAllocatedPostingList(PostingListIndex posting_list_index) {
+ if (posting_list_index >= max_num_posting_lists() || posting_list_index < 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Cannot get posting list with index %d in IndexBlock with only %d "
+ "posting lists.",
+ posting_list_index, max_num_posting_lists()));
+ }
+
+ // Read out the header from disk.
+ ICING_ASSIGN_OR_RETURN(BlockHeader header, ReadHeader());
+
+ // Read out the allocated posting list from disk.
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<uint8_t[]> posting_list_buffer,
+ ReadPostingList(posting_list_index));
+
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
+ serializer_, std::move(posting_list_buffer), posting_list_bytes_));
+ return PostingListAndBlockInfo(
+ std::move(pl_used), posting_list_index, header.next_block_index,
+ /*has_free_posting_lists_in=*/header.free_list_posting_list_index !=
+ kInvalidPostingListIndex);
+}
+
+libtextclassifier3::StatusOr<IndexBlock::PostingListAndBlockInfo>
+IndexBlock::AllocatePostingList() {
+ // Read out the header from disk.
+ ICING_ASSIGN_OR_RETURN(BlockHeader header, ReadHeader());
+
+ if (header.free_list_posting_list_index == kInvalidPostingListIndex) {
+ return absl_ports::ResourceExhaustedError(
+ "No available posting lists to allocate.");
+ }
+
+ // Pull one off the free list.
+ PostingListIndex posting_list_index = header.free_list_posting_list_index;
+
+ // Read out the posting list from disk.
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<uint8_t[]> posting_list_buffer,
+ ReadPostingList(posting_list_index));
+ // Step 1: get the next (chained) free posting list index and set it to block
+ // header.
+ ICING_ASSIGN_OR_RETURN(
+ PostingListFree pl_free,
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ posting_list_buffer.get(), posting_list_bytes_,
+ serializer_->GetDataTypeBytes(),
+ serializer_->GetMinPostingListSize()));
+ header.free_list_posting_list_index = pl_free.get_next_posting_list_index();
+ if (header.free_list_posting_list_index != kInvalidPostingListIndex &&
+ header.free_list_posting_list_index >= max_num_posting_lists()) {
+ ICING_LOG(ERROR)
+ << "Free Posting List points to an invalid posting list index!";
+ header.free_list_posting_list_index = kInvalidPostingListIndex;
+ }
+
+ // Step 2: create PostingListUsed instance. The original content in the above
+ // posting_list_buffer is not important now because
+ // PostingListUsed::CreateFromUnitializedRegion will wipe it out, and
+ // we only need to sync it to disk after initializing.
+ ICING_ASSIGN_OR_RETURN(PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ serializer_, posting_list_bytes_));
+
+ // Sync the initialized posting list (overwrite the original content of
+ // PostingListFree) and header to disk.
+ ICING_RETURN_IF_ERROR(
+ WritePostingList(posting_list_index, pl_used.posting_list_buffer()));
+ ICING_RETURN_IF_ERROR(WriteHeader(header));
+
+ return PostingListAndBlockInfo(
+ std::move(pl_used), posting_list_index, header.next_block_index,
+ /*has_free_posting_lists_in=*/header.free_list_posting_list_index !=
+ kInvalidPostingListIndex);
+}
+
+libtextclassifier3::Status IndexBlock::FreePostingList(
+ PostingListIndex posting_list_index) {
+ if (posting_list_index >= max_num_posting_lists() || posting_list_index < 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Cannot free posting list with index %d in IndexBlock with only %d "
+ "posting lists.",
+ posting_list_index, max_num_posting_lists()));
+ }
+
+ ICING_ASSIGN_OR_RETURN(BlockHeader header, ReadHeader());
+ ICING_RETURN_IF_ERROR(FreePostingListImpl(header, posting_list_index));
+ ICING_RETURN_IF_ERROR(WriteHeader(header));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IndexBlock::WritePostingListToDisk(
+ const PostingListUsed& posting_list_used,
+ PostingListIndex posting_list_index) {
+ if (posting_list_index >= max_num_posting_lists() || posting_list_index < 0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Cannot write posting list with index %d in IndexBlock with only %d "
+ "posting lists.",
+ posting_list_index, max_num_posting_lists()));
+ }
+
+ if (posting_list_used.size_in_bytes() != posting_list_bytes_) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot write posting list into a block with different posting list "
+ "bytes");
+ }
+
+ if (!posting_list_used.is_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Write the allocated posting list to disk.
+ return WritePostingList(posting_list_index,
+ posting_list_used.posting_list_buffer());
+}
+
+libtextclassifier3::StatusOr<uint32_t> IndexBlock::GetNextBlockIndex() const {
+ ICING_ASSIGN_OR_RETURN(BlockHeader header, ReadHeader());
+ return header.next_block_index;
+}
+
+libtextclassifier3::Status IndexBlock::SetNextBlockIndex(
+ uint32_t next_block_index) {
+ ICING_ASSIGN_OR_RETURN(BlockHeader header, ReadHeader());
+ header.next_block_index = next_block_index;
+ ICING_RETURN_IF_ERROR(WriteHeader(header));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<bool> IndexBlock::HasFreePostingLists() const {
+ ICING_ASSIGN_OR_RETURN(BlockHeader header, ReadHeader());
+ return header.free_list_posting_list_index != kInvalidPostingListIndex;
+}
+
+libtextclassifier3::Status IndexBlock::Reset() {
+ BlockHeader header;
+ header.free_list_posting_list_index = kInvalidPostingListIndex;
+ header.next_block_index = kInvalidBlockIndex;
+ header.posting_list_bytes = posting_list_bytes_;
+
+ // Starting with the last posting list, prepend each posting list to the free
+ // list. At the end, the beginning of the free list should be the first
+ // posting list.
+ for (PostingListIndex posting_list_index = max_num_posting_lists() - 1;
+ posting_list_index >= 0; --posting_list_index) {
+ // Adding the posting list at posting_list_index to the free list will
+ // modify both the posting list and also
+ // header.free_list_posting_list_index.
+ ICING_RETURN_IF_ERROR(FreePostingListImpl(header, posting_list_index));
+ }
+
+ // Sync the header to disk.
+ ICING_RETURN_IF_ERROR(WriteHeader(header));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IndexBlock::FreePostingListImpl(
+ BlockHeader& header, PostingListIndex posting_list_index) {
+ // Read out the posting list from disk.
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<uint8_t[]> posting_list_buffer,
+ ReadPostingList(posting_list_index));
+
+ ICING_ASSIGN_OR_RETURN(PostingListFree plfree,
+ PostingListFree::CreateFromUnitializedRegion(
+ posting_list_buffer.get(), posting_list_bytes(),
+ serializer_->GetDataTypeBytes(),
+ serializer_->GetMinPostingListSize()));
+
+ // Put at the head of the list.
+ plfree.set_next_posting_list_index(header.free_list_posting_list_index);
+ header.free_list_posting_list_index = posting_list_index;
+
+ // Sync the posting list to disk.
+ ICING_RETURN_IF_ERROR(
+ WritePostingList(posting_list_index, posting_list_buffer.get()));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<IndexBlock::BlockHeader> IndexBlock::ReadHeader()
+ const {
+ BlockHeader header;
+ if (!filesystem_->PRead(fd_, &header, sizeof(BlockHeader),
+ block_file_offset_)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("PRead block header error: ", strerror(errno)));
+ }
+ if (header.posting_list_bytes != posting_list_bytes_) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Inconsistent posting list bytes between block header (%d) and class "
+ "instance (%d)",
+ header.posting_list_bytes, posting_list_bytes_));
+ }
+ return header;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<uint8_t[]>>
+IndexBlock::ReadPostingList(PostingListIndex posting_list_index) const {
+ auto posting_list_buffer = std::make_unique<uint8_t[]>(posting_list_bytes_);
+ if (!filesystem_->PRead(fd_, posting_list_buffer.get(), posting_list_bytes_,
+ get_posting_list_file_offset(posting_list_index))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("PRead posting list error: ", strerror(errno)));
+ }
+ return posting_list_buffer;
+}
+
+libtextclassifier3::Status IndexBlock::WriteHeader(const BlockHeader& header) {
+ if (!filesystem_->PWrite(fd_, block_file_offset_, &header,
+ sizeof(BlockHeader))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("PWrite block header error: ", strerror(errno)));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IndexBlock::WritePostingList(
+ PostingListIndex posting_list_index, const uint8_t* posting_list_buffer) {
+ if (!filesystem_->PWrite(fd_,
+ get_posting_list_file_offset(posting_list_index),
+ posting_list_buffer, posting_list_bytes_)) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("PWrite posting list error: ", strerror(errno)));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/index-block.h b/icing/file/posting_list/index-block.h
new file mode 100644
index 0000000..21ad13f
--- /dev/null
+++ b/icing/file/posting_list/index-block.h
@@ -0,0 +1,369 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_
+#define ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_
+
+#include <sys/types.h>
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/legacy/index/icing-bit-util.h"
+
+namespace icing {
+namespace lib {
+
+// This class is used to manage I/O to a single flash block and to manage the
+// division of that flash block into PostingLists. It provides an interface to
+// allocate, free and read posting lists. Note that IndexBlock is stateless:
+// - Any changes to block header will be synced to disk before the method
+// returns.
+// - Any posting list allocation/freeing will be synced to disk before the
+// method returns.
+// - When getting an allocated posting list, it PReads the contents from disk to
+// a buffer and transfer the ownership to PostingListUsed. Any changes to
+// PostingListUsed will not be visible to other instances until calling
+// WritePostingListToDisk.
+//
+// An IndexBlock contains a small header and an array of fixed-size posting list
+// buffers. Initially, all posting lists are chained in a singly-linked free
+// list.
+//
+// When we want to get a new PostingList from an IndexBlock, we just pull one
+// off the free list. When the user wants to return the PostingList to the free
+// pool, we prepend it to the free list.
+//
+// Read-write the same block is NOT thread safe. If we try to read-write the
+// same block at the same time (either by the same or different IndexBlock
+// instances), then it causes race condition and the behavior is undefined.
+class IndexBlock {
+ public:
+ // What is the maximum posting list size in bytes that can be stored in this
+ // block.
+ static uint32_t CalculateMaxPostingListBytes(uint32_t block_size_in_bytes,
+ uint32_t data_type_bytes) {
+ return (block_size_in_bytes - sizeof(BlockHeader)) / data_type_bytes *
+ data_type_bytes;
+ }
+
+ // Creates an IndexBlock to reference the previously used region of the file
+ // descriptor starting at block_file_offset with size block_size.
+ //
+ // - serializer: for reading/writing posting list. Also some additional
+ // information (e.g. data size) should be provided by the
+ // serializer.
+ // - fd: a valid file descriptor opened for write by the caller.
+ // - block_file_offset: absolute offset of the file (fd).
+ // - block_size: byte size of this block.
+ //
+ // Unlike CreateFromUninitializedRegion, a pre-existing index block has
+ // already determined and written posting list bytes into block header, so it
+ // will be read from block header and the caller doesn't have to provide.
+ //
+ // RETURNS:
+ // - A valid IndexBlock instance on success
+ // - INVALID_ARGUMENT_ERROR
+ // - If block_size is too small for even just the BlockHeader
+ // - If the posting list size stored in the region is not a valid posting
+ // list size (e.g. exceeds max_posting_list_bytes(size))
+ // - INTERNAL_ERROR on I/O error
+ static libtextclassifier3::StatusOr<IndexBlock>
+ CreateFromPreexistingIndexBlockRegion(const Filesystem* filesystem,
+ PostingListSerializer* serializer,
+ int fd, off_t block_file_offset,
+ uint32_t block_size);
+
+ // Creates an IndexBlock to reference an uninitialized region of the file
+ // descriptor starting at block_file_offset with size block_size. The
+ // IndexBlock will initialize the region to be an empty IndexBlock with
+ // posting lists of size posting_list_bytes.
+ //
+ // - serializer: for reading/writing posting list. Also some additional
+ // information (e.g. data size) should be provided by the
+ // serializer.
+ // - fd: a valid file descriptor opened for write by the caller.
+ // - block_file_offset: absolute offset of the file (fd).
+ // - block_size: byte size of this block.
+ // - posting_list_bytes: byte size of all posting lists in this block. This
+ // information will be written into block header.
+ //
+ // RETURNS:
+ // - A valid IndexBlock instance on success
+ // - INVALID_ARGUMENT_ERROR
+ // - If block_size is too small for even just the BlockHeader
+ // - If the posting list size stored in the region is not a valid posting
+ // list size (e.g. exceeds max_posting_list_bytes(size))
+ // - INTERNAL_ERROR on I/O error
+ static libtextclassifier3::StatusOr<IndexBlock> CreateFromUninitializedRegion(
+ const Filesystem* filesystem, PostingListSerializer* serializer, int fd,
+ off_t block_file_offset, uint32_t block_size,
+ uint32_t posting_list_bytes);
+
+ IndexBlock(const IndexBlock&) = delete;
+ IndexBlock& operator=(const IndexBlock&) = delete;
+ IndexBlock(IndexBlock&&) = default;
+ IndexBlock& operator=(IndexBlock&&) = default;
+
+ ~IndexBlock() = default;
+
+ struct PostingListAndBlockInfo {
+ PostingListUsed posting_list_used;
+ PostingListIndex posting_list_index;
+
+ uint32_t next_block_index;
+
+ // Flag indicating if there are any free posting lists available after this
+ // allocation request.
+ bool has_free_posting_lists;
+
+ explicit PostingListAndBlockInfo(PostingListUsed&& posting_list_used_in,
+ PostingListIndex posting_list_index_in,
+ uint32_t next_block_index_in,
+ bool has_free_posting_lists_in)
+ : posting_list_used(std::move(posting_list_used_in)),
+ posting_list_index(posting_list_index_in),
+ next_block_index(next_block_index_in),
+ has_free_posting_lists(has_free_posting_lists_in) {}
+ };
+
+ // PReads existing posting list content at posting_list_index, instantiates a
+ // PostingListUsed, and returns it with some additional index block info.
+ //
+ // RETURNS:
+ // - A valid PostingListAndBlockInfo on success
+ // - INVALID_ARGUMENT_ERROR if posting_list_index < 0 or posting_list_index
+ // >= max_num_posting_lists()
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<PostingListAndBlockInfo> GetAllocatedPostingList(
+ PostingListIndex posting_list_index);
+
+ // Allocates a PostingListUsed in the IndexBlock, initializes the content
+ // (by serializer), and returns the initialized PostingListUsed instance,
+ // PostingListIndex, and some additional index block info.
+ //
+ // RETURNS:
+ // - A valid PostingListAndBlockInfo instance on success
+ // - RESOURCE_EXHAUSTED_ERROR if there is already no free posting list
+ // available, i.e. !HasFreePostingLists()
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<PostingListAndBlockInfo> AllocatePostingList();
+
+ // Frees a posting list at posting_list_index, adds it into the free list
+ // chain and updates block header. Both changes on posting list free and
+ // header will be synced to disk.
+ //
+ // It is considered an error to "double-free" a posting list. You should never
+ // call FreePostingList(index) with the same index twice, unless that index
+ // was returned by an intervening AllocatePostingList() call.
+ //
+ // Ex.
+ // PostingListIndex index = block.AllocatePostingList();
+ // DoSomething(block.GetAllocatedPostingList(index));
+ // block.FreePostingList(index);
+ // block.FreePostingList(index); // Argh! What are you doing?!
+ // ...
+ // PostingListIndex index = block.AllocatePostingList();
+ // DoSomething(block.GetAllocatedPostingList(index));
+ // block.FreePostingList(index);
+ // index = block.AllocatePostingList();
+ // DoSomethingElse(block.GetAllocatedPostingList(index));
+ // // A-Ok! We called AllocatePostingList() since the last FreePostingList()
+ // // call.
+ // block.FreePostingList(index);
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR if posting_list_index < 0 or posting_list_index
+ // >= max_num_posting_lists()
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status FreePostingList(
+ PostingListIndex posting_list_index);
+
+ // Writes back an allocated posting list (PostingListUsed) at
+ // posting_list_index to disk.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR
+ // - If posting_list_index < 0 or posting_list_index >=
+ // max_num_posting_lists()
+ // - If posting_list_used.size_in_bytes() != posting_list_bytes_
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status WritePostingListToDisk(
+ const PostingListUsed& posting_list_used,
+ PostingListIndex posting_list_index);
+
+ // PReads to get the index of next block from block header. Blocks can be
+ // chained, and the interpretation of the chaining is up to the caller.
+ //
+ // RETURNS:
+ // - Next block index on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<uint32_t> GetNextBlockIndex() const;
+
+ // PWrites block header to set the index of next block.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status SetNextBlockIndex(uint32_t next_block_index);
+
+ // PReads to get whether or not there are available posting lists in the free
+ // list.
+ //
+ // RETURNS:
+ // - A bool value on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<bool> HasFreePostingLists() const;
+
+ // Retrieves the size (in bytes) of the posting lists in this IndexBlock.
+ uint32_t posting_list_bytes() const { return posting_list_bytes_; }
+
+ // Retrieves maximum number of posting lists in the block.
+ uint32_t max_num_posting_lists() const {
+ return total_posting_lists_bytes() / posting_list_bytes_;
+ }
+
+ // Retrieves number of bits required to store the largest PostingListIndex in
+ // this block.
+ int posting_list_index_bits() const {
+ return BitsToStore(max_num_posting_lists());
+ }
+
+ private:
+ struct BlockHeader {
+ // Index of the next block if this block is being chained or part of a free
+ // list.
+ uint32_t next_block_index;
+
+ // Index to the first PostingListFree in the IndexBlock. This is the start
+ // of the free list.
+ PostingListIndex free_list_posting_list_index;
+
+ // The size of each posting list in the IndexBlock. This value will be
+ // initialized when calling CreateFromUninitializedRegion once and remain
+ // unchanged.
+ uint32_t posting_list_bytes;
+ };
+
+ // Assumes that fd has been opened for write.
+ explicit IndexBlock(const Filesystem* filesystem,
+ PostingListSerializer* serializer, int fd,
+ off_t block_file_offset, uint32_t block_size_in_bytes,
+ uint32_t posting_list_bytes)
+ : filesystem_(filesystem),
+ serializer_(serializer),
+ fd_(fd),
+ block_file_offset_(block_file_offset),
+ block_size_in_bytes_(block_size_in_bytes),
+ posting_list_bytes_(posting_list_bytes) {}
+
+ // Resets IndexBlock to hold posting lists of posting_list_bytes size and adds
+ // all posting lists to the free list.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status Reset();
+
+ // Frees a posting list at posting_list_index, adds it into the free list
+ // chain and updates (sets) the given block header instance.
+ //
+ // - This function is served to avoid redundant block header PWrite when
+ // freeing multiple posting lists.
+ // - The caller should provide a BlockHeader instance for updating the free
+ // list chain, and finally sync it to disk.
+ //
+ // REQUIRES: 0 <= posting_list_index < max_posting_list_bytes()
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status FreePostingListImpl(
+ BlockHeader& header, PostingListIndex posting_list_index);
+
+ // PReads block header from the file.
+ //
+ // RETURNS:
+ // - A BlockHeader instance on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<BlockHeader> ReadHeader() const;
+
+ // PReads posting list content at posting_list_index. Note that it can be a
+ // freed or allocated posting list.
+ //
+ // REQUIRES: 0 <= posting_list_index < max_posting_list_bytes()
+ //
+ // RETURNS:
+ // - A data buffer with size = posting_list_bytes_ on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::StatusOr<std::unique_ptr<uint8_t[]>> ReadPostingList(
+ PostingListIndex posting_list_index) const;
+
+ // PWrites block header to the file.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status WriteHeader(const BlockHeader& header);
+
+ // PWrites posting list content at posting_list_index. Note that it can be a
+ // freed or allocated posting list.
+ //
+ // REQUIRES: 0 <= posting_list_index < max_posting_list_bytes() and size of
+ // posting_list_buffer is posting_list_bytes_.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status WritePostingList(
+ PostingListIndex posting_list_index, const uint8_t* posting_list_buffer);
+
+ // Retrieves the absolute file (fd) offset of a posting list at
+ // posting_list_index.
+ //
+ // REQUIRES: 0 <= posting_list_index < max_posting_list_bytes()
+ off_t get_posting_list_file_offset(
+ PostingListIndex posting_list_index) const {
+ return block_file_offset_ + sizeof(BlockHeader) +
+ posting_list_bytes_ * posting_list_index;
+ }
+
+ // Retrieves the byte size in the block available for posting lists (excluding
+ // the size of block header).
+ uint32_t total_posting_lists_bytes() const {
+ return block_size_in_bytes_ - sizeof(BlockHeader);
+ }
+
+ const Filesystem* filesystem_; // Does not own.
+
+ PostingListSerializer* serializer_; // Does not own.
+
+ int fd_; // Does not own.
+
+ off_t block_file_offset_;
+ uint32_t block_size_in_bytes_;
+ uint32_t posting_list_bytes_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_INDEX_BLOCK_H_
diff --git a/icing/file/posting_list/index-block_test.cc b/icing/file/posting_list/index-block_test.cc
new file mode 100644
index 0000000..ebc9ba4
--- /dev/null
+++ b/icing/file/posting_list/index-block_test.cc
@@ -0,0 +1,357 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/posting_list/index-block.h"
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+
+static constexpr int kBlockSize = 4096;
+
+class IndexBlockTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/flash";
+ flash_file_ = test_dir_ + "/0";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+
+ sfd_ = std::make_unique<ScopedFd>(
+ filesystem_.OpenForWrite(flash_file_.c_str()));
+ ASSERT_TRUE(sfd_->is_valid());
+
+ // Grow the file by one block for the IndexBlock to use.
+ ASSERT_TRUE(filesystem_.Grow(sfd_->get(), kBlockSize));
+
+ // TODO: test different serializers
+ serializer_ = std::make_unique<PostingListHitSerializer>();
+ }
+
+ void TearDown() override {
+ serializer_.reset();
+ sfd_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+ std::string flash_file_;
+ std::unique_ptr<ScopedFd> sfd_;
+ std::unique_ptr<PostingListHitSerializer> serializer_;
+};
+
+TEST_F(IndexBlockTest, CreateFromUninitializedRegionProducesEmptyBlock) {
+ constexpr int kPostingListBytes = 20;
+
+ {
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
+ EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsTrue()));
+ }
+}
+
+TEST_F(IndexBlockTest, SizeAccessorsWorkCorrectly) {
+ constexpr int kPostingListBytes1 = 20;
+
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block,
+ IndexBlock::CreateFromUninitializedRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes1));
+ EXPECT_THAT(block.posting_list_bytes(), Eq(kPostingListBytes1));
+ // There should be (4096 - 12) / 20 = 204 posting lists
+ // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 203 in only 8
+ // bits.
+ EXPECT_THAT(block.max_num_posting_lists(), Eq(204));
+ EXPECT_THAT(block.posting_list_index_bits(), Eq(8));
+
+ constexpr int kPostingListBytes2 = 200;
+
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ block, IndexBlock::CreateFromUninitializedRegion(
+ &filesystem_, serializer_.get(), sfd_->get(), /*offset=*/0,
+ kBlockSize, kPostingListBytes2));
+ EXPECT_THAT(block.posting_list_bytes(), Eq(kPostingListBytes2));
+ // There should be (4096 - 12) / 200 = 20 posting lists
+ // (sizeof(BlockHeader)==12). We can store a PostingListIndex of 19 in only 5
+ // bits.
+ EXPECT_THAT(block.max_num_posting_lists(), Eq(20));
+ EXPECT_THAT(block.posting_list_index_bits(), Eq(5));
+}
+
+TEST_F(IndexBlockTest, IndexBlockChangesPersistAcrossInstances) {
+ constexpr int kPostingListBytes = 2000;
+
+ std::vector<Hit> test_hits{
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency),
+ };
+ PostingListIndex allocated_index;
+ {
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
+ // Add hits to the first posting list.
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info,
+ block.AllocatePostingList());
+ for (const Hit& hit : test_hits) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&alloc_info.posting_list_used, hit));
+ }
+ EXPECT_THAT(
+ serializer_->GetHits(&alloc_info.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(test_hits.rbegin(), test_hits.rend())));
+
+ ICING_ASSERT_OK(block.WritePostingListToDisk(
+ alloc_info.posting_list_used, alloc_info.posting_list_index));
+ allocated_index = alloc_info.posting_list_index;
+ }
+ {
+ // Create an IndexBlock from the previously allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock::PostingListAndBlockInfo pl_block_info,
+ block.GetAllocatedPostingList(allocated_index));
+ EXPECT_THAT(
+ serializer_->GetHits(&pl_block_info.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(test_hits.rbegin(), test_hits.rend())));
+ EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsTrue()));
+ }
+}
+
+TEST_F(IndexBlockTest, IndexBlockMultiplePostingLists) {
+ constexpr int kPostingListBytes = 2000;
+
+ std::vector<Hit> hits_in_posting_list1{
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency),
+ };
+ std::vector<Hit> hits_in_posting_list2{
+ Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88),
+ Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2),
+ Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12),
+ Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency),
+ };
+ PostingListIndex allocated_index_1;
+ PostingListIndex allocated_index_2;
+ {
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
+
+ // Add hits to the first posting list.
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_1,
+ block.AllocatePostingList());
+ for (const Hit& hit : hits_in_posting_list1) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&alloc_info_1.posting_list_used, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&alloc_info_1.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
+ hits_in_posting_list1.rend())));
+
+ // Add hits to the second posting list.
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_2,
+ block.AllocatePostingList());
+ for (const Hit& hit : hits_in_posting_list2) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&alloc_info_2.posting_list_used, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&alloc_info_2.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
+ hits_in_posting_list2.rend())));
+
+ EXPECT_THAT(block.AllocatePostingList(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsFalse()));
+
+ // Write both posting lists to disk.
+ ICING_ASSERT_OK(block.WritePostingListToDisk(
+ alloc_info_1.posting_list_used, alloc_info_1.posting_list_index));
+ ICING_ASSERT_OK(block.WritePostingListToDisk(
+ alloc_info_2.posting_list_used, alloc_info_2.posting_list_index));
+ allocated_index_1 = alloc_info_1.posting_list_index;
+ allocated_index_2 = alloc_info_2.posting_list_index;
+ }
+ {
+ // Create an IndexBlock from the previously allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock::PostingListAndBlockInfo pl_block_info_1,
+ block.GetAllocatedPostingList(allocated_index_1));
+ EXPECT_THAT(serializer_->GetHits(&pl_block_info_1.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
+ hits_in_posting_list1.rend())));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock::PostingListAndBlockInfo pl_block_info_2,
+ block.GetAllocatedPostingList(allocated_index_2));
+ EXPECT_THAT(serializer_->GetHits(&pl_block_info_2.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
+ hits_in_posting_list2.rend())));
+ EXPECT_THAT(block.AllocatePostingList(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsFalse()));
+ }
+}
+
+TEST_F(IndexBlockTest, IndexBlockReallocatingPostingLists) {
+ constexpr int kPostingListBytes = 2000;
+
+ // Create an IndexBlock from this newly allocated file block.
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock block,
+ IndexBlock::CreateFromUninitializedRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
+
+ // Add hits to the first posting list.
+ std::vector<Hit> hits_in_posting_list1{
+ Hit(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/5, /*document_id=*/1, /*term_frequency=*/99),
+ Hit(/*section_id=*/3, /*document_id=*/3, /*term_frequency=*/17),
+ Hit(/*section_id=*/10, /*document_id=*/10, Hit::kDefaultTermFrequency),
+ };
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_1,
+ block.AllocatePostingList());
+ for (const Hit& hit : hits_in_posting_list1) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&alloc_info_1.posting_list_used, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&alloc_info_1.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list1.rbegin(),
+ hits_in_posting_list1.rend())));
+
+ // Add hits to the second posting list.
+ std::vector<Hit> hits_in_posting_list2{
+ Hit(/*section_id=*/12, /*document_id=*/220, /*term_frequency=*/88),
+ Hit(/*section_id=*/17, /*document_id=*/265, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/0, /*document_id=*/287, /*term_frequency=*/2),
+ Hit(/*section_id=*/11, /*document_id=*/306, /*term_frequency=*/12),
+ Hit(/*section_id=*/10, /*document_id=*/306, Hit::kDefaultTermFrequency),
+ };
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_2,
+ block.AllocatePostingList());
+ for (const Hit& hit : hits_in_posting_list2) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&alloc_info_2.posting_list_used, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&alloc_info_2.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list2.rbegin(),
+ hits_in_posting_list2.rend())));
+
+ EXPECT_THAT(block.AllocatePostingList(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsFalse()));
+
+ // Now free the first posting list. Then, reallocate it and fill it with a
+ // different set of hits.
+ ICING_ASSERT_OK(block.FreePostingList(alloc_info_1.posting_list_index));
+ EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsTrue()));
+
+ std::vector<Hit> hits_in_posting_list3{
+ Hit(/*section_id=*/12, /*document_id=*/0, /*term_frequency=*/88),
+ Hit(/*section_id=*/17, /*document_id=*/1, Hit::kDefaultTermFrequency),
+ Hit(/*section_id=*/0, /*document_id=*/2, /*term_frequency=*/2),
+ };
+ ICING_ASSERT_OK_AND_ASSIGN(IndexBlock::PostingListAndBlockInfo alloc_info_3,
+ block.AllocatePostingList());
+ EXPECT_THAT(alloc_info_3.posting_list_index,
+ Eq(alloc_info_3.posting_list_index));
+ for (const Hit& hit : hits_in_posting_list3) {
+ ICING_ASSERT_OK(
+ serializer_->PrependHit(&alloc_info_3.posting_list_used, hit));
+ }
+ EXPECT_THAT(serializer_->GetHits(&alloc_info_3.posting_list_used),
+ IsOkAndHolds(ElementsAreArray(hits_in_posting_list3.rbegin(),
+ hits_in_posting_list3.rend())));
+ EXPECT_THAT(block.AllocatePostingList(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(block.HasFreePostingLists(), IsOkAndHolds(IsFalse()));
+}
+
+TEST_F(IndexBlockTest, IndexBlockNextBlockIndex) {
+ constexpr int kPostingListBytes = 2000;
+ constexpr int kSomeBlockIndex = 22;
+
+ {
+ // Create an IndexBlock from this newly allocated file block and set the
+ // next block index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
+ EXPECT_THAT(block.GetNextBlockIndex(), IsOkAndHolds(kInvalidBlockIndex));
+ EXPECT_THAT(block.SetNextBlockIndex(kSomeBlockIndex), IsOk());
+ EXPECT_THAT(block.GetNextBlockIndex(), IsOkAndHolds(kSomeBlockIndex));
+ }
+ {
+ // Create an IndexBlock from this previously allocated file block and make
+ // sure that next_block_index is still set properly.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromPreexistingIndexBlockRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize));
+ EXPECT_THAT(block.GetNextBlockIndex(), IsOkAndHolds(kSomeBlockIndex));
+ }
+ {
+ // Create an IndexBlock, treating this file block as uninitialized. This
+ // reset the next_block_index to kInvalidBlockIndex.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ IndexBlock block, IndexBlock::CreateFromUninitializedRegion(
+ &filesystem_, serializer_.get(), sfd_->get(),
+ /*offset=*/0, kBlockSize, kPostingListBytes));
+ EXPECT_THAT(block.GetNextBlockIndex(), IsOkAndHolds(kInvalidBlockIndex));
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/posting-list-accessor.cc b/icing/file/posting_list/posting-list-accessor.cc
new file mode 100644
index 0000000..a7cdb17
--- /dev/null
+++ b/icing/file/posting_list/posting-list-accessor.cc
@@ -0,0 +1,136 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/posting_list/posting-list-accessor.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::Status PostingListAccessor::FlushPreexistingPostingList() {
+ if (preexisting_posting_list_->posting_list.size_in_bytes() ==
+ storage_->max_posting_list_bytes()) {
+ // If this is a max-sized posting list, then sync to disk and keep track of
+ // the id.
+ ICING_RETURN_IF_ERROR(
+ storage_->WritePostingListToDisk(*preexisting_posting_list_));
+ prev_block_identifier_ = preexisting_posting_list_->id;
+ } else {
+ // If this is NOT a max-sized posting list, then our data have outgrown this
+ // particular posting list. Move the data into the in-memory posting list
+ // and free this posting list.
+ //
+ // Move will always succeed since in_memory_posting_list_ is max_pl_bytes.
+ ICING_RETURN_IF_ERROR(GetSerializer()->MoveFrom(
+ /*dst=*/&in_memory_posting_list_,
+ /*src=*/&preexisting_posting_list_->posting_list));
+
+ // Now that all the contents of this posting list have been copied, there's
+ // no more use for it. Make it available to be used for another posting
+ // list.
+ ICING_RETURN_IF_ERROR(
+ storage_->FreePostingList(std::move(*preexisting_posting_list_)));
+ }
+ preexisting_posting_list_.reset();
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PostingListAccessor::FlushInMemoryPostingList() {
+ // We exceeded max_pl_bytes(). Need to flush in_memory_posting_list_ and
+ // update the chain.
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage_->AllocateAndChainMaxSizePostingList(
+ prev_block_identifier_.block_index()));
+ ICING_RETURN_IF_ERROR(
+ GetSerializer()->MoveFrom(/*dst=*/&holder.posting_list,
+ /*src=*/&in_memory_posting_list_));
+ ICING_RETURN_IF_ERROR(storage_->WritePostingListToDisk(holder));
+
+ // Set prev block id only if persist to disk succeeded.
+ prev_block_identifier_ = holder.id;
+ return libtextclassifier3::Status::OK;
+}
+
+PostingListAccessor::FinalizeResult PostingListAccessor::Finalize() && {
+ if (preexisting_posting_list_ != nullptr) {
+ // Sync to disk.
+ return FinalizeResult(
+ storage_->WritePostingListToDisk(*preexisting_posting_list_),
+ preexisting_posting_list_->id);
+ }
+
+ if (GetSerializer()->GetBytesUsed(&in_memory_posting_list_) <= 0) {
+ return FinalizeResult(absl_ports::InvalidArgumentError(
+ "Can't finalize an empty PostingListAccessor. "
+ "There's nothing to Finalize!"),
+ PostingListIdentifier::kInvalid);
+ }
+
+ libtextclassifier3::StatusOr<PostingListHolder> holder_or;
+ if (prev_block_identifier_.is_valid()) {
+ // If prev_block_identifier_ is valid, then it means there was a max-sized
+ // posting list, so we have to allocate another new max size posting list
+ // and chain them together.
+ holder_or = storage_->AllocateAndChainMaxSizePostingList(
+ prev_block_identifier_.block_index());
+ } else {
+ // Otherwise, it is the first posting list, and we can use smaller size pl.
+ // Note that even if it needs a max-sized posting list here, it is ok to
+ // call AllocatePostingList without setting next block index since we don't
+ // have any previous posting list to chain and AllocatePostingList will set
+ // next block index to kInvalidBlockIndex.
+ uint32_t posting_list_bytes =
+ GetSerializer()->GetMinPostingListSizeToFit(&in_memory_posting_list_);
+ holder_or = storage_->AllocatePostingList(posting_list_bytes);
+ }
+
+ if (!holder_or.ok()) {
+ return FinalizeResult(std::move(holder_or).status(),
+ prev_block_identifier_);
+ }
+ PostingListHolder holder = std::move(holder_or).ValueOrDie();
+
+ // Move to allocated area. This should never actually return an error. We know
+ // that editor.posting_list() is valid because it wouldn't have successfully
+ // returned by AllocatePostingList if it wasn't. We know
+ // in_memory_posting_list_ is valid because we created it in-memory. And
+ // finally, we know that the data from in_memory_posting_list_ will fit in
+ // editor.posting_list() because we requested it be at at least
+ // posting_list_bytes large.
+ auto status = GetSerializer()->MoveFrom(/*dst=*/&holder.posting_list,
+ /*src=*/&in_memory_posting_list_);
+ if (!status.ok()) {
+ return FinalizeResult(std::move(status), prev_block_identifier_);
+ }
+
+ status = storage_->WritePostingListToDisk(holder);
+ if (!status.ok()) {
+ return FinalizeResult(std::move(status), prev_block_identifier_);
+ }
+ return FinalizeResult(libtextclassifier3::Status::OK, holder.id);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/posting-list-accessor.h b/icing/file/posting_list/posting-list-accessor.h
new file mode 100644
index 0000000..91f1f2d
--- /dev/null
+++ b/icing/file/posting_list/posting-list-accessor.h
@@ -0,0 +1,118 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_ACCESSOR_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_ACCESSOR_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+
+namespace icing {
+namespace lib {
+
+// This class serves to:
+// 1. Expose PostingListUseds to clients of FlashIndexStorage
+// 2. Handles flushing posting list properly, including choosing the most
+// efficient size of PL, chaining max-sized PL correctly, etc.
+// 3. Ensure that PostingListUseds can only be freed by calling methods which
+// will also properly maintain the FlashIndexStorage free list and prevent
+// callers from modifying the Posting List after freeing.
+class PostingListAccessor {
+ public:
+ virtual ~PostingListAccessor() = default;
+
+ struct FinalizeResult {
+ // - OK on success
+ // - INVALID_ARGUMENT if there was no pre-existing posting list and no
+ // data were added
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a
+ // new posting list.
+ libtextclassifier3::Status status;
+ // Id of the posting list chain that was finalized. Guaranteed to be valid
+ // if status is OK. May be valid if status is non-OK, but previous blocks
+ // were written.
+ PostingListIdentifier id;
+
+ explicit FinalizeResult(libtextclassifier3::Status status_in,
+ PostingListIdentifier id_in)
+ : status(std::move(status_in)), id(std::move(id_in)) {}
+ };
+ // Write all accumulated data to storage.
+ //
+ // If accessor points to a posting list chain with multiple posting lists in
+ // the chain and unable to write the last posting list in the chain, Finalize
+ // will return the error and also populate id with the id of the
+ // second-to-last posting list.
+ FinalizeResult Finalize() &&;
+
+ virtual PostingListSerializer* GetSerializer() = 0;
+
+ protected:
+ explicit PostingListAccessor(FlashIndexStorage* storage,
+ PostingListUsed in_memory_posting_list)
+ : storage_(storage),
+ prev_block_identifier_(PostingListIdentifier::kInvalid),
+ in_memory_posting_list_(std::move(in_memory_posting_list)),
+ has_reached_posting_list_chain_end_(false) {}
+
+ // Flushes preexisting_posting_list_ to disk if it's a max-sized posting list
+ // and populates prev_block_identifier.
+ // If it's not a max-sized posting list, moves the contents of
+ // preexisting_posting_list_ to in_memory_posting_list_ and frees
+ // preexisting_posting_list_.
+ // Sets preexisting_posting_list_ to nullptr.
+ libtextclassifier3::Status FlushPreexistingPostingList();
+
+ // Flushes in_memory_posting_list_ to a max-sized posting list on disk, chains
+ // the newly allocated max-size posting list block by setting its next pointer
+ // to prev_block_identifier_, and updates prev_block_identifier_ to point to
+ // the newly allocated posting list.
+ libtextclassifier3::Status FlushInMemoryPostingList();
+
+ // Frees all posting lists in the posting list chain starting at
+ // prev_block_identifier_.
+ libtextclassifier3::Status FreePostingListChain();
+
+ FlashIndexStorage* storage_; // Does not own.
+
+ // The PostingListIdentifier of the first max-sized posting list in the
+ // posting list chain or PostingListIdentifier::kInvalid if there is no
+ // posting list chain.
+ PostingListIdentifier prev_block_identifier_;
+
+ // An editor to an existing posting list on disk. If available (non-NULL),
+ // we'll try to add all data to this posting list. Once this posting list
+ // fills up, we'll either 1) chain it (if a max-sized posting list) and put
+ // future data in in_memory_posting_list_ or 2) copy all of its data into
+ // in_memory_posting_list_ and free this pl (if not a max-sized posting list).
+ // TODO(tjbarron) provide a benchmark to demonstrate the effects that re-using
+ // existing posting lists has on latency.
+ std::unique_ptr<PostingListHolder> preexisting_posting_list_;
+
+ // In-memory posting list used to buffer data before writing them to the
+ // smallest on-disk posting list that will fit them.
+ PostingListUsed in_memory_posting_list_;
+
+ bool has_reached_posting_list_chain_end_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_ACCESSOR_H_
diff --git a/icing/file/posting_list/posting-list-common.h b/icing/file/posting_list/posting-list-common.h
new file mode 100644
index 0000000..44c6dd2
--- /dev/null
+++ b/icing/file/posting_list/posting-list-common.h
@@ -0,0 +1,33 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_
+
+#include <cstdint>
+
+namespace icing {
+namespace lib {
+
+// A FlashIndexBlock can contain multiple posting lists. This specifies which
+// PostingList in the FlashIndexBlock we want to refer to.
+using PostingListIndex = int32_t;
+inline constexpr PostingListIndex kInvalidPostingListIndex = ~0U;
+
+inline constexpr uint32_t kInvalidBlockIndex = 0;
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_COMMON_H_
diff --git a/icing/index/posting-list-free.h b/icing/file/posting_list/posting-list-free.h
index a2eba82..073e344 100644
--- a/icing/index/posting-list-free.h
+++ b/icing/file/posting_list/posting-list-free.h
@@ -12,30 +12,22 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_POSTING_LIST_FREE_H_
-#define ICING_INDEX_POSTING_LIST_FREE_H_
-
-#include <string.h>
-#include <sys/mman.h>
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_
#include <cstdint>
+#include <cstring>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
-#include "icing/index/hit/hit.h"
-#include "icing/index/posting-list-utils.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-utils.h"
#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
-// A FlashIndexBlock can contain multiple posting lists. This specifies which
-// PostingList in the FlashIndexBlock we want to refer to.
-using PostingListIndex = uint32_t;
-inline constexpr PostingListIndex kInvalidPostingListIndex = ~0U;
-
// A posting list in the index block's free list.
//
// We re-use the first sizeof(PostingListIndex) bytes of the posting list
@@ -51,14 +43,17 @@ class PostingListFree {
//
// RETURNS:
// - A valid PostingListFree on success
- // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
- // || size_in_bytes % sizeof(Hit) != 0.
+ // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check
+ // fails
// - FAILED_PRECONDITION if posting_list_buffer is null
static libtextclassifier3::StatusOr<PostingListFree>
- CreateFromPreexistingPostingListFreeRegion(void *posting_list_buffer,
- uint32_t size_in_bytes) {
+ CreateFromPreexistingPostingListFreeRegion(void* posting_list_buffer,
+ uint32_t size_in_bytes,
+ uint32_t data_type_bytes,
+ uint32_t min_posting_list_size) {
ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
- if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) {
+ if (!posting_list_utils::IsValidPostingListSize(
+ size_in_bytes, data_type_bytes, min_posting_list_size)) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Requested posting list size %d is invalid!", size_in_bytes));
}
@@ -74,15 +69,17 @@ class PostingListFree {
//
// RETURNS:
// - A valid PostingListFree on success
- // - INVALID_ARGUMENT if size_in_bytes < min_size() || size_in_bytes %
- // sizeof(Hit) != 0.
+ // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check
+ // fails
// - FAILED_PRECONDITION if posting_list_buffer is null
static libtextclassifier3::StatusOr<PostingListFree>
- CreateFromUnitializedRegion(void *posting_list_buffer,
- uint32_t size_in_bytes) {
+ CreateFromUnitializedRegion(void* posting_list_buffer, uint32_t size_in_bytes,
+ uint32_t data_type_bytes,
+ uint32_t min_posting_list_size) {
ICING_ASSIGN_OR_RETURN(PostingListFree posting_list_free,
CreateFromPreexistingPostingListFreeRegion(
- posting_list_buffer, size_in_bytes));
+ posting_list_buffer, size_in_bytes,
+ data_type_bytes, min_posting_list_size));
posting_list_free.Clear();
return posting_list_free;
}
@@ -101,8 +98,8 @@ class PostingListFree {
}
private:
- PostingListFree(void *posting_list_buffer, uint32_t size_in_bytes)
- : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)),
+ explicit PostingListFree(void* posting_list_buffer, uint32_t size_in_bytes)
+ : posting_list_buffer_(static_cast<uint8_t*>(posting_list_buffer)),
size_in_bytes_(size_in_bytes) {}
// Reset the current free posting list as unchained free posting list so that
@@ -114,16 +111,11 @@ class PostingListFree {
// A byte array of size size_in_bytes_. The first sizeof(PostingListIndex)
// bytes which will store the next posting list index, the rest are unused and
// can be anything.
- uint8_t *posting_list_buffer_;
- uint32_t size_in_bytes_;
-
- static_assert(sizeof(PostingListIndex) <=
- posting_list_utils::min_posting_list_size(),
- "PostingListIndex must be small enough to fit in a "
- "minimum-sized Posting List.");
+ uint8_t* posting_list_buffer_;
+ [[maybe_unused]] uint32_t size_in_bytes_;
};
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_POSTING_LIST_FREE_H_
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_FREE_H_
diff --git a/icing/index/posting-list-free_test.cc b/icing/file/posting_list/posting-list-free_test.cc
index 80b8957..503012d 100644
--- a/icing/index/posting-list-free_test.cc
+++ b/icing/file/posting_list/posting-list-free_test.cc
@@ -12,14 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/posting-list-free.h"
+#include "icing/file/posting_list/posting-list-free.h"
#include <cstdint>
#include <memory>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gtest/gtest.h"
-#include "icing/index/posting-list-utils.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
#include "icing/testing/common-matchers.h"
namespace icing {
@@ -27,55 +27,76 @@ namespace lib {
namespace {
+// TODO(b/249829533): test different serializers
+
TEST(PostingListTest, PostingListFree) {
+ PostingListHitSerializer serializer;
static const size_t kHitsSize = 2551 * sizeof(Hit);
std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
EXPECT_EQ(pl_free.get_next_posting_list_index(), kInvalidPostingListIndex);
}
TEST(PostingListTest, PostingListTooSmallInvalidArgument) {
- static const size_t kHitSizeTooSmall =
- posting_list_utils::min_posting_list_size() - sizeof(Hit);
+ PostingListHitSerializer serializer;
+ const size_t kHitSizeTooSmall =
+ serializer.GetMinPostingListSize() - sizeof(Hit);
std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitSizeTooSmall);
- EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitSizeTooSmall),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- static_cast<void *>(hits_buf.get()), kHitSizeTooSmall),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeTooSmall,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeTooSmall,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST(PostingListTest, PostingListNotAlignedInvalidArgument) {
- static const size_t kHitSizeNotAligned =
- posting_list_utils::min_posting_list_size() + 1;
+ PostingListHitSerializer serializer;
+ const size_t kHitSizeNotAligned = serializer.GetMinPostingListSize() + 1;
std::unique_ptr<char[]> hits_buf =
std::make_unique<char[]>(kHitSizeNotAligned);
- EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitSizeNotAligned),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- static_cast<void *>(hits_buf.get()), kHitSizeNotAligned),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ PostingListFree::CreateFromUnitializedRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeNotAligned,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ static_cast<void *>(hits_buf.get()), kHitSizeNotAligned,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST(PostingListTest, PostingListNullBufferFailedPrecondition) {
- static const size_t kHitSize = posting_list_utils::min_posting_list_size();
- EXPECT_THAT(PostingListFree::CreateFromUnitializedRegion(
- /*posting_list_buffer=*/nullptr, kHitSize),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- /*posting_list_buffer=*/nullptr, kHitSize),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ PostingListHitSerializer serializer;
+ const size_t kHitSize = serializer.GetMinPostingListSize();
+
+ // nullptr posting_list_buffer
+ EXPECT_THAT(
+ PostingListFree::CreateFromUnitializedRegion(
+ /*posting_list_buffer=*/nullptr, kHitSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ PostingListFree::CreateFromPreexistingPostingListFreeRegion(
+ /*posting_list_buffer=*/nullptr, kHitSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
TEST(PostingListTest, PostingListFreePreexistingRegion) {
+ PostingListHitSerializer serializer;
constexpr PostingListIndex kOtherPostingListIndex = 12;
static const size_t kHitsSize = 2551 * sizeof(Hit);
@@ -85,7 +106,8 @@ TEST(PostingListTest, PostingListFreePreexistingRegion) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
pl_free.set_next_posting_list_index(kOtherPostingListIndex);
EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
}
@@ -95,12 +117,14 @@ TEST(PostingListTest, PostingListFreePreexistingRegion) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromPreexistingPostingListFreeRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
}
}
TEST(PostingListTest, PostingListFreeUninitializedRegion) {
+ PostingListHitSerializer serializer;
constexpr PostingListIndex kOtherPostingListIndex = 12;
static const size_t kHitsSize = 2551 * sizeof(Hit);
@@ -110,7 +134,8 @@ TEST(PostingListTest, PostingListFreeUninitializedRegion) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
pl_free.set_next_posting_list_index(kOtherPostingListIndex);
EXPECT_EQ(pl_free.get_next_posting_list_index(), kOtherPostingListIndex);
}
@@ -120,7 +145,8 @@ TEST(PostingListTest, PostingListFreeUninitializedRegion) {
ICING_ASSERT_OK_AND_ASSIGN(
PostingListFree pl_free,
PostingListFree::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
+ static_cast<void *>(hits_buf.get()), kHitsSize,
+ serializer.GetDataTypeBytes(), serializer.GetMinPostingListSize()));
EXPECT_EQ(pl_free.get_next_posting_list_index(), kInvalidPostingListIndex);
}
}
diff --git a/icing/file/posting_list/posting-list-identifier.cc b/icing/file/posting_list/posting-list-identifier.cc
new file mode 100644
index 0000000..4491c38
--- /dev/null
+++ b/icing/file/posting_list/posting-list-identifier.cc
@@ -0,0 +1,27 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/posting_list/posting-list-identifier.h"
+
+#include "icing/file/posting_list/posting-list-common.h"
+
+namespace icing {
+namespace lib {
+
+PostingListIdentifier PostingListIdentifier::kInvalid(
+ kInvalidBlockIndex, /*posting_list_index=*/0,
+ PostingListIdentifier::kEncodedPostingListIndexBits - 1);
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/posting-list-identifier.h b/icing/file/posting_list/posting-list-identifier.h
new file mode 100644
index 0000000..8a0229b
--- /dev/null
+++ b/icing/file/posting_list/posting-list-identifier.h
@@ -0,0 +1,120 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_
+
+#include <cstdint>
+
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/legacy/index/icing-bit-util.h"
+
+namespace icing {
+namespace lib {
+
+// 1M blocks * 4K page size = 4GB index
+inline constexpr int kBlockIndexBits = 20;
+inline constexpr int kMaxBlockIndex = (1u << kBlockIndexBits) - 1;
+
+// Class used to store information necessary to identify any posting list within
+// the index.
+//
+// The 20 leftmost bits in this identifier encode the block index. The 12
+// rightmost bits encode both the posting list index and the maximum number of
+// bits required to encode a posting list index on that block.
+//
+// Ex. An index block containing a max of 68 posting lists each of size 60
+// bytes (and thus 7 posting list bits), with a block index of 13 and a posting
+// list index of 5.
+// 0000 0000 0000 0000 1101 1111 0000 0101
+// |__________block-index_______|__pad__|_pl-index_|
+//
+// "pad" is some region starting at kEncodedPostingListIndexBits (12) bit and
+// continuing rightward until reaching a terminating "0". This padding encodes
+// the posting list bits value - posting list bits value is the number of bits
+// after the terminating '0' of the "pad" region.
+//
+// This value will eventually be stored in the Main Lexicon.
+class PostingListIdentifier {
+ // 1 bit is wasted to encode max pl index bits so there can be at most 2^11
+ // posting lists per block. Block size would have to be >=40020 bytes for
+ // there to be more than 2K+ posting lists in a block.
+ static constexpr int kEncodedPostingListIndexBits = 12;
+ static_assert(kEncodedPostingListIndexBits + kBlockIndexBits <=
+ 8 * sizeof(uint32_t),
+ "Not enough room in PostingListIdentifier value to encode "
+ "block index and posting list index.");
+
+ public:
+ static PostingListIdentifier kInvalid;
+
+ explicit PostingListIdentifier() { *this = kInvalid; }
+
+ // 1. block_index - the index of this block within the FlashIndexStorage file
+ // 2. posting_list_index - the index of this posting list within the block
+ // 3. posting_list_index_bits - the number of bits needed to encode the
+ // largest posting_list_index that this block can have.
+ explicit PostingListIdentifier(uint32_t block_index,
+ PostingListIndex posting_list_index,
+ int posting_list_index_bits) {
+ val_ = 0;
+ BITFIELD_OR(val_, /*offset=*/0, /*len=*/posting_list_index_bits,
+ /*val=*/static_cast<uint64_t>(posting_list_index));
+ BITFIELD_OR(
+ val_, /*offset=*/posting_list_index_bits + 1,
+ /*len=*/kEncodedPostingListIndexBits - posting_list_index_bits - 1,
+ /*val=*/~0u);
+ BITFIELD_OR(val_, /*offset=*/kEncodedPostingListIndexBits,
+ /*len=*/kBlockIndexBits,
+ /*val=*/block_index);
+ }
+
+ uint32_t block_index() const {
+ return BITFIELD_GET(val_, kEncodedPostingListIndexBits, kBlockIndexBits);
+ }
+
+ PostingListIndex posting_list_index() const {
+ return BITFIELD_GET(val_, 0, posting_list_index_bits());
+ }
+
+ // Returns the maximum number of bits that a posting list index on the block
+ // referred to by block_index could use.
+ int posting_list_index_bits() const {
+ for (int bits = kEncodedPostingListIndexBits - 1; bits >= 0; --bits) {
+ if (((1u << bits) & val_) == 0) {
+ // Got to the zero bit. This is the start of pl index.
+ return bits;
+ }
+ }
+ return -1;
+ }
+
+ bool is_valid() const { return *this != kInvalid; }
+
+ bool operator==(const PostingListIdentifier& rhs) const {
+ return val_ == rhs.val_;
+ }
+ bool operator!=(const PostingListIdentifier& rhs) const {
+ return !(*this == rhs);
+ }
+
+ private:
+ uint32_t val_;
+} __attribute__((packed));
+static_assert(sizeof(PostingListIdentifier) == 4, "");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_IDENTIFIER_H_
diff --git a/icing/file/posting_list/posting-list-used.cc b/icing/file/posting_list/posting-list-used.cc
new file mode 100644
index 0000000..d049b13
--- /dev/null
+++ b/icing/file/posting_list/posting-list-used.cc
@@ -0,0 +1,58 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/posting_list/posting-list-used.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/posting-list-utils.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<PostingListUsed>
+PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
+ PostingListSerializer* serializer,
+ std::unique_ptr<uint8_t[]> posting_list_buffer, uint32_t size_in_bytes) {
+ ICING_RETURN_ERROR_IF_NULL(serializer);
+ ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
+
+ if (!posting_list_utils::IsValidPostingListSize(
+ size_in_bytes, serializer->GetDataTypeBytes(),
+ serializer->GetMinPostingListSize())) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Requested posting list size %d is invalid!", size_in_bytes));
+ }
+ return PostingListUsed(std::move(posting_list_buffer), size_in_bytes);
+}
+
+libtextclassifier3::StatusOr<PostingListUsed>
+PostingListUsed::CreateFromUnitializedRegion(PostingListSerializer* serializer,
+ uint32_t size_in_bytes) {
+ ICING_ASSIGN_OR_RETURN(
+ PostingListUsed posting_list_used,
+ CreateFromPreexistingPostingListUsedRegion(
+ serializer, std::make_unique<uint8_t[]>(size_in_bytes),
+ size_in_bytes));
+ serializer->Clear(&posting_list_used);
+ return posting_list_used;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/posting_list/posting-list-used.h b/icing/file/posting_list/posting-list-used.h
new file mode 100644
index 0000000..980d210
--- /dev/null
+++ b/icing/file/posting_list/posting-list-used.h
@@ -0,0 +1,174 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+
+class PostingListUsed;
+
+// Interface for PostingListUsed data serialization and deserialization.
+// - It contains several common methods used by lower level of posting list
+// management related classes (e.g. FlashIndexStorage, IndexBlock,
+// PostingListUsed, etc).
+// - Higher level classes (e.g. MainIndex) create their desired serializers
+// according to the data type they're dealing with, and pass the instance down
+// to all posting list management related classes.
+// - Data specific methods can also be implemented in each serializer. They
+// won't be used by posting list management related classes, but higher level
+// classes are able to call it and deal with the specific data type.
+//
+// E.g. main index stores 'Hit' data into posting lists.
+// - MainIndex creates PostingListUsedHitSerializer instance and uses hit data
+// related methods to serialize/deserialize Hit data to/from posting lists.
+// - FlashIndexStorage, IndexBlock, PostingListUsed use the serializer created
+// by MainIndex, but hold the reference/pointer in the interface format
+// (PostingListSerializer) and only use common interface methods to manage
+// posting list.
+class PostingListSerializer {
+ public:
+ // Special data is either a DataType instance or data_start_offset.
+ template <typename DataType>
+ union SpecialData {
+ explicit SpecialData(const DataType& data) : data_(data) {}
+
+ explicit SpecialData(uint32_t data_start_offset)
+ : data_start_offset_(data_start_offset) {}
+
+ const DataType& data() const { return data_; }
+
+ uint32_t data_start_offset() const { return data_start_offset_; }
+ void set_data_start_offset(uint32_t data_start_offset) {
+ data_start_offset_ = data_start_offset;
+ }
+
+ private:
+ DataType data_;
+ uint32_t data_start_offset_;
+ } __attribute__((packed));
+
+ static constexpr uint32_t kNumSpecialData = 2;
+
+ virtual ~PostingListSerializer() = default;
+
+ // Returns byte size of the data type.
+ virtual uint32_t GetDataTypeBytes() const = 0;
+
+ // Returns minimum posting list size allowed.
+ //
+ // Note that min posting list size should also be large enough to store a
+ // single PostingListIndex (for posting list management usage), so we have to
+ // add static_assert in each serializer implementation.
+ // E.g.
+ // static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize;
+ // static_assert(sizeof(PostingListIndex) <= kMinPostingListSize, "");
+ virtual uint32_t GetMinPostingListSize() const = 0;
+
+ // Returns minimum size of posting list that can fit these used bytes
+ // (see MoveFrom).
+ virtual uint32_t GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const = 0;
+
+ // Returns bytes used by actual data.
+ virtual uint32_t GetBytesUsed(
+ const PostingListUsed* posting_list_used) const = 0;
+
+ // Clears the posting list. It is usually used for initializing a newly
+ // allocated (or reclaimed from free posting list chain) posting list.
+ virtual void Clear(PostingListUsed* posting_list_used) const = 0;
+
+ // Moves contents from posting list 'src' to 'dst'. Clears 'src'.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if 'src' is not valid or 'src' is too large to fit in
+ // 'dst'.
+ // - FAILED_PRECONDITION if 'dst' posting list is in a corrupted state.
+ virtual libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
+ PostingListUsed* src) const = 0;
+};
+
+// A posting list with in-memory data. The caller should sync it to disk via
+// FlashIndexStorage. Layout depends on the serializer.
+class PostingListUsed {
+ public:
+ // Creates a PostingListUsed that takes over the ownership of
+ // posting_list_buffer with size_in_bytes bytes. 'Preexisting' means that
+ // the data in posting_list_buffer was previously modified by another instance
+ // of PostingListUsed, and the caller should read the data from disk to
+ // posting_list_buffer.
+ //
+ // RETURNS:
+ // - A valid PostingListUsed if successful
+ // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check
+ // fails
+ // - FAILED_PRECONDITION if serializer or posting_list_buffer is null
+ static libtextclassifier3::StatusOr<PostingListUsed>
+ CreateFromPreexistingPostingListUsedRegion(
+ PostingListSerializer* serializer,
+ std::unique_ptr<uint8_t[]> posting_list_buffer, uint32_t size_in_bytes);
+
+ // Creates a PostingListUsed that owns a buffer of size_in_bytes bytes and
+ // initializes the content of the buffer so that the returned PostingListUsed
+ // is empty.
+ //
+ // RETURNS:
+ // - A valid PostingListUsed if successful
+ // - INVALID_ARGUMENT if posting_list_utils::IsValidPostingListSize check
+ // fails
+ // - FAILED_PRECONDITION if serializer is null
+ static libtextclassifier3::StatusOr<PostingListUsed>
+ CreateFromUnitializedRegion(PostingListSerializer* serializer,
+ uint32_t size_in_bytes);
+
+ uint8_t* posting_list_buffer() {
+ is_dirty_ = true;
+ return posting_list_buffer_.get();
+ }
+
+ const uint8_t* posting_list_buffer() const {
+ return posting_list_buffer_.get();
+ }
+
+ uint32_t size_in_bytes() const { return size_in_bytes_; }
+
+ bool is_dirty() const { return is_dirty_; }
+
+ private:
+ explicit PostingListUsed(std::unique_ptr<uint8_t[]> posting_list_buffer,
+ uint32_t size_in_bytes)
+ : posting_list_buffer_(std::move(posting_list_buffer)),
+ size_in_bytes_(size_in_bytes),
+ is_dirty_(false) {}
+
+ // A byte array of size size_in_bytes_ containing encoded data for this
+ // posting list.
+ std::unique_ptr<uint8_t[]> posting_list_buffer_;
+ uint32_t size_in_bytes_;
+
+ bool is_dirty_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_USED_H_
diff --git a/icing/index/posting-list-utils.cc b/icing/file/posting_list/posting-list-utils.cc
index b0e2929..2adbc26 100644
--- a/icing/index/posting-list-utils.cc
+++ b/icing/file/posting_list/posting-list-utils.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/index/posting-list-utils.h"
+#include "icing/file/posting_list/posting-list-utils.h"
#include "icing/legacy/index/icing-bit-util.h"
#include "icing/util/logging.h"
@@ -22,27 +22,28 @@ namespace lib {
namespace posting_list_utils {
-bool IsValidPostingListSize(uint32_t size_in_bytes) {
- // size must be sizeof(Hit) aligned. Otherwise, we can have serious
+bool IsValidPostingListSize(uint32_t size_in_bytes, uint32_t data_type_bytes,
+ uint32_t min_posting_list_size) {
+ // size must be data_type_bytes aligned. Otherwise, we can have serious
// wasted space in the worst case.
- if (size_in_bytes % sizeof(Hit) != 0) {
- ICING_LOG(ERROR) << "Size " << size_in_bytes << " hit " << sizeof(Hit);
+ if (size_in_bytes % data_type_bytes != 0) {
+ ICING_LOG(ERROR) << "Size " << size_in_bytes << " data " << data_type_bytes;
return false;
}
// Must be able to store the min information.
- if (size_in_bytes < min_posting_list_size()) {
+ if (size_in_bytes < min_posting_list_size) {
ICING_LOG(ERROR) << "Size " << size_in_bytes << " is less than min size "
- << min_posting_list_size();
+ << min_posting_list_size;
return false;
}
- // We re-use the first two hits as pointers into the posting list
- // so the posting list size must fit in sizeof(Hit).
- if (BitsToStore(size_in_bytes) > sizeof(Hit::Value) * 8) {
+ // We re-use the first two data as pointers into the posting list
+ // so the posting list size must fit in data_type_bytes.
+ if (BitsToStore(size_in_bytes) > data_type_bytes * 8) {
ICING_LOG(ERROR)
<< "Posting list size must be small enough to store the offset in "
- << sizeof(Hit::Value) * 8 << " bytes.";
+ << data_type_bytes << " bytes.";
return false;
}
diff --git a/icing/index/posting-list-utils.h b/icing/file/posting_list/posting-list-utils.h
index fc90d64..6a1e28c 100644
--- a/icing/index/posting-list-utils.h
+++ b/icing/file/posting_list/posting-list-utils.h
@@ -12,33 +12,26 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_INDEX_POSTING_LIST_UTILS_H_
-#define ICING_INDEX_POSTING_LIST_UTILS_H_
+#ifndef ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_
+#define ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_
#include <cstdint>
-#include "icing/index/hit/hit.h"
-
namespace icing {
namespace lib {
namespace posting_list_utils {
-// Represents the byte length of the two special hits described
-// in the private section of posting-list-used.h.
-static constexpr uint32_t kSpecialHitsSize = sizeof(Hit) * 2;
-
-constexpr uint32_t min_posting_list_size() { return kSpecialHitsSize; }
-
// For a posting list size to be valid, it must:
-// 1) be sizeof(Hit) aligned
+// 1) be data_type_bytes aligned
// 2) be equal to or larger than min_posting_list_size
-// 3) be small enough to be encoded within a single Hit (5 bytes)
-bool IsValidPostingListSize(uint32_t size_in_bytes);
+// 3) be small enough to be encoded within a single data (data_type_bytes)
+bool IsValidPostingListSize(uint32_t size_in_bytes, uint32_t data_type_bytes,
+ uint32_t min_posting_list_size);
} // namespace posting_list_utils
} // namespace lib
} // namespace icing
-#endif // ICING_INDEX_POSTING_LIST_UTILS_H_
+#endif // ICING_FILE_POSTING_LIST_POSTING_LIST_UTILS_H_
diff --git a/icing/file/version-util.cc b/icing/file/version-util.cc
new file mode 100644
index 0000000..dd233e0
--- /dev/null
+++ b/icing/file/version-util.cc
@@ -0,0 +1,150 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/version-util.h"
+
+#include <cstdint>
+#include <string>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/index.h"
+
+namespace icing {
+namespace lib {
+
+namespace version_util {
+
+libtextclassifier3::StatusOr<VersionInfo> ReadVersion(
+ const Filesystem& filesystem, const std::string& version_file_path,
+ const std::string& index_base_dir) {
+ // 1. Read the version info.
+ VersionInfo existing_version_info(-1, -1);
+ if (filesystem.FileExists(version_file_path.c_str()) &&
+ !filesystem.PRead(version_file_path.c_str(), &existing_version_info,
+ sizeof(VersionInfo), /*offset=*/0)) {
+ return absl_ports::InternalError("Fail to read version");
+ }
+
+ // 2. Check the Index magic to see if we're actually on version 0.
+ libtextclassifier3::StatusOr<int> existing_flash_index_magic_or =
+ Index::ReadFlashIndexMagic(&filesystem, index_base_dir);
+ if (!existing_flash_index_magic_or.ok()) {
+ if (absl_ports::IsNotFound(existing_flash_index_magic_or.status())) {
+ // Flash index magic doesn't exist. In this case, we're unable to
+ // determine the version change state correctly (regardless of the
+ // existence of the version file), so invalidate VersionInfo by setting
+ // version to -1, but still keep the max_version value read in step 1.
+ existing_version_info.version = -1;
+ return existing_version_info;
+ }
+ // Real error.
+ return std::move(existing_flash_index_magic_or).status();
+ }
+ if (existing_flash_index_magic_or.ValueOrDie() ==
+ kVersionZeroFlashIndexMagic) {
+ existing_version_info.version = 0;
+ if (existing_version_info.max_version == -1) {
+ existing_version_info.max_version = 0;
+ }
+ }
+
+ return existing_version_info;
+}
+
+libtextclassifier3::Status WriteVersion(const Filesystem& filesystem,
+ const std::string& version_file_path,
+ const VersionInfo& version_info) {
+ ScopedFd scoped_fd(filesystem.OpenForWrite(version_file_path.c_str()));
+ if (!scoped_fd.is_valid() ||
+ !filesystem.PWrite(scoped_fd.get(), /*offset=*/0, &version_info,
+ sizeof(VersionInfo)) ||
+ !filesystem.DataSync(scoped_fd.get())) {
+ return absl_ports::InternalError("Fail to write version");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+StateChange GetVersionStateChange(const VersionInfo& existing_version_info,
+ int32_t curr_version) {
+ if (!existing_version_info.IsValid()) {
+ return StateChange::kUndetermined;
+ }
+
+ if (existing_version_info.version == 0) {
+ return (existing_version_info.max_version == existing_version_info.version)
+ ? StateChange::kVersionZeroUpgrade
+ : StateChange::kVersionZeroRollForward;
+ }
+
+ if (existing_version_info.version == curr_version) {
+ return StateChange::kCompatible;
+ } else if (existing_version_info.version > curr_version) {
+ return StateChange::kRollBack;
+ } else { // existing_version_info.version < curr_version
+ return (existing_version_info.max_version == existing_version_info.version)
+ ? StateChange::kUpgrade
+ : StateChange::kRollForward;
+ }
+}
+
+bool ShouldRebuildDerivedFiles(const VersionInfo& existing_version_info,
+ int32_t curr_version) {
+ StateChange state_change =
+ GetVersionStateChange(existing_version_info, curr_version);
+ switch (state_change) {
+ case StateChange::kCompatible:
+ return false;
+ case StateChange::kUndetermined:
+ [[fallthrough]];
+ case StateChange::kRollBack:
+ [[fallthrough]];
+ case StateChange::kRollForward:
+ [[fallthrough]];
+ case StateChange::kVersionZeroRollForward:
+ [[fallthrough]];
+ case StateChange::kVersionZeroUpgrade:
+ return true;
+ case StateChange::kUpgrade:
+ break;
+ }
+
+ bool should_rebuild = false;
+ int32_t existing_version = existing_version_info.version;
+ while (existing_version < curr_version) {
+ switch (existing_version) {
+ case 1: {
+ // version 1 -> version 2 upgrade, no need to rebuild
+ break;
+ }
+ case 2: {
+ // version 2 -> version 3 upgrade, no need to rebuild
+ break;
+ }
+ default:
+ // This should not happen. Rebuild anyway if unsure.
+ should_rebuild |= true;
+ }
+ ++existing_version;
+ }
+ return should_rebuild;
+}
+
+} // namespace version_util
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/file/version-util.h b/icing/file/version-util.h
new file mode 100644
index 0000000..b2d51df
--- /dev/null
+++ b/icing/file/version-util.h
@@ -0,0 +1,115 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_FILE_VERSION_UTIL_H_
+#define ICING_FILE_VERSION_UTIL_H_
+
+#include <cstdint>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+
+namespace icing {
+namespace lib {
+
+namespace version_util {
+
+// - Version 0: Android T base. Can be identified only by flash index magic.
+// - Version 1: Android U base and M-2023-08.
+// - Version 2: M-2023-09, M-2023-11, M-2024-01. Schema is compatible with v1.
+// (There were no M-2023-10, M-2023-12).
+// - Version 3: M-2024-02. Schema is compatible with v1 and v2.
+//
+// LINT.IfChange(kVersion)
+inline static constexpr int32_t kVersion = 3;
+// LINT.ThenChange(//depot/google3/icing/schema/schema-store.cc:min_overlay_version_compatibility)
+inline static constexpr int32_t kVersionOne = 1;
+inline static constexpr int32_t kVersionTwo = 2;
+inline static constexpr int32_t kVersionThree = 3;
+
+inline static constexpr int kVersionZeroFlashIndexMagic = 0x6dfba6ae;
+
+struct VersionInfo {
+ int32_t version;
+ int32_t max_version;
+
+ explicit VersionInfo(int32_t version_in, int32_t max_version_in)
+ : version(version_in), max_version(max_version_in) {}
+
+ bool IsValid() const { return version >= 0 && max_version >= 0; }
+
+ bool operator==(const VersionInfo& other) const {
+ return version == other.version && max_version == other.max_version;
+ }
+} __attribute__((packed));
+static_assert(sizeof(VersionInfo) == 8, "");
+
+enum class StateChange {
+ kUndetermined,
+ kCompatible,
+ kRollForward,
+ kRollBack,
+ kUpgrade,
+ kVersionZeroUpgrade,
+ kVersionZeroRollForward,
+};
+
+// Helper method to read version info (using version file and flash index header
+// magic) from the existing data. If the state is invalid (e.g. flash index
+// header file is missing), then return an invalid VersionInfo.
+//
+// RETURNS:
+// - Existing data's VersionInfo on success
+// - INTERNAL_ERROR on I/O errors
+libtextclassifier3::StatusOr<VersionInfo> ReadVersion(
+ const Filesystem& filesystem, const std::string& version_file_path,
+ const std::string& index_base_dir);
+
+// Helper method to write version file.
+//
+// RETURNS:
+// - OK on success
+// - INTERNAL_ERROR on I/O errors
+libtextclassifier3::Status WriteVersion(const Filesystem& filesystem,
+ const std::string& version_file_path,
+ const VersionInfo& version_info);
+
+// Helper method to determine the change state between the existing data version
+// and the current code version.
+//
+// REQUIRES: curr_version > 0. We implement version checking in version 1, so
+// the callers (except unit tests) will always use a version # greater than 0.
+//
+// RETURNS: StateChange
+StateChange GetVersionStateChange(const VersionInfo& existing_version_info,
+ int32_t curr_version = kVersion);
+
+// Helper method to determine whether Icing should rebuild all derived files.
+// Sometimes it is not required to rebuild derived files when
+// roll-forward/upgrading. This function "encodes" upgrade paths and checks if
+// the roll-forward/upgrading requires derived files to be rebuilt or not.
+//
+// REQUIRES: curr_version > 0. We implement version checking in version 1, so
+// the callers (except unit tests) will always use a version # greater than 0.
+bool ShouldRebuildDerivedFiles(const VersionInfo& existing_version_info,
+ int32_t curr_version = kVersion);
+
+} // namespace version_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_FILE_VERSION_UTIL_H_
diff --git a/icing/file/version-util_test.cc b/icing/file/version-util_test.cc
new file mode 100644
index 0000000..9dedb1d
--- /dev/null
+++ b/icing/file/version-util_test.cc
@@ -0,0 +1,484 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/file/version-util.h"
+
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage-header.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+namespace version_util {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+
+struct VersionUtilReadVersionTestParam {
+ std::optional<VersionInfo> existing_version_info;
+ std::optional<int> existing_flash_index_magic;
+ VersionInfo expected_version_info;
+
+ explicit VersionUtilReadVersionTestParam(
+ std::optional<VersionInfo> existing_version_info_in,
+ std::optional<int> existing_flash_index_magic_in,
+ VersionInfo expected_version_info_in)
+ : existing_version_info(std::move(existing_version_info_in)),
+ existing_flash_index_magic(std::move(existing_flash_index_magic_in)),
+ expected_version_info(std::move(expected_version_info_in)) {}
+};
+
+class VersionUtilReadVersionTest
+ : public ::testing::TestWithParam<VersionUtilReadVersionTestParam> {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/version_util_test";
+ version_file_path_ = base_dir_ + "/version";
+ index_path_ = base_dir_ + "/index";
+
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(base_dir_.c_str()));
+ }
+
+ const Filesystem& filesystem() const { return filesystem_; }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+ std::string version_file_path_;
+ std::string index_path_;
+};
+
+TEST_P(VersionUtilReadVersionTest, ReadVersion) {
+ const VersionUtilReadVersionTestParam& param = GetParam();
+
+ // Prepare version file and flash index file.
+ if (param.existing_version_info.has_value()) {
+ ICING_ASSERT_OK(WriteVersion(filesystem_, version_file_path_,
+ param.existing_version_info.value()));
+ }
+
+ if (param.existing_flash_index_magic.has_value()) {
+ HeaderBlock header_block(&filesystem_, /*block_size=*/4096);
+ header_block.header()->magic = param.existing_flash_index_magic.value();
+
+ std::string main_index_dir = index_path_ + "/idx/main";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(main_index_dir.c_str()));
+ std::string flash_index_file_path = main_index_dir + "/main_index";
+
+ ScopedFd sfd(filesystem_.OpenForWrite(flash_index_file_path.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+ ASSERT_TRUE(header_block.Write(sfd.get()));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ VersionInfo version_info,
+ ReadVersion(filesystem_, version_file_path_, index_path_));
+ EXPECT_THAT(version_info, Eq(param.expected_version_info));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ VersionUtilReadVersionTest, VersionUtilReadVersionTest,
+ testing::Values(
+ // - Version file doesn't exist
+ // - Flash index doesn't exist
+ // - Result: version -1, max_version -1 (invalid)
+ VersionUtilReadVersionTestParam(
+ /*existing_version_info_in=*/std::nullopt,
+ /*existing_flash_index_magic_in=*/std::nullopt,
+ /*expected_version_info_in=*/
+ VersionInfo(/*version_in=*/-1, /*max_version=*/-1)),
+
+ // - Version file doesn't exist
+ // - Flash index exists with version 0 magic
+ // - Result: version 0, max_version 0
+ VersionUtilReadVersionTestParam(
+ /*existing_version_info_in=*/std::nullopt,
+ /*existing_flash_index_magic_in=*/
+ std::make_optional<int>(kVersionZeroFlashIndexMagic),
+ /*expected_version_info_in=*/
+ VersionInfo(/*version_in=*/0, /*max_version=*/0)),
+
+ // - Version file doesn't exist
+ // - Flash index exists with non version 0 magic
+ // - Result: version -1, max_version -1 (invalid)
+ VersionUtilReadVersionTestParam(
+ /*existing_version_info_in=*/std::nullopt,
+ /*existing_flash_index_magic_in=*/
+ std::make_optional<int>(kVersionZeroFlashIndexMagic + 1),
+ /*expected_version_info_in=*/
+ VersionInfo(/*version_in=*/-1, /*max_version=*/-1)),
+
+ // - Version file exists
+ // - Flash index doesn't exist
+ // - Result: version -1, max_version 1 (invalid)
+ VersionUtilReadVersionTestParam(
+ /*existing_version_info_in=*/std::make_optional<VersionInfo>(
+ /*version_in=*/1, /*max_version=*/1),
+ /*existing_flash_index_magic_in=*/std::nullopt,
+ /*expected_version_info_in=*/
+ VersionInfo(/*version_in=*/-1, /*max_version=*/1)),
+
+ // - Version file exists: version 1, max_version 1
+ // - Flash index exists with version 0 magic
+ // - Result: version 0, max_version 1
+ VersionUtilReadVersionTestParam(
+ /*existing_version_info_in=*/std::make_optional<VersionInfo>(
+ /*version_in=*/1, /*max_version=*/1),
+ /*existing_flash_index_magic_in=*/
+ std::make_optional<int>(kVersionZeroFlashIndexMagic),
+ /*expected_version_info_in=*/
+ VersionInfo(/*version_in=*/0, /*max_version=*/1)),
+
+ // - Version file exists: version 2, max_version 3
+ // - Flash index exists with version 0 magic
+ // - Result: version 0, max_version 3
+ VersionUtilReadVersionTestParam(
+ /*existing_version_info_in=*/std::make_optional<VersionInfo>(
+ /*version_in=*/2, /*max_version=*/3),
+ /*existing_flash_index_magic_in=*/
+ std::make_optional<int>(kVersionZeroFlashIndexMagic),
+ /*expected_version_info_in=*/
+ VersionInfo(/*version_in=*/0, /*max_version=*/3)),
+
+ // - Version file exists: version 1, max_version 1
+ // - Flash index exists with non version 0 magic
+ // - Result: version 1, max_version 1
+ VersionUtilReadVersionTestParam(
+ /*existing_version_info_in=*/std::make_optional<VersionInfo>(
+ /*version_in=*/1, /*max_version=*/1),
+ /*existing_flash_index_magic_in=*/
+ std::make_optional<int>(kVersionZeroFlashIndexMagic + 1),
+ /*expected_version_info_in=*/
+ VersionInfo(/*version_in=*/1, /*max_version=*/1)),
+
+ // - Version file exists: version 2, max_version 3
+ // - Flash index exists with non version 0 magic
+ // - Result: version 2, max_version 3
+ VersionUtilReadVersionTestParam(
+ /*existing_version_info_in=*/std::make_optional<VersionInfo>(
+ /*version_in=*/2, /*max_version=*/3),
+ /*existing_flash_index_magic_in=*/
+ std::make_optional<int>(kVersionZeroFlashIndexMagic + 1),
+ /*expected_version_info_in=*/
+ VersionInfo(/*version_in=*/2, /*max_version=*/3))));
+
+struct VersionUtilStateChangeTestParam {
+ VersionInfo existing_version_info;
+ int32_t curr_version;
+ StateChange expected_state_change;
+
+ explicit VersionUtilStateChangeTestParam(VersionInfo existing_version_info_in,
+ int32_t curr_version_in,
+ StateChange expected_state_change_in)
+ : existing_version_info(std::move(existing_version_info_in)),
+ curr_version(curr_version_in),
+ expected_state_change(expected_state_change_in) {}
+};
+
+class VersionUtilStateChangeTest
+ : public ::testing::TestWithParam<VersionUtilStateChangeTestParam> {};
+
+TEST_P(VersionUtilStateChangeTest, GetVersionStateChange) {
+ const VersionUtilStateChangeTestParam& param = GetParam();
+
+ EXPECT_THAT(
+ GetVersionStateChange(param.existing_version_info, param.curr_version),
+ Eq(param.expected_state_change));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ VersionUtilStateChangeTest, VersionUtilStateChangeTest,
+ testing::Values(
+ // - version -1, max_version -1 (invalid)
+ // - Current version = 1
+ // - Result: undetermined
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(-1, -1),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kUndetermined),
+
+ // - version -1, max_version 1 (invalid)
+ // - Current version = 1
+ // - Result: undetermined
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(-1, 1),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kUndetermined),
+
+ // - version -1, max_version -1 (invalid)
+ // - Current version = 2
+ // - Result: undetermined
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(-1, -1),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kUndetermined),
+
+ // - version -1, max_version 1 (invalid)
+ // - Current version = 2
+ // - Result: undetermined
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(-1, 1),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kUndetermined),
+
+ // - version 0, max_version 0
+ // - Current version = 1
+ // - Result: version 0 upgrade
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(0, 0),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kVersionZeroUpgrade),
+
+ // - version 0, max_version 1
+ // - Current version = 1
+ // - Result: version 0 roll forward
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(0, 1),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kVersionZeroRollForward),
+
+ // - version 0, max_version 2
+ // - Current version = 1
+ // - Result: version 0 roll forward
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(0, 2),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kVersionZeroRollForward),
+
+ // - version 0, max_version 0
+ // - Current version = 2
+ // - Result: version 0 upgrade
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(0, 0),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kVersionZeroUpgrade),
+
+ // - version 0, max_version 1
+ // - Current version = 2
+ // - Result: version 0 upgrade
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(0, 1),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kVersionZeroRollForward),
+
+ // - version 0, max_version 2
+ // - Current version = 2
+ // - Result: version 0 roll forward
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(0, 2),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kVersionZeroRollForward),
+
+ // - version 1, max_version 1
+ // - Current version = 1
+ // - Result: compatible
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(1, 1),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kCompatible),
+
+ // - version 1, max_version 2
+ // - Current version = 1
+ // - Result: compatible
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(1, 2),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kCompatible),
+
+ // - version 2, max_version 2
+ // - Current version = 1
+ // - Result: roll back
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(2, 2),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kRollBack),
+
+ // - version 2, max_version 3
+ // - Current version = 1
+ // - Result: roll back
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(2, 3),
+ /*curr_version_in=*/1,
+ /*expected_state_change_in=*/StateChange::kRollBack),
+
+ // - version 1, max_version 1
+ // - Current version = 2
+ // - Result: upgrade
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(1, 1),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kUpgrade),
+
+ // - version 1, max_version 2
+ // - Current version = 2
+ // - Result: roll forward
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(1, 2),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kRollForward),
+
+ // - version 1, max_version 2
+ // - Current version = 3
+ // - Result: roll forward
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(1, 2),
+ /*curr_version_in=*/3,
+ /*expected_state_change_in=*/StateChange::kRollForward),
+
+ // - version 1, max_version 3
+ // - Current version = 2
+ // - Result: roll forward
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(1, 3),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kRollForward),
+
+ // - version 2, max_version 2
+ // - Current version = 2
+ // - Result: compatible
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(2, 2),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kCompatible),
+
+ // - version 2, max_version 3
+ // - Current version = 2
+ // - Result: compatible
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(2, 3),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kCompatible),
+
+ // - version 3, max_version 3
+ // - Current version = 2
+ // - Result: rollback
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(3, 3),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kRollBack),
+
+ // - version 3, max_version 4
+ // - Current version = 2
+ // - Result: rollback
+ VersionUtilStateChangeTestParam(
+ /*existing_version_info_in=*/VersionInfo(3, 4),
+ /*curr_version_in=*/2,
+ /*expected_state_change_in=*/StateChange::kRollBack)));
+
+TEST(VersionUtilTest, ShouldRebuildDerivedFilesUndeterminedVersion) {
+ EXPECT_THAT(
+ ShouldRebuildDerivedFiles(VersionInfo(-1, -1), /*curr_version=*/1),
+ IsTrue());
+ EXPECT_THAT(
+ ShouldRebuildDerivedFiles(VersionInfo(-1, -1), /*curr_version=*/2),
+ IsTrue());
+}
+
+TEST(VersionUtilTest, ShouldRebuildDerivedFilesVersionZeroUpgrade) {
+ // 0 -> 1
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 0), /*curr_version=*/1),
+ IsTrue());
+
+ // 0 -> 2
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 0), /*curr_version=*/2),
+ IsTrue());
+}
+
+TEST(VersionUtilTest, ShouldRebuildDerivedFilesVersionZeroRollForward) {
+ // (1 -> 0), 0 -> 1
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 1), /*curr_version=*/1),
+ IsTrue());
+
+ // (1 -> 0), 0 -> 2
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 1), /*curr_version=*/2),
+ IsTrue());
+
+ // (2 -> 0), 0 -> 1
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(0, 2), /*curr_version=*/1),
+ IsTrue());
+}
+
+TEST(VersionUtilTest, ShouldRebuildDerivedFilesRollBack) {
+ // 2 -> 1
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 2), /*curr_version=*/1),
+ IsTrue());
+
+ // 3 -> 1
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(3, 3), /*curr_version=*/1),
+ IsTrue());
+
+ // (3 -> 2), 2 -> 1
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 3), /*curr_version=*/1),
+ IsTrue());
+}
+
+TEST(VersionUtilTest, ShouldRebuildDerivedFilesRollForward) {
+ // (2 -> 1), 1 -> 2
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 2), /*curr_version=*/2),
+ IsTrue());
+
+ // (2 -> 1), 1 -> 3
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 2), /*curr_version=*/3),
+ IsTrue());
+
+ // (3 -> 1), 1 -> 2
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(1, 3), /*curr_version=*/2),
+ IsTrue());
+}
+
+TEST(VersionUtilTest, ShouldRebuildDerivedFilesCompatible) {
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 2), /*curr_version=*/2),
+ IsFalse());
+
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(2, 3), /*curr_version=*/2),
+ IsFalse());
+}
+
+TEST(VersionUtilTest, Upgrade) {
+ // Unlike other state changes, upgrade depends on the actual "encoded path".
+
+ // kVersionOne -> kVersionTwo
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(kVersionOne, kVersionOne),
+ /*curr_version=*/kVersionTwo),
+ IsFalse());
+
+ // kVersionTwo -> kVersionThree
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(kVersionTwo, kVersionTwo),
+ /*curr_version=*/kVersionThree),
+ IsFalse());
+
+ // kVersionOne -> kVersionThree.
+ EXPECT_THAT(ShouldRebuildDerivedFiles(VersionInfo(kVersionOne, kVersionOne),
+ /*curr_version=*/kVersionThree),
+ IsFalse());
+}
+
+} // namespace
+
+} // namespace version_util
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc b/icing/icing-search-engine-test-jni-layer.cc
index 8392363..6acc99b 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni.cc
+++ b/icing/icing-search-engine-test-jni-layer.cc
@@ -21,12 +21,11 @@
JNIEnv* g_jenv = nullptr;
extern "C" JNIEXPORT jboolean JNICALL
-Java_icing_tokenization_reverse_1jni_ReverseJniLanguageSegmenterTest_testsMain(
- JNIEnv* env, jclass ignored) {
+Java_icing_jni_IcingSearchEngineJniTest_testsMain(JNIEnv* env, jclass ignored) {
g_jenv = env;
std::vector<char*> my_argv;
- char arg[] = "reverse-jni-language-segmenter-test-lib";
+ char arg[] = "jni-test-lib";
my_argv.push_back(arg);
int argc = 1;
char** argv = &(my_argv[0]);
diff --git a/icing/icing-search-engine-with-icu-file_test.cc b/icing/icing-search-engine-with-icu-file_test.cc
index 32ac9e6..39f9df0 100644
--- a/icing/icing-search-engine-with-icu-file_test.cc
+++ b/icing/icing-search-engine-with-icu-file_test.cc
@@ -27,11 +27,14 @@
#include "icing/proto/search.pb.h"
#include "icing/proto/status.pb.h"
#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
namespace icing {
namespace lib {
namespace {
+
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::Eq;
@@ -54,22 +57,6 @@ DocumentProto CreateMessageDocument(std::string name_space, std::string uri) {
.Build();
}
-SchemaProto CreateMessageSchema() {
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("Message");
-
- auto body = type->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- return schema;
-}
-
ScoringSpecProto GetDefaultScoringSpec() {
ScoringSpecProto scoring_spec;
scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
@@ -79,15 +66,31 @@ ScoringSpecProto GetDefaultScoringSpec() {
TEST(IcingSearchEngineWithIcuFileTest, ShouldInitialize) {
IcingSearchEngine icing(GetDefaultIcingOptions());
EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
}
TEST(IcingSearchEngineWithIcuFileTest, ShouldIndexAndSearch) {
IcingSearchEngine icing(GetDefaultIcingOptions());
ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
@@ -113,7 +116,8 @@ TEST(IcingSearchEngineWithIcuFileTest, ShouldIndexAndSearch) {
// The token is a random number so we don't verify it.
expected_search_result_proto.set_next_page_token(
search_result_proto.next_page_token());
- EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
}
} // namespace
diff --git a/icing/icing-search-engine.cc b/icing/icing-search-engine.cc
index 75ccc41..72be4e9 100644
--- a/icing/icing-search-engine.cc
+++ b/icing/icing-search-engine.cc
@@ -18,6 +18,8 @@
#include <memory>
#include <string>
#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
#include <utility>
#include <vector>
@@ -27,27 +29,57 @@
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/mutex.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-file.h"
+#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
+#include "icing/file/version-util.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
+#include "icing/index/integer-section-indexing-handler.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/integer-index.h"
+#include "icing/index/term-indexing-handler.h"
+#include "icing/join/join-processor.h"
+#include "icing/join/qualified-id-join-index-impl-v1.h"
+#include "icing/join/qualified-id-join-index-impl-v2.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/join/qualified-id-join-indexing-handler.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/portable/endian.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
+#include "icing/proto/internal/optimize.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/optimize.pb.h"
#include "icing/proto/persist.pb.h"
#include "icing/proto/reset.pb.h"
#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/status.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/query/advanced_query_parser/lexer.h"
+#include "icing/query/query-features.h"
#include "icing/query/query-processor.h"
-#include "icing/result/result-retriever.h"
+#include "icing/query/query-results.h"
+#include "icing/query/suggestion-processor.h"
+#include "icing/result/page-result.h"
+#include "icing/result/projection-tree.h"
+#include "icing/result/projector.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/result/result-retriever-v2.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/schema-util.h"
#include "icing/schema/section.h"
-#include "icing/scoring/ranker.h"
+#include "icing/scoring/advanced_scoring/score-expression.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
#include "icing/scoring/scored-document-hit.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
#include "icing/scoring/scoring-processor.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
@@ -57,38 +89,97 @@
#include "icing/transform/normalizer.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
+#include "icing/util/data-loss.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace {
+constexpr std::string_view kVersionFilename = "version";
constexpr std::string_view kDocumentSubfolderName = "document_dir";
constexpr std::string_view kIndexSubfolderName = "index_dir";
+constexpr std::string_view kIntegerIndexSubfolderName = "integer_index_dir";
+constexpr std::string_view kQualifiedIdJoinIndexSubfolderName =
+ "qualified_id_join_index_dir";
constexpr std::string_view kSchemaSubfolderName = "schema_dir";
-constexpr std::string_view kIcingSearchEngineHeaderFilename =
- "icing_search_engine_header";
-
-libtextclassifier3::Status ValidateOptions(
- const IcingSearchEngineOptions& options) {
- // These options are only used in IndexProcessor, which won't be created
- // until the first Put call. So they must be checked here, so that any
- // errors can be surfaced in Initialize.
- if (options.max_tokens_per_doc() <= 0) {
- return absl_ports::InvalidArgumentError(
- "Options::max_tokens_per_doc must be greater than zero.");
+constexpr std::string_view kSetSchemaMarkerFilename = "set_schema_marker";
+constexpr std::string_view kInitMarkerFilename = "init_marker";
+constexpr std::string_view kOptimizeStatusFilename = "optimize_status";
+
+// The maximum number of unsuccessful initialization attempts from the current
+// state that we will tolerate before deleting all data and starting from a
+// fresh state.
+constexpr int kMaxUnsuccessfulInitAttempts = 5;
+
+// A pair that holds namespace and type.
+struct NamespaceTypePair {
+ std::string namespace_;
+ std::string type;
+
+ bool operator==(const NamespaceTypePair& other) const {
+ return namespace_ == other.namespace_ && type == other.type;
}
- return libtextclassifier3::Status::OK;
-}
+};
+
+struct NamespaceTypePairHasher {
+ std::size_t operator()(const NamespaceTypePair& pair) const {
+ return std::hash<std::string>()(pair.namespace_) ^
+ std::hash<std::string>()(pair.type);
+ }
+};
libtextclassifier3::Status ValidateResultSpec(
- const ResultSpecProto& result_spec) {
+ const DocumentStore* document_store, const ResultSpecProto& result_spec) {
if (result_spec.num_per_page() < 0) {
return absl_ports::InvalidArgumentError(
"ResultSpecProto.num_per_page cannot be negative.");
}
+ if (result_spec.num_total_bytes_per_page_threshold() <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "ResultSpecProto.num_total_bytes_per_page_threshold cannot be "
+ "non-positive.");
+ }
+ if (result_spec.max_joined_children_per_parent_to_return() < 0) {
+ return absl_ports::InvalidArgumentError(
+ "ResultSpecProto.max_joined_children_per_parent_to_return cannot be "
+ "negative.");
+ }
+ if (result_spec.num_to_score() <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "ResultSpecProto.num_to_score cannot be non-positive.");
+ }
+ // Validate ResultGroupings.
+ std::unordered_set<int32_t> unique_entry_ids;
+ ResultSpecProto::ResultGroupingType result_grouping_type =
+ result_spec.result_group_type();
+ for (const ResultSpecProto::ResultGrouping& result_grouping :
+ result_spec.result_groupings()) {
+ if (result_grouping.max_results() <= 0) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot specify a result grouping with max results <= 0.");
+ }
+ for (const ResultSpecProto::ResultGrouping::Entry& entry :
+ result_grouping.entry_groupings()) {
+ const std::string& name_space = entry.namespace_();
+ const std::string& schema = entry.schema();
+ auto entry_id_or = document_store->GetResultGroupingEntryId(
+ result_grouping_type, name_space, schema);
+ if (!entry_id_or.ok()) {
+ continue;
+ }
+ int32_t entry_id = entry_id_or.ValueOrDie();
+ if (unique_entry_ids.find(entry_id) != unique_entry_ids.end()) {
+ return absl_ports::InvalidArgumentError(
+ "Entry Ids must be unique across result groups.");
+ }
+ unique_entry_ids.insert(entry_id);
+ }
+ }
return libtextclassifier3::Status::OK;
}
@@ -101,21 +192,71 @@ libtextclassifier3::Status ValidateSearchSpec(
"allowed query length: ",
std::to_string(configuration.max_query_length)));
}
+ // Check that no unknown features have been enabled in the search spec.
+ std::unordered_set<Feature> query_features_set = GetQueryFeaturesSet();
+ for (const Feature feature : search_spec.enabled_features()) {
+ if (query_features_set.find(feature) == query_features_set.end()) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Unknown feature in "
+ "SearchSpecProto.enabled_features: ",
+ feature));
+ }
+ }
return libtextclassifier3::Status::OK;
}
-IndexProcessor::Options CreateIndexProcessorOptions(
- const IcingSearchEngineOptions& options) {
- IndexProcessor::Options index_processor_options;
- index_processor_options.max_tokens_per_document =
- options.max_tokens_per_doc();
- index_processor_options.token_limit_behavior =
- IndexProcessor::Options::TokenLimitBehavior::kSuppressError;
- return index_processor_options;
+libtextclassifier3::Status ValidateSuggestionSpec(
+ const SuggestionSpecProto& suggestion_spec,
+ const PerformanceConfiguration& configuration) {
+ if (suggestion_spec.prefix().empty()) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("SuggestionSpecProto.prefix is empty!"));
+ }
+ if (suggestion_spec.scoring_spec().scoring_match_type() ==
+ TermMatchType::UNKNOWN) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("SuggestionSpecProto.term_match_type is unknown!"));
+ }
+ if (suggestion_spec.num_to_return() <= 0) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "SuggestionSpecProto.num_to_return must be positive."));
+ }
+ if (suggestion_spec.prefix().size() > configuration.max_query_length) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("SuggestionSpecProto.prefix is longer than the "
+ "maximum allowed prefix length: ",
+ std::to_string(configuration.max_query_length)));
+ }
+ return libtextclassifier3::Status::OK;
}
-std::string MakeHeaderFilename(const std::string& base_dir) {
- return absl_ports::StrCat(base_dir, "/", kIcingSearchEngineHeaderFilename);
+bool IsV2QualifiedIdJoinIndexEnabled(const IcingSearchEngineOptions& options) {
+ return options.use_new_qualified_id_join_index() &&
+ options.document_store_namespace_id_fingerprint();
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>>
+CreateQualifiedIdJoinIndex(const Filesystem& filesystem,
+ std::string qualified_id_join_index_dir,
+ const IcingSearchEngineOptions& options) {
+ if (IsV2QualifiedIdJoinIndexEnabled(options)) {
+ // V2
+ return QualifiedIdJoinIndexImplV2::Create(
+ filesystem, std::move(qualified_id_join_index_dir),
+ options.pre_mapping_fbv());
+ } else {
+ // V1
+ // TODO(b/275121148): deprecate this part after rollout v2.
+ return QualifiedIdJoinIndexImplV1::Create(
+ filesystem, std::move(qualified_id_join_index_dir),
+ options.pre_mapping_fbv(), options.use_persistent_hash_map());
+ }
+}
+
+// Version file is a single file under base_dir containing version info of the
+// existing data.
+std::string MakeVersionFilePath(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kVersionFilename);
}
// Document store files are in a standalone subfolder for easier file
@@ -139,6 +280,22 @@ std::string MakeIndexDirectoryPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kIndexSubfolderName);
}
+// Working path for integer index. Integer index is derived from
+// PersistentStorage and it will take full ownership of this working path,
+// including creation/deletion. See PersistentStorage for more details about
+// working path.
+std::string MakeIntegerIndexWorkingPath(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kIntegerIndexSubfolderName);
+}
+
+// Working path for qualified id join index. It is derived from
+// PersistentStorage and it will take full ownership of this working path,
+// including creation/deletion. See PersistentStorage for more details about
+// working path.
+std::string MakeQualifiedIdJoinIndexWorkingPath(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kQualifiedIdJoinIndexSubfolderName);
+}
+
// SchemaStore files are in a standalone subfolder for easier file management.
// We can delete and recreate the subfolder and not touch/affect anything
// else.
@@ -146,32 +303,45 @@ std::string MakeSchemaDirectoryPath(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kSchemaSubfolderName);
}
+std::string MakeSetSchemaMarkerFilePath(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kSetSchemaMarkerFilename);
+}
+
+std::string MakeInitMarkerFilePath(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kInitMarkerFilename);
+}
+
void TransformStatus(const libtextclassifier3::Status& internal_status,
StatusProto* status_proto) {
+ StatusProto::Code code;
+ if (!internal_status.ok()) {
+ ICING_LOG(WARNING) << "Error: " << internal_status.error_code()
+ << ", Message: " << internal_status.error_message();
+ }
switch (internal_status.CanonicalCode()) {
case libtextclassifier3::StatusCode::OK:
- status_proto->set_code(StatusProto::OK);
+ code = StatusProto::OK;
break;
case libtextclassifier3::StatusCode::DATA_LOSS:
- status_proto->set_code(StatusProto::WARNING_DATA_LOSS);
+ code = StatusProto::WARNING_DATA_LOSS;
break;
case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
- status_proto->set_code(StatusProto::INVALID_ARGUMENT);
+ code = StatusProto::INVALID_ARGUMENT;
break;
case libtextclassifier3::StatusCode::NOT_FOUND:
- status_proto->set_code(StatusProto::NOT_FOUND);
+ code = StatusProto::NOT_FOUND;
break;
case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
- status_proto->set_code(StatusProto::FAILED_PRECONDITION);
+ code = StatusProto::FAILED_PRECONDITION;
break;
case libtextclassifier3::StatusCode::ABORTED:
- status_proto->set_code(StatusProto::ABORTED);
+ code = StatusProto::ABORTED;
break;
case libtextclassifier3::StatusCode::INTERNAL:
// TODO(b/147699081): Cleanup our internal use of INTERNAL since it
// doesn't match with what it *should* indicate as described in
// go/icing-library-apis.
- status_proto->set_code(StatusProto::INTERNAL);
+ code = StatusProto::INTERNAL;
break;
case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
// TODO(b/147699081): Note that we don't detect all cases of OUT_OF_SPACE
@@ -179,43 +349,117 @@ void TransformStatus(const libtextclassifier3::Status& internal_status,
// internally to indicate other resources are exhausted (e.g.
// DocHitInfos) - although none of these are exposed through the API.
// Consider separating the two cases out more clearly.
- status_proto->set_code(StatusProto::OUT_OF_SPACE);
+ code = StatusProto::OUT_OF_SPACE;
break;
- default:
+ case libtextclassifier3::StatusCode::ALREADY_EXISTS:
+ code = StatusProto::ALREADY_EXISTS;
+ break;
+ case libtextclassifier3::StatusCode::CANCELLED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNKNOWN:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::PERMISSION_DENIED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::OUT_OF_RANGE:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNIMPLEMENTED:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNAVAILABLE:
+ [[fallthrough]];
+ case libtextclassifier3::StatusCode::UNAUTHENTICATED:
// Other internal status codes aren't supported externally yet. If it
// should be supported, add another switch-case above.
- ICING_LOG(FATAL)
- << "Internal status code not supported in the external API";
+ ICING_LOG(ERROR) << "Internal status code "
+ << internal_status.error_code()
+ << " not supported in the external API";
+ code = StatusProto::UNKNOWN;
break;
}
-
+ status_proto->set_code(code);
status_proto->set_message(internal_status.error_message());
}
+libtextclassifier3::Status RetrieveAndAddDocumentInfo(
+ const DocumentStore* document_store, DeleteByQueryResultProto& result_proto,
+ std::unordered_map<NamespaceTypePair,
+ DeleteByQueryResultProto::DocumentGroupInfo*,
+ NamespaceTypePairHasher>& info_map,
+ DocumentId document_id) {
+ ICING_ASSIGN_OR_RETURN(DocumentProto document,
+ document_store->Get(document_id));
+ NamespaceTypePair key = {document.namespace_(), document.schema()};
+ auto iter = info_map.find(key);
+ if (iter == info_map.end()) {
+ auto entry = result_proto.add_deleted_documents();
+ entry->set_namespace_(std::move(document.namespace_()));
+ entry->set_schema(std::move(document.schema()));
+ entry->add_uris(std::move(document.uri()));
+ info_map[key] = entry;
+ } else {
+ iter->second->add_uris(std::move(document.uri()));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+bool ShouldRebuildIndex(const OptimizeStatsProto& optimize_stats,
+ float optimize_rebuild_index_threshold) {
+ int num_invalid_documents = optimize_stats.num_deleted_documents() +
+ optimize_stats.num_expired_documents();
+ return num_invalid_documents >= optimize_stats.num_original_documents() *
+ optimize_rebuild_index_threshold;
+}
+
+// Useful method to get RankingStrategy if advanced scoring is enabled. When the
+// "RelevanceScore" function is used in the advanced scoring expression,
+// RankingStrategy will be treated as RELEVANCE_SCORE in order to prepare the
+// necessary information needed for calculating relevance score.
+libtextclassifier3::StatusOr<ScoringSpecProto::RankingStrategy::Code>
+GetRankingStrategyFromScoringSpec(const ScoringSpecProto& scoring_spec) {
+ if (scoring_spec.advanced_scoring_expression().empty()) {
+ return scoring_spec.rank_by();
+ }
+ // TODO(b/261474063) The Lexer will be called again when creating the
+ // AdvancedScorer instance. Consider refactoring the code to allow the Lexer
+ // to be called only once.
+ Lexer lexer(scoring_spec.advanced_scoring_expression(),
+ Lexer::Language::SCORING);
+ ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ for (const Lexer::LexerToken& token : lexer_tokens) {
+ if (token.type == Lexer::TokenType::FUNCTION_NAME &&
+ token.text == RelevanceScoreFunctionScoreExpression::kFunctionName) {
+ return ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
+ }
+ }
+ return ScoringSpecProto::RankingStrategy::NONE;
+}
+
} // namespace
IcingSearchEngine::IcingSearchEngine(const IcingSearchEngineOptions& options,
std::unique_ptr<const JniCache> jni_cache)
: IcingSearchEngine(options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
std::make_unique<Clock>(), std::move(jni_cache)) {}
IcingSearchEngine::IcingSearchEngine(
IcingSearchEngineOptions options,
- std::unique_ptr<const Filesystem> filesystem, std::unique_ptr<Clock> clock,
- std::unique_ptr<const JniCache> jni_cache)
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock, std::unique_ptr<const JniCache> jni_cache)
: options_(std::move(options)),
filesystem_(std::move(filesystem)),
- icing_filesystem_(std::make_unique<IcingFilesystem>()),
+ icing_filesystem_(std::move(icing_filesystem)),
clock_(std::move(clock)),
- result_state_manager_(performance_configuration_.max_num_hits_per_query,
- performance_configuration_.max_num_cache_results),
jni_cache_(std::move(jni_cache)) {
ICING_VLOG(1) << "Creating IcingSearchEngine in dir: " << options_.base_dir();
}
IcingSearchEngine::~IcingSearchEngine() {
if (initialized_) {
- if (PersistToDisk().status().code() != StatusProto::OK) {
+ if (PersistToDisk(PersistType::FULL).status().code() != StatusProto::OK) {
ICING_LOG(ERROR)
<< "Error persisting to disk in IcingSearchEngine destructor";
}
@@ -230,49 +474,164 @@ InitializeResultProto IcingSearchEngine::Initialize() {
return InternalInitialize();
}
+void IcingSearchEngine::ResetMembers() {
+ schema_store_.reset();
+ document_store_.reset();
+ language_segmenter_.reset();
+ normalizer_.reset();
+ index_.reset();
+ integer_index_.reset();
+ qualified_id_join_index_.reset();
+}
+
+libtextclassifier3::Status IcingSearchEngine::CheckInitMarkerFile(
+ InitializeStatsProto* initialize_stats) {
+ // Check to see if the marker file exists and if we've already passed our max
+ // number of init attempts.
+ std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
+ bool file_exists = filesystem_->FileExists(marker_filepath.c_str());
+ int network_init_attempts = 0;
+ int host_init_attempts = 0;
+
+ // Read the number of previous failed init attempts from the file. If it
+ // fails, then just assume the value is zero (the most likely reason for
+ // failure would be non-existence because the last init was successful
+ // anyways).
+ std::unique_ptr<ScopedFd> marker_file_fd = std::make_unique<ScopedFd>(
+ filesystem_->OpenForWrite(marker_filepath.c_str()));
+ libtextclassifier3::Status status;
+ if (file_exists &&
+ filesystem_->PRead(marker_file_fd->get(), &network_init_attempts,
+ sizeof(network_init_attempts), /*offset=*/0)) {
+ host_init_attempts = GNetworkToHostL(network_init_attempts);
+ if (host_init_attempts > kMaxUnsuccessfulInitAttempts) {
+ // We're tried and failed to init too many times. We need to throw
+ // everything out and start from scratch.
+ ResetMembers();
+ marker_file_fd.reset();
+
+ // Delete the entire base directory.
+ if (!filesystem_->DeleteDirectoryRecursively(
+ options_.base_dir().c_str())) {
+ return absl_ports::InternalError("Failed to delete icing base dir!");
+ }
+
+ // Create the base directory again and reopen marker file.
+ if (!filesystem_->CreateDirectoryRecursively(
+ options_.base_dir().c_str())) {
+ return absl_ports::InternalError("Failed to create icing base dir!");
+ }
+
+ marker_file_fd = std::make_unique<ScopedFd>(
+ filesystem_->OpenForWrite(marker_filepath.c_str()));
+
+ status = absl_ports::DataLossError(
+ "Encountered failed initialization limit. Cleared all data.");
+ host_init_attempts = 0;
+ }
+ }
+
+ // Use network_init_attempts here because we might have set host_init_attempts
+ // to 0 if it exceeded the max threshold.
+ initialize_stats->set_num_previous_init_failures(
+ GNetworkToHostL(network_init_attempts));
+
+ ++host_init_attempts;
+ network_init_attempts = GHostToNetworkL(host_init_attempts);
+ // Write the updated number of attempts before we get started.
+ if (!filesystem_->PWrite(marker_file_fd->get(), /*offset=*/0,
+ &network_init_attempts,
+ sizeof(network_init_attempts)) ||
+ !filesystem_->DataSync(marker_file_fd->get())) {
+ return absl_ports::InternalError(
+ "Failed to write and sync init marker file");
+ }
+
+ return status;
+}
+
InitializeResultProto IcingSearchEngine::InternalInitialize() {
ICING_VLOG(1) << "Initializing IcingSearchEngine in dir: "
<< options_.base_dir();
+ // Measure the latency of the initialization process.
+ std::unique_ptr<Timer> initialize_timer = clock_->GetNewTimer();
+
InitializeResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ InitializeStatsProto* initialize_stats =
+ result_proto.mutable_initialize_stats();
if (initialized_) {
// Already initialized.
result_status->set_code(StatusProto::OK);
+ initialize_stats->set_latency_ms(
+ initialize_timer->GetElapsedMilliseconds());
+ initialize_stats->set_num_documents(document_store_->num_documents());
return result_proto;
}
- // Releases result / query cache if any
- result_state_manager_.InvalidateAllResultStates();
+ // Now go ahead and try to initialize.
+ libtextclassifier3::Status status = InitializeMembers(initialize_stats);
+ if (status.ok() || absl_ports::IsDataLoss(status)) {
+ // We successfully initialized. We should delete the init marker file to
+ // indicate a successful init.
+ std::string marker_filepath = MakeInitMarkerFilePath(options_.base_dir());
+ if (!filesystem_->DeleteFile(marker_filepath.c_str())) {
+ status = absl_ports::InternalError("Failed to delete init marker file!");
+ } else {
+ initialized_ = true;
+ }
+ }
+ TransformStatus(status, result_status);
+ initialize_stats->set_latency_ms(initialize_timer->GetElapsedMilliseconds());
+ return result_proto;
+}
+
+libtextclassifier3::Status IcingSearchEngine::InitializeMembers(
+ InitializeStatsProto* initialize_stats) {
+ ICING_RETURN_ERROR_IF_NULL(initialize_stats);
- libtextclassifier3::Status status = InitializeMembers();
- if (!status.ok()) {
- TransformStatus(status, result_status);
- return result_proto;
+ // Make sure the base directory exists
+ if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not create directory: ", options_.base_dir()));
}
- // Even if each subcomponent initialized fine independently, we need to
- // check if they're consistent with each other.
- if (!CheckConsistency().ok()) {
- ICING_VLOG(1)
- << "IcingSearchEngine in inconsistent state, regenerating all "
- "derived data";
- status = RegenerateDerivedFiles();
- if (!status.ok()) {
- TransformStatus(status, result_status);
- return result_proto;
- }
+ // Check to see if the marker file exists and if we've already passed our max
+ // number of init attempts.
+ libtextclassifier3::Status status = CheckInitMarkerFile(initialize_stats);
+ if (!status.ok() && !absl_ports::IsDataLoss(status)) {
+ return status;
}
- initialized_ = true;
- result_status->set_code(StatusProto::OK);
- return result_proto;
-}
+ // Read version file and determine the state change.
+ const std::string version_filepath = MakeVersionFilePath(options_.base_dir());
+ const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
+ ICING_ASSIGN_OR_RETURN(
+ version_util::VersionInfo version_info,
+ version_util::ReadVersion(*filesystem_, version_filepath, index_dir));
+ version_util::StateChange version_state_change =
+ version_util::GetVersionStateChange(version_info);
+ if (version_state_change != version_util::StateChange::kCompatible) {
+ // Step 1: migrate schema according to the version state change.
+ ICING_RETURN_IF_ERROR(SchemaStore::MigrateSchema(
+ filesystem_.get(), MakeSchemaDirectoryPath(options_.base_dir()),
+ version_state_change, version_util::kVersion));
+
+ // Step 2: discard all derived data if needed rebuild.
+ if (version_util::ShouldRebuildDerivedFiles(version_info)) {
+ ICING_RETURN_IF_ERROR(DiscardDerivedFiles());
+ }
+
+ // Step 3: update version file
+ version_util::VersionInfo new_version_info(
+ version_util::kVersion,
+ std::max(version_info.max_version, version_util::kVersion));
+ ICING_RETURN_IF_ERROR(version_util::WriteVersion(
+ *filesystem_, version_filepath, new_version_info));
+ }
-libtextclassifier3::Status IcingSearchEngine::InitializeMembers() {
- ICING_RETURN_IF_ERROR(InitializeOptions());
- ICING_RETURN_IF_ERROR(InitializeSchemaStore());
- ICING_RETURN_IF_ERROR(InitializeDocumentStore());
+ ICING_RETURN_IF_ERROR(InitializeSchemaStore(initialize_stats));
// TODO(b/156383798) : Resolve how to specify the locale.
language_segmenter_factory::SegmenterOptions segmenter_options(
@@ -283,24 +642,156 @@ libtextclassifier3::Status IcingSearchEngine::InitializeMembers() {
TC3_ASSIGN_OR_RETURN(normalizer_,
normalizer_factory::Create(options_.max_token_length()));
- ICING_RETURN_IF_ERROR(InitializeIndex());
+ std::string marker_filepath =
+ MakeSetSchemaMarkerFilePath(options_.base_dir());
+
+ libtextclassifier3::Status index_init_status;
+ if (absl_ports::IsNotFound(schema_store_->GetSchema().status())) {
+ // The schema was either lost or never set before. Wipe out the doc store
+ // and index directories and initialize them from scratch.
+ const std::string doc_store_dir =
+ MakeDocumentDirectoryPath(options_.base_dir());
+ const std::string integer_index_dir =
+ MakeIntegerIndexWorkingPath(options_.base_dir());
+ const std::string qualified_id_join_index_dir =
+ MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
+ if (!filesystem_->DeleteDirectoryRecursively(doc_store_dir.c_str()) ||
+ !filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
+ !IntegerIndex::Discard(*filesystem_, integer_index_dir).ok() ||
+ !QualifiedIdJoinIndex::Discard(*filesystem_,
+ qualified_id_join_index_dir)
+ .ok()) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not delete directories: ", index_dir, ", ", integer_index_dir,
+ ", ", qualified_id_join_index_dir, " and ", doc_store_dir));
+ }
+ ICING_ASSIGN_OR_RETURN(
+ bool document_store_derived_files_regenerated,
+ InitializeDocumentStore(
+ /*force_recovery_and_revalidate_documents=*/false,
+ initialize_stats));
+ index_init_status = InitializeIndex(
+ document_store_derived_files_regenerated, initialize_stats);
+ if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
+ return index_init_status;
+ }
+ } else if (filesystem_->FileExists(marker_filepath.c_str())) {
+ // If the marker file is still around then something wonky happened when we
+ // last tried to set the schema.
+ //
+ // Since we're going to rebuild all indices in this case, the return value
+ // of InitializeDocumentStore (document_store_derived_files_regenerated) is
+ // unused.
+ ICING_RETURN_IF_ERROR(InitializeDocumentStore(
+ /*force_recovery_and_revalidate_documents=*/true, initialize_stats));
+
+ // We're going to need to build the index from scratch. So just delete its
+ // directory now.
+ // Discard index directory and instantiate a new one.
+ Index::Options index_options(
+ index_dir, options_.index_merge_size(),
+ options_.lite_index_sort_at_indexing(), options_.lite_index_sort_size(),
+ options_.build_property_existence_metadata_hits());
+ if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
+ !filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Could not recreate directory: ", index_dir));
+ }
+ ICING_ASSIGN_OR_RETURN(index_,
+ Index::Create(index_options, filesystem_.get(),
+ icing_filesystem_.get()));
- return libtextclassifier3::Status::OK;
-}
+ // Discard integer index directory and instantiate a new one.
+ std::string integer_index_dir =
+ MakeIntegerIndexWorkingPath(options_.base_dir());
+ ICING_RETURN_IF_ERROR(
+ IntegerIndex::Discard(*filesystem_, integer_index_dir));
+ ICING_ASSIGN_OR_RETURN(
+ integer_index_,
+ IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
+ options_.integer_index_bucket_split_threshold(),
+ options_.pre_mapping_fbv()));
+
+ // Discard qualified id join index directory and instantiate a new one.
+ std::string qualified_id_join_index_dir =
+ MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
+ ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
+ *filesystem_, qualified_id_join_index_dir));
+ ICING_ASSIGN_OR_RETURN(
+ qualified_id_join_index_,
+ CreateQualifiedIdJoinIndex(
+ *filesystem_, std::move(qualified_id_join_index_dir), options_));
+
+ std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
+ IndexRestorationResult restore_result = RestoreIndexIfNeeded();
+ index_init_status = std::move(restore_result.status);
+ // DATA_LOSS means that we have successfully initialized and re-added
+ // content to the index. Some indexed content was lost, but otherwise the
+ // index is in a valid state and can be queried.
+ if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
+ return index_init_status;
+ }
-libtextclassifier3::Status IcingSearchEngine::InitializeOptions() {
- ICING_RETURN_IF_ERROR(ValidateOptions(options_));
+ // Delete the marker file to indicate that everything is now in sync with
+ // whatever changes were made to the schema.
+ filesystem_->DeleteFile(marker_filepath.c_str());
+
+ initialize_stats->set_index_restoration_latency_ms(
+ restore_timer->GetElapsedMilliseconds());
+ initialize_stats->set_index_restoration_cause(
+ InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
+ initialize_stats->set_integer_index_restoration_cause(
+ InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
+ initialize_stats->set_qualified_id_join_index_restoration_cause(
+ InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC);
+ } else if (version_state_change != version_util::StateChange::kCompatible) {
+ ICING_ASSIGN_OR_RETURN(bool document_store_derived_files_regenerated,
+ InitializeDocumentStore(
+ /*force_recovery_and_revalidate_documents=*/true,
+ initialize_stats));
+ index_init_status = InitializeIndex(
+ document_store_derived_files_regenerated, initialize_stats);
+ if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
+ return index_init_status;
+ }
- // Make sure the base directory exists
- if (!filesystem_->CreateDirectoryRecursively(options_.base_dir().c_str())) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Could not create directory: ", options_.base_dir()));
+ initialize_stats->set_schema_store_recovery_cause(
+ InitializeStatsProto::VERSION_CHANGED);
+ initialize_stats->set_document_store_recovery_cause(
+ InitializeStatsProto::VERSION_CHANGED);
+ initialize_stats->set_index_restoration_cause(
+ InitializeStatsProto::VERSION_CHANGED);
+ initialize_stats->set_integer_index_restoration_cause(
+ InitializeStatsProto::VERSION_CHANGED);
+ initialize_stats->set_qualified_id_join_index_restoration_cause(
+ InitializeStatsProto::VERSION_CHANGED);
+ } else {
+ ICING_ASSIGN_OR_RETURN(
+ bool document_store_derived_files_regenerated,
+ InitializeDocumentStore(
+ /*force_recovery_and_revalidate_documents=*/false,
+ initialize_stats));
+ index_init_status = InitializeIndex(
+ document_store_derived_files_regenerated, initialize_stats);
+ if (!index_init_status.ok() && !absl_ports::IsDataLoss(index_init_status)) {
+ return index_init_status;
+ }
}
- return libtextclassifier3::Status::OK;
+ if (status.ok()) {
+ status = index_init_status;
+ }
+
+ result_state_manager_ = std::make_unique<ResultStateManager>(
+ performance_configuration_.max_num_total_hits, *document_store_);
+
+ return status;
}
-libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore() {
+libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore(
+ InitializeStatsProto* initialize_stats) {
+ ICING_RETURN_ERROR_IF_NULL(initialize_stats);
+
const std::string schema_store_dir =
MakeSchemaDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
@@ -309,12 +800,17 @@ libtextclassifier3::Status IcingSearchEngine::InitializeSchemaStore() {
absl_ports::StrCat("Could not create directory: ", schema_store_dir));
}
ICING_ASSIGN_OR_RETURN(
- schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir));
+ schema_store_, SchemaStore::Create(filesystem_.get(), schema_store_dir,
+ clock_.get(), initialize_stats));
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore() {
+libtextclassifier3::StatusOr<bool> IcingSearchEngine::InitializeDocumentStore(
+ bool force_recovery_and_revalidate_documents,
+ InitializeStatsProto* initialize_stats) {
+ ICING_RETURN_ERROR_IF_NULL(initialize_stats);
+
const std::string document_dir =
MakeDocumentDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
@@ -323,23 +819,38 @@ libtextclassifier3::Status IcingSearchEngine::InitializeDocumentStore() {
absl_ports::StrCat("Could not create directory: ", document_dir));
}
ICING_ASSIGN_OR_RETURN(
- document_store_,
- DocumentStore::Create(filesystem_.get(), document_dir, clock_.get(),
- schema_store_.get()));
-
- return libtextclassifier3::Status::OK;
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ filesystem_.get(), document_dir, clock_.get(), schema_store_.get(),
+ force_recovery_and_revalidate_documents,
+ options_.document_store_namespace_id_fingerprint(),
+ options_.pre_mapping_fbv(), options_.use_persistent_hash_map(),
+ options_.compression_level(), initialize_stats));
+ document_store_ = std::move(create_result.document_store);
+
+ return create_result.derived_files_regenerated;
}
-libtextclassifier3::Status IcingSearchEngine::InitializeIndex() {
+libtextclassifier3::Status IcingSearchEngine::InitializeIndex(
+ bool document_store_derived_files_regenerated,
+ InitializeStatsProto* initialize_stats) {
+ ICING_RETURN_ERROR_IF_NULL(initialize_stats);
+
const std::string index_dir = MakeIndexDirectoryPath(options_.base_dir());
// Make sure the sub-directory exists
if (!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
return absl_ports::InternalError(
absl_ports::StrCat("Could not create directory: ", index_dir));
}
- Index::Options index_options(index_dir, options_.index_merge_size());
+ Index::Options index_options(
+ index_dir, options_.index_merge_size(),
+ options_.lite_index_sort_at_indexing(), options_.lite_index_sort_size(),
+ options_.build_property_existence_metadata_hits());
- auto index_or = Index::Create(index_options, icing_filesystem_.get());
+ // Term index
+ InitializeStatsProto::RecoveryCause index_recovery_cause;
+ auto index_or =
+ Index::Create(index_options, filesystem_.get(), icing_filesystem_.get());
if (!index_or.ok()) {
if (!filesystem_->DeleteDirectoryRecursively(index_dir.c_str()) ||
!filesystem_->CreateDirectoryRecursively(index_dir.c_str())) {
@@ -347,66 +858,114 @@ libtextclassifier3::Status IcingSearchEngine::InitializeIndex() {
absl_ports::StrCat("Could not recreate directory: ", index_dir));
}
+ index_recovery_cause = InitializeStatsProto::IO_ERROR;
+
// Try recreating it from scratch and re-indexing everything.
- ICING_ASSIGN_OR_RETURN(
- index_, Index::Create(index_options, icing_filesystem_.get()));
- ICING_RETURN_IF_ERROR(RestoreIndex());
+ ICING_ASSIGN_OR_RETURN(index_,
+ Index::Create(index_options, filesystem_.get(),
+ icing_filesystem_.get()));
} else {
// Index was created fine.
index_ = std::move(index_or).ValueOrDie();
- }
-
- return libtextclassifier3::Status::OK;
-} // namespace lib
+ // If a recover does have to happen, then it must be because the index is
+ // out of sync with the document store.
+ index_recovery_cause = InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
+ }
+
+ // Integer index
+ std::string integer_index_dir =
+ MakeIntegerIndexWorkingPath(options_.base_dir());
+ InitializeStatsProto::RecoveryCause integer_index_recovery_cause;
+ auto integer_index_or =
+ IntegerIndex::Create(*filesystem_, integer_index_dir,
+ options_.integer_index_bucket_split_threshold(),
+ options_.pre_mapping_fbv());
+ if (!integer_index_or.ok()) {
+ ICING_RETURN_IF_ERROR(
+ IntegerIndex::Discard(*filesystem_, integer_index_dir));
-libtextclassifier3::Status IcingSearchEngine::CheckConsistency() {
- if (!HeaderExists()) {
- // Without a header file, we have no checksum and can't even detect
- // inconsistencies
- return absl_ports::NotFoundError("No header file found.");
- }
+ integer_index_recovery_cause = InitializeStatsProto::IO_ERROR;
- // Header does exist, verify that the header looks fine.
- IcingSearchEngine::Header header;
- if (!filesystem_->Read(MakeHeaderFilename(options_.base_dir()).c_str(),
- &header, sizeof(header))) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Couldn't read: ", MakeHeaderFilename(options_.base_dir())));
- }
+ // Try recreating it from scratch and re-indexing everything.
+ ICING_ASSIGN_OR_RETURN(
+ integer_index_,
+ IntegerIndex::Create(*filesystem_, std::move(integer_index_dir),
+ options_.integer_index_bucket_split_threshold(),
+ options_.pre_mapping_fbv()));
+ } else {
+ // Integer index was created fine.
+ integer_index_ = std::move(integer_index_or).ValueOrDie();
+ // If a recover does have to happen, then it must be because the index is
+ // out of sync with the document store.
+ integer_index_recovery_cause =
+ InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
+ }
+
+ // Qualified id join index
+ std::string qualified_id_join_index_dir =
+ MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir());
+ InitializeStatsProto::RecoveryCause qualified_id_join_index_recovery_cause;
+ if (document_store_derived_files_regenerated &&
+ IsV2QualifiedIdJoinIndexEnabled(options_)) {
+ // V2 qualified id join index depends on document store derived files, so we
+ // have to rebuild it from scratch if
+ // document_store_derived_files_regenerated is true.
+ ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
+ *filesystem_, qualified_id_join_index_dir));
- if (header.magic != IcingSearchEngine::Header::kMagic) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Invalid header kMagic for file: ",
- MakeHeaderFilename(options_.base_dir())));
- }
+ ICING_ASSIGN_OR_RETURN(
+ qualified_id_join_index_,
+ CreateQualifiedIdJoinIndex(
+ *filesystem_, std::move(qualified_id_join_index_dir), options_));
- ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
- if (checksum.Get() != header.checksum) {
- return absl_ports::InternalError(
- "IcingSearchEngine checksum doesn't match");
+ qualified_id_join_index_recovery_cause =
+ InitializeStatsProto::DEPENDENCIES_CHANGED;
+ } else {
+ auto qualified_id_join_index_or = CreateQualifiedIdJoinIndex(
+ *filesystem_, qualified_id_join_index_dir, options_);
+ if (!qualified_id_join_index_or.ok()) {
+ ICING_RETURN_IF_ERROR(QualifiedIdJoinIndex::Discard(
+ *filesystem_, qualified_id_join_index_dir));
+
+ qualified_id_join_index_recovery_cause = InitializeStatsProto::IO_ERROR;
+
+ // Try recreating it from scratch and rebuild everything.
+ ICING_ASSIGN_OR_RETURN(
+ qualified_id_join_index_,
+ CreateQualifiedIdJoinIndex(
+ *filesystem_, std::move(qualified_id_join_index_dir), options_));
+ } else {
+ // Qualified id join index was created fine.
+ qualified_id_join_index_ =
+ std::move(qualified_id_join_index_or).ValueOrDie();
+ // If a recover does have to happen, then it must be because the index is
+ // out of sync with the document store.
+ qualified_id_join_index_recovery_cause =
+ InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH;
+ }
}
- return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::Status IcingSearchEngine::RegenerateDerivedFiles() {
- ICING_RETURN_IF_ERROR(
- document_store_->UpdateSchemaStore(schema_store_.get()));
- ICING_RETURN_IF_ERROR(index_->Reset());
- ICING_RETURN_IF_ERROR(RestoreIndex());
+ std::unique_ptr<Timer> restore_timer = clock_->GetNewTimer();
+ IndexRestorationResult restore_result = RestoreIndexIfNeeded();
+ if (restore_result.index_needed_restoration ||
+ restore_result.integer_index_needed_restoration ||
+ restore_result.qualified_id_join_index_needed_restoration) {
+ initialize_stats->set_index_restoration_latency_ms(
+ restore_timer->GetElapsedMilliseconds());
- const std::string header_file =
- MakeHeaderFilename(options_.base_dir().c_str());
- if (HeaderExists()) {
- if (!filesystem_->DeleteFile(header_file.c_str())) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Unable to delete file: ", header_file));
+ if (restore_result.index_needed_restoration) {
+ initialize_stats->set_index_restoration_cause(index_recovery_cause);
+ }
+ if (restore_result.integer_index_needed_restoration) {
+ initialize_stats->set_integer_index_restoration_cause(
+ integer_index_recovery_cause);
+ }
+ if (restore_result.qualified_id_join_index_needed_restoration) {
+ initialize_stats->set_qualified_id_join_index_restoration_cause(
+ qualified_id_join_index_recovery_cause);
}
}
- ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
- ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
-
- return libtextclassifier3::Status::OK;
+ return restore_result.status;
}
SetSchemaResultProto IcingSearchEngine::SetSchema(
@@ -422,18 +981,15 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
StatusProto* result_status = result_proto.mutable_status();
absl_ports::unique_lock l(&mutex_);
+ ScopedTimer timer(clock_->GetNewTimer(), [&result_proto](int64_t t) {
+ result_proto.set_latency_ms(t);
+ });
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
- libtextclassifier3::Status status = SchemaUtil::Validate(new_schema);
- if (!status.ok()) {
- TransformStatus(status, result_status);
- return result_proto;
- }
-
auto lost_previous_schema_or = LostPreviousSchema();
if (!lost_previous_schema_or.ok()) {
TransformStatus(lost_previous_schema_or.status(), result_status);
@@ -441,14 +997,24 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
}
bool lost_previous_schema = lost_previous_schema_or.ValueOrDie();
+ std::string marker_filepath =
+ MakeSetSchemaMarkerFilePath(options_.base_dir());
+ // Create the marker file indicating that we are going to apply a schema
+ // change. No need to write anything to the marker file - its existence is the
+ // only thing that matters. The marker file is used to indicate if we
+ // encountered a crash or a power loss while updating the schema and other
+ // files. So set it up to be deleted as long as we return from this function.
+ DestructibleFile marker_file(marker_filepath, filesystem_.get());
+
auto set_schema_result_or = schema_store_->SetSchema(
- std::move(new_schema), ignore_errors_and_delete_documents);
+ std::move(new_schema), ignore_errors_and_delete_documents,
+ options_.allow_circular_schema_definitions());
if (!set_schema_result_or.ok()) {
TransformStatus(set_schema_result_or.status(), result_status);
return result_proto;
}
- const SchemaStore::SetSchemaResult set_schema_result =
- set_schema_result_or.ValueOrDie();
+ SchemaStore::SetSchemaResult set_schema_result =
+ std::move(set_schema_result_or).ValueOrDie();
for (const std::string& deleted_type :
set_schema_result.schema_types_deleted_by_name) {
@@ -460,6 +1026,34 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
result_proto.add_incompatible_schema_types(incompatible_type);
}
+ for (const std::string& new_type :
+ set_schema_result.schema_types_new_by_name) {
+ result_proto.add_new_schema_types(std::move(new_type));
+ }
+
+ for (const std::string& compatible_type :
+ set_schema_result.schema_types_changed_fully_compatible_by_name) {
+ result_proto.add_fully_compatible_changed_schema_types(
+ std::move(compatible_type));
+ }
+
+ bool index_incompatible =
+ !set_schema_result.schema_types_index_incompatible_by_name.empty();
+ for (const std::string& index_incompatible_type :
+ set_schema_result.schema_types_index_incompatible_by_name) {
+ result_proto.add_index_incompatible_changed_schema_types(
+ std::move(index_incompatible_type));
+ }
+
+ bool join_incompatible =
+ !set_schema_result.schema_types_join_incompatible_by_name.empty();
+ for (const std::string& join_incompatible_type :
+ set_schema_result.schema_types_join_incompatible_by_name) {
+ result_proto.add_join_incompatible_changed_schema_types(
+ std::move(join_incompatible_type));
+ }
+
+ libtextclassifier3::Status status;
if (set_schema_result.success) {
if (lost_previous_schema) {
// No previous schema to calculate a diff against. We have to go through
@@ -480,26 +1074,42 @@ SetSchemaResultProto IcingSearchEngine::SetSchema(
}
}
- if (lost_previous_schema || set_schema_result.index_incompatible) {
- // Clears all index files
- status = index_->Reset();
+ if (lost_previous_schema || index_incompatible) {
+ // Clears search indices
+ status = ClearSearchIndices();
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
+ }
- status = RestoreIndex();
+ if (lost_previous_schema || join_incompatible) {
+ // Clears join indices
+ status = ClearJoinIndices();
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
}
+ if (lost_previous_schema || index_incompatible || join_incompatible) {
+ IndexRestorationResult restore_result = RestoreIndexIfNeeded();
+ // DATA_LOSS means that we have successfully re-added content to the
+ // index. Some indexed content was lost, but otherwise the index is in a
+ // valid state and can be queried.
+ if (!restore_result.status.ok() &&
+ !absl_ports::IsDataLoss(restore_result.status)) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+ }
+
result_status->set_code(StatusProto::OK);
} else {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("Schema is incompatible.");
}
+
return result_proto;
}
@@ -557,6 +1167,11 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
PutResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ PutDocumentStatsProto* put_document_stats =
+ result_proto.mutable_put_document_stats();
+ ScopedTimer put_timer(clock_->GetNewTimer(), [put_document_stats](int64_t t) {
+ put_document_stats->set_latency_ms(t);
+ });
// Lock must be acquired before validation because the DocumentStore uses
// the schema file to validate, and the schema could be changed in
@@ -568,35 +1183,75 @@ PutResultProto IcingSearchEngine::Put(DocumentProto&& document) {
return result_proto;
}
- auto document_id_or = document_store_->Put(document);
+ auto tokenized_document_or = TokenizedDocument::Create(
+ schema_store_.get(), language_segmenter_.get(), std::move(document));
+ if (!tokenized_document_or.ok()) {
+ TransformStatus(tokenized_document_or.status(), result_status);
+ return result_proto;
+ }
+ TokenizedDocument tokenized_document(
+ std::move(tokenized_document_or).ValueOrDie());
+
+ auto document_id_or = document_store_->Put(
+ tokenized_document.document(), tokenized_document.num_string_tokens(),
+ put_document_stats);
if (!document_id_or.ok()) {
TransformStatus(document_id_or.status(), result_status);
return result_proto;
}
DocumentId document_id = document_id_or.ValueOrDie();
- auto index_processor_or = IndexProcessor::Create(
- schema_store_.get(), language_segmenter_.get(), normalizer_.get(),
- index_.get(), CreateIndexProcessorOptions(options_));
- if (!index_processor_or.ok()) {
- TransformStatus(index_processor_or.status(), result_status);
+ auto data_indexing_handlers_or = CreateDataIndexingHandlers();
+ if (!data_indexing_handlers_or.ok()) {
+ TransformStatus(data_indexing_handlers_or.status(), result_status);
return result_proto;
}
- std::unique_ptr<IndexProcessor> index_processor =
- std::move(index_processor_or).ValueOrDie();
+ IndexProcessor index_processor(
+ std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get());
+
+ auto index_status = index_processor.IndexDocument(
+ tokenized_document, document_id, put_document_stats);
+ // Getting an internal error from the index could possibly mean that the index
+ // is broken. Try to rebuild them to recover.
+ if (absl_ports::IsInternal(index_status)) {
+ ICING_LOG(ERROR) << "Got an internal error from the index. Trying to "
+ "rebuild the index!\n"
+ << index_status.error_message();
+ index_status = ClearAllIndices();
+ if (index_status.ok()) {
+ index_status = RestoreIndexIfNeeded().status;
+ if (!index_status.ok()) {
+ ICING_LOG(ERROR) << "Failed to reindex documents after a failure of "
+ "indexing a document.";
+ }
+ } else {
+ ICING_LOG(ERROR)
+ << "Failed to clear indices after a failure of indexing a document.";
+ }
+ }
- auto status = index_processor->IndexDocument(document, document_id);
- if (!status.ok()) {
- TransformStatus(status, result_status);
- return result_proto;
+ if (!index_status.ok()) {
+ // If we encountered a failure or cannot resolve an internal error while
+ // indexing this document, then mark it as deleted.
+ int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
+ libtextclassifier3::Status delete_status =
+ document_store_->Delete(document_id, current_time_ms);
+ if (!delete_status.ok()) {
+ // This is pretty dire (and, hopefully, unlikely). We can't roll back the
+ // document that we just added. Wipeout the whole index.
+ ICING_LOG(ERROR) << "Cannot delete the document that is failed to index. "
+ "Wiping out the whole Icing search engine.";
+ ResetInternal();
+ }
}
- result_status->set_code(StatusProto::OK);
+ TransformStatus(index_status, result_status);
return result_proto;
}
GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
- const std::string_view uri) {
+ const std::string_view uri,
+ const GetResultSpecProto& result_spec) {
GetResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
@@ -613,8 +1268,48 @@ GetResultProto IcingSearchEngine::Get(const std::string_view name_space,
return result_proto;
}
+ DocumentProto document = std::move(document_or).ValueOrDie();
+ std::unique_ptr<ProjectionTree> type_projection_tree;
+ std::unique_ptr<ProjectionTree> wildcard_projection_tree;
+ for (const SchemaStore::ExpandedTypePropertyMask& type_field_mask :
+ schema_store_->ExpandTypePropertyMasks(
+ result_spec.type_property_masks())) {
+ if (type_field_mask.schema_type == document.schema()) {
+ type_projection_tree = std::make_unique<ProjectionTree>(type_field_mask);
+ } else if (type_field_mask.schema_type ==
+ SchemaStore::kSchemaTypeWildcard) {
+ wildcard_projection_tree =
+ std::make_unique<ProjectionTree>(type_field_mask);
+ }
+ }
+
+ // Apply projection
+ if (type_projection_tree != nullptr) {
+ projector::Project(type_projection_tree->root().children, &document);
+ } else if (wildcard_projection_tree != nullptr) {
+ projector::Project(wildcard_projection_tree->root().children, &document);
+ }
+
result_status->set_code(StatusProto::OK);
- *result_proto.mutable_document() = std::move(document_or).ValueOrDie();
+ *result_proto.mutable_document() = std::move(document);
+ return result_proto;
+}
+
+ReportUsageResultProto IcingSearchEngine::ReportUsage(
+ const UsageReport& usage_report) {
+ ReportUsageResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
+ libtextclassifier3::Status status =
+ document_store_->ReportUsage(usage_report);
+ TransformStatus(status, result_status);
return result_proto;
}
@@ -623,6 +1318,11 @@ GetAllNamespacesResultProto IcingSearchEngine::GetAllNamespaces() {
StatusProto* result_status = result_proto.mutable_status();
absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
std::vector<std::string> namespaces = document_store_->GetAllNamespaces();
@@ -648,18 +1348,30 @@ DeleteResultProto IcingSearchEngine::Delete(const std::string_view name_space,
return result_proto;
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ DeleteStatsProto* delete_stats = result_proto.mutable_delete_stats();
+ delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SINGLE);
+
+ std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
- libtextclassifier3::Status status = document_store_->Delete(name_space, uri);
+ int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
+ libtextclassifier3::Status status =
+ document_store_->Delete(name_space, uri, current_time_ms);
if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete Document. namespace: " << name_space
- << ", uri: " << uri;
+ LogSeverity::Code severity = ERROR;
+ if (absl_ports::IsNotFound(status)) {
+ severity = DBG;
+ }
+ ICING_LOG(severity) << status.error_message()
+ << "Failed to delete Document. namespace: "
+ << name_space << ", uri: " << uri;
TransformStatus(status, result_status);
return result_proto;
}
result_status->set_code(StatusProto::OK);
+ delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
+ delete_stats->set_num_documents_deleted(1);
return result_proto;
}
@@ -676,16 +1388,24 @@ DeleteByNamespaceResultProto IcingSearchEngine::DeleteByNamespace(
return delete_result;
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
+ delete_stats->set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
+
+ std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
- libtextclassifier3::Status status =
+ DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteByNamespace(name_space);
- TransformStatus(status, result_status);
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
+ if (!doc_store_result.status.ok()) {
+ ICING_LOG(ERROR) << doc_store_result.status.error_message()
<< "Failed to delete Namespace: " << name_space;
+ TransformStatus(doc_store_result.status, result_status);
return delete_result;
}
+
+ result_status->set_code(StatusProto::OK);
+ delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
+ delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
return delete_result;
}
@@ -702,20 +1422,143 @@ DeleteBySchemaTypeResultProto IcingSearchEngine::DeleteBySchemaType(
return delete_result;
}
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ DeleteStatsProto* delete_stats = delete_result.mutable_delete_stats();
+ delete_stats->set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
+
+ std::unique_ptr<Timer> delete_timer = clock_->GetNewTimer();
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
- libtextclassifier3::Status status =
+ DocumentStore::DeleteByGroupResult doc_store_result =
document_store_->DeleteBySchemaType(schema_type);
- TransformStatus(status, result_status);
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
+ if (!doc_store_result.status.ok()) {
+ ICING_LOG(ERROR) << doc_store_result.status.error_message()
<< "Failed to delete SchemaType: " << schema_type;
+ TransformStatus(doc_store_result.status, result_status);
return delete_result;
}
+
+ result_status->set_code(StatusProto::OK);
+ delete_stats->set_latency_ms(delete_timer->GetElapsedMilliseconds());
+ delete_stats->set_num_documents_deleted(doc_store_result.num_docs_deleted);
return delete_result;
}
-PersistToDiskResultProto IcingSearchEngine::PersistToDisk() {
+DeleteByQueryResultProto IcingSearchEngine::DeleteByQuery(
+ const SearchSpecProto& search_spec, bool return_deleted_document_info) {
+ ICING_VLOG(1) << "Deleting documents for query " << search_spec.query()
+ << " from doc store";
+
+ DeleteByQueryResultProto result_proto;
+ StatusProto* result_status = result_proto.mutable_status();
+
+ absl_ports::unique_lock l(&mutex_);
+ if (!initialized_) {
+ result_status->set_code(StatusProto::FAILED_PRECONDITION);
+ result_status->set_message("IcingSearchEngine has not been initialized!");
+ return result_proto;
+ }
+
+ DeleteByQueryStatsProto* delete_stats =
+ result_proto.mutable_delete_by_query_stats();
+ delete_stats->set_query_length(search_spec.query().length());
+ delete_stats->set_num_namespaces_filtered(
+ search_spec.namespace_filters_size());
+ delete_stats->set_num_schema_types_filtered(
+ search_spec.schema_type_filters_size());
+
+ ScopedTimer delete_timer(clock_->GetNewTimer(), [delete_stats](int64_t t) {
+ delete_stats->set_latency_ms(t);
+ });
+ libtextclassifier3::Status status =
+ ValidateSearchSpec(search_spec, performance_configuration_);
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ return result_proto;
+ }
+
+ std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
+ // Gets unordered results from query processor
+ auto query_processor_or = QueryProcessor::Create(
+ index_.get(), integer_index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(), schema_store_.get());
+ if (!query_processor_or.ok()) {
+ TransformStatus(query_processor_or.status(), result_status);
+ delete_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ return result_proto;
+ }
+ std::unique_ptr<QueryProcessor> query_processor =
+ std::move(query_processor_or).ValueOrDie();
+
+ int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
+ auto query_results_or = query_processor->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE, current_time_ms);
+ if (!query_results_or.ok()) {
+ TransformStatus(query_results_or.status(), result_status);
+ delete_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ return result_proto;
+ }
+ QueryResults query_results = std::move(query_results_or).ValueOrDie();
+ delete_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+
+ ICING_VLOG(2) << "Deleting the docs that matched the query.";
+ int num_deleted = 0;
+ // A map used to group deleted documents.
+ // From the (namespace, type) pair to a list of uris.
+ std::unordered_map<NamespaceTypePair,
+ DeleteByQueryResultProto::DocumentGroupInfo*,
+ NamespaceTypePairHasher>
+ deleted_info_map;
+
+ component_timer = clock_->GetNewTimer();
+ while (query_results.root_iterator->Advance().ok()) {
+ ICING_VLOG(3) << "Deleting doc "
+ << query_results.root_iterator->doc_hit_info().document_id();
+ ++num_deleted;
+ if (return_deleted_document_info) {
+ status = RetrieveAndAddDocumentInfo(
+ document_store_.get(), result_proto, deleted_info_map,
+ query_results.root_iterator->doc_hit_info().document_id());
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ delete_stats->set_document_removal_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ return result_proto;
+ }
+ }
+ status = document_store_->Delete(
+ query_results.root_iterator->doc_hit_info().document_id(),
+ current_time_ms);
+ if (!status.ok()) {
+ TransformStatus(status, result_status);
+ delete_stats->set_document_removal_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ return result_proto;
+ }
+ }
+ delete_stats->set_document_removal_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ int term_count = 0;
+ for (const auto& section_and_terms : query_results.query_terms) {
+ term_count += section_and_terms.second.size();
+ }
+ delete_stats->set_num_terms(term_count);
+
+ if (num_deleted > 0) {
+ result_proto.mutable_status()->set_code(StatusProto::OK);
+ } else {
+ result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ result_proto.mutable_status()->set_message(
+ "No documents matched the query to delete by!");
+ }
+ delete_stats->set_num_documents_deleted(num_deleted);
+ return result_proto;
+}
+
+PersistToDiskResultProto IcingSearchEngine::PersistToDisk(
+ PersistType::Code persist_type) {
ICING_VLOG(1) << "Persisting data to disk";
PersistToDiskResultProto result_proto;
@@ -728,7 +1571,7 @@ PersistToDiskResultProto IcingSearchEngine::PersistToDisk() {
return result_proto;
}
- auto status = InternalPersistToDisk();
+ auto status = InternalPersistToDisk(persist_type);
TransformStatus(status, result_status);
return result_proto;
}
@@ -752,53 +1595,168 @@ OptimizeResultProto IcingSearchEngine::Optimize() {
return result_proto;
}
- // Releases result / query cache if any
- result_state_manager_.InvalidateAllResultStates();
+ OptimizeStatsProto* optimize_stats = result_proto.mutable_optimize_stats();
+ ScopedTimer optimize_timer(
+ clock_->GetNewTimer(),
+ [optimize_stats](int64_t t) { optimize_stats->set_latency_ms(t); });
// Flushes data to disk before doing optimization
- auto status = InternalPersistToDisk();
+ auto status = InternalPersistToDisk(PersistType::FULL);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
+ int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
+ optimize_stats->set_storage_size_before(
+ Filesystem::SanitizeFileSize(before_size));
+
// TODO(b/143646633): figure out if we need to optimize index and doc store
// at the same time.
- libtextclassifier3::Status optimization_status = OptimizeDocumentStore();
-
- if (!optimization_status.ok() &&
- !absl_ports::IsDataLoss(optimization_status)) {
+ std::unique_ptr<Timer> optimize_doc_store_timer = clock_->GetNewTimer();
+ libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
+ optimize_result_or = OptimizeDocumentStore(optimize_stats);
+ optimize_stats->set_document_store_optimize_latency_ms(
+ optimize_doc_store_timer->GetElapsedMilliseconds());
+
+ if (!optimize_result_or.ok() &&
+ !absl_ports::IsDataLoss(optimize_result_or.status())) {
// The status now is either ABORTED_ERROR or INTERNAL_ERROR.
// If ABORTED_ERROR, Icing should still be working.
// If INTERNAL_ERROR, we're having IO errors or other errors that we can't
// recover from.
- TransformStatus(optimization_status, result_status);
+ TransformStatus(optimize_result_or.status(), result_status);
return result_proto;
}
// The status is either OK or DATA_LOSS. The optimized document store is
// guaranteed to work, so we update index according to the new document store.
- libtextclassifier3::Status index_reset_status = index_->Reset();
- if (!index_reset_status.ok()) {
- status = absl_ports::Annotate(
- absl_ports::InternalError("Failed to reset index after optimization."),
- index_reset_status.error_message());
- TransformStatus(status, result_status);
- return result_proto;
+ std::unique_ptr<Timer> optimize_index_timer = clock_->GetNewTimer();
+ auto doc_store_optimize_result_status = optimize_result_or.status();
+ bool should_rebuild_index =
+ !optimize_result_or.ok() ||
+ optimize_result_or.ValueOrDie().should_rebuild_index ||
+ ShouldRebuildIndex(*optimize_stats,
+ options_.optimize_rebuild_index_threshold());
+ if (!should_rebuild_index) {
+ // At this point should_rebuild_index is false, so it means
+ // optimize_result_or.ok() is true and therefore it is safe to call
+ // ValueOrDie.
+ DocumentStore::OptimizeResult optimize_result =
+ std::move(optimize_result_or).ValueOrDie();
+
+ optimize_stats->set_index_restoration_mode(
+ OptimizeStatsProto::INDEX_TRANSLATION);
+ libtextclassifier3::Status index_optimize_status =
+ index_->Optimize(optimize_result.document_id_old_to_new,
+ document_store_->last_added_document_id());
+ if (!index_optimize_status.ok()) {
+ ICING_LOG(WARNING) << "Failed to optimize index. Error: "
+ << index_optimize_status.error_message();
+ should_rebuild_index = true;
+ }
+
+ libtextclassifier3::Status integer_index_optimize_status =
+ integer_index_->Optimize(optimize_result.document_id_old_to_new,
+ document_store_->last_added_document_id());
+ if (!integer_index_optimize_status.ok()) {
+ ICING_LOG(WARNING) << "Failed to optimize integer index. Error: "
+ << integer_index_optimize_status.error_message();
+ should_rebuild_index = true;
+ }
+
+ libtextclassifier3::Status qualified_id_join_index_optimize_status =
+ qualified_id_join_index_->Optimize(
+ optimize_result.document_id_old_to_new,
+ optimize_result.namespace_id_old_to_new,
+ document_store_->last_added_document_id());
+ if (!qualified_id_join_index_optimize_status.ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to optimize qualified id join index. Error: "
+ << qualified_id_join_index_optimize_status.error_message();
+ should_rebuild_index = true;
+ }
}
+ // If we received a DATA_LOSS error from OptimizeDocumentStore, we have a
+ // valid document store, but it might be the old one or the new one. So throw
+ // out the index data and rebuild from scratch.
+ // Also rebuild index if DocumentStore::OptimizeInto hints to do so.
+ // Likewise, if Index::Optimize failed, then attempt to recover the index by
+ // rebuilding from scratch.
+ // If ShouldRebuildIndex() returns true, we will also rebuild the index for
+ // better performance.
+ if (should_rebuild_index) {
+ optimize_stats->set_index_restoration_mode(
+ OptimizeStatsProto::FULL_INDEX_REBUILD);
+ ICING_LOG(WARNING) << "Clearing the entire index!";
+
+ libtextclassifier3::Status index_clear_status = ClearAllIndices();
+ if (!index_clear_status.ok()) {
+ status = absl_ports::Annotate(
+ absl_ports::InternalError("Failed to clear index."),
+ index_clear_status.error_message());
+ TransformStatus(status, result_status);
+ optimize_stats->set_index_restoration_latency_ms(
+ optimize_index_timer->GetElapsedMilliseconds());
+ return result_proto;
+ }
- libtextclassifier3::Status index_restoration_status = RestoreIndex();
- if (!index_restoration_status.ok()) {
- status = absl_ports::Annotate(
- absl_ports::InternalError(
- "Failed to reindex documents after optimization."),
- index_restoration_status.error_message());
+ IndexRestorationResult index_restoration_status = RestoreIndexIfNeeded();
+ // DATA_LOSS means that we have successfully re-added content to the index.
+ // Some indexed content was lost, but otherwise the index is in a valid
+ // state and can be queried.
+ if (!index_restoration_status.status.ok() &&
+ !absl_ports::IsDataLoss(index_restoration_status.status)) {
+ status = absl_ports::Annotate(
+ absl_ports::InternalError(
+ "Failed to reindex documents after optimization."),
+ index_restoration_status.status.error_message());
+ TransformStatus(status, result_status);
+ optimize_stats->set_index_restoration_latency_ms(
+ optimize_index_timer->GetElapsedMilliseconds());
+ return result_proto;
+ }
+ }
+ optimize_stats->set_index_restoration_latency_ms(
+ optimize_index_timer->GetElapsedMilliseconds());
+
+ // Read the optimize status to get the time that we last ran.
+ std::string optimize_status_filename =
+ absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
+ FileBackedProto<OptimizeStatusProto> optimize_status_file(
+ *filesystem_, optimize_status_filename);
+ auto optimize_status_or = optimize_status_file.Read();
+ int64_t current_time = clock_->GetSystemTimeMilliseconds();
+ if (optimize_status_or.ok()) {
+ // If we have trouble reading the status or this is the first time that
+ // we've ever run, don't set this field.
+ optimize_stats->set_time_since_last_optimize_ms(
+ current_time - optimize_status_or.ValueOrDie()
+ ->last_successful_optimize_run_time_ms());
+ }
+
+ // Update the status for this run and write it.
+ auto optimize_status = std::make_unique<OptimizeStatusProto>();
+ optimize_status->set_last_successful_optimize_run_time_ms(current_time);
+ auto write_status = optimize_status_file.Write(std::move(optimize_status));
+ if (!write_status.ok()) {
+ ICING_LOG(ERROR) << "Failed to write optimize status:\n"
+ << write_status.error_message();
+ }
+
+ // Flushes data to disk after doing optimization
+ status = InternalPersistToDisk(PersistType::FULL);
+ if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
}
- TransformStatus(optimization_status, result_status);
+ int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
+ optimize_stats->set_storage_size_after(
+ Filesystem::SanitizeFileSize(after_size));
+
+ TransformStatus(doc_store_optimize_result_status, result_status);
return result_proto;
}
@@ -815,6 +1773,22 @@ GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
return result_proto;
}
+ // Read the optimize status to get the time that we last ran.
+ std::string optimize_status_filename =
+ absl_ports::StrCat(options_.base_dir(), "/", kOptimizeStatusFilename);
+ FileBackedProto<OptimizeStatusProto> optimize_status_file(
+ *filesystem_, optimize_status_filename);
+ auto optimize_status_or = optimize_status_file.Read();
+ int64_t current_time = clock_->GetSystemTimeMilliseconds();
+
+ if (optimize_status_or.ok()) {
+ // If we have trouble reading the status or this is the first time that
+ // we've ever run, don't set this field.
+ result_proto.set_time_since_last_optimize_ms(
+ current_time - optimize_status_or.ValueOrDie()
+ ->last_successful_optimize_run_time_ms());
+ }
+
// Get stats from DocumentStore
auto doc_store_optimize_info_or = document_store_->GetOptimizeInfo();
if (!doc_store_optimize_info_or.ok()) {
@@ -840,6 +1814,8 @@ GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
}
int64_t index_elements_size = index_elements_size_or.ValueOrDie();
+ // TODO(b/259744228): add stats for integer index
+
// Sum up the optimizable sizes from DocumentStore and Index
result_proto.set_estimated_optimizable_bytes(
index_elements_size * doc_store_optimize_info.optimizable_docs /
@@ -850,94 +1826,162 @@ GetOptimizeInfoResultProto IcingSearchEngine::GetOptimizeInfo() {
return result_proto;
}
-libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk() {
- ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
- ICING_RETURN_IF_ERROR(document_store_->PersistToDisk());
- ICING_RETURN_IF_ERROR(index_->PersistToDisk());
-
- // Update the combined checksum and write to header file.
- ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
- ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
-
- return libtextclassifier3::Status::OK;
+StorageInfoResultProto IcingSearchEngine::GetStorageInfo() {
+ StorageInfoResultProto result;
+ absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
+ result.mutable_status()->set_message(
+ "IcingSearchEngine has not been initialized!");
+ return result;
+ }
+
+ int64_t index_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
+ result.mutable_storage_info()->set_total_storage_size(
+ Filesystem::SanitizeFileSize(index_size));
+ *result.mutable_storage_info()->mutable_document_storage_info() =
+ document_store_->GetStorageInfo();
+ *result.mutable_storage_info()->mutable_schema_store_storage_info() =
+ schema_store_->GetStorageInfo();
+ *result.mutable_storage_info()->mutable_index_storage_info() =
+ index_->GetStorageInfo();
+ // TODO(b/259744228): add stats for integer index
+ result.mutable_status()->set_code(StatusProto::OK);
+ return result;
}
-libtextclassifier3::StatusOr<Crc32> IcingSearchEngine::ComputeChecksum() {
- Crc32 total_checksum;
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
- // that can support error logging.
- auto checksum_or = schema_store_->ComputeChecksum();
- if (!checksum_or.ok()) {
- ICING_LOG(ERROR) << checksum_or.status().error_message()
- << "Failed to compute checksum of SchemaStore";
- return checksum_or.status();
+DebugInfoResultProto IcingSearchEngine::GetDebugInfo(
+ DebugInfoVerbosity::Code verbosity) {
+ DebugInfoResultProto debug_info;
+ StatusProto* result_status = debug_info.mutable_status();
+ absl_ports::shared_lock l(&mutex_);
+ if (!initialized_) {
+ debug_info.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
+ debug_info.mutable_status()->set_message(
+ "IcingSearchEngine has not been initialized!");
+ return debug_info;
}
- Crc32 schema_store_checksum = std::move(checksum_or).ValueOrDie();
+ // Index
+ *debug_info.mutable_debug_info()->mutable_index_info() =
+ index_->GetDebugInfo(verbosity);
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
- // that can support error logging.
- checksum_or = document_store_->ComputeChecksum();
- if (!checksum_or.ok()) {
- ICING_LOG(ERROR) << checksum_or.status().error_message()
- << "Failed to compute checksum of DocumentStore";
- return checksum_or.status();
+ // TODO(b/259744228): add debug info for integer index
+
+ // Document Store
+ libtextclassifier3::StatusOr<DocumentDebugInfoProto> document_debug_info =
+ document_store_->GetDebugInfo(verbosity);
+ if (!document_debug_info.ok()) {
+ TransformStatus(document_debug_info.status(), result_status);
+ return debug_info;
}
- Crc32 document_store_checksum = std::move(checksum_or).ValueOrDie();
+ *debug_info.mutable_debug_info()->mutable_document_info() =
+ std::move(document_debug_info).ValueOrDie();
- Crc32 index_checksum = index_->ComputeChecksum();
+ // Schema Store
+ libtextclassifier3::StatusOr<SchemaDebugInfoProto> schema_debug_info =
+ schema_store_->GetDebugInfo();
+ if (!schema_debug_info.ok()) {
+ TransformStatus(schema_debug_info.status(), result_status);
+ return debug_info;
+ }
+ *debug_info.mutable_debug_info()->mutable_schema_info() =
+ std::move(schema_debug_info).ValueOrDie();
+
+ result_status->set_code(StatusProto::OK);
+ return debug_info;
+}
- total_checksum.Append(std::to_string(document_store_checksum.Get()));
- total_checksum.Append(std::to_string(schema_store_checksum.Get()));
- total_checksum.Append(std::to_string(index_checksum.Get()));
+libtextclassifier3::Status IcingSearchEngine::InternalPersistToDisk(
+ PersistType::Code persist_type) {
+ if (persist_type == PersistType::LITE) {
+ return document_store_->PersistToDisk(persist_type);
+ }
+ ICING_RETURN_IF_ERROR(schema_store_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(document_store_->PersistToDisk(PersistType::FULL));
+ ICING_RETURN_IF_ERROR(index_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(integer_index_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(qualified_id_join_index_->PersistToDisk());
- return total_checksum;
+ return libtextclassifier3::Status::OK;
}
-bool IcingSearchEngine::HeaderExists() {
- if (!filesystem_->FileExists(
- MakeHeaderFilename(options_.base_dir()).c_str())) {
- return false;
+SearchResultProto IcingSearchEngine::Search(
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec) {
+ if (search_spec.use_read_only_search()) {
+ return SearchLockedShared(search_spec, scoring_spec, result_spec);
+ } else {
+ return SearchLockedExclusive(search_spec, scoring_spec, result_spec);
}
+}
+
+SearchResultProto IcingSearchEngine::SearchLockedShared(
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec) {
+ std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
+
+ // Only acquire an overall read-lock for this implementation. Finer-grained
+ // locks are implemented around code paths that write changes to Icing's data
+ // members.
+ absl_ports::shared_lock l(&mutex_);
+ int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
- int64_t file_size =
- filesystem_->GetFileSize(MakeHeaderFilename(options_.base_dir()).c_str());
+ SearchResultProto result_proto =
+ InternalSearch(search_spec, scoring_spec, result_spec);
- // If it's been truncated to size 0 before, we consider it to be a new file
- return file_size != 0 && file_size != Filesystem::kBadFileSize;
+ result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
+ lock_acquisition_latency);
+ result_proto.mutable_query_stats()->set_latency_ms(
+ overall_timer->GetElapsedMilliseconds());
+ return result_proto;
}
-libtextclassifier3::Status IcingSearchEngine::UpdateHeader(
- const Crc32& checksum) {
- // Write the header
- IcingSearchEngine::Header header;
- header.magic = IcingSearchEngine::Header::kMagic;
- header.checksum = checksum.Get();
+SearchResultProto IcingSearchEngine::SearchLockedExclusive(
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec) {
+ std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
- // This should overwrite the header.
- if (!filesystem_->Write(MakeHeaderFilename(options_.base_dir()).c_str(),
- &header, sizeof(header))) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to write IcingSearchEngine header: ",
- MakeHeaderFilename(options_.base_dir())));
- }
- return libtextclassifier3::Status::OK;
+ // Acquire the overall write-lock for this locked implementation.
+ absl_ports::unique_lock l(&mutex_);
+ int64_t lock_acquisition_latency = overall_timer->GetElapsedMilliseconds();
+
+ SearchResultProto result_proto =
+ InternalSearch(search_spec, scoring_spec, result_spec);
+
+ result_proto.mutable_query_stats()->set_lock_acquisition_latency_ms(
+ lock_acquisition_latency);
+ result_proto.mutable_query_stats()->set_latency_ms(
+ overall_timer->GetElapsedMilliseconds());
+ return result_proto;
}
-SearchResultProto IcingSearchEngine::Search(
+SearchResultProto IcingSearchEngine::InternalSearch(
const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
const ResultSpecProto& result_spec) {
SearchResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
- // TODO(b/146008613) Explore ideas to make this function read-only.
- absl_ports::unique_lock l(&mutex_);
+
+ QueryStatsProto* query_stats = result_proto.mutable_query_stats();
+ query_stats->set_is_first_page(true);
+ query_stats->set_requested_page_size(result_spec.num_per_page());
+
+ // TODO(b/305098009): deprecate search-related flat fields in query_stats.
+ query_stats->set_num_namespaces_filtered(
+ search_spec.namespace_filters_size());
+ query_stats->set_num_schema_types_filtered(
+ search_spec.schema_type_filters_size());
+ query_stats->set_query_length(search_spec.query().length());
+ query_stats->set_ranking_strategy(scoring_spec.rank_by());
+
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
- libtextclassifier3::Status status = ValidateResultSpec(result_spec);
+ libtextclassifier3::Status status =
+ ValidateResultSpec(document_store_.get(), result_spec);
if (!status.ok()) {
TransformStatus(status, result_status);
return result_proto;
@@ -948,153 +1992,337 @@ SearchResultProto IcingSearchEngine::Search(
return result_proto;
}
- // Gets unordered results from query processor
- auto query_processor_or = QueryProcessor::Create(
- index_.get(), language_segmenter_.get(), normalizer_.get(),
- document_store_.get(), schema_store_.get(), clock_.get());
- if (!query_processor_or.ok()) {
- TransformStatus(query_processor_or.status(), result_status);
- return result_proto;
- }
- std::unique_ptr<QueryProcessor> query_processor =
- std::move(query_processor_or).ValueOrDie();
-
- auto query_results_or = query_processor->ParseSearch(search_spec);
- if (!query_results_or.ok()) {
- TransformStatus(query_results_or.status(), result_status);
- return result_proto;
- }
- QueryProcessor::QueryResults query_results =
- std::move(query_results_or).ValueOrDie();
+ const JoinSpecProto& join_spec = search_spec.join_spec();
+ std::unique_ptr<JoinChildrenFetcher> join_children_fetcher;
+ std::unique_ptr<ResultAdjustmentInfo> child_result_adjustment_info;
+ int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
+ if (!join_spec.parent_property_expression().empty() &&
+ !join_spec.child_property_expression().empty()) {
+ query_stats->set_is_join_query(true);
+ QueryStatsProto::SearchStats* child_search_stats =
+ query_stats->mutable_child_search_stats();
+
+ // Process child query
+ QueryScoringResults nested_query_scoring_results = ProcessQueryAndScore(
+ join_spec.nested_spec().search_spec(),
+ join_spec.nested_spec().scoring_spec(),
+ join_spec.nested_spec().result_spec(),
+ /*join_children_fetcher=*/nullptr, current_time_ms, child_search_stats);
+ if (!nested_query_scoring_results.status.ok()) {
+ TransformStatus(nested_query_scoring_results.status, result_status);
+ return result_proto;
+ }
- // Scores but does not rank the results.
- libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
- scoring_processor_or =
- ScoringProcessor::Create(scoring_spec, document_store_.get());
- if (!scoring_processor_or.ok()) {
- TransformStatus(scoring_processor_or.status(), result_status);
+ JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
+ qualified_id_join_index_.get(),
+ current_time_ms);
+ // Building a JoinChildrenFetcher where child documents are grouped by
+ // their joinable values.
+ libtextclassifier3::StatusOr<JoinChildrenFetcher> join_children_fetcher_or =
+ join_processor.GetChildrenFetcher(
+ search_spec.join_spec(),
+ std::move(nested_query_scoring_results.scored_document_hits));
+ if (!join_children_fetcher_or.ok()) {
+ TransformStatus(join_children_fetcher_or.status(), result_status);
+ return result_proto;
+ }
+ join_children_fetcher = std::make_unique<JoinChildrenFetcher>(
+ std::move(join_children_fetcher_or).ValueOrDie());
+
+ // Assign child's ResultAdjustmentInfo.
+ child_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
+ join_spec.nested_spec().search_spec(),
+ join_spec.nested_spec().scoring_spec(),
+ join_spec.nested_spec().result_spec(), schema_store_.get(),
+ std::move(nested_query_scoring_results.query_terms));
+ }
+
+ // Process parent query
+ QueryStatsProto::SearchStats* parent_search_stats =
+ query_stats->mutable_parent_search_stats();
+ QueryScoringResults query_scoring_results = ProcessQueryAndScore(
+ search_spec, scoring_spec, result_spec, join_children_fetcher.get(),
+ current_time_ms, parent_search_stats);
+ // TODO(b/305098009): deprecate search-related flat fields in query_stats.
+ query_stats->set_num_terms(parent_search_stats->num_terms());
+ query_stats->set_parse_query_latency_ms(
+ parent_search_stats->parse_query_latency_ms());
+ query_stats->set_scoring_latency_ms(
+ parent_search_stats->scoring_latency_ms());
+ query_stats->set_num_documents_scored(
+ parent_search_stats->num_documents_scored());
+ if (!query_scoring_results.status.ok()) {
+ TransformStatus(query_scoring_results.status, result_status);
return result_proto;
}
- std::unique_ptr<ScoringProcessor> scoring_processor =
- std::move(scoring_processor_or).ValueOrDie();
- std::vector<ScoredDocumentHit> result_document_hits =
- scoring_processor->Score(std::move(query_results.root_iterator),
- performance_configuration_.num_to_score);
// Returns early for empty result
- if (result_document_hits.empty()) {
+ if (query_scoring_results.scored_document_hits.empty()) {
result_status->set_code(StatusProto::OK);
return result_proto;
}
- // Ranks and paginates results
- libtextclassifier3::StatusOr<PageResultState> page_result_state_or =
- result_state_manager_.RankAndPaginate(ResultState(
- std::move(result_document_hits), std::move(query_results.query_terms),
- search_spec, scoring_spec, result_spec));
- if (!page_result_state_or.ok()) {
- TransformStatus(page_result_state_or.status(), result_status);
- return result_proto;
- }
- PageResultState page_result_state =
- std::move(page_result_state_or).ValueOrDie();
-
- // Retrieves the document protos and snippets if requested
+ // Construct parent's result adjustment info.
+ auto parent_result_adjustment_info = std::make_unique<ResultAdjustmentInfo>(
+ search_spec, scoring_spec, result_spec, schema_store_.get(),
+ std::move(query_scoring_results.query_terms));
+
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker;
+ if (join_children_fetcher != nullptr) {
+ std::unique_ptr<Timer> join_timer = clock_->GetNewTimer();
+ // Join 2 scored document hits
+ JoinProcessor join_processor(document_store_.get(), schema_store_.get(),
+ qualified_id_join_index_.get(),
+ current_time_ms);
+ libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
+ joined_result_document_hits_or = join_processor.Join(
+ join_spec, std::move(query_scoring_results.scored_document_hits),
+ *join_children_fetcher);
+ if (!joined_result_document_hits_or.ok()) {
+ TransformStatus(joined_result_document_hits_or.status(), result_status);
+ return result_proto;
+ }
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits =
+ std::move(joined_result_document_hits_or).ValueOrDie();
+
+ query_stats->set_join_latency_ms(join_timer->GetElapsedMilliseconds());
+
+ std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
+ // Ranks results
+ ranker = std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
+ std::move(joined_result_document_hits),
+ /*is_descending=*/scoring_spec.order_by() ==
+ ScoringSpecProto::Order::DESC);
+ query_stats->set_ranking_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ } else {
+ // Non-join query
+ std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
+ // Ranks results
+ ranker = std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(query_scoring_results.scored_document_hits),
+ /*is_descending=*/scoring_spec.order_by() ==
+ ScoringSpecProto::Order::DESC);
+ query_stats->set_ranking_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ }
+
+ std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
+ // CacheAndRetrieveFirstPage and retrieves the document protos and snippets if
+ // requested
auto result_retriever_or =
- ResultRetriever::Create(document_store_.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get());
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get());
if (!result_retriever_or.ok()) {
- result_state_manager_.InvalidateResultState(
- page_result_state.next_page_token);
TransformStatus(result_retriever_or.status(), result_status);
+ query_stats->set_document_retrieval_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
- std::unique_ptr<ResultRetriever> result_retriever =
+ std::unique_ptr<ResultRetrieverV2> result_retriever =
std::move(result_retriever_or).ValueOrDie();
- libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
- results_or = result_retriever->RetrieveResults(page_result_state);
- if (!results_or.ok()) {
- result_state_manager_.InvalidateResultState(
- page_result_state.next_page_token);
- TransformStatus(results_or.status(), result_status);
+ libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ page_result_info_or = result_state_manager_->CacheAndRetrieveFirstPage(
+ std::move(ranker), std::move(parent_result_adjustment_info),
+ std::move(child_result_adjustment_info), result_spec,
+ *document_store_, *result_retriever, current_time_ms);
+ if (!page_result_info_or.ok()) {
+ TransformStatus(page_result_info_or.status(), result_status);
+ query_stats->set_document_retrieval_latency_ms(
+ component_timer->GetElapsedMilliseconds());
return result_proto;
}
- std::vector<SearchResultProto::ResultProto> results =
- std::move(results_or).ValueOrDie();
+ std::pair<uint64_t, PageResult> page_result_info =
+ std::move(page_result_info_or).ValueOrDie();
// Assembles the final search result proto
- result_proto.mutable_results()->Reserve(results.size());
- for (SearchResultProto::ResultProto& result : results) {
+ result_proto.mutable_results()->Reserve(
+ page_result_info.second.results.size());
+
+ int32_t child_count = 0;
+ for (SearchResultProto::ResultProto& result :
+ page_result_info.second.results) {
+ child_count += result.joined_results_size();
result_proto.mutable_results()->Add(std::move(result));
}
+
result_status->set_code(StatusProto::OK);
- if (page_result_state.next_page_token != kInvalidNextPageToken) {
- result_proto.set_next_page_token(page_result_state.next_page_token);
+ if (page_result_info.first != kInvalidNextPageToken) {
+ result_proto.set_next_page_token(page_result_info.first);
}
+
+ query_stats->set_document_retrieval_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ query_stats->set_num_results_returned_current_page(
+ result_proto.results_size());
+
+ query_stats->set_num_joined_results_returned_current_page(child_count);
+
+ query_stats->set_num_results_with_snippets(
+ page_result_info.second.num_results_with_snippets);
return result_proto;
}
+IcingSearchEngine::QueryScoringResults IcingSearchEngine::ProcessQueryAndScore(
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec,
+ const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms,
+ QueryStatsProto::SearchStats* search_stats) {
+ search_stats->set_num_namespaces_filtered(
+ search_spec.namespace_filters_size());
+ search_stats->set_num_schema_types_filtered(
+ search_spec.schema_type_filters_size());
+ search_stats->set_query_length(search_spec.query().length());
+ search_stats->set_ranking_strategy(scoring_spec.rank_by());
+
+ std::unique_ptr<Timer> component_timer = clock_->GetNewTimer();
+
+ // Gets unordered results from query processor
+ auto query_processor_or = QueryProcessor::Create(
+ index_.get(), integer_index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(), schema_store_.get());
+ if (!query_processor_or.ok()) {
+ search_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ return QueryScoringResults(std::move(query_processor_or).status(),
+ /*query_terms_in=*/{},
+ /*scored_document_hits_in=*/{});
+ }
+ std::unique_ptr<QueryProcessor> query_processor =
+ std::move(query_processor_or).ValueOrDie();
+
+ auto ranking_strategy_or = GetRankingStrategyFromScoringSpec(scoring_spec);
+ libtextclassifier3::StatusOr<QueryResults> query_results_or;
+ if (ranking_strategy_or.ok()) {
+ query_results_or = query_processor->ParseSearch(
+ search_spec, ranking_strategy_or.ValueOrDie(), current_time_ms);
+ } else {
+ query_results_or = ranking_strategy_or.status();
+ }
+ search_stats->set_parse_query_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+ if (!query_results_or.ok()) {
+ return QueryScoringResults(std::move(query_results_or).status(),
+ /*query_terms_in=*/{},
+ /*scored_document_hits_in=*/{});
+ }
+ QueryResults query_results = std::move(query_results_or).ValueOrDie();
+
+ // Set SearchStats related to QueryResults.
+ int term_count = 0;
+ for (const auto& section_and_terms : query_results.query_terms) {
+ term_count += section_and_terms.second.size();
+ }
+ search_stats->set_num_terms(term_count);
+
+ if (query_results.features_in_use.count(kNumericSearchFeature)) {
+ search_stats->set_is_numeric_query(true);
+ }
+
+ component_timer = clock_->GetNewTimer();
+ // Scores but does not rank the results.
+ libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
+ scoring_processor_or = ScoringProcessor::Create(
+ scoring_spec, document_store_.get(), schema_store_.get(),
+ current_time_ms, join_children_fetcher);
+ if (!scoring_processor_or.ok()) {
+ return QueryScoringResults(std::move(scoring_processor_or).status(),
+ std::move(query_results.query_terms),
+ /*scored_document_hits_in=*/{});
+ }
+ std::unique_ptr<ScoringProcessor> scoring_processor =
+ std::move(scoring_processor_or).ValueOrDie();
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ scoring_processor->Score(
+ std::move(query_results.root_iterator), result_spec.num_to_score(),
+ &query_results.query_term_iterators, search_stats);
+ search_stats->set_scoring_latency_ms(
+ component_timer->GetElapsedMilliseconds());
+
+ return QueryScoringResults(libtextclassifier3::Status::OK,
+ std::move(query_results.query_terms),
+ std::move(scored_document_hits));
+}
+
SearchResultProto IcingSearchEngine::GetNextPage(uint64_t next_page_token) {
SearchResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
+ QueryStatsProto* query_stats = result_proto.mutable_query_stats();
+ query_stats->set_is_first_page(false);
+ std::unique_ptr<Timer> overall_timer = clock_->GetNewTimer();
// ResultStateManager has its own writer lock, so here we only need a reader
// lock for other components.
absl_ports::shared_lock l(&mutex_);
+ query_stats->set_lock_acquisition_latency_ms(
+ overall_timer->GetElapsedMilliseconds());
if (!initialized_) {
result_status->set_code(StatusProto::FAILED_PRECONDITION);
result_status->set_message("IcingSearchEngine has not been initialized!");
return result_proto;
}
- libtextclassifier3::StatusOr<PageResultState> page_result_state_or =
- result_state_manager_.GetNextPage(next_page_token);
-
- if (!page_result_state_or.ok()) {
- if (absl_ports::IsNotFound(page_result_state_or.status())) {
- // NOT_FOUND means an empty result.
- result_status->set_code(StatusProto::OK);
- } else {
- // Real error, pass up.
- TransformStatus(page_result_state_or.status(), result_status);
- }
- return result_proto;
- }
-
- PageResultState page_result_state =
- std::move(page_result_state_or).ValueOrDie();
-
- // Retrieves the document protos.
auto result_retriever_or =
- ResultRetriever::Create(document_store_.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get());
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get());
if (!result_retriever_or.ok()) {
TransformStatus(result_retriever_or.status(), result_status);
return result_proto;
}
- std::unique_ptr<ResultRetriever> result_retriever =
+ std::unique_ptr<ResultRetrieverV2> result_retriever =
std::move(result_retriever_or).ValueOrDie();
- libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
- results_or = result_retriever->RetrieveResults(page_result_state);
- if (!results_or.ok()) {
- TransformStatus(results_or.status(), result_status);
+ int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
+ libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ page_result_info_or = result_state_manager_->GetNextPage(
+ next_page_token, *result_retriever, current_time_ms);
+ if (!page_result_info_or.ok()) {
+ if (absl_ports::IsNotFound(page_result_info_or.status())) {
+ // NOT_FOUND means an empty result.
+ result_status->set_code(StatusProto::OK);
+ } else {
+ // Real error, pass up.
+ TransformStatus(page_result_info_or.status(), result_status);
+ }
return result_proto;
}
- std::vector<SearchResultProto::ResultProto> results =
- std::move(results_or).ValueOrDie();
+
+ std::pair<uint64_t, PageResult> page_result_info =
+ std::move(page_result_info_or).ValueOrDie();
+ query_stats->set_requested_page_size(
+ page_result_info.second.requested_page_size);
// Assembles the final search result proto
- result_proto.mutable_results()->Reserve(results.size());
- for (SearchResultProto::ResultProto& result : results) {
+ result_proto.mutable_results()->Reserve(
+ page_result_info.second.results.size());
+
+ int32_t child_count = 0;
+ for (SearchResultProto::ResultProto& result :
+ page_result_info.second.results) {
+ child_count += result.joined_results_size();
result_proto.mutable_results()->Add(std::move(result));
}
result_status->set_code(StatusProto::OK);
- if (result_proto.results_size() > 0) {
- result_proto.set_next_page_token(next_page_token);
- }
+ if (page_result_info.first != kInvalidNextPageToken) {
+ result_proto.set_next_page_token(page_result_info.first);
+ }
+
+ // The only thing that we're doing is document retrieval. So document
+ // retrieval latency and overall latency are the same and can use the same
+ // timer.
+ query_stats->set_document_retrieval_latency_ms(
+ overall_timer->GetElapsedMilliseconds());
+ query_stats->set_latency_ms(overall_timer->GetElapsedMilliseconds());
+ query_stats->set_num_results_returned_current_page(
+ result_proto.results_size());
+ query_stats->set_num_results_with_snippets(
+ page_result_info.second.num_results_with_snippets);
+ query_stats->set_num_joined_results_returned_current_page(child_count);
+
return result_proto;
}
@@ -1104,10 +2332,11 @@ void IcingSearchEngine::InvalidateNextPageToken(uint64_t next_page_token) {
ICING_LOG(ERROR) << "IcingSearchEngine has not been initialized!";
return;
}
- result_state_manager_.InvalidateResultState(next_page_token);
+ result_state_manager_->InvalidateResultState(next_page_token);
}
-libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
+libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
+IcingSearchEngine::OptimizeDocumentStore(OptimizeStatsProto* optimize_stats) {
// Gets the current directory path and an empty tmp directory path for
// document store optimization.
const std::string current_document_dir =
@@ -1123,17 +2352,21 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
}
// Copies valid document data to tmp directory
- auto optimize_status = document_store_->OptimizeInto(temporary_document_dir);
+ libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
+ optimize_result_or = document_store_->OptimizeInto(
+ temporary_document_dir, language_segmenter_.get(), optimize_stats);
// Handles error if any
- if (!optimize_status.ok()) {
+ if (!optimize_result_or.ok()) {
filesystem_->DeleteDirectoryRecursively(temporary_document_dir.c_str());
return absl_ports::Annotate(
absl_ports::AbortedError("Failed to optimize document store"),
- optimize_status.error_message());
+ optimize_result_or.status().error_message());
}
- // Resets before swapping
+ // result_state_manager_ depends on document_store_. So we need to reset it at
+ // the same time that we reset the document_store_.
+ result_state_manager_.reset();
document_store_.reset();
// When swapping files, always put the current working directory at the
@@ -1146,24 +2379,35 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
// Ensures that current directory is still present.
if (!filesystem_->CreateDirectoryRecursively(
current_document_dir.c_str())) {
+ // Can't even create the old directory. Mark as uninitialized and return
+ // INTERNAL.
+ initialized_ = false;
return absl_ports::InternalError(
"Failed to create file directory for document store");
}
// Tries to rebuild document store if swapping fails, to avoid leaving the
// system in the broken state for future operations.
- auto document_store_or =
- DocumentStore::Create(filesystem_.get(), current_document_dir,
- clock_.get(), schema_store_.get());
+ auto create_result_or = DocumentStore::Create(
+ filesystem_.get(), current_document_dir, clock_.get(),
+ schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false,
+ options_.document_store_namespace_id_fingerprint(),
+ options_.pre_mapping_fbv(), options_.use_persistent_hash_map(),
+ options_.compression_level(), /*initialize_stats=*/nullptr);
// TODO(b/144458732): Implement a more robust version of
// TC_ASSIGN_OR_RETURN that can support error logging.
- if (!document_store_or.ok()) {
+ if (!create_result_or.ok()) {
+ // Unable to create DocumentStore from the old file. Mark as uninitialized
+ // and return INTERNAL.
+ initialized_ = false;
ICING_LOG(ERROR) << "Failed to create document store instance";
return absl_ports::Annotate(
absl_ports::InternalError("Failed to create document store instance"),
- document_store_or.status().error_message());
+ create_result_or.status().error_message());
}
- document_store_ = std::move(document_store_or).ValueOrDie();
+ document_store_ = std::move(create_result_or.ValueOrDie().document_store);
+ result_state_manager_ = std::make_unique<ResultStateManager>(
+ performance_configuration_.max_num_total_hits, *document_store_);
// Potential data loss
// TODO(b/147373249): Find a way to detect true data loss error
@@ -1172,13 +2416,25 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
}
// Recreates the doc store instance
- ICING_ASSIGN_OR_RETURN(
- document_store_,
- DocumentStore::Create(filesystem_.get(), current_document_dir,
- clock_.get(), schema_store_.get()),
- absl_ports::InternalError(
- "Document store has been optimized, but a valid document store "
- "instance can't be created"));
+ auto create_result_or = DocumentStore::Create(
+ filesystem_.get(), current_document_dir, clock_.get(),
+ schema_store_.get(), /*force_recovery_and_revalidate_documents=*/false,
+ options_.document_store_namespace_id_fingerprint(),
+ options_.pre_mapping_fbv(), options_.use_persistent_hash_map(),
+ options_.compression_level(), /*initialize_stats=*/nullptr);
+ if (!create_result_or.ok()) {
+ // Unable to create DocumentStore from the new file. Mark as uninitialized
+ // and return INTERNAL.
+ initialized_ = false;
+ return absl_ports::InternalError(
+ "Document store has been optimized, but a valid document store "
+ "instance can't be created");
+ }
+ DocumentStore::CreateResult create_result =
+ std::move(create_result_or).ValueOrDie();
+ document_store_ = std::move(create_result.document_store);
+ result_state_manager_ = std::make_unique<ResultStateManager>(
+ performance_configuration_.max_num_total_hits, *document_store_);
// Deletes tmp directory
if (!filesystem_->DeleteDirectoryRecursively(
@@ -1187,26 +2443,70 @@ libtextclassifier3::Status IcingSearchEngine::OptimizeDocumentStore() {
"delete temporary file directory";
}
- return libtextclassifier3::Status::OK;
+ // Since we created new (optimized) document store with correct PersistToDisk
+ // call, we shouldn't have data loss or regenerate derived files. Therefore,
+ // if we really encounter any of these situations, then return DataLossError
+ // to let the caller rebuild index.
+ if (create_result.data_loss != DataLoss::NONE ||
+ create_result.derived_files_regenerated) {
+ return absl_ports::DataLossError(
+ "Unexpected data loss or derived files regenerated for new document "
+ "store");
+ }
+
+ return optimize_result_or;
}
-libtextclassifier3::Status IcingSearchEngine::RestoreIndex() {
+IcingSearchEngine::IndexRestorationResult
+IcingSearchEngine::RestoreIndexIfNeeded() {
DocumentId last_stored_document_id =
document_store_->last_added_document_id();
-
- if (last_stored_document_id == kInvalidDocumentId) {
- // Nothing to index
- return libtextclassifier3::Status::OK;
+ if (last_stored_document_id == index_->last_added_document_id() &&
+ last_stored_document_id == integer_index_->last_added_document_id() &&
+ last_stored_document_id ==
+ qualified_id_join_index_->last_added_document_id()) {
+ // No need to recover.
+ return {libtextclassifier3::Status::OK, false, false, false};
}
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<IndexProcessor> index_processor,
- IndexProcessor::Create(schema_store_.get(), language_segmenter_.get(),
- normalizer_.get(), index_.get(),
- CreateIndexProcessorOptions(options_)));
-
- for (DocumentId document_id = kMinDocumentId;
- document_id <= last_stored_document_id; document_id++) {
+ if (last_stored_document_id == kInvalidDocumentId) {
+ // Document store is empty but index is not. Clear the index.
+ return {ClearAllIndices(), false, false, false};
+ }
+
+ // Truncate indices first.
+ auto truncate_result_or = TruncateIndicesTo(last_stored_document_id);
+ if (!truncate_result_or.ok()) {
+ return {std::move(truncate_result_or).status(), false, false, false};
+ }
+ TruncateIndexResult truncate_result =
+ std::move(truncate_result_or).ValueOrDie();
+
+ if (truncate_result.first_document_to_reindex > last_stored_document_id) {
+ // Nothing to restore. Just return.
+ return {libtextclassifier3::Status::OK, false, false, false};
+ }
+
+ auto data_indexing_handlers_or = CreateDataIndexingHandlers();
+ if (!data_indexing_handlers_or.ok()) {
+ return {data_indexing_handlers_or.status(),
+ truncate_result.index_needed_restoration,
+ truncate_result.integer_index_needed_restoration,
+ truncate_result.qualified_id_join_index_needed_restoration};
+ }
+ // By using recovery_mode for IndexProcessor, we're able to replay documents
+ // from smaller document id and it will skip documents that are already been
+ // indexed.
+ IndexProcessor index_processor(
+ std::move(data_indexing_handlers_or).ValueOrDie(), clock_.get(),
+ /*recovery_mode=*/true);
+
+ ICING_VLOG(1) << "Restoring index by replaying documents from document id "
+ << truncate_result.first_document_to_reindex
+ << " to document id " << last_stored_document_id;
+ libtextclassifier3::Status overall_status;
+ for (DocumentId document_id = truncate_result.first_document_to_reindex;
+ document_id <= last_stored_document_id; ++document_id) {
libtextclassifier3::StatusOr<DocumentProto> document_or =
document_store_->Get(document_id);
@@ -1217,15 +2517,45 @@ libtextclassifier3::Status IcingSearchEngine::RestoreIndex() {
continue;
} else {
// Returns other errors
- return document_or.status();
+ return {document_or.status(), truncate_result.index_needed_restoration,
+ truncate_result.integer_index_needed_restoration,
+ truncate_result.qualified_id_join_index_needed_restoration};
}
}
+ DocumentProto document(std::move(document_or).ValueOrDie());
+
+ libtextclassifier3::StatusOr<TokenizedDocument> tokenized_document_or =
+ TokenizedDocument::Create(schema_store_.get(),
+ language_segmenter_.get(),
+ std::move(document));
+ if (!tokenized_document_or.ok()) {
+ return {tokenized_document_or.status(),
+ truncate_result.index_needed_restoration,
+ truncate_result.integer_index_needed_restoration,
+ truncate_result.qualified_id_join_index_needed_restoration};
+ }
+ TokenizedDocument tokenized_document(
+ std::move(tokenized_document_or).ValueOrDie());
- ICING_RETURN_IF_ERROR(
- index_processor->IndexDocument(document_or.ValueOrDie(), document_id));
+ libtextclassifier3::Status status =
+ index_processor.IndexDocument(tokenized_document, document_id);
+ if (!status.ok()) {
+ if (!absl_ports::IsDataLoss(status)) {
+ // Real error. Stop recovering and pass it up.
+ return {status, truncate_result.index_needed_restoration,
+ truncate_result.integer_index_needed_restoration,
+ truncate_result.qualified_id_join_index_needed_restoration};
+ }
+ // FIXME: why can we skip data loss error here?
+ // Just a data loss. Keep trying to add the remaining docs, but report the
+ // data loss when we're done.
+ overall_status = status;
+ }
}
- return libtextclassifier3::Status::OK;
+ return {overall_status, truncate_result.index_needed_restoration,
+ truncate_result.integer_index_needed_restoration,
+ truncate_result.qualified_id_join_index_needed_restoration};
}
libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
@@ -1251,30 +2581,199 @@ libtextclassifier3::StatusOr<bool> IcingSearchEngine::LostPreviousSchema() {
return document_store_->last_added_document_id() != kInvalidDocumentId;
}
+libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
+IcingSearchEngine::CreateDataIndexingHandlers() {
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+
+ // Term index handler
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<TermIndexingHandler> term_indexing_handler,
+ TermIndexingHandler::Create(
+ clock_.get(), normalizer_.get(), index_.get(),
+ options_.build_property_existence_metadata_hits()));
+ handlers.push_back(std::move(term_indexing_handler));
+
+ // Integer index handler
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(
+ clock_.get(), integer_index_.get()));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+
+ // Qualified id join index handler
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler>
+ qualified_id_join_indexing_handler,
+ QualifiedIdJoinIndexingHandler::Create(
+ clock_.get(), document_store_.get(), qualified_id_join_index_.get()));
+ handlers.push_back(std::move(qualified_id_join_indexing_handler));
+
+ return handlers;
+}
+
+libtextclassifier3::StatusOr<IcingSearchEngine::TruncateIndexResult>
+IcingSearchEngine::TruncateIndicesTo(DocumentId last_stored_document_id) {
+ // Attempt to truncate term index.
+ // TruncateTo ensures that the index does not hold any data that is not
+ // present in the ground truth. If the document store lost some documents,
+ // TruncateTo will ensure that the index does not contain any hits from those
+ // lost documents. If the index does not contain any hits for documents with
+ // document id greater than last_stored_document_id, then TruncateTo will have
+ // no effect.
+ ICING_RETURN_IF_ERROR(index_->TruncateTo(last_stored_document_id));
+
+ // Get last indexed document id for term index after truncating.
+ DocumentId term_index_last_added_document_id =
+ index_->last_added_document_id();
+ DocumentId first_document_to_reindex =
+ (term_index_last_added_document_id != kInvalidDocumentId)
+ ? term_index_last_added_document_id + 1
+ : kMinDocumentId;
+ bool index_needed_restoration =
+ (last_stored_document_id != term_index_last_added_document_id);
+
+ // Attempt to truncate integer index.
+ bool integer_index_needed_restoration = false;
+ DocumentId integer_index_last_added_document_id =
+ integer_index_->last_added_document_id();
+ if (integer_index_last_added_document_id == kInvalidDocumentId ||
+ last_stored_document_id > integer_index_last_added_document_id) {
+ // If last_stored_document_id is greater than
+ // integer_index_last_added_document_id, then we only have to replay docs
+ // starting from integer_index_last_added_document_id + 1. Also use std::min
+ // since we might need to replay even smaller doc ids for term index.
+ integer_index_needed_restoration = true;
+ if (integer_index_last_added_document_id != kInvalidDocumentId) {
+ first_document_to_reindex = std::min(
+ first_document_to_reindex, integer_index_last_added_document_id + 1);
+ } else {
+ first_document_to_reindex = kMinDocumentId;
+ }
+ } else if (last_stored_document_id < integer_index_last_added_document_id) {
+ // Clear the entire integer index if last_stored_document_id is smaller than
+ // integer_index_last_added_document_id, because there is no way to remove
+ // data with doc_id > last_stored_document_id from integer index and we have
+ // to rebuild.
+ ICING_RETURN_IF_ERROR(integer_index_->Clear());
+
+ // Since the entire integer index is discarded, we start to rebuild it by
+ // setting first_document_to_reindex to kMinDocumentId.
+ integer_index_needed_restoration = true;
+ first_document_to_reindex = kMinDocumentId;
+ }
+
+ // Attempt to truncate qualified id join index
+ bool qualified_id_join_index_needed_restoration = false;
+ DocumentId qualified_id_join_index_last_added_document_id =
+ qualified_id_join_index_->last_added_document_id();
+ if (qualified_id_join_index_last_added_document_id == kInvalidDocumentId ||
+ last_stored_document_id >
+ qualified_id_join_index_last_added_document_id) {
+ // If last_stored_document_id is greater than
+ // qualified_id_join_index_last_added_document_id, then we only have to
+ // replay docs starting from (qualified_id_join_index_last_added_document_id
+ // + 1). Also use std::min since we might need to replay even smaller doc
+ // ids for other components.
+ qualified_id_join_index_needed_restoration = true;
+ if (qualified_id_join_index_last_added_document_id != kInvalidDocumentId) {
+ first_document_to_reindex =
+ std::min(first_document_to_reindex,
+ qualified_id_join_index_last_added_document_id + 1);
+ } else {
+ first_document_to_reindex = kMinDocumentId;
+ }
+ } else if (last_stored_document_id <
+ qualified_id_join_index_last_added_document_id) {
+ // Clear the entire qualified id join index if last_stored_document_id is
+ // smaller than qualified_id_join_index_last_added_document_id, because
+ // there is no way to remove data with doc_id > last_stored_document_id from
+ // join index efficiently and we have to rebuild.
+ ICING_RETURN_IF_ERROR(qualified_id_join_index_->Clear());
+
+ // Since the entire qualified id join index is discarded, we start to
+ // rebuild it by setting first_document_to_reindex to kMinDocumentId.
+ qualified_id_join_index_needed_restoration = true;
+ first_document_to_reindex = kMinDocumentId;
+ }
+
+ return TruncateIndexResult(first_document_to_reindex,
+ index_needed_restoration,
+ integer_index_needed_restoration,
+ qualified_id_join_index_needed_restoration);
+}
+
+libtextclassifier3::Status IcingSearchEngine::DiscardDerivedFiles() {
+ if (schema_store_ != nullptr || document_store_ != nullptr ||
+ index_ != nullptr || integer_index_ != nullptr ||
+ qualified_id_join_index_ != nullptr) {
+ return absl_ports::FailedPreconditionError(
+ "Cannot discard derived files while having valid instances");
+ }
+
+ // Schema store
+ ICING_RETURN_IF_ERROR(
+ SchemaStore::DiscardDerivedFiles(filesystem_.get(), options_.base_dir()));
+
+ // Document store
+ ICING_RETURN_IF_ERROR(DocumentStore::DiscardDerivedFiles(
+ filesystem_.get(), options_.base_dir()));
+
+ // Term index
+ if (!filesystem_->DeleteDirectoryRecursively(
+ MakeIndexDirectoryPath(options_.base_dir()).c_str())) {
+ return absl_ports::InternalError("Failed to discard index");
+ }
+
+ // Integer index
+ if (!filesystem_->DeleteDirectoryRecursively(
+ MakeIntegerIndexWorkingPath(options_.base_dir()).c_str())) {
+ return absl_ports::InternalError("Failed to discard integer index");
+ }
+
+ // Qualified id join index
+ if (!filesystem_->DeleteDirectoryRecursively(
+ MakeQualifiedIdJoinIndexWorkingPath(options_.base_dir()).c_str())) {
+ return absl_ports::InternalError(
+ "Failed to discard qualified id join index");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IcingSearchEngine::ClearSearchIndices() {
+ ICING_RETURN_IF_ERROR(index_->Reset());
+ ICING_RETURN_IF_ERROR(integer_index_->Clear());
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IcingSearchEngine::ClearJoinIndices() {
+ return qualified_id_join_index_->Clear();
+}
+
+libtextclassifier3::Status IcingSearchEngine::ClearAllIndices() {
+ ICING_RETURN_IF_ERROR(ClearSearchIndices());
+ ICING_RETURN_IF_ERROR(ClearJoinIndices());
+ return libtextclassifier3::Status::OK;
+}
+
ResetResultProto IcingSearchEngine::Reset() {
+ absl_ports::unique_lock l(&mutex_);
+ return ResetInternal();
+}
+
+ResetResultProto IcingSearchEngine::ResetInternal() {
ICING_VLOG(1) << "Resetting IcingSearchEngine";
ResetResultProto result_proto;
StatusProto* result_status = result_proto.mutable_status();
- int64_t before_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
-
+ initialized_ = false;
+ ResetMembers();
if (!filesystem_->DeleteDirectoryRecursively(options_.base_dir().c_str())) {
- int64_t after_size = filesystem_->GetDiskUsage(options_.base_dir().c_str());
- if (after_size != before_size) {
- // Our filesystem doesn't atomically delete. If we have a discrepancy in
- // size, then that means we may have deleted some files, but not others.
- // So our data is in an invalid state now.
- result_status->set_code(StatusProto::INTERNAL);
- return result_proto;
- }
-
- result_status->set_code(StatusProto::ABORTED);
+ result_status->set_code(StatusProto::INTERNAL);
return result_proto;
}
- absl_ports::unique_lock l(&mutex_);
- initialized_ = false;
if (InternalInitialize().status().code() != StatusProto::OK) {
// We shouldn't hit the following Initialize errors:
// NOT_FOUND: all data was cleared, we aren't expecting anything
@@ -1295,5 +2794,54 @@ ResetResultProto IcingSearchEngine::Reset() {
return result_proto;
}
+SuggestionResponse IcingSearchEngine::SearchSuggestions(
+ const SuggestionSpecProto& suggestion_spec) {
+ // TODO(b/146008613) Explore ideas to make this function read-only.
+ absl_ports::unique_lock l(&mutex_);
+ SuggestionResponse response;
+ StatusProto* response_status = response.mutable_status();
+ if (!initialized_) {
+ response_status->set_code(StatusProto::FAILED_PRECONDITION);
+ response_status->set_message("IcingSearchEngine has not been initialized!");
+ return response;
+ }
+
+ libtextclassifier3::Status status =
+ ValidateSuggestionSpec(suggestion_spec, performance_configuration_);
+ if (!status.ok()) {
+ TransformStatus(status, response_status);
+ return response;
+ }
+
+ // Create the suggestion processor.
+ auto suggestion_processor_or = SuggestionProcessor::Create(
+ index_.get(), integer_index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(), schema_store_.get());
+ if (!suggestion_processor_or.ok()) {
+ TransformStatus(suggestion_processor_or.status(), response_status);
+ return response;
+ }
+ std::unique_ptr<SuggestionProcessor> suggestion_processor =
+ std::move(suggestion_processor_or).ValueOrDie();
+
+ // Run suggestion based on given SuggestionSpec.
+ int64_t current_time_ms = clock_->GetSystemTimeMilliseconds();
+ libtextclassifier3::StatusOr<std::vector<TermMetadata>> terms_or =
+ suggestion_processor->QuerySuggestions(suggestion_spec, current_time_ms);
+ if (!terms_or.ok()) {
+ TransformStatus(terms_or.status(), response_status);
+ return response;
+ }
+
+ // Convert vector<TermMetaData> into final SuggestionResponse proto.
+ for (TermMetadata& term : terms_or.ValueOrDie()) {
+ SuggestionResponse::Suggestion suggestion;
+ suggestion.set_query(std::move(term.content));
+ response.mutable_suggestions()->Add(std::move(suggestion));
+ }
+ response_status->set_code(StatusProto::OK);
+ return response;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/icing-search-engine.h b/icing/icing-search-engine.h
index 6ae76d7..d316350 100644
--- a/icing/icing-search-engine.h
+++ b/icing/icing-search-engine.h
@@ -19,26 +19,38 @@
#include <memory>
#include <string>
#include <string_view>
+#include <utility>
+#include <vector>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/mutex.h"
#include "icing/absl_ports/thread_annotations.h"
#include "icing/file/filesystem.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/join/qualified-id-join-index.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/performance-configuration.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/optimize.pb.h"
#include "icing/proto/persist.pb.h"
#include "icing/proto/reset.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/query/query-terms.h"
#include "icing/result/result-state-manager.h"
#include "icing/schema/schema-store.h"
+#include "icing/scoring/scored-document-hit.h"
#include "icing/store/document-store.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer.h"
@@ -51,16 +63,6 @@ namespace lib {
// TODO(cassiewang) Top-level comments and links to design-doc.
class IcingSearchEngine {
public:
- struct Header {
- static constexpr int32_t kMagic = 0x6e650d0a;
-
- // Holds the magic as a quick sanity check against file corruption.
- int32_t magic;
-
- // Checksum of the IcingSearchEngine's sub-component's checksums.
- uint32_t checksum;
- };
-
// Note: It is only required to provide a pointer to a valid instance of
// JniCache if this instance needs to perform reverse-jni calls. Users on
// Linux and iOS should always provide a nullptr.
@@ -128,12 +130,18 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // ALREADY_EXISTS if 'new_schema' contains multiple definitions of the same
+ // type or contains a type that has multiple properties with the same
+ // name.
// INVALID_ARGUMENT if 'new_schema' is invalid
// FAILED_PRECONDITION if 'new_schema' is incompatible, or IcingSearchEngine
// has not been initialized yet.
// INTERNAL_ERROR if Icing failed to store the new schema or upgrade
// existing data based on the new schema. Using Icing beyond this error is
// undefined and may cause crashes.
+ // DATA_LOSS_ERROR if 'new_schema' requires the index to be rebuilt and an
+ // IO error leads to some documents being excluded from the index. These
+ // documents will still be retrievable via Get, but won't match queries.
//
// TODO(cassiewang) Figure out, document (and maybe even enforce) the best
// way ordering of calls between Initialize() and SetSchema(), both when
@@ -180,10 +188,14 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // OUT_OF_SPACE if exceeds maximum number of allowed documents
// FAILED_PRECONDITION if a schema has not been set yet, IcingSearchEngine
// has not been initialized yet.
// NOT_FOUND if there is no SchemaTypeConfig in the SchemaProto that matches
// the document's schema
+ // DATA_LOSS if an IO error occurs while merging document into the index and
+ // the index is lost. These documents will still be retrievable via Get,
+ // but won't match queries.
// INTERNAL_ERROR on IO error
PutResultProto Put(DocumentProto&& document) ICING_LOCKS_EXCLUDED(mutex_);
@@ -203,7 +215,17 @@ class IcingSearchEngine {
// NOT_FOUND if the key doesn't exist or doc has been deleted
// FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL_ERROR on IO error
- GetResultProto Get(std::string_view name_space, std::string_view uri);
+ GetResultProto Get(std::string_view name_space, std::string_view uri,
+ const GetResultSpecProto& result_spec);
+
+ // Reports usage. The corresponding usage scores of the specified document in
+ // the report will be updated.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
+ // INTERNAL_ERROR on I/O errors.
+ ReportUsageResultProto ReportUsage(const UsageReport& usage_report);
// Returns all the namespaces that have at least one valid document in it.
//
@@ -256,10 +278,26 @@ class IcingSearchEngine {
DeleteBySchemaTypeResultProto DeleteBySchemaType(std::string_view schema_type)
ICING_LOCKS_EXCLUDED(mutex_);
+ // Deletes all Documents that match the query specified in search_spec. Delete
+ // changes are automatically applied to disk, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // NOTE: Space is not reclaimed for deleted documents until Optimize() is
+ // called.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND if the query doesn't match any documents
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
+ // INTERNAL_ERROR on IO error
+ DeleteByQueryResultProto DeleteByQuery(
+ const SearchSpecProto& search_spec,
+ bool return_deleted_document_info = false) ICING_LOCKS_EXCLUDED(mutex_);
+
// Retrieves, scores, ranks, and returns the results according to the specs.
// Results can be empty. If there're multiple pages of results,
- // SearchResultProto.next_page_token will be populated and that can be used to
- // fetch more pages via GetNextPage() method. Clients should call
+ // SearchResultProto.next_page_token will be set to a non-zero token and can
+ // be used to fetch more pages via GetNextPage() method. Clients should call
// InvalidateNextPageToken() after they get the pages they need to release
// result cache in memory. Please refer to each proto file for spec
// definitions.
@@ -275,8 +313,24 @@ class IcingSearchEngine {
const ResultSpecProto& result_spec)
ICING_LOCKS_EXCLUDED(mutex_);
+ // Retrieves, scores, ranks and returns the suggested query string according
+ // to the specs. Results can be empty.
+ //
+ // Returns a SuggestionResponse with status:
+ // OK with results on success
+ // INVALID_ARGUMENT if any of specs is invalid
+ // FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
+ // INTERNAL_ERROR on any other errors
+ SuggestionResponse SearchSuggestions(
+ const SuggestionSpecProto& suggestion_spec) ICING_LOCKS_EXCLUDED(mutex_);
+
// Fetches the next page of results of a previously executed query. Results
- // can be empty if next-page token is invalid or all pages have been returned.
+ // can be empty if next-page token is invalid. Invalid next page tokens are
+ // tokens that are either zero or were previously passed to
+ // InvalidateNextPageToken. If there are pages of results remaining after the
+ // one retrieved by this call, SearchResultProto.next_page_token will be
+ // set to a non-zero token and can be used to fetch more pages via
+ // GetNextPage() method.
//
// Returns a SearchResultProto with status:
// OK with results on success
@@ -288,12 +342,26 @@ class IcingSearchEngine {
// Invalidates the next-page token so that no more results of the related
// query can be returned.
- void InvalidateNextPageToken(uint64_t next_page_token);
+ void InvalidateNextPageToken(uint64_t next_page_token)
+ ICING_LOCKS_EXCLUDED(mutex_);
// Makes sure that every update/delete received till this point is flushed
// to disk. If the app crashes after a call to PersistToDisk(), Icing
// would be able to fully recover all data written up to this point.
//
+ // If persist_type is PersistType::LITE, then only the ground truth will be
+ // synced. This should be relatively lightweight to do (order of microseconds)
+ // and ensures that there will be no data loss. At worst, Icing may need to
+ // recover internal data structures by replaying the document log upon the
+ // next startup. Clients should call PersistToDisk(LITE) after each batch of
+ // mutations.
+ //
+ // If persist_type is PersistType::FULL, then all internal data structures in
+ // Icing will be synced. This is a heavier operation (order of milliseconds).
+ // It ensures that Icing will not need to recover internal data structures
+ // upon the next startup. Clients should call PersistToDisk(FULL) before their
+ // process dies.
+ //
// NOTE: It is not necessary to call PersistToDisk() to read back data
// that was recently written. All read APIs will include the most recent
// updates/deletes regardless of the data being flushed to disk.
@@ -302,7 +370,8 @@ class IcingSearchEngine {
// OK on success
// FAILED_PRECONDITION IcingSearchEngine has not been initialized yet
// INTERNAL on I/O error
- PersistToDiskResultProto PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_);
+ PersistToDiskResultProto PersistToDisk(PersistType::Code persist_type)
+ ICING_LOCKS_EXCLUDED(mutex_);
// Allows Icing to run tasks that are too expensive and/or unnecessary to be
// executed in real-time, but are useful to keep it fast and be
@@ -338,6 +407,16 @@ class IcingSearchEngine {
// INTERNAL_ERROR on IO error
GetOptimizeInfoResultProto GetOptimizeInfo() ICING_LOCKS_EXCLUDED(mutex_);
+ // Calculates the StorageInfo for Icing.
+ //
+ // If an IO error occurs while trying to calculate the value for a field, then
+ // that field will be set to -1.
+ StorageInfoResultProto GetStorageInfo() ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Get debug information for Icing.
+ DebugInfoResultProto GetDebugInfo(DebugInfoVerbosity::Code verbosity)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
// Clears all data from Icing and re-initializes. Clients DO NOT need to call
// Initialize again.
//
@@ -354,6 +433,7 @@ class IcingSearchEngine {
protected:
IcingSearchEngine(IcingSearchEngineOptions options,
std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
std::unique_ptr<Clock> clock,
std::unique_ptr<const JniCache> jni_cache = nullptr);
@@ -364,15 +444,19 @@ class IcingSearchEngine {
bool initialized_ ICING_GUARDED_BY(mutex_) = false;
// Abstraction for accessing time values.
- std::unique_ptr<Clock> clock_;
+ const std::unique_ptr<const Clock> clock_;
// Provides key thresholds that affects the running time and memory of major
// components in Icing search engine.
- PerformanceConfiguration performance_configuration_;
-
- // Used to manage pagination state of query results. A lock is not needed here
- // because ResultStateManager has its own reader-writer lock.
- ResultStateManager result_state_manager_;
+ const PerformanceConfiguration performance_configuration_;
+
+ // Used to manage pagination state of query results. Even though
+ // ResultStateManager has its own reader-writer lock, mutex_ must still be
+ // acquired first in order to adhere to the global lock ordering:
+ // 1. mutex_
+ // 2. result_state_manager_.lock_
+ std::unique_ptr<ResultStateManager> result_state_manager_
+ ICING_GUARDED_BY(mutex_);
// Used to provide reader and writer locks
absl_ports::shared_mutex mutex_;
@@ -388,18 +472,45 @@ class IcingSearchEngine {
std::unique_ptr<const Normalizer> normalizer_ ICING_GUARDED_BY(mutex_);
- // Storage for all hits of content from the document store.
+ // Storage for all hits of string contents from the document store.
std::unique_ptr<Index> index_ ICING_GUARDED_BY(mutex_);
+ // Storage for all hits of numeric contents from the document store.
+ std::unique_ptr<NumericIndex<int64_t>> integer_index_
+ ICING_GUARDED_BY(mutex_);
+
+ // Storage for all join qualified ids from the document store.
+ std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_
+ ICING_GUARDED_BY(mutex_);
+
// Pointer to JNI class references
const std::unique_ptr<const JniCache> jni_cache_;
+ // Resets all members that are created during Initialize.
+ void ResetMembers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Resets all members that are created during Initialize, deletes all
+ // underlying files and initializes a fresh index.
+ ResetResultProto ResetInternal() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Checks for the existence of the init marker file. If the failed init count
+ // exceeds kMaxUnsuccessfulInitAttempts, all data is deleted and the index is
+ // initialized from scratch. The updated count (original failed init count + 1
+ // ) is written to the marker file.
+ //
+ // RETURNS
+ // OK on success
+ // INTERNAL if an IO error occurs while trying to update the marker file.
+ libtextclassifier3::Status CheckInitMarkerFile(
+ InitializeStatsProto* initialize_stats)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
// Helper method to do the actual work to persist data to disk. We need this
// separate method so that other public methods don't need to call
// PersistToDisk(). Public methods calling each other may cause deadlock
// issues.
- libtextclassifier3::Status InternalPersistToDisk()
- ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ libtextclassifier3::Status InternalPersistToDisk(
+ PersistType::Code persist_type) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Helper method to the actual work to Initialize. We need this separate
// method so that other public methods don't need to call Initialize(). Public
@@ -411,49 +522,110 @@ class IcingSearchEngine {
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if initialize_stats is null
// RESOURCE_EXHAUSTED if the index runs out of storage
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL on any I/O errors
- libtextclassifier3::Status InitializeMembers()
- ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
- // Do any validation/setup required for the given IcingSearchEngineOptions
- //
- // Returns:
- // OK on success
- // INVALID_ARGUMENT if options has invalid values
- // INTERNAL on I/O error
- libtextclassifier3::Status InitializeOptions()
+ libtextclassifier3::Status InitializeMembers(
+ InitializeStatsProto* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any initialization/recovery necessary to create a SchemaStore instance.
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if initialize_stats is null
// INTERNAL on I/O error
- libtextclassifier3::Status InitializeSchemaStore()
+ libtextclassifier3::Status InitializeSchemaStore(
+ InitializeStatsProto* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Do any initialization/recovery necessary to create a DocumentStore
// instance.
//
+ // See comments on DocumentStore::Create for explanation of
+ // force_recovery_and_revalidate_documents.
+ //
// Returns:
- // OK on success
+ // On success, a boolean flag indicating whether derived files of the
+ // document store have been regenerated or not. If true, any other
+ // components depending on them should also be rebuilt if true.
+ // FAILED_PRECONDITION if initialize_stats is null
// INTERNAL on I/O error
- libtextclassifier3::Status InitializeDocumentStore()
+ libtextclassifier3::StatusOr<bool> InitializeDocumentStore(
+ bool force_recovery_and_revalidate_documents,
+ InitializeStatsProto* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
- // Do any initialization/recovery necessary to create a DocumentStore
- // instance.
+ // Do any initialization/recovery necessary to create term index, integer
+ // index, and qualified id join index instances.
+ //
+ // If document_store_derived_files_regenerated is true, then we have to
+ // rebuild qualified id join index since NamespaceIds were reassigned.
//
// Returns:
// OK on success
+ // FAILED_PRECONDITION if initialize_stats is null
// RESOURCE_EXHAUSTED if the index runs out of storage
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL on I/O error
- libtextclassifier3::Status InitializeIndex()
+ libtextclassifier3::Status InitializeIndex(
+ bool document_store_derived_files_regenerated,
+ InitializeStatsProto* initialize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ // Implementation of IcingSearchEngine::Search that only grabs the overall
+ // read-lock, allowing for parallel non-exclusive operations.
+ // This implementation is used if search_spec.use_read_only_search is true.
+ SearchResultProto SearchLockedShared(const SearchSpecProto& search_spec,
+ const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Implementation of IcingSearchEngine::Search that requires the overall
+ // write lock. No other operations of any kind can be executed in parallel if
+ // this version is used.
+ // This implementation is used if search_spec.use_read_only_search is false.
+ SearchResultProto SearchLockedExclusive(const SearchSpecProto& search_spec,
+ const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Helper method for the actual work to Search. We need this separate
+ // method to manage locking for Search.
+ SearchResultProto InternalSearch(const SearchSpecProto& search_spec,
+ const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec)
+ ICING_SHARED_LOCKS_REQUIRED(mutex_);
+
+ // Processes query and scores according to the specs. It is a helper function
+ // (called by Search) to process and score normal query and the nested child
+ // query for join search.
+ //
+ // Returns a QueryScoringResults
+ // OK on success with a vector of ScoredDocumentHits,
+ // SectionRestrictQueryTermsMap, and other stats fields for logging.
+ // Any other errors when processing the query or scoring
+ struct QueryScoringResults {
+ libtextclassifier3::Status status;
+ SectionRestrictQueryTermsMap query_terms;
+ std::vector<ScoredDocumentHit> scored_document_hits;
+
+ explicit QueryScoringResults(
+ libtextclassifier3::Status status_in,
+ SectionRestrictQueryTermsMap&& query_terms_in,
+ std::vector<ScoredDocumentHit>&& scored_document_hits_in)
+ : status(std::move(status_in)),
+ query_terms(std::move(query_terms_in)),
+ scored_document_hits(std::move(scored_document_hits_in)) {}
+ };
+ QueryScoringResults ProcessQueryAndScore(
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec,
+ const JoinChildrenFetcher* join_children_fetcher, int64_t current_time_ms,
+ QueryStatsProto::SearchStats* search_stats)
+ ICING_SHARED_LOCKS_REQUIRED(mutex_);
+
// Many of the internal components rely on other components' derived data.
// Check that everything is consistent with each other so that we're not
// using outdated derived data in some parts of our system.
@@ -469,12 +641,23 @@ class IcingSearchEngine {
libtextclassifier3::Status CheckConsistency()
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ // Discards all derived data.
+ //
+ // Returns:
+ // OK on success
+ // FAILED_PRECONDITION_ERROR if those instances are valid (non nullptr)
+ // INTERNAL_ERROR on any I/O errors
+ libtextclassifier3::Status DiscardDerivedFiles()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
// Repopulates derived data off our ground truths.
//
// Returns:
// OK on success
// INTERNAL_ERROR on any IO errors
- libtextclassifier3::Status RegenerateDerivedFiles()
+ libtextclassifier3::Status RegenerateDerivedFiles(
+ InitializeStatsProto* initialize_stats = nullptr,
+ bool log_document_store_stats = false)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Optimizes the DocumentStore by removing any unneeded documents (i.e.
@@ -484,44 +667,41 @@ class IcingSearchEngine {
// would need call Initialize() to reinitialize everything into a valid state.
//
// Returns:
- // OK on success
+ // On success, OptimizeResult which contains a vector mapping from old
+ // document id to new document id and another vector mapping from old
+ // namespace id to new namespace id. A value of kInvalidDocumentId indicates
+ // that the old document id has been deleted.
// ABORTED_ERROR if any error happens before the actual optimization, the
// original document store should be still available
// DATA_LOSS_ERROR on errors that could potentially cause data loss,
// document store is still available
// INTERNAL_ERROR on any IO errors or other errors that we can't recover
// from
- libtextclassifier3::Status OptimizeDocumentStore()
+ libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
+ OptimizeDocumentStore(OptimizeStatsProto* optimize_stats)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
- // Helper method to restore missing document data in index_. All documents
- // will be reindexed. This does not clear the index, so it is recommended to
- // call Index::Reset first.
+ // Helper method to restore missing document data in index_, integer_index_,
+ // and qualified_id_join_index_. All documents will be reindexed. This does
+ // not clear the index, so it is recommended to call ClearAllIndices,
+ // ClearSearchIndices, or ClearJoinIndices first if needed.
//
// Returns:
- // OK on success
+ // On success, OK and a bool indicating whether or not restoration was
+ // needed.
+ // DATA_LOSS, if an error during index merging caused us to lose indexed
+ // data in the main index. Despite the data loss, this is still considered
+ // a successful run and needed_restoration will be set to true.
// RESOURCE_EXHAUSTED if the index fills up before finishing indexing
// NOT_FOUND if some Document's schema type is not in the SchemaStore
// INTERNAL_ERROR on any IO errors
- libtextclassifier3::Status RestoreIndex()
- ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
- // Computes the combined checksum of the IcingSearchEngine - includes all its
- // subcomponents
- //
- // Returns:
- // Combined checksum on success
- // INTERNAL_ERROR on compute error
- libtextclassifier3::StatusOr<Crc32> ComputeChecksum()
- ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
- // Checks if the header exists already. This does not create the header file
- // if it doesn't exist.
- bool HeaderExists() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
- // Update and replace the header file. Creates the header file if it doesn't
- // exist.
- libtextclassifier3::Status UpdateHeader(const Crc32& checksum)
+ struct IndexRestorationResult {
+ libtextclassifier3::Status status;
+ bool index_needed_restoration;
+ bool integer_index_needed_restoration;
+ bool qualified_id_join_index_needed_restoration;
+ };
+ IndexRestorationResult RestoreIndexIfNeeded()
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// If we lost the schema during a previous failure, it may "look" the same as
@@ -535,6 +715,70 @@ class IcingSearchEngine {
// INTERNAL_ERROR on I/O error
libtextclassifier3::StatusOr<bool> LostPreviousSchema()
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to create all types of data indexing handlers to index term,
+ // integer, and join qualified ids.
+ libtextclassifier3::StatusOr<
+ std::vector<std::unique_ptr<DataIndexingHandler>>>
+ CreateDataIndexingHandlers() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to discard parts of (term, integer, qualified id join)
+ // indices if they contain data for document ids greater than
+ // last_stored_document_id.
+ //
+ // REQUIRES: last_stored_document_id is valid (!= kInvalidDocumentId). Note:
+ // if we want to truncate everything in the index, then please call
+ // ClearSearchIndices/ClearJoinIndices/ClearAllIndices instead.
+ //
+ // Returns:
+ // On success, a DocumentId indicating the first document to start for
+ // reindexing and 2 bool flags indicating whether term or integer index
+ // needs restoration.
+ // INTERNAL on any I/O errors
+ struct TruncateIndexResult {
+ DocumentId first_document_to_reindex;
+ bool index_needed_restoration;
+ bool integer_index_needed_restoration;
+ bool qualified_id_join_index_needed_restoration;
+
+ explicit TruncateIndexResult(
+ DocumentId first_document_to_reindex_in,
+ bool index_needed_restoration_in,
+ bool integer_index_needed_restoration_in,
+ bool qualified_id_join_index_needed_restoration_in)
+ : first_document_to_reindex(first_document_to_reindex_in),
+ index_needed_restoration(index_needed_restoration_in),
+ integer_index_needed_restoration(integer_index_needed_restoration_in),
+ qualified_id_join_index_needed_restoration(
+ qualified_id_join_index_needed_restoration_in) {}
+ };
+ libtextclassifier3::StatusOr<TruncateIndexResult> TruncateIndicesTo(
+ DocumentId last_stored_document_id)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to discard search (term, integer) indices.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on any I/O errors
+ libtextclassifier3::Status ClearSearchIndices()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to discard join (qualified id) indices.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on any I/O errors
+ libtextclassifier3::Status ClearJoinIndices()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Helper method to discard all search and join indices.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on any I/O errors
+ libtextclassifier3::Status ClearAllIndices()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
};
} // namespace lib
diff --git a/icing/icing-search-engine_backwards_compatibility_test.cc b/icing/icing-search-engine_backwards_compatibility_test.cc
new file mode 100644
index 0000000..178e923
--- /dev/null
+++ b/icing/icing-search-engine_backwards_compatibility_test.cc
@@ -0,0 +1,569 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/icing-search-engine.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::Eq;
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+class IcingSearchEngineBackwardsCompatibilityTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ private:
+ Filesystem filesystem_;
+};
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+std::string GetTestDataDir(std::string_view test_subdir) {
+ if (IsAndroidX86()) {
+ return GetTestFilePath(
+ absl_ports::StrCat("icing/testdata/", test_subdir,
+ "/icing_search_engine_android_x86"));
+ } else if (IsAndroidArm()) {
+ return GetTestFilePath(
+ absl_ports::StrCat("icing/testdata/", test_subdir,
+ "/icing_search_engine_android_arm"));
+ } else if (IsIosPlatform()) {
+ return GetTestFilePath(absl_ports::StrCat("icing/testdata/",
+ test_subdir,
+ "/icing_search_engine_ios"));
+ } else {
+ return GetTestFilePath(absl_ports::StrCat("icing/testdata/",
+ test_subdir,
+ "/icing_search_engine_linux"));
+ }
+}
+
+TEST_F(IcingSearchEngineBackwardsCompatibilityTest,
+ MigrateToPortableFileBackedProtoLog) {
+ // Copy the testdata files into our IcingSearchEngine directory
+ std::string dir_without_portable_log = GetTestDataDir("not_portable_log");
+
+ // Create dst directory that we'll initialize the IcingSearchEngine over.
+ std::string base_dir = GetTestBaseDir() + "_migrate";
+ ASSERT_THAT(filesystem()->DeleteDirectoryRecursively(base_dir.c_str()), true);
+ ASSERT_THAT(filesystem()->CreateDirectoryRecursively(base_dir.c_str()), true);
+
+ ASSERT_TRUE(filesystem()->CopyDirectory(dir_without_portable_log.c_str(),
+ base_dir.c_str(),
+ /*recursive=*/true));
+
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(base_dir);
+
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+
+ // Since there will be version change, the recovery cause will be
+ // VERSION_CHANGED.
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+
+ // Set up schema, this is the one used to validate documents in the testdata
+ // files. Do not change unless you're also updating the testdata files.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Make sure our schema is still the same as we expect. If not, there's
+ // definitely no way we're getting the documents back that we expect.
+ GetSchemaResultProto expected_get_schema_result_proto;
+ expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_schema_result_proto.mutable_schema() = schema;
+ ASSERT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto));
+
+ // These are the documents that are stored in the testdata files. Do not
+ // change unless you're also updating the testdata files.
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .AddStringProperty("body", "bar")
+ .Build();
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("email")
+ .SetCreationTimestampMs(20)
+ .SetScore(321)
+ .AddStringProperty("body", "baz bat")
+ .Build();
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(30)
+ .SetScore(123)
+ .AddStringProperty("subject", "phoo")
+ .Build();
+
+ // Document 1 and 3 were put normally, and document 2 was deleted in our
+ // testdata files.
+ EXPECT_THAT(icing
+ .Get(document1.namespace_(), document1.uri(),
+ GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(document1));
+ EXPECT_THAT(icing
+ .Get(document2.namespace_(), document2.uri(),
+ GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+ EXPECT_THAT(icing
+ .Get(document3.namespace_(), document3.uri(),
+ GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(document3));
+
+ // Searching for "foo" should get us document1.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("foo");
+
+ SearchResultProto expected_document1;
+ expected_document1.mutable_status()->set_code(StatusProto::OK);
+ *expected_document1.mutable_results()->Add()->mutable_document() = document1;
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(expected_document1));
+
+ // Searching for "baz" would've gotten us document2, except it got deleted.
+ // Make sure that it's cleared from our index too.
+ search_spec.set_query("baz");
+
+ SearchResultProto expected_no_documents;
+ expected_no_documents.mutable_status()->set_code(StatusProto::OK);
+
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(expected_no_documents));
+
+ // Searching for "phoo" should get us document3.
+ search_spec.set_query("phoo");
+
+ SearchResultProto expected_document3;
+ expected_document3.mutable_status()->set_code(StatusProto::OK);
+ *expected_document3.mutable_results()->Add()->mutable_document() = document3;
+
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(expected_document3));
+}
+
+TEST_F(IcingSearchEngineBackwardsCompatibilityTest, MigrateToLargerScale) {
+ // Copy the testdata files into our IcingSearchEngine directory
+ std::string test_data_dir = GetTestDataDir("icing_scale_migration");
+
+ // Create dst directory that we'll initialize the IcingSearchEngine over.
+ std::string base_dir = GetTestBaseDir() + "_migrate";
+ ASSERT_THAT(filesystem()->DeleteDirectoryRecursively(base_dir.c_str()), true);
+ ASSERT_THAT(filesystem()->CreateDirectoryRecursively(base_dir.c_str()), true);
+
+ ASSERT_TRUE(filesystem()->CopyDirectory(test_data_dir.c_str(),
+ base_dir.c_str(),
+ /*recursive=*/true));
+
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(base_dir);
+
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+
+ // Since there will be version change, the recovery cause will be
+ // VERSION_CHANGED.
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+
+ // Verify that the schema stored in the index matches the one that we expect.
+ // Do not change unless you're also updating the testdata files.
+ SchemaProto expected_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Make sure our schema is still the same as we expect. If not, there's
+ // definitely no way we're getting the documents back that we expect.
+ GetSchemaResultProto expected_get_schema_result_proto;
+ expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_schema_result_proto.mutable_schema() = expected_schema;
+ ASSERT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto));
+
+ // These are the documents that are stored in the testdata files. Do not
+ // change unless you're also updating the testdata files.
+ DocumentProto expected_document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .AddStringProperty("body", "bar")
+ .Build();
+
+ DocumentProto expected_deleted_document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("email")
+ .SetCreationTimestampMs(20)
+ .SetScore(321)
+ .AddStringProperty("body", "baz bat")
+ .Build();
+
+ DocumentProto expected_document3 = DocumentBuilder()
+ .SetKey("namespace2", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(30)
+ .SetScore(123)
+ .AddStringProperty("subject", "phoo")
+ .Build();
+
+ // Document 1 and 3 were put normally, and document 2 was deleted in our
+ // testdata files.
+ EXPECT_THAT(
+ icing
+ .Get(expected_document1.namespace_(), expected_document1.uri(),
+ GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(expected_document1));
+ EXPECT_THAT(icing
+ .Get(expected_deleted_document2.namespace_(),
+ expected_deleted_document2.uri(),
+ GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing
+ .Get(expected_document3.namespace_(), expected_document3.uri(),
+ GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(expected_document3));
+
+ // Searching for "foo" should get us document1.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("foo");
+
+ SearchResultProto expected_document1_search;
+ expected_document1_search.mutable_status()->set_code(StatusProto::OK);
+ *expected_document1_search.mutable_results()->Add()->mutable_document() =
+ expected_document1;
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_document1_search));
+
+ // Searching for "baz" would've gotten us document2, except it got deleted.
+ // Make sure that it's cleared from our index too.
+ search_spec.set_query("baz");
+
+ SearchResultProto expected_no_documents;
+ expected_no_documents.mutable_status()->set_code(StatusProto::OK);
+
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(expected_no_documents));
+
+ // Searching for "phoo" should get us document3.
+ search_spec.set_query("phoo");
+
+ SearchResultProto expected_document3_search;
+ expected_document3_search.mutable_status()->set_code(StatusProto::OK);
+ *expected_document3_search.mutable_results()->Add()->mutable_document() =
+ expected_document3;
+
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_document3_search));
+}
+
+TEST_F(IcingSearchEngineBackwardsCompatibilityTest,
+ MigrateToAppendOnlySchemaStorage) {
+ // Copy the testdata files into our IcingSearchEngine directory
+ std::string test_data_dir = GetTestDataDir("blob_schema_store");
+
+ // Create dst directory that we'll initialize the IcingSearchEngine over.
+ std::string base_dir = GetTestBaseDir() + "_migrate";
+ ASSERT_THAT(filesystem()->DeleteDirectoryRecursively(base_dir.c_str()), true);
+ ASSERT_THAT(filesystem()->CreateDirectoryRecursively(base_dir.c_str()), true);
+
+ ASSERT_TRUE(filesystem()->CopyDirectory(test_data_dir.c_str(),
+ base_dir.c_str(),
+ /*recursive=*/true));
+
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(base_dir);
+
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+
+ // Since there will be version change, the recovery cause will be
+ // VERSION_CHANGED.
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ // TODO: create enum code for legacy schema store recovery after schema store
+ // change is made.
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+
+ // Verify that the schema stored in the index matches the one that we expect.
+ // Do not change unless you're also updating the testdata files.
+ SchemaProto expected_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("transaction")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("unindexedStringProperty")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("unindexedIntegerProperty")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableIntegerProperty")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("stringExactProperty")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("stringPrefixProperty")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ GetSchemaResultProto expected_get_schema_result_proto;
+ expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_schema_result_proto.mutable_schema() = expected_schema;
+ ASSERT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto));
+
+ // These are the documents that are stored in the testdata files. Do not
+ // change unless you're also updating the testdata files.
+ DocumentProto expected_document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .AddStringProperty("body", "bar")
+ .Build();
+
+ DocumentProto expected_document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(20)
+ .SetScore(123)
+ .AddStringProperty("subject", "phoo")
+ .Build();
+
+ DocumentProto expected_document3 =
+ DocumentBuilder()
+ .SetKey("namespace3", "uri3")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(30)
+ .SetScore(123)
+ .AddStringProperty("stringExactProperty", "foo")
+ .AddInt64Property("indexableIntegerProperty", 10)
+ .Build();
+
+ EXPECT_THAT(
+ icing
+ .Get(expected_document1.namespace_(), expected_document1.uri(),
+ GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(expected_document1));
+ EXPECT_THAT(
+ icing
+ .Get(expected_document2.namespace_(), expected_document2.uri(),
+ GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(expected_document2));
+ EXPECT_THAT(
+ icing
+ .Get(expected_document3.namespace_(), expected_document3.uri(),
+ GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(expected_document3));
+
+ // Searching for "foo" should get us document1 and not document3 due to the
+ // schema type filter.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("foo");
+ search_spec.add_schema_type_filters("email");
+
+ SearchResultProto expected_document1_search;
+ expected_document1_search.mutable_status()->set_code(StatusProto::OK);
+ *expected_document1_search.mutable_results()->Add()->mutable_document() =
+ expected_document1;
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_document1_search));
+
+ // Searching for "phoo" should get us document2.
+ search_spec.set_query("phoo");
+
+ SearchResultProto expected_document2_search;
+ expected_document2_search.mutable_status()->set_code(StatusProto::OK);
+ *expected_document2_search.mutable_results()->Add()->mutable_document() =
+ expected_document2;
+
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_document2_search));
+
+ // Searching for "foo" should get us both document 1 and document3 now that
+ // schema type 'transaction' has been added to the schema filter.
+ search_spec.set_query("foo");
+ search_spec.add_schema_type_filters("transaction");
+
+ SearchResultProto expected_document_1_and_3_search;
+ expected_document_1_and_3_search.mutable_status()->set_code(StatusProto::OK);
+ *expected_document_1_and_3_search.mutable_results()
+ ->Add()
+ ->mutable_document() = expected_document3;
+ *expected_document_1_and_3_search.mutable_results()
+ ->Add()
+ ->mutable_document() = expected_document1;
+
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_document_1_and_3_search));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_benchmark.cc b/icing/icing-search-engine_benchmark.cc
index a6d96e0..18c6bb9 100644
--- a/icing/icing-search-engine_benchmark.cc
+++ b/icing/icing-search-engine_benchmark.cc
@@ -16,7 +16,9 @@
#include <fstream>
#include <iostream>
+#include <limits>
#include <memory>
+#include <numeric>
#include <ostream>
#include <random>
#include <sstream>
@@ -32,17 +34,23 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/icing-search-engine.h"
+#include "icing/join/join-processor.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/status.pb.h"
#include "icing/proto/term.pb.h"
+#include "icing/query/query-features.h"
+#include "icing/schema-builder.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/document-generator.h"
+#include "icing/testing/numeric/number-generator.h"
+#include "icing/testing/numeric/uniform-distribution-integer-generator.h"
#include "icing/testing/random-string.h"
-#include "icing/testing/recorder-test-utils.h"
#include "icing/testing/schema-generator.h"
#include "icing/testing/tmp-directory.h"
@@ -51,7 +59,7 @@
// //icing:icing-search-engine_benchmark
//
// $ blaze-bin/icing/icing-search-engine_benchmark
-// --benchmarks=all --benchmark_memory_usage
+// --benchmark_filter=all --benchmark_memory_usage
//
// Run on an Android device:
// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
@@ -61,7 +69,8 @@
// $ adb push blaze-bin/icing/icing-search-engine_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/icing-search-engine_benchmark --benchmarks=all
+// $ adb shell /data/local/tmp/icing-search-engine_benchmark
+// --benchmark_filter=all
namespace icing {
namespace lib {
@@ -69,6 +78,7 @@ namespace lib {
namespace {
using ::testing::Eq;
+using ::testing::HasSubstr;
// Icing GMSCore has, on average, 17 corpora on a device and 30 corpora at the
// 95th pct. Most clients use a single type. This is a function of Icing's
@@ -87,14 +97,6 @@ constexpr int kAvgDocumentSize = 300;
// ASSUME: ~75% of the document's size comes from it's content.
constexpr float kContentSizePct = 0.7;
-// Average length of word in English is 4.7 characters.
-constexpr int kAvgTokenLen = 5;
-// Made up value. This results in a fairly reasonable language - the majority of
-// generated words are 3-9 characters, ~3% of words are >=20 chars, and the
-// longest ones are 27 chars, (roughly consistent with the longest,
-// non-contrived English words
-// https://en.wikipedia.org/wiki/Longest_word_in_English)
-constexpr int kTokenStdDev = 7;
constexpr int kLanguageSize = 1000;
// Lite Index size required to fit 128k docs, each doc requires ~64 bytes of
@@ -114,22 +116,6 @@ std::vector<std::string> CreateNamespaces(int num_namespaces) {
return namespaces;
}
-// Creates a vector containing num_words randomly-generated words for use by
-// documents.
-template <typename Rand>
-std::vector<std::string> CreateLanguage(int num_words, Rand* r) {
- std::vector<std::string> language;
- std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev);
- while (--num_words >= 0) {
- int word_length = 0;
- while (word_length < 1) {
- word_length = std::round(norm_dist(*r));
- }
- language.push_back(RandomString(kAlNumAlphabet, word_length, r));
- }
- return language;
-}
-
SearchSpecProto CreateSearchSpec(const std::string& query,
const std::vector<std::string>& namespaces,
TermMatchType::Code match_type) {
@@ -175,6 +161,202 @@ class DestructibleDirectory {
std::string dir_;
};
+std::vector<DocumentProto> GenerateRandomDocuments(
+ EvenDistributionTypeSelector* type_selector, int num_docs,
+ const std::vector<std::string>& language) {
+ std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces);
+ EvenDistributionNamespaceSelector namespace_selector(namespaces);
+
+ std::default_random_engine random;
+ UniformDistributionLanguageTokenGenerator<std::default_random_engine>
+ token_generator(language, &random);
+
+ DocumentGenerator<
+ EvenDistributionNamespaceSelector, EvenDistributionTypeSelector,
+ UniformDistributionLanguageTokenGenerator<std::default_random_engine>>
+ generator(&namespace_selector, type_selector, &token_generator,
+ kAvgDocumentSize * kContentSizePct);
+
+ std::vector<DocumentProto> random_docs;
+ random_docs.reserve(num_docs);
+ for (int i = 0; i < num_docs; i++) {
+ random_docs.push_back(generator.generateDoc());
+ }
+ return random_docs;
+}
+
+std::unique_ptr<NumberGenerator<int64_t>> CreateIntegerGenerator(
+ size_t num_documents) {
+ // Since the collision # follows poisson distribution with lambda =
+ // (num_keys / range), we set the range 10x (lambda = 0.1) to avoid too many
+ // collisions.
+ //
+ // Distribution:
+ // - keys in range being picked for 0 times: 90.5%
+ // - keys in range being picked for 1 time: 9%
+ // - keys in range being picked for 2 times: 0.45%
+ // - keys in range being picked for 3 times: 0.015%
+ //
+ // For example, num_keys = 1M, range = 10M. Then there will be ~904837 unique
+ // keys, 45242 keys being picked twice, 1508 keys being picked thrice ...
+ return std::make_unique<UniformDistributionIntegerGenerator<int64_t>>(
+ /*seed=*/12345, /*range_lower=*/0,
+ /*range_upper=*/static_cast<int64_t>(num_documents) * 10 - 1);
+}
+
+void BM_IndexLatency(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ std::default_random_engine random;
+ int num_types = kAvgNumNamespaces * kAvgNumTypes;
+ ExactStringPropertyGenerator property_generator;
+ SchemaGenerator<ExactStringPropertyGenerator> schema_generator(
+ /*num_properties=*/state.range(1), &property_generator);
+ SchemaProto schema = schema_generator.GenerateSchema(num_types);
+ EvenDistributionTypeSelector type_selector(schema);
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ int num_docs = state.range(0);
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
+ const std::vector<DocumentProto> random_docs =
+ GenerateRandomDocuments(&type_selector, num_docs, language);
+ for (auto _ : state) {
+ state.PauseTiming();
+ ASSERT_THAT(icing->Reset().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+ state.ResumeTiming();
+ for (const DocumentProto& doc : random_docs) {
+ ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
+ }
+ }
+}
+BENCHMARK(BM_IndexLatency)
+ // Arguments: num_indexed_documents, num_sections
+ ->ArgPair(1000000, 5);
+
+void BM_QueryLatency(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ std::default_random_engine random;
+ int num_types = kAvgNumNamespaces * kAvgNumTypes;
+ ExactStringPropertyGenerator property_generator;
+ SchemaGenerator<ExactStringPropertyGenerator> schema_generator(
+ /*num_properties=*/state.range(1), &property_generator);
+ SchemaProto schema = schema_generator.GenerateSchema(num_types);
+ EvenDistributionTypeSelector type_selector(schema);
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ int num_docs = state.range(0);
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
+ const std::vector<DocumentProto> random_docs =
+ GenerateRandomDocuments(&type_selector, num_docs, language);
+ for (const DocumentProto& doc : random_docs) {
+ ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
+ }
+
+ SearchSpecProto search_spec = CreateSearchSpec(
+ language.at(0), std::vector<std::string>(), TermMatchType::PREFIX);
+ ResultSpecProto result_spec = CreateResultSpec(1, 1000000, 1000000);
+ ScoringSpecProto scoring_spec =
+ CreateScoringSpec(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ for (auto _ : state) {
+ SearchResultProto results = icing->Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ }
+}
+BENCHMARK(BM_QueryLatency)
+ // Arguments: num_indexed_documents, num_sections
+ ->ArgPair(1000000, 2);
+
+void BM_IndexThroughput(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ std::default_random_engine random;
+ int num_types = kAvgNumNamespaces * kAvgNumTypes;
+ ExactStringPropertyGenerator property_generator;
+ SchemaGenerator<ExactStringPropertyGenerator> schema_generator(
+ /*num_properties=*/state.range(1), &property_generator);
+ SchemaProto schema = schema_generator.GenerateSchema(num_types);
+ EvenDistributionTypeSelector type_selector(schema);
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ int num_docs = state.range(0);
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
+ const std::vector<DocumentProto> random_docs =
+ GenerateRandomDocuments(&type_selector, num_docs, language);
+ for (auto s : state) {
+ for (const DocumentProto& doc : random_docs) {
+ ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
+ }
+ }
+ state.SetItemsProcessed(state.iterations() * num_docs);
+}
+BENCHMARK(BM_IndexThroughput)
+ // Arguments: num_indexed_documents, num_sections
+ ->ArgPair(1, 1)
+ ->ArgPair(2, 1)
+ ->ArgPair(8, 1)
+ ->ArgPair(32, 1)
+ ->ArgPair(128, 1)
+ ->ArgPair(1 << 10, 1)
+ ->ArgPair(1 << 13, 1)
+ ->ArgPair(1 << 15, 1)
+ ->ArgPair(1 << 17, 1)
+ ->ArgPair(1, 5)
+ ->ArgPair(2, 5)
+ ->ArgPair(8, 5)
+ ->ArgPair(32, 5)
+ ->ArgPair(128, 5)
+ ->ArgPair(1 << 10, 5)
+ ->ArgPair(1 << 13, 5)
+ ->ArgPair(1 << 15, 5)
+ ->ArgPair(1 << 17, 5)
+ ->ArgPair(1, 10)
+ ->ArgPair(2, 10)
+ ->ArgPair(8, 10)
+ ->ArgPair(32, 10)
+ ->ArgPair(128, 10)
+ ->ArgPair(1 << 10, 10)
+ ->ArgPair(1 << 13, 10)
+ ->ArgPair(1 << 15, 10)
+ ->ArgPair(1 << 17, 10);
+
void BM_MutlipleIndices(benchmark::State& state) {
// Initialize the filesystem
std::string test_dir = GetTestTempDir() + "/icing/benchmark";
@@ -202,11 +384,8 @@ void BM_MutlipleIndices(benchmark::State& state) {
options.set_index_merge_size(kIcingFullIndexSize / num_indices);
auto icing = std::make_unique<IcingSearchEngine>(options);
- InitializeResultProto init_result = icing->Initialize();
- ASSERT_THAT(init_result.status().code(), Eq(StatusProto::OK));
-
- SetSchemaResultProto schema_result = icing->SetSchema(schema);
- ASSERT_THAT(schema_result.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
icings.push_back(std::move(icing));
}
@@ -214,7 +393,7 @@ void BM_MutlipleIndices(benchmark::State& state) {
std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces);
EvenDistributionNamespaceSelector namespace_selector(namespaces);
- std::vector<std::string> language = CreateLanguage(kLanguageSize, &random);
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
UniformDistributionLanguageTokenGenerator<std::default_random_engine>
token_generator(language, &random);
@@ -231,8 +410,7 @@ void BM_MutlipleIndices(benchmark::State& state) {
ASSERT_THAT(put_result.status().code(), Eq(StatusProto::UNKNOWN));
continue;
}
- put_result = icings.at(i % icings.size())->Put(doc);
- ASSERT_THAT(put_result.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icings.at(i % icings.size())->Put(doc).status(), ProtoIsOk());
}
// QUERY!
@@ -255,13 +433,13 @@ void BM_MutlipleIndices(benchmark::State& state) {
continue;
}
result = icings.at(0)->Search(search_spec, scoring_spec, result_spec);
- ASSERT_THAT(result.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(result.status(), ProtoIsOk());
while (!result.results().empty()) {
num_results += result.results_size();
if (!icings.empty()) {
result = icings.at(0)->GetNextPage(result.next_page_token());
}
- ASSERT_THAT(result.status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(result.status(), ProtoIsOk());
}
}
@@ -307,6 +485,781 @@ BENCHMARK(BM_MutlipleIndices)
->ArgPair(10, 32768)
->ArgPair(10, 131072);
+void BM_SearchNoStackOverflow(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Create a document that has the term "foo"
+ DocumentProto base_document = DocumentBuilder()
+ .SetSchema("Message")
+ .SetNamespace("namespace")
+ .AddStringProperty("body", "foo")
+ .Build();
+
+ // Insert a lot of documents with the term "foo"
+ int64_t num_docs = state.range(0);
+ for (int64_t i = 0; i < num_docs; ++i) {
+ DocumentProto document =
+ DocumentBuilder(base_document).SetUri(std::to_string(i)).Build();
+ ASSERT_THAT(icing->Put(document).status(), ProtoIsOk());
+ }
+
+ // Do a query and exclude documents with the term "foo". The way this is
+ // currently implemented is that we'll iterate over all the documents in the
+ // index, then apply the exclusion check. Since all our documents have "foo",
+ // we'll consider it a "miss". Previously with recursion, we would have
+ // recursed until we got a success, which would never happen causing us to
+ // recurse through all the documents and trigger a stack overflow. With
+ // the iterative implementation, we should avoid this.
+ SearchSpecProto search_spec;
+ search_spec.set_query("-foo");
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+
+ ResultSpecProto result_spec;
+ ScoringSpecProto scoring_spec;
+ for (auto s : state) {
+ icing->Search(search_spec, scoring_spec, result_spec);
+ }
+}
+// For other reasons, we hit a limit when inserting the ~350,000th document. So
+// cap the limit to 1 << 18.
+BENCHMARK(BM_SearchNoStackOverflow)
+ ->Range(/*start=*/1 << 10, /*limit=*/1 << 18);
+
+// Added for b/184373205. Ensure that we can repeatedly put documents even if
+// the underlying mmapped areas grow past a few page sizes.
+void BM_RepeatedPut(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Create a document that has the term "foo"
+ DocumentProto base_document = DocumentBuilder()
+ .SetSchema("Message")
+ .SetNamespace("namespace")
+ .AddStringProperty("body", "foo")
+ .Build();
+
+ // Insert a lot of documents with the term "foo"
+ int64_t num_docs = state.range(0);
+ for (auto s : state) {
+ for (int64_t i = 0; i < num_docs; ++i) {
+ DocumentProto document =
+ DocumentBuilder(base_document).SetUri("uri").Build();
+ ASSERT_THAT(icing->Put(document).status(), ProtoIsOk());
+ }
+ }
+}
+// For other reasons, we hit a limit when inserting the ~350,000th document. So
+// cap the limit to 1 << 18.
+BENCHMARK(BM_RepeatedPut)->Range(/*start=*/100, /*limit=*/1 << 18);
+
+// This is different from BM_RepeatedPut since we're just trying to benchmark
+// one Put call, not thousands of them at once.
+void BM_Put(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message"))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Create a document
+ DocumentProto document = DocumentBuilder()
+ .SetSchema("Message")
+ .SetNamespace("namespace")
+ .SetUri("uri")
+ .Build();
+
+ for (auto s : state) {
+ benchmark::DoNotOptimize(icing->Put(document));
+ }
+}
+BENCHMARK(BM_Put);
+
+void BM_Get(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message"))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Create a document
+ DocumentProto document = DocumentBuilder()
+ .SetSchema("Message")
+ .SetNamespace("namespace")
+ .SetUri("uri")
+ .Build();
+
+ ASSERT_THAT(icing->Put(document).status(), ProtoIsOk());
+ for (auto s : state) {
+ benchmark::DoNotOptimize(
+ icing->Get("namespace", "uri", GetResultSpecProto::default_instance()));
+ }
+}
+BENCHMARK(BM_Get);
+
+void BM_Delete(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message"))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Create a document
+ DocumentProto document = DocumentBuilder()
+ .SetSchema("Message")
+ .SetNamespace("namespace")
+ .SetUri("uri")
+ .Build();
+
+ ASSERT_THAT(icing->Put(document).status(), ProtoIsOk());
+ for (auto s : state) {
+ state.PauseTiming();
+ icing->Put(document);
+ state.ResumeTiming();
+
+ benchmark::DoNotOptimize(icing->Delete("namespace", "uri"));
+ }
+}
+BENCHMARK(BM_Delete);
+
+void BM_PutMaxAllowedDocuments(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL)))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Create a document that has the term "foo"
+ DocumentProto base_document = DocumentBuilder()
+ .SetSchema("Message")
+ .SetNamespace("namespace")
+ .AddStringProperty("body", "foo")
+ .Build();
+
+ // Insert a lot of documents with the term "foo"
+ for (auto s : state) {
+ for (int64_t i = 0; i <= kMaxDocumentId; ++i) {
+ DocumentProto document =
+ DocumentBuilder(base_document).SetUri(std::to_string(i)).Build();
+ EXPECT_THAT(icing->Put(document).status(), ProtoIsOk());
+ }
+ }
+
+ DocumentProto document =
+ DocumentBuilder(base_document).SetUri("out_of_space_uri").Build();
+ PutResultProto put_result_proto = icing->Put(document);
+ EXPECT_THAT(put_result_proto.status(),
+ ProtoStatusIs(StatusProto::OUT_OF_SPACE));
+ EXPECT_THAT(put_result_proto.status().message(),
+ HasSubstr("Exceeded maximum number of documents"));
+}
+BENCHMARK(BM_PutMaxAllowedDocuments);
+
+void BM_QueryWithSnippet(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ std::string body = "message body";
+ for (int i = 0; i < 100; i++) {
+ body = body +
+ " invent invention inventory invest investigate investigation "
+ "investigator investment nvestor invisible invitation invite "
+ "involve involved involvement IraqiI rish island";
+ }
+ for (int i = 0; i < 50; i++) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri" + std::to_string(i))
+ .SetSchema("Message")
+ .AddStringProperty("body", body)
+ .Build();
+ ASSERT_THAT(icing->Put(std::move(document)).status(), ProtoIsOk());
+ }
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("i");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(10000);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(10000);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(10000);
+
+ for (auto s : state) {
+ SearchResultProto results = icing->Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ }
+}
+BENCHMARK(BM_QueryWithSnippet);
+
+void BM_NumericIndexing(benchmark::State& state) {
+ int num_documents = state.range(0);
+ int num_integers_per_doc = state.range(1);
+
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("integer")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ std::unique_ptr<NumberGenerator<int64_t>> integer_generator =
+ CreateIntegerGenerator(num_documents);
+ std::vector<DocumentProto> documents;
+ documents.reserve(num_documents);
+ for (int i = 0; i < num_documents; ++i) {
+ std::vector<int64_t> integers;
+ integers.reserve(num_integers_per_doc);
+ for (int j = 0; j < num_integers_per_doc; ++j) {
+ integers.push_back(integer_generator->Generate());
+ }
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri" + std::to_string(i))
+ .SetSchema("Message")
+ .AddStringProperty("body", "body hello world")
+ .AddInt64Property("integer", integers.begin(), integers.end())
+ .Build();
+ documents.push_back(std::move(document));
+ }
+
+ for (auto s : state) {
+ state.PauseTiming();
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+ state.ResumeTiming();
+
+ for (const DocumentProto& document : documents) {
+ ASSERT_THAT(icing->Put(document).status(), ProtoIsOk());
+ }
+
+ state.PauseTiming();
+ icing.reset();
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ state.ResumeTiming();
+ }
+}
+
+BENCHMARK(BM_NumericIndexing)
+ // Arguments: num_documents, num_integers_per_doc
+ ->ArgPair(1000000, 5);
+
+void BM_NumericExactQuery(benchmark::State& state) {
+ int num_documents = state.range(0);
+ int num_integers_per_doc = state.range(1);
+
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("integer")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ std::unique_ptr<NumberGenerator<int64_t>> integer_generator =
+ CreateIntegerGenerator(num_documents);
+ std::unordered_set<int64_t> chosen_integer_set;
+ for (int i = 0; i < num_documents; ++i) {
+ std::vector<int64_t> integers;
+ integers.reserve(num_integers_per_doc);
+ for (int j = 0; j < num_integers_per_doc; ++j) {
+ int64_t chosen_int = integer_generator->Generate();
+ integers.push_back(chosen_int);
+ chosen_integer_set.insert(chosen_int);
+ }
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri" + std::to_string(i))
+ .SetSchema("Message")
+ .AddStringProperty("body", "body hello world")
+ .AddInt64Property("integer", integers.begin(), integers.end())
+ .Build();
+ ASSERT_THAT(icing->Put(std::move(document)).status(), ProtoIsOk());
+ }
+
+ SearchSpecProto search_spec;
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ std::vector<int64_t> chosen_integers(chosen_integer_set.begin(),
+ chosen_integer_set.end());
+ std::uniform_int_distribution<> distrib(0, chosen_integers.size() - 1);
+ std::default_random_engine e(/*seed=*/12345);
+ for (auto s : state) {
+ int64_t exact = chosen_integers[distrib(e)];
+ search_spec.set_query("integer == " + std::to_string(exact));
+
+ SearchResultProto results =
+ icing->Search(search_spec, scoring_spec, result_spec);
+ ASSERT_THAT(results.status(), ProtoIsOk());
+ ASSERT_GT(results.results_size(), 0);
+ if (results.next_page_token() != kInvalidNextPageToken) {
+ icing->InvalidateNextPageToken(results.next_page_token());
+ }
+ }
+}
+BENCHMARK(BM_NumericExactQuery)
+ // Arguments: num_documents, num_integers_per_doc
+ ->ArgPair(1000000, 5);
+
+void BM_NumericRangeQueryAll(benchmark::State& state) {
+ int num_documents = state.range(0);
+ int num_integers_per_doc = state.range(1);
+
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("integer")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ std::unique_ptr<NumberGenerator<int64_t>> integer_generator =
+ CreateIntegerGenerator(num_documents);
+ for (int i = 0; i < num_documents; ++i) {
+ std::vector<int64_t> integers;
+ integers.reserve(num_integers_per_doc);
+ for (int j = 0; j < num_integers_per_doc; ++j) {
+ integers.push_back(integer_generator->Generate());
+ }
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri" + std::to_string(i))
+ .SetSchema("Message")
+ .AddStringProperty("body", "body hello world")
+ .AddInt64Property("integer", integers.begin(), integers.end())
+ .Build();
+ ASSERT_THAT(icing->Put(std::move(document)).status(), ProtoIsOk());
+ }
+
+ SearchSpecProto search_spec;
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+ search_spec.set_query("integer >= " +
+ std::to_string(std::numeric_limits<int64_t>::min()));
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ for (auto s : state) {
+ SearchResultProto results =
+ icing->Search(search_spec, scoring_spec, result_spec);
+ ASSERT_THAT(results.status(), ProtoIsOk());
+ ASSERT_GT(results.results_size(), 0);
+ if (results.next_page_token() != kInvalidNextPageToken) {
+ icing->InvalidateNextPageToken(results.next_page_token());
+ }
+ }
+}
+BENCHMARK(BM_NumericRangeQueryAll)
+ // Arguments: num_documents, num_integers_per_doc
+ ->ArgPair(1000000, 5);
+
+void BM_JoinQueryQualifiedId(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ options.set_document_store_namespace_id_fingerprint(true);
+ options.set_use_new_qualified_id_join_index(true);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Create Person documents (parent)
+ static constexpr int kNumPersonDocuments = 1000;
+ for (int i = 0; i < kNumPersonDocuments; ++i) {
+ std::string person_id = std::to_string(i);
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person" + person_id)
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first" + person_id)
+ .AddStringProperty("lastName", "last" + person_id)
+ .AddStringProperty("emailAddress",
+ "person" + person_id + "@gmail.com")
+ .Build();
+ ASSERT_THAT(icing->Put(std::move(person)).status(), ProtoIsOk());
+ }
+
+ // Create Email documents (child)
+ static constexpr int kNumEmailDocuments = 1000;
+ std::uniform_int_distribution<> distrib(0, kNumPersonDocuments - 1);
+ std::default_random_engine e(/*seed=*/12345);
+ for (int i = 0; i < kNumEmailDocuments; ++i) {
+ std::string email_id = std::to_string(i);
+ std::string person_id = std::to_string(distrib(e));
+ DocumentProto email =
+ DocumentBuilder()
+ .SetKey("namespace", "email" + email_id)
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject " + email_id)
+ .AddStringProperty("body", "message body")
+ .AddStringProperty("personQualifiedId",
+ "pkg$db/namespace#person" + person_id)
+ .Build();
+ ASSERT_THAT(icing->Put(std::move(email)).status(), ProtoIsOk());
+ }
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::MAX);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ *nested_spec->mutable_scoring_spec() = ScoringSpecProto::default_instance();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ static constexpr int kNumPerPage = 10;
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(kNumPerPage);
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ ScoringSpecProto score_spec = ScoringSpecProto::default_instance();
+
+ const auto child_count_reduce_func =
+ [](int child_count, const SearchResultProto::ResultProto& result) -> int {
+ return child_count + result.joined_results_size();
+ };
+ for (auto s : state) {
+ int total_parent_count = 0;
+ int total_child_count = 0;
+ SearchResultProto results =
+ icing->Search(search_spec, score_spec, result_spec);
+ total_parent_count += results.results_size();
+ total_child_count +=
+ std::reduce(results.results().begin(), results.results().end(), 0,
+ child_count_reduce_func);
+
+ ASSERT_THAT(total_parent_count, Eq(kNumPerPage));
+ ASSERT_THAT(total_child_count, ::testing::Ge(0));
+ }
+}
+BENCHMARK(BM_JoinQueryQualifiedId);
+
+void BM_PersistToDisk(benchmark::State& state) {
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ std::default_random_engine random;
+ int num_types = kAvgNumNamespaces * kAvgNumTypes;
+ ExactStringPropertyGenerator property_generator;
+ SchemaGenerator<ExactStringPropertyGenerator> schema_generator(
+ /*num_properties=*/state.range(1), &property_generator);
+ SchemaProto schema = schema_generator.GenerateSchema(num_types);
+ EvenDistributionTypeSelector type_selector(schema);
+
+ // Generate documents.
+ int num_docs = state.range(0);
+ std::vector<std::string> language = CreateLanguages(kLanguageSize, &random);
+ const std::vector<DocumentProto> random_docs =
+ GenerateRandomDocuments(&type_selector, num_docs, language);
+
+ for (auto _ : state) {
+ state.PauseTiming();
+ // Create the index.
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ options.set_index_merge_size(kIcingFullIndexSize);
+ options.set_use_persistent_hash_map(true);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Reset().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ for (const DocumentProto& doc : random_docs) {
+ ASSERT_THAT(icing->Put(doc).status(), ProtoIsOk());
+ }
+
+ state.ResumeTiming();
+
+ ASSERT_THAT(icing->PersistToDisk(PersistType::FULL).status(), ProtoIsOk());
+
+ state.PauseTiming();
+ icing.reset();
+ ASSERT_TRUE(filesystem.DeleteDirectoryRecursively(test_dir.c_str()));
+ state.ResumeTiming();
+ }
+}
+BENCHMARK(BM_PersistToDisk)
+ // Arguments: num_indexed_documents, num_sections
+ ->ArgPair(1024, 5);
+
} // namespace
} // namespace lib
diff --git a/icing/icing-search-engine_delete_test.cc b/icing/icing-search-engine_delete_test.cc
new file mode 100644
index 0000000..c3b1ccd
--- /dev/null
+++ b/icing/icing-search-engine_delete_test.cc
@@ -0,0 +1,768 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/icing-search-engine.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Return;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::testing::UnorderedElementsAre;
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+// This test is meant to cover all tests relating to IcingSearchEngine::Delete*.
+class IcingSearchEngineDeleteTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ private:
+ Filesystem filesystem_;
+};
+
+// Non-zero value so we don't override it to be the current time
+constexpr int64_t kDefaultCreationTimestampMs = 1575492852000;
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ return icing_options;
+}
+
+SchemaProto CreateMessageSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+}
+
+SchemaProto CreateEmailSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+}
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+TEST_F(IcingSearchEngineDeleteTest, DeleteBySchemaType) {
+ SchemaProto schema;
+ // Add an email type
+ auto type = schema.add_types();
+ type->set_schema_type("email");
+ auto property = type->add_properties();
+ property->set_property_name("subject");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+ // Add an message type
+ type = schema.add_types();
+ type->set_schema_type("message");
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(7);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete the first type. The first doc should be irretrievable. The
+ // second should still be present.
+ DeleteBySchemaTypeResultProto result_proto =
+ icing.DeleteBySchemaType("message");
+ EXPECT_THAT(result_proto.status(), ProtoIsOk());
+ DeleteStatsProto exp_stats;
+ exp_stats.set_delete_type(DeleteStatsProto::DeleteType::SCHEMA_TYPE);
+ exp_stats.set_latency_ms(7);
+ exp_stats.set_num_documents_deleted(1);
+ EXPECT_THAT(result_proto.delete_stats(), EqualsProto(exp_stats));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Search for "message", only document2 should show up.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("message");
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineDeleteTest, DeleteSchemaTypeByQuery) {
+ SchemaProto schema = CreateMessageSchema();
+ // Add an email type
+ SchemaProto tmp = CreateEmailSchema();
+ *schema.add_types() = tmp.types(0);
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema(schema.types(0).schema_type())
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema(schema.types(1).schema_type())
+ .AddStringProperty("subject", "subject subject2")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete the first type. The first doc should be irretrievable. The
+ // second should still be present.
+ SearchSpecProto search_spec;
+ search_spec.add_schema_type_filters(schema.types(0).schema_type());
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status(), ProtoIsOk());
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineDeleteTest, DeleteByNamespace) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace3", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(7);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace3", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete namespace1. Document1 and document2 should be irretrievable.
+ // Document3 should still be present.
+ DeleteByNamespaceResultProto result_proto =
+ icing.DeleteByNamespace("namespace1");
+ EXPECT_THAT(result_proto.status(), ProtoIsOk());
+ DeleteStatsProto exp_stats;
+ exp_stats.set_delete_type(DeleteStatsProto::DeleteType::NAMESPACE);
+ exp_stats.set_latency_ms(7);
+ exp_stats.set_num_documents_deleted(2);
+ EXPECT_THAT(result_proto.delete_stats(), EqualsProto(exp_stats));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri2) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace3", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Search for "message", only document3 should show up.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("message");
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineDeleteTest, DeleteNamespaceByQuery) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete the first namespace. The first doc should be irretrievable. The
+ // second should still be present.
+ SearchSpecProto search_spec;
+ search_spec.add_namespace_filters("namespace1");
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status(), ProtoIsOk());
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineDeleteTest, DeleteByQuery) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(7);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs containing 'body1'. The first doc should be irretrievable.
+ // The second should still be present.
+ SearchSpecProto search_spec;
+ search_spec.set_query("body1");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ DeleteByQueryResultProto result_proto = icing.DeleteByQuery(search_spec);
+ EXPECT_THAT(result_proto.status(), ProtoIsOk());
+ DeleteByQueryStatsProto exp_stats;
+ exp_stats.set_latency_ms(7);
+ exp_stats.set_num_documents_deleted(1);
+ exp_stats.set_query_length(search_spec.query().length());
+ exp_stats.set_num_terms(1);
+ exp_stats.set_num_namespaces_filtered(0);
+ exp_stats.set_num_schema_types_filtered(0);
+ exp_stats.set_parse_query_latency_ms(7);
+ exp_stats.set_document_removal_latency_ms(7);
+ EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace1, uri1) not found.");
+ expected_get_result_proto.clear_document();
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineDeleteTest, DeleteByQueryReturnInfo) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body3")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(7);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs to test the information is correctly grouped.
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ DeleteByQueryResultProto result_proto =
+ icing.DeleteByQuery(search_spec, true);
+ EXPECT_THAT(result_proto.status(), ProtoIsOk());
+ DeleteByQueryStatsProto exp_stats;
+ exp_stats.set_latency_ms(7);
+ exp_stats.set_num_documents_deleted(3);
+ exp_stats.set_query_length(search_spec.query().length());
+ exp_stats.set_num_terms(1);
+ exp_stats.set_num_namespaces_filtered(0);
+ exp_stats.set_num_schema_types_filtered(0);
+ exp_stats.set_parse_query_latency_ms(7);
+ exp_stats.set_document_removal_latency_ms(7);
+ EXPECT_THAT(result_proto.delete_by_query_stats(), EqualsProto(exp_stats));
+
+ // Check that DeleteByQuery can return information for deleted documents.
+ DeleteByQueryResultProto::DocumentGroupInfo info1, info2;
+ info1.set_namespace_("namespace1");
+ info1.set_schema("Message");
+ info1.add_uris("uri1");
+ info2.set_namespace_("namespace2");
+ info2.set_schema("Message");
+ info2.add_uris("uri3");
+ info2.add_uris("uri2");
+ EXPECT_THAT(result_proto.deleted_documents(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2)));
+
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri3", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineDeleteTest, DeleteByQueryNotFound) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Delete all docs containing 'foo', which should be none of them. Both docs
+ // should still be present.
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(icing.DeleteByQuery(search_spec).status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace2", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ search_spec = SearchSpecProto::default_instance();
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_flush_benchmark.cc b/icing/icing-search-engine_flush_benchmark.cc
new file mode 100644
index 0000000..3196ef6
--- /dev/null
+++ b/icing/icing-search-engine_flush_benchmark.cc
@@ -0,0 +1,199 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/icing-search-engine.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/document-generator.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/schema-generator.h"
+#include "icing/testing/tmp-directory.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing:icing-search-engine_flush_benchmark
+//
+// $ blaze-bin/icing/icing-search-engine_flush_benchmark
+// --benchmark_filter=all --benchmark_memory_usage
+//
+// Run on an Android device:
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing:icing-search-engine_flush_benchmark
+//
+// $ adb push blaze-bin/icing/icing-search-engine_flush_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/icing-search-engine_flush_benchmark
+// --benchmark_filter=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Assume that there will be roughly 10 packages, each using 3 of its own types.
+constexpr int kAvgNumNamespaces = 10;
+constexpr int kAvgNumTypes = 3;
+
+// ASSUME: Types will have at most ten properties. Types will be created with
+// [1, 10] properties.
+constexpr int kMaxNumProperties = 10;
+
+// Based on logs from Icing GMSCore.
+constexpr int kAvgDocumentSize = 300;
+
+// ASSUME: ~75% of the document's size comes from its content.
+constexpr float kContentSizePct = 0.7;
+
+// Average length of word in English is 4.7 characters.
+constexpr int kAvgTokenLen = 5;
+// Made up value. This results in a fairly reasonable language - the majority of
+// generated words are 3-9 characters, ~3% of words are >=20 chars, and the
+// longest ones are 27 chars, (roughly consistent with the longest,
+// non-contrived English words
+// https://en.wikipedia.org/wiki/Longest_word_in_English)
+constexpr int kTokenStdDev = 7;
+constexpr int kLanguageSize = 1000;
+
+// The number of documents to index.
+constexpr int kNumDocuments = 1024;
+
+std::vector<std::string> CreateNamespaces(int num_namespaces) {
+ std::vector<std::string> namespaces;
+ while (--num_namespaces >= 0) {
+ namespaces.push_back("comgooglepackage" + std::to_string(num_namespaces));
+ }
+ return namespaces;
+}
+
+// Creates a vector containing num_words randomly-generated words for use by
+// documents.
+template <typename Rand>
+std::vector<std::string> CreateLanguage(int num_words, Rand* r) {
+ std::vector<std::string> language;
+ std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev);
+ while (--num_words >= 0) {
+ int word_length = 0;
+ while (word_length < 1) {
+ word_length = std::round(norm_dist(*r));
+ }
+ language.push_back(RandomString(kAlNumAlphabet, word_length, r));
+ }
+ return language;
+}
+
+class DestructibleDirectory {
+ public:
+ explicit DestructibleDirectory(const Filesystem& filesystem,
+ const std::string& dir)
+ : filesystem_(filesystem), dir_(dir) {
+ filesystem_.CreateDirectoryRecursively(dir_.c_str());
+ }
+ ~DestructibleDirectory() {
+ filesystem_.DeleteDirectoryRecursively(dir_.c_str());
+ }
+
+ private:
+ Filesystem filesystem_;
+ std::string dir_;
+};
+
+void BM_FlushBenchmark(benchmark::State& state) {
+ PersistType::Code persist_type =
+ (state.range(0)) ? PersistType::LITE : PersistType::FULL;
+ int num_documents_per_persist = state.range(1);
+
+ // Initialize the filesystem
+ std::string test_dir = GetTestTempDir() + "/icing/benchmark/flush";
+ Filesystem filesystem;
+ DestructibleDirectory ddir(filesystem, test_dir);
+
+ // Create the schema.
+ std::default_random_engine random;
+ int num_types = kAvgNumNamespaces * kAvgNumTypes;
+ ExactStringPropertyGenerator property_generator;
+ RandomSchemaGenerator<std::default_random_engine,
+ ExactStringPropertyGenerator>
+ schema_generator(&random, &property_generator);
+ SchemaProto schema =
+ schema_generator.GenerateSchema(num_types, kMaxNumProperties);
+ EvenDistributionTypeSelector type_selector(schema);
+
+ std::vector<std::string> namespaces = CreateNamespaces(kAvgNumNamespaces);
+ EvenDistributionNamespaceSelector namespace_selector(namespaces);
+
+ std::vector<std::string> language = CreateLanguage(kLanguageSize, &random);
+ UniformDistributionLanguageTokenGenerator<std::default_random_engine>
+ token_generator(language, &random);
+
+ DocumentGenerator<
+ EvenDistributionNamespaceSelector, EvenDistributionTypeSelector,
+ UniformDistributionLanguageTokenGenerator<std::default_random_engine>>
+ generator(&namespace_selector, &type_selector, &token_generator,
+ kAvgDocumentSize * kContentSizePct);
+
+ IcingSearchEngineOptions options;
+ options.set_base_dir(test_dir);
+ std::unique_ptr<IcingSearchEngine> icing =
+ std::make_unique<IcingSearchEngine>(options);
+
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+ for (auto s : state) {
+ for (int i = 0; i < kNumDocuments; ++i) {
+ icing->Put(generator.generateDoc());
+
+ if (i % num_documents_per_persist == num_documents_per_persist - 1) {
+ icing->PersistToDisk(persist_type);
+ }
+ }
+ }
+}
+BENCHMARK(BM_FlushBenchmark)
+ // First argument: lite_flush,
+ // Second argument: num_document_per_lite_flush
+ ->ArgPair(true, 1)
+ ->ArgPair(false, 1)
+ ->ArgPair(true, 32)
+ ->ArgPair(false, 32)
+ ->ArgPair(true, 1024)
+ ->ArgPair(false, 1024);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_fuzz_test.cc b/icing/icing-search-engine_fuzz_test.cc
index d31f836..2cf19ad 100644
--- a/icing/icing-search-engine_fuzz_test.cc
+++ b/icing/icing-search-engine_fuzz_test.cc
@@ -18,11 +18,15 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/document-builder.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/icing-search-engine.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
+#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -36,20 +40,6 @@ IcingSearchEngineOptions Setup() {
return icing_options;
}
-SchemaProto SetTypes() {
- SchemaProto schema;
- SchemaTypeConfigProto* type = schema.add_types();
- type->set_schema_type("Message");
- PropertyConfigProto* body = type->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- return schema;
-}
-
DocumentProto MakeDocument(const uint8_t* data, size_t size) {
// TODO (sidchhabra): Added more optimized fuzzing techniques.
DocumentProto document;
@@ -82,7 +72,15 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
// TODO (b/145758378): Deleting directory should not be required.
filesystem_.DeleteDirectoryRecursively(icing_options.base_dir().c_str());
icing.Initialize();
- SchemaProto schema_proto = SetTypes();
+
+ SchemaProto schema_proto =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
icing.SetSchema(schema_proto);
// Index
diff --git a/icing/icing-search-engine_initialization_test.cc b/icing/icing-search-engine_initialization_test.cc
new file mode 100644
index 0000000..122e4af
--- /dev/null
+++ b/icing/icing-search-engine_initialization_test.cc
@@ -0,0 +1,6030 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/file/version-util.h"
+#include "icing/icing-search-engine.h"
+#include "icing/index/data-indexing-handler.h"
+#include "icing/index/index-processor.h"
+#include "icing/index/index.h"
+#include "icing/index/integer-section-indexing-handler.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/integer-index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/term-indexing-handler.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/join/join-processor.h"
+#include "icing/join/qualified-id-join-index-impl-v2.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/join/qualified-id-join-indexing-handler.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/query/query-features.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-associated-score-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-log-creator.h"
+#include "icing/store/document-store.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::_;
+using ::testing::AtLeast;
+using ::testing::DoDefault;
+using ::testing::EndsWith;
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Matcher;
+using ::testing::Ne;
+using ::testing::Return;
+using ::testing::SizeIs;
+
+constexpr std::string_view kIpsumText =
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla convallis "
+ "scelerisque orci quis hendrerit. Sed augue turpis, sodales eu gravida "
+ "nec, scelerisque nec leo. Maecenas accumsan interdum commodo. Aliquam "
+ "mattis sapien est, sit amet interdum risus dapibus sed. Maecenas leo "
+ "erat, fringilla in nisl a, venenatis gravida metus. Phasellus venenatis, "
+ "orci in aliquet mattis, lectus sapien volutpat arcu, sed hendrerit ligula "
+ "arcu nec mauris. Integer dolor mi, rhoncus eget gravida et, pulvinar et "
+ "nunc. Aliquam ac sollicitudin nisi. Vivamus sit amet urna vestibulum, "
+ "tincidunt eros sed, efficitur nisl. Fusce non neque accumsan, sagittis "
+ "nisi eget, sagittis turpis. Ut pulvinar nibh eu purus feugiat faucibus. "
+ "Donec tellus nulla, tincidunt vel lacus id, bibendum fermentum turpis. "
+ "Nullam ultrices sed nibh vitae aliquet. Ut risus neque, consectetur "
+ "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh "
+ "placerat semper.";
+
+PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader(
+ Filesystem filesystem, const std::string& file_path) {
+ PortableFileBackedProtoLog<DocumentWrapper>::Header header;
+ filesystem.PRead(file_path.c_str(), &header,
+ sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header),
+ /*offset=*/0);
+ return header;
+}
+
+void WriteDocumentLogHeader(
+ Filesystem filesystem, const std::string& file_path,
+ PortableFileBackedProtoLog<DocumentWrapper>::Header& header) {
+ filesystem.Write(file_path.c_str(), &header,
+ sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header));
+}
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+// This test is meant to cover all tests relating to
+// IcingSearchEngine::Initialize.
+class IcingSearchEngineInitializationTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ normalizer_,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
+ }
+
+ void TearDown() override {
+ normalizer_.reset();
+ lang_segmenter_.reset();
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ const IcingFilesystem* icing_filesystem() const { return &icing_filesystem_; }
+
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<Normalizer> normalizer_;
+};
+
+// Non-zero value so we don't override it to be the current time
+constexpr int64_t kDefaultCreationTimestampMs = 1575492852000;
+
+std::string GetVersionFilename() { return GetTestBaseDir() + "/version"; }
+
+std::string GetDocumentDir() { return GetTestBaseDir() + "/document_dir"; }
+
+std::string GetIndexDir() { return GetTestBaseDir() + "/index_dir"; }
+
+std::string GetIntegerIndexDir() {
+ return GetTestBaseDir() + "/integer_index_dir";
+}
+
+std::string GetQualifiedIdJoinIndexDir() {
+ return GetTestBaseDir() + "/qualified_id_join_index_dir";
+}
+
+std::string GetSchemaDir() { return GetTestBaseDir() + "/schema_dir"; }
+
+std::string GetHeaderFilename() {
+ return GetTestBaseDir() + "/icing_search_engine_header";
+}
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ icing_options.set_document_store_namespace_id_fingerprint(true);
+ icing_options.set_use_new_qualified_id_join_index(true);
+ return icing_options;
+}
+
+DocumentProto CreateMessageDocument(std::string name_space, std::string uri) {
+ return DocumentBuilder()
+ .SetKey(std::move(name_space), std::move(uri))
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+}
+
+DocumentProto CreateEmailDocument(const std::string& name_space,
+ const std::string& uri, int score,
+ const std::string& subject_content,
+ const std::string& body_content) {
+ return DocumentBuilder()
+ .SetKey(name_space, uri)
+ .SetSchema("Email")
+ .SetScore(score)
+ .AddStringProperty("subject", subject_content)
+ .AddStringProperty("body", body_content)
+ .Build();
+}
+
+SchemaTypeConfigProto CreateMessageSchemaTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+}
+
+SchemaTypeConfigProto CreateEmailSchemaTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+}
+
+SchemaProto CreateMessageSchema() {
+ return SchemaBuilder().AddType(CreateMessageSchemaTypeConfig()).Build();
+}
+
+SchemaProto CreateEmailSchema() {
+ return SchemaBuilder().AddType(CreateEmailSchemaTypeConfig()).Build();
+}
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+// TODO(b/272145329): create SearchSpecBuilder, JoinSpecBuilder,
+// SearchResultProtoBuilder and ResultProtoBuilder for unit tests and build all
+// instances by them.
+
+TEST_F(IcingSearchEngineInitializationTest, UninitializedInstanceFailsSafely) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+
+ SchemaProto email_schema = CreateMessageSchema();
+ EXPECT_THAT(icing.SetSchema(email_schema).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.GetSchema().status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.GetSchemaType(email_schema.types(0).schema_type()).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+
+ DocumentProto doc = CreateMessageDocument("namespace", "uri");
+ EXPECT_THAT(icing.Put(doc).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing
+ .Get(doc.namespace_(), doc.uri(),
+ GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Delete(doc.namespace_(), doc.uri()).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.DeleteByNamespace(doc.namespace_()).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.DeleteBySchemaType(email_schema.types(0).schema_type())
+ .status()
+ .code(),
+ Eq(StatusProto::FAILED_PRECONDITION));
+
+ SearchSpecProto search_spec = SearchSpecProto::default_instance();
+ ScoringSpecProto scoring_spec = ScoringSpecProto::default_instance();
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ EXPECT_THAT(icing.Search(search_spec, scoring_spec, result_spec).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ constexpr int kSomePageToken = 12;
+ EXPECT_THAT(icing.GetNextPage(kSomePageToken).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ icing.InvalidateNextPageToken(kSomePageToken); // Verify this doesn't crash.
+
+ EXPECT_THAT(icing.PersistToDisk(PersistType::FULL).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Optimize().status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+}
+
+TEST_F(IcingSearchEngineInitializationTest, SimpleInitialization) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(DocumentProto(document)).status(), ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializingAgainSavesNonPersistedData) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document;
+
+ ASSERT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ MaxIndexMergeSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(std::numeric_limits<int32_t>::max());
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ NegativeMergeSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(-1);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ ZeroMergeSizeReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(0);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineInitializationTest, GoodIndexMergeSizeReturnsOk) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // One is fine, if a bit weird. It just means that the lite index will be
+ // smaller and will request a merge any time content is added to it.
+ options.set_index_merge_size(1);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ NegativeMaxTokenLenReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_max_token_length(-1);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ ZeroMaxTokenLenReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_max_token_length(0);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ NegativeCompressionLevelReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_compression_level(-1);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ GreaterThanMaxCompressionLevelReturnsInvalidArgument) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_compression_level(10);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineInitializationTest, GoodCompressionLevelReturnsOk) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_compression_level(0);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ ReinitializingWithDifferentCompressionLevelReturnsOk) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_compression_level(3);
+ {
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ ASSERT_THAT(icing.PersistToDisk(PersistType::FULL).status(), ProtoIsOk());
+ }
+ options.set_compression_level(9);
+ {
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ }
+ options.set_compression_level(0);
+ {
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest, FailToCreateDocStore) {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ // This fails DocumentStore::Create()
+ ON_CALL(*mock_filesystem, CreateDirectoryRecursively(_))
+ .WillByDefault(Return(false));
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(),
+ ProtoStatusIs(StatusProto::INTERNAL));
+ EXPECT_THAT(initialize_result_proto.status().message(),
+ HasSubstr("Could not create directory"));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitMarkerFilePreviousFailuresAtThreshold) {
+ Filesystem filesystem;
+ DocumentProto email1 =
+ CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1");
+ email1.set_creation_timestamp_ms(10000);
+ DocumentProto email2 =
+ CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2");
+ email2.set_creation_timestamp_ms(10000);
+
+ {
+ // Create an index with a few documents.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoIsOk());
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(0));
+ ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ }
+
+ // Write an init marker file with 5 previously failed attempts.
+ std::string marker_filepath = GetTestBaseDir() + "/init_marker";
+
+ {
+ ScopedFd marker_file_fd(filesystem.OpenForWrite(marker_filepath.c_str()));
+ int network_init_attempts = GHostToNetworkL(5);
+ // Write the updated number of attempts before we get started.
+ ASSERT_TRUE(filesystem.PWrite(marker_file_fd.get(), 0,
+ &network_init_attempts,
+ sizeof(network_init_attempts)));
+ ASSERT_TRUE(filesystem.DataSync(marker_file_fd.get()));
+ }
+
+ {
+ // Create the index again and verify that initialization succeeds and no
+ // data is thrown out.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoIsOk());
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(5));
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(email1));
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(email2));
+ }
+
+ // The successful init should have thrown out the marker file.
+ ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str()));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitMarkerFilePreviousFailuresBeyondThreshold) {
+ Filesystem filesystem;
+ DocumentProto email1 =
+ CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1");
+ DocumentProto email2 =
+ CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2");
+
+ {
+ // Create an index with a few documents.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoIsOk());
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(0));
+ ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ }
+
+ // Write an init marker file with 6 previously failed attempts.
+ std::string marker_filepath = GetTestBaseDir() + "/init_marker";
+
+ {
+ ScopedFd marker_file_fd(filesystem.OpenForWrite(marker_filepath.c_str()));
+ int network_init_attempts = GHostToNetworkL(6);
+ // Write the updated number of attempts before we get started.
+ ASSERT_TRUE(filesystem.PWrite(marker_file_fd.get(), 0,
+ &network_init_attempts,
+ sizeof(network_init_attempts)));
+ ASSERT_TRUE(filesystem.DataSync(marker_file_fd.get()));
+ }
+
+ {
+ // Create the index again and verify that initialization succeeds and all
+ // data is thrown out.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(),
+ ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(6));
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+ }
+
+ // The successful init should have thrown out the marker file.
+ ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str()));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ SuccessiveInitFailuresIncrementsInitMarker) {
+ Filesystem filesystem;
+ DocumentProto email1 =
+ CreateEmailDocument("namespace", "uri1", 100, "subject1", "body1");
+ DocumentProto email2 =
+ CreateEmailDocument("namespace", "uri2", 50, "subject2", "body2");
+
+ {
+ // 1. Create an index with a few documents.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoIsOk());
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(0));
+ ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ }
+
+ {
+ // 2. Create an index that will encounter an IO failure when trying to
+ // create the document log.
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ std::string document_log_filepath =
+ icing_options.base_dir() + "/document_dir/document_log_v1";
+ ON_CALL(*mock_filesystem,
+ GetFileSize(Matcher<const char*>(Eq(document_log_filepath))))
+ .WillByDefault(Return(Filesystem::kBadFileSize));
+
+ TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ // Fail to initialize six times in a row.
+ InitializeResultProto init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(0));
+
+ init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(1));
+
+ init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(2));
+
+ init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(3));
+
+ init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(4));
+
+ init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(), ProtoStatusIs(StatusProto::INTERNAL));
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(5));
+ }
+
+ {
+ // 3. Create the index again and verify that initialization succeeds and all
+ // data is thrown out.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ ASSERT_THAT(init_result.status(),
+ ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+ ASSERT_THAT(init_result.initialize_stats().num_previous_init_failures(),
+ Eq(6));
+
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+ }
+
+ // The successful init should have thrown out the marker file.
+ std::string marker_filepath = GetTestBaseDir() + "/init_marker";
+ ASSERT_FALSE(filesystem.FileExists(marker_filepath.c_str()));
+}
+
+TEST_F(IcingSearchEngineInitializationTest, RecoverFromMissingHeaderFile) {
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
+
+ {
+ // Basic initialization/setup
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
+ ProtoIsOk());
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ EXPECT_TRUE(filesystem()->DeleteFile(GetHeaderFilename().c_str()));
+
+ // We should be able to recover from this and access all our previous data
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Checks that DocumentLog is still ok
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Checks that the term index is still ok so we can search over it
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Checks that the integer index is still ok so we can search over it
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Checks that Schema is still since it'll be needed to validate the document
+ EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
+ ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineInitializationTest, UnableToRecoverFromCorruptSchema) {
+ {
+ // Basic initialization/setup
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
+ ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
+
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ const std::string schema_file =
+ absl_ports::StrCat(GetSchemaDir(), "/schema.pb");
+ const std::string corrupt_data = "1234";
+ EXPECT_TRUE(filesystem()->Write(schema_file.c_str(), corrupt_data.data(),
+ corrupt_data.size()));
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INTERNAL));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ UnableToRecoverFromCorruptDocumentLog) {
+ {
+ // Basic initialization/setup
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
+ ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() =
+ CreateMessageDocument("namespace", "uri");
+
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ const std::string document_log_file = absl_ports::StrCat(
+ GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename());
+ const std::string corrupt_data = "1234";
+ EXPECT_TRUE(filesystem()->Write(document_log_file.c_str(),
+ corrupt_data.data(), corrupt_data.size()));
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::INTERNAL));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RecoverFromInconsistentSchemaStore) {
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2_with_additional_property =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("additional", "content")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ {
+ // Initializes folder and schema
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder(CreateMessageSchemaTypeConfig())
+ // Add non-indexable property "additional"
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("additional")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document2_with_additional_property).status(),
+ ProtoIsOk());
+
+ // Won't get us anything because "additional" isn't marked as an indexed
+ // property in the schema
+ SearchSpecProto search_spec;
+ search_spec.set_query("additional:content");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ {
+ // This schema will change the SchemaTypeIds from the previous schema_
+ // (since SchemaTypeIds are assigned based on order of the types, and this
+ // new schema changes the ordering of previous types)
+ SchemaProto new_schema;
+ auto type = new_schema.add_types();
+ type->set_schema_type("Email");
+
+ // Switching a non-indexable property to indexable changes the SectionIds
+ // (since SectionIds are assigned based on alphabetical order of indexed
+ // sections, marking "additional" as an indexed property will push the
+ // "body" and "indexableInteger" property to different SectionIds)
+ *new_schema.add_types() =
+ SchemaTypeConfigBuilder(CreateMessageSchemaTypeConfig())
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("additional")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Write the marker file
+ std::string marker_filepath =
+ absl_ports::StrCat(options.base_dir(), "/set_schema_marker");
+ ScopedFd sfd(filesystem()->OpenForWrite(marker_filepath.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+
+ // Write the new schema
+ FakeClock fake_clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ new_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ } // Will persist new schema
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // We can insert a Email document since we kept the new schema
+ DocumentProto email_document =
+ DocumentBuilder()
+ .SetKey("namespace", "email_uri")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ EXPECT_THAT(icing.Put(email_document).status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = email_document;
+
+ EXPECT_THAT(icing.Get("namespace", "email_uri",
+ GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Verify term search
+ SearchSpecProto search_spec1;
+
+ // The section restrict will ensure we are using the correct, updated
+ // SectionId in the Index
+ search_spec1.set_query("additional:content");
+
+ // Schema type filter will ensure we're using the correct, updated
+ // SchemaTypeId in the DocumentStore
+ search_spec1.add_schema_type_filters("Message");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto1;
+ expected_search_result_proto1.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto1.mutable_results()->Add()->mutable_document() =
+ document2_with_additional_property;
+
+ SearchResultProto search_result_proto1 =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto1));
+
+ // Verify numeric (integer) search
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec1.add_schema_type_filters("Message");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto expected_search_result_google::protobuf;
+ expected_search_result_google::protobuf.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_google::protobuf.mutable_results()->Add()->mutable_document() =
+ document2_with_additional_property;
+ *expected_search_result_google::protobuf.mutable_results()->Add()->mutable_document() =
+ document1;
+
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_google::protobuf));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RecoverFromInconsistentDocumentStore) {
+ // Test the following scenario: document store is ahead of term, integer and
+ // qualified id join index. IcingSearchEngine should be able to recover all
+ // indices. Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+ // - Still, we need to replay and reindex documents.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message1 =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message2 =
+ DocumentBuilder()
+ .SetKey("namespace", "message/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ {
+ // Initializes folder and schema, index one document
+ TestIcingSearchEngine icing(icing_options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message1).status(), ProtoIsOk());
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ {
+ FakeClock fake_clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock));
+
+ // Puts message2 into DocumentStore but doesn't index it.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ filesystem(), GetDocumentDir(), &fake_clock, schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/
+ icing_options.document_store_namespace_id_fingerprint(),
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_EXPECT_OK(document_store->Put(message2));
+ }
+
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ EXPECT_THAT(initialize_result.status(), ProtoIsOk());
+ // Index Restoration should be triggered here and document2 should be
+ // indexed.
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = message1;
+
+ // DocumentStore kept the additional document
+ EXPECT_THAT(icing.Get("namespace", "message/1",
+ GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ *expected_get_result_proto.mutable_document() = message2;
+ EXPECT_THAT(icing.Get("namespace", "message/2",
+ GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ message2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ message1;
+
+ // We indexed the additional document in all indices.
+ // Verify term search
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("message");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto search_result_proto1 =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `body:message` based on the child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:message");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto expected_join_search_result_proto;
+ expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_join_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message2;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message1;
+
+ SearchResultProto search_result_proto3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ EXPECT_THAT(search_result_proto3, EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptedDocumentStore) {
+ // Test the following scenario: some document store derived files are
+ // corrupted. IcingSearchEngine should be able to recover the document store,
+ // and since NamespaceIds were reassigned, we should rebuild qualified id join
+ // index as well. Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Should discard the entire qualified id join index directory and start
+ // it from scratch.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+ // - Still, we need to replay and reindex documents (for qualified id join
+ // index).
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto personDummy =
+ DocumentBuilder()
+ .SetKey("namespace2", "personDummy")
+ .SetSchema("Person")
+ .AddStringProperty("name", "personDummy")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace2", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace2#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ {
+ // Initializes folder and schema, index one document
+ TestIcingSearchEngine icing(icing_options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ // "namespace2" (in personDummy) will be assigned NamespaceId = 0.
+ EXPECT_THAT(icing.Put(personDummy).status(), ProtoIsOk());
+ // "namespace1" (in person1) will be assigned NamespaceId = 1.
+ EXPECT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+
+ // Now delete personDummy.
+ EXPECT_THAT(
+ icing.Delete(personDummy.namespace_(), personDummy.uri()).status(),
+ ProtoIsOk());
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ {
+ FakeClock fake_clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock));
+
+ // Manually corrupt one of the derived files of DocumentStore without
+ // updating checksum in DocumentStore header.
+ std::string score_cache_filename = GetDocumentDir() + "/score_cache";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>>
+ score_cache,
+ FileBackedVector<DocumentAssociatedScoreData>::Create(
+ *filesystem(), std::move(score_cache_filename),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK_AND_ASSIGN(const DocumentAssociatedScoreData* score_data,
+ score_cache->Get(/*idx=*/0));
+ ICING_ASSERT_OK(score_cache->Set(
+ /*idx=*/0,
+ DocumentAssociatedScoreData(score_data->corpus_id(),
+ score_data->document_score() + 1,
+ score_data->creation_timestamp_ms(),
+ score_data->length_in_tokens())));
+ ICING_ASSERT_OK(score_cache->PersistToDisk());
+ }
+
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should be discarded once, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(1);
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ EXPECT_THAT(initialize_result.status(), ProtoIsOk());
+ // DocumentStore should be recovered. When reassigning NamespaceId, the order
+ // will be the document traversal order: [person1, person2, message].
+ // Therefore, "namespace1" will have id = 0 and "namespace2" will have id = 1.
+ EXPECT_THAT(
+ initialize_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ // Term, integer index should be unaffected.
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ // Qualified id join index should be rebuilt.
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::DEPENDENCIES_CHANGED));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `body:message` based on the child's `senderQualifiedId` field. message2
+ // should be joined to person2 correctly.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:message");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto expected_join_search_result_proto;
+ expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_join_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person2;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message;
+
+ *expected_join_search_result_proto.mutable_results()
+ ->Add()
+ ->mutable_document() = person1;
+
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIndex) {
+ // Test the following scenario: term index is corrupted (e.g. checksum doesn't
+ // match). IcingSearchEngine should be able to recover term index. Several
+ // additional behaviors are also tested:
+ // - Index directory handling:
+ // - Should discard the entire term index directory and start it from
+ // scratch.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect since we start it
+ // from scratch.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("body:message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ message;
+
+ {
+ // Initializes folder and schema, index one document
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ // Manually corrupt term index
+ {
+ const std::string index_hit_buffer_file = GetIndexDir() + "/idx/lite.hb";
+ ScopedFd fd(filesystem()->OpenForWrite(index_hit_buffer_file.c_str()));
+ ASSERT_TRUE(fd.is_valid());
+ ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4));
+ }
+
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should be discarded once.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(1);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory "*/integer_index_dir/*"
+ // should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ EXPECT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Check that our index is ok by searching over the restored index
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineInitializationTest, RecoverFromCorruptIntegerIndex) {
+ // Test the following scenario: integer index is corrupted (e.g. checksum
+ // doesn't match). IcingSearchEngine should be able to recover integer index.
+ // Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Should discard the entire integer index directory and start it from
+ // scratch.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded, since we start it from scratch.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("indexableInteger == 123");
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ message;
+
+ {
+ // Initializes folder and schema, index one document
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ // Manually corrupt integer index
+ {
+ const std::string integer_index_metadata_file =
+ GetIntegerIndexDir() + "/integer_index.m";
+ ScopedFd fd(
+ filesystem()->OpenForWrite(integer_index_metadata_file.c_str()));
+ ASSERT_TRUE(fd.is_valid());
+ ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4));
+ }
+
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should be discarded once, and Clear()
+ // should never be called (i.e. storage sub directory "*/integer_index_dir/*"
+ // should never be discarded) since we start it from scratch.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(1);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ EXPECT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Check that our index is ok by searching over the restored index
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RecoverFromIntegerIndexBucketSplitThresholdChange) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddInt64Property("indexableInteger", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with a message document.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Create the index again with different
+ // integer_index_bucket_split_threshold. This should trigger index
+ // restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should be discarded once, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded) since we start it from
+ // scratch.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(1);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ static constexpr int32_t kNewIntegerIndexBucketSplitThreshold = 1000;
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ ASSERT_THAT(kNewIntegerIndexBucketSplitThreshold,
+ Ne(options.integer_index_bucket_split_threshold()));
+ options.set_integer_index_bucket_split_threshold(
+ kNewIntegerIndexBucketSplitThreshold);
+
+ TestIcingSearchEngine icing(options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec;
+ search_spec.set_query("indexableInteger == 123");
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document().uri(), Eq("message/1"));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RecoverFromCorruptQualifiedIdJoinIndex) {
+ // Test the following scenario: qualified id join index is corrupted (e.g.
+ // checksum doesn't match). IcingSearchEngine should be able to recover
+ // qualified id join index. Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Should discard the entire qualified id join index directory and start
+ // it from scratch.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded, since we start
+ // it from scratch.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Prepare join search spec to join a query for `name:person` with a child
+ // query for `body:message` based on the child's `senderQualifiedId` field.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:message");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message;
+
+ {
+ // Initializes folder and schema, index one document
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ // Manually corrupt qualified id join index
+ {
+ const std::string qualified_id_join_index_metadata_file =
+ GetQualifiedIdJoinIndexDir() + "/metadata";
+ ScopedFd fd(filesystem()->OpenForWrite(
+ qualified_id_join_index_metadata_file.c_str()));
+ ASSERT_TRUE(fd.is_valid());
+ ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4));
+ }
+
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory "*/integer_index_dir/*"
+ // should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should be discarded once, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(1);
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ EXPECT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+
+ // Check that our index is ok by searching over the restored index
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseTermIndex) {
+ // Test the following scenario: losing the entire term index. Since we need
+ // flash index magic to determine the version, in this test we will throw out
+ // the entire term index and re-initialize an empty one, to bypass
+ // undetermined version state change and correctly trigger "lose term index"
+ // scenario.
+ // IcingSearchEngine should be able to recover term index. Several additional
+ // behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should not be discarded (but instead just being
+ // rebuilt by replaying all docs).
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect since it is empty.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with 3 message documents.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/3").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Delete and re-initialize an empty term index to trigger
+ // RestoreIndexIfNeeded.
+ {
+ std::string idx_subdir = GetIndexDir() + "/idx";
+ ASSERT_TRUE(filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(Index::Options(GetIndexDir(),
+ /*index_merge_size=*/100,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/50),
+ filesystem(), icing_filesystem()));
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // 3. Create the index again. This should trigger index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded since we've already
+ // lost it.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Verify term index works normally
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:consectetur");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results1 =
+ icing.Search(search_spec1, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results1.status(), ProtoIsOk());
+ EXPECT_THAT(results1.next_page_token(), Eq(0));
+ // All documents should be retrievable.
+ ASSERT_THAT(results1.results(), SizeIs(3));
+ EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1"));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results2 =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results2.results(), SizeIs(3));
+ EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1"));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ ASSERT_THAT(results3.results(), SizeIs(1));
+ EXPECT_THAT(results3.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3));
+ EXPECT_THAT(results3.results(0).joined_results(0).document().uri(),
+ Eq("message/3"));
+ EXPECT_THAT(results3.results(0).joined_results(1).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results3.results(0).joined_results(2).document().uri(),
+ Eq("message/1"));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest, RestoreIndexLoseIntegerIndex) {
+ // Test the following scenario: losing the entire integer index directory.
+ // IcingSearchEngine should be able to recover integer index. Several
+ // additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should not be discarded since we've already
+ // lost it. Start it from scratch.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded, since we start it from scratch.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with 3 message documents.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/3").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Delete the integer index file to trigger RestoreIndexIfNeeded.
+ std::string integer_index_dir = GetIntegerIndexDir();
+ filesystem()->DeleteDirectoryRecursively(integer_index_dir.c_str());
+
+ // 3. Create the index again. This should trigger index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded since we've
+ // already lost it, and Clear() should never be called (i.e. storage sub
+ // directory "*/integer_index_dir/*" should never be discarded) since we
+ // start it from scratch.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Verify term index works normally
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:consectetur");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results1 =
+ icing.Search(search_spec1, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results1.status(), ProtoIsOk());
+ EXPECT_THAT(results1.next_page_token(), Eq(0));
+ // All documents should be retrievable.
+ ASSERT_THAT(results1.results(), SizeIs(3));
+ EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1"));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results2 =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results2.results(), SizeIs(3));
+ EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1"));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ ASSERT_THAT(results3.results(), SizeIs(1));
+ EXPECT_THAT(results3.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3));
+ EXPECT_THAT(results3.results(0).joined_results(0).document().uri(),
+ Eq("message/3"));
+ EXPECT_THAT(results3.results(0).joined_results(1).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results3.results(0).joined_results(2).document().uri(),
+ Eq("message/1"));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexLoseQualifiedIdJoinIndex) {
+ // Test the following scenario: losing the entire qualified id join index
+ // directory. IcingSearchEngine should be able to recover qualified id join
+ // index. Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should not be discarded since we've
+ // already lost it. Start it from scratch.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded, since we start
+ // it from scratch.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with 3 message documents.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/3").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Delete the qualified id join index file to trigger RestoreIndexIfNeeded.
+ std::string qualified_id_join_index_dir = GetQualifiedIdJoinIndexDir();
+ filesystem()->DeleteDirectoryRecursively(qualified_id_join_index_dir.c_str());
+
+ // 3. Create the index again. This should trigger index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded since we've
+ // already lost it, and Clear() should never be called (i.e. storage sub
+ // directory "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded)
+ // since we start it from scratch.
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+
+ // Verify term index works normally
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:consectetur");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results1 =
+ icing.Search(search_spec1, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results1.status(), ProtoIsOk());
+ EXPECT_THAT(results1.next_page_token(), Eq(0));
+ // All documents should be retrievable.
+ ASSERT_THAT(results1.results(), SizeIs(3));
+ EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1"));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results2 =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results2.results(), SizeIs(3));
+ EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1"));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ ASSERT_THAT(results3.results(), SizeIs(1));
+ EXPECT_THAT(results3.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3));
+ EXPECT_THAT(results3.results(0).joined_results(0).document().uri(),
+ Eq("message/3"));
+ EXPECT_THAT(results3.results(0).joined_results(1).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results3.results(0).joined_results(2).document().uri(),
+ Eq("message/1"));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexTruncateLiteIndexWithoutReindexing) {
+ // Test the following scenario: term lite index is *completely* ahead of
+ // document store. IcingSearchEngine should be able to recover term index.
+ // Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index should take effect and throw out the
+ // entire lite index. This should be sufficient to make term index
+ // consistent with document store, so reindexing should not take place.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with a LiteIndex that will only allow a person and a
+ // message document before needing a merge.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(person.ByteSizeLong() +
+ message.ByteSizeLong());
+ TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ // Add two message documents. These should get merged into the main index.
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Manually add some data into term lite index and increment
+ // last_added_document_id, but don't merge into the main index. This will
+ // cause mismatched last_added_document_id with term index.
+ // - Document store: [0, 1, 2]
+ // - Term index
+ // - Main index: [0, 1, 2]
+ // - Lite index: [3]
+ // - Integer index: [0, 1, 2]
+ // - Qualified id join index: [0, 1, 2]
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(
+ Index::Options(GetIndexDir(),
+ /*index_merge_size=*/message.ByteSizeLong(),
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/8),
+ filesystem(), icing_filesystem()));
+ DocumentId original_last_added_doc_id = index->last_added_document_id();
+ index->set_last_added_document_id(original_last_added_doc_id + 1);
+ Index::Editor editor =
+ index->Edit(original_last_added_doc_id + 1, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ }
+
+ // 3. Create the index again.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded. since we only call
+ // TruncateTo for term index.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(message.ByteSizeLong());
+ TestIcingSearchEngine icing(options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ // Since truncating lite index is sufficient to make term index consistent
+ // with document store, replaying documents or reindex shouldn't take place.
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Verify term index works normally
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:consectetur");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results1 =
+ icing.Search(search_spec1, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results1.status(), ProtoIsOk());
+ EXPECT_THAT(results1.next_page_token(), Eq(0));
+ // Only the documents that were in the main index should be retrievable.
+ ASSERT_THAT(results1.results(), SizeIs(2));
+ EXPECT_THAT(results1.results(0).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results1.results(1).document().uri(), Eq("message/1"));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results2 =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results2.results(), SizeIs(2));
+ EXPECT_THAT(results2.results(0).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results2.results(1).document().uri(), Eq("message/1"));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ ASSERT_THAT(results3.results(), SizeIs(1));
+ EXPECT_THAT(results3.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results3.results(0).joined_results(), SizeIs(2));
+ EXPECT_THAT(results3.results(0).joined_results(0).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results3.results(0).joined_results(1).document().uri(),
+ Eq("message/1"));
+ }
+
+ // 4. Since document 3 doesn't exist, testing query = "foo" is not enough to
+ // verify the correctness of term index restoration. Instead, we have to check
+ // hits for "foo" should not be found in term index.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(
+ Index::Options(GetIndexDir(),
+ /*index_merge_size=*/message.ByteSizeLong(),
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/8),
+ filesystem(), icing_filesystem()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter,
+ index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(doc_hit_info_iter->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexTruncateLiteIndexWithReindexing) {
+ // Test the following scenario: term lite index is *partially* ahead of
+ // document store. IcingSearchEngine should be able to recover term index.
+ // Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index should take effect and throw out the
+ // entire lite index. However, some valid data in term lite index were
+ // discarded together, so reindexing should still take place to recover
+ // them after truncating.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with a LiteIndex that will only allow a person and a
+ // message document before needing a merge.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(message.ByteSizeLong());
+ TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ // Add two message documents. These should get merged into the main index.
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ // Add one document. This one should get remain in the lite index.
+ message = DocumentBuilder(message).SetUri("message/3").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Manually add some data into term lite index and increment
+ // last_added_document_id, but don't merge into the main index. This will
+ // cause mismatched last_added_document_id with term index.
+ // - Document store: [0, 1, 2, 3]
+ // - Term index
+ // - Main index: [0, 1, 2]
+ // - Lite index: [3, 4]
+ // - Integer index: [0, 1, 2, 3]
+ // - Qualified id join index: [0, 1, 2, 3]
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(
+ Index::Options(GetIndexDir(),
+ /*index_merge_size=*/message.ByteSizeLong(),
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/8),
+ filesystem(), icing_filesystem()));
+ DocumentId original_last_added_doc_id = index->last_added_document_id();
+ index->set_last_added_document_id(original_last_added_doc_id + 1);
+ Index::Editor editor =
+ index->Edit(original_last_added_doc_id + 1, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ }
+
+ // 3. Create the index again.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded. since we only call
+ // TruncateTo for term index.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(message.ByteSizeLong());
+ TestIcingSearchEngine icing(options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ // Truncating lite index not only deletes data ahead document store, but
+ // also deletes valid data. Therefore, we still have to replay documents and
+ // reindex.
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Verify term index works normally
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:consectetur");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results1 =
+ icing.Search(search_spec1, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results1.status(), ProtoIsOk());
+ EXPECT_THAT(results1.next_page_token(), Eq(0));
+ // Only the documents that were in the main index should be retrievable.
+ ASSERT_THAT(results1.results(), SizeIs(3));
+ EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1"));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results2 =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results2.results(), SizeIs(3));
+ EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1"));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ ASSERT_THAT(results3.results(), SizeIs(1));
+ EXPECT_THAT(results3.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3));
+ EXPECT_THAT(results3.results(0).joined_results(0).document().uri(),
+ Eq("message/3"));
+ EXPECT_THAT(results3.results(0).joined_results(1).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results3.results(0).joined_results(2).document().uri(),
+ Eq("message/1"));
+ }
+
+ // 4. Since document 4 doesn't exist, testing query = "foo" is not enough to
+ // verify the correctness of term index restoration. Instead, we have to check
+ // hits for "foo" should not be found in term index.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(
+ Index::Options(GetIndexDir(),
+ /*index_merge_size=*/message.ByteSizeLong(),
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/8),
+ filesystem(), icing_filesystem()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter,
+ index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(doc_hit_info_iter->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexTruncateMainIndexWithoutReindexing) {
+ // Test the following scenario: term main index is *completely* ahead of
+ // document store. IcingSearchEngine should be able to recover term index.
+ // Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index should take effect and throw out the
+ // entire lite and main index. This should be sufficient to make term
+ // index consistent with document store (in this case, document store is
+ // empty as well), so reindexing should not take place.
+ // - "Clear()" should be called for integer index. It is a special case when
+ // document store has no document. Since there is no integer index storage
+ // sub directories (path_expr = "*/integer_index_dir/*"), nothing will be
+ // discarded.
+ // - "Clear()" should be called for qualified id join index. It is a special
+ // case when document store has no document.
+
+ // 1. Create an index with no document.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ }
+
+ // 2. Manually add some data into term lite index and increment
+ // last_added_document_id. Merge some of them into the main index and keep
+ // others in the lite index. This will cause mismatched document id with
+ // document store.
+ // - Document store: []
+ // - Term index
+ // - Main index: [0]
+ // - Lite index: [1]
+ // - Integer index: []
+ // - Qualified id join index: []
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(
+ // index merge size is not important here because we will manually
+ // invoke merge below.
+ Index::Options(GetIndexDir(), /*index_merge_size=*/100,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/50),
+ filesystem(), icing_filesystem()));
+ // Add hits for document 0 and merge.
+ ASSERT_THAT(index->last_added_document_id(), kInvalidDocumentId);
+ index->set_last_added_document_id(0);
+ Index::Editor editor =
+ index->Edit(/*document_id=*/0, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ ICING_ASSERT_OK(index->Merge());
+
+ // Add hits for document 1 and don't merge.
+ index->set_last_added_document_id(1);
+ editor = index->Edit(/*document_id=*/1, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ }
+
+ // 3. Create the index again. This should throw out the lite and main index.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded. since we only call
+ // TruncateTo for term index.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded. Even though
+ // Clear() was called, it shouldn't take effect since there is no storage
+ // sub directory ("*/integer_index_dir/*") and nothing will be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded.
+ // Clear() was called and should discard and reinitialize the underlying
+ // mapper.
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(AtLeast(1));
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ // Since truncating main index is sufficient to make term index consistent
+ // with document store, replaying documents or reindexing shouldn't take
+ // place.
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ }
+
+ // 4. Since document 0, 1 don't exist, testing queries = "foo", "bar" are not
+ // enough to verify the correctness of term index restoration. Instead, we
+ // have to check hits for "foo", "bar" should not be found in term index.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/50),
+ filesystem(), icing_filesystem()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter,
+ index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(doc_hit_info_iter->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ doc_hit_info_iter,
+ index->GetIterator("bar", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(doc_hit_info_iter->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexTruncateMainIndexWithReindexing) {
+ // Test the following scenario: term main index is *partially* ahead of
+ // document store. IcingSearchEngine should be able to recover term index.
+ // Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - In RestoreIndexIfNecessary():
+ // - "TruncateTo()" for term index should take effect and throw out the
+ // entire lite and main index. However, some valid data in term main index
+ // were discarded together, so reindexing should still take place to
+ // recover them after truncating.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with 3 message documents.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/3").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Manually add some data into term lite index and increment
+ // last_added_document_id. Merge some of them into the main index and keep
+ // others in the lite index. This will cause mismatched document id with
+ // document store.
+ // - Document store: [0, 1, 2, 3]
+ // - Term index
+ // - Main index: [0, 1, 2, 3, 4]
+ // - Lite index: [5]
+ // - Integer index: [0, 1, 2, 3]
+ // - Qualified id join index: [0, 1, 2, 3]
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(
+ Index::Options(GetIndexDir(),
+ /*index_merge_size=*/message.ByteSizeLong(),
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/8),
+ filesystem(), icing_filesystem()));
+ // Add hits for document 4 and merge.
+ DocumentId original_last_added_doc_id = index->last_added_document_id();
+ index->set_last_added_document_id(original_last_added_doc_id + 1);
+ Index::Editor editor =
+ index->Edit(original_last_added_doc_id + 1, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ ICING_ASSERT_OK(index->Merge());
+
+ // Add hits for document 5 and don't merge.
+ index->set_last_added_document_id(original_last_added_doc_id + 2);
+ editor = index->Edit(original_last_added_doc_id + 2, /*section_id=*/0,
+ TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ }
+
+ // 3. Create the index again. This should throw out the lite and main index
+ // and trigger index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded. since we only call
+ // TruncateTo for term index.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ // Truncating main index not only deletes data ahead document store, but
+ // also deletes valid data. Therefore, we still have to replay documents and
+ // reindex.
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Verify term index works normally
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:consectetur");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results1 =
+ icing.Search(search_spec1, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results1.status(), ProtoIsOk());
+ EXPECT_THAT(results1.next_page_token(), Eq(0));
+ // Only the first document should be retrievable.
+ ASSERT_THAT(results1.results(), SizeIs(3));
+ EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1"));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results2 =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results2.results(), SizeIs(3));
+ EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1"));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ ASSERT_THAT(results3.results(), SizeIs(1));
+ EXPECT_THAT(results3.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3));
+ EXPECT_THAT(results3.results(0).joined_results(0).document().uri(),
+ Eq("message/3"));
+ EXPECT_THAT(results3.results(0).joined_results(1).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results3.results(0).joined_results(2).document().uri(),
+ Eq("message/1"));
+ }
+
+ // 4. Since document 4, 5 don't exist, testing queries = "foo", "bar" are not
+ // enough to verify the correctness of term index restoration. Instead, we
+ // have to check hits for "foo", "bar" should not be found in term index.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(Index::Options(GetIndexDir(), /*index_merge_size=*/100,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/50),
+ filesystem(), icing_filesystem()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iter,
+ index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(doc_hit_info_iter->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ doc_hit_info_iter,
+ index->GetIterator("bar", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(doc_hit_info_iter->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexTruncateIntegerIndexWithoutReindexing) {
+ // Test the following scenario: integer index is *completely* ahead of
+ // document store. IcingSearchEngine should be able to recover integer index.
+ // Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" should be called for integer index and throw out all integer
+ // index storages, i.e. all storage sub directories (path_expr =
+ // "*/integer_index_dir/*") should be discarded. This should be sufficient
+ // to make integer index consistent with document store (in this case,
+ // document store is empty as well), so reindexing should not take place.
+ // - "Clear()" should be called for qualified id join index. It is a special
+ // case when document store has no document.
+
+ // 1. Create an index with no document.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ }
+
+ // 2. Manually add some data into integer index and increment
+ // last_added_document_id. This will cause mismatched document id with
+ // document store.
+ // - Document store: []
+ // - Term index: []
+ // - Integer index: [0]
+ // - Qualified id join index: []
+ {
+ Filesystem filesystem;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem, GetIntegerIndexDir(),
+ /*num_data_threshold_for_bucket_split=*/65536,
+ /*pre_mapping_fbv=*/false));
+ // Add hits for document 0.
+ ASSERT_THAT(integer_index->last_added_document_id(), kInvalidDocumentId);
+ integer_index->set_last_added_document_id(0);
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor = integer_index->Edit(
+ /*property_path=*/"indexableInteger", /*document_id=*/0,
+ /*section_id=*/0);
+ ICING_ASSERT_OK(editor->BufferKey(123));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+ }
+
+ // 3. Create the index again. This should trigger index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ // Clear() should be called to truncate integer index and thus storage sub
+ // directory (path_expr = "*/integer_index_dir/*") should be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(1);
+ // Ensure qualified id join index directory should never be discarded.
+ // Clear() was called and should discard and reinitialize the underlying
+ // mapper.
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(AtLeast(1));
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ // Since truncating integer index is sufficient to make it consistent with
+ // document store, replaying documents or reindexing shouldn't take place.
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Verify that numeric query safely wiped out the pre-existing hit for
+ // 'indexableInteger' == 123. Add a new document without that value for
+ // 'indexableInteger' that will take docid=0. If the integer index was not
+ // rebuilt correctly, then it will still have the previously added hit for
+ // 'indexableInteger' == 123 for docid 0 and incorrectly return this new
+ // doc in a query.
+ DocumentProto another_message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 456)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ EXPECT_THAT(icing.Put(another_message).status(), ProtoIsOk());
+ // Verify integer index works normally
+ SearchSpecProto search_spec;
+ search_spec.set_query("indexableInteger == 123");
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.results(), IsEmpty());
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexTruncateIntegerIndexWithReindexing) {
+ // Test the following scenario: integer index is *partially* ahead of document
+ // store. IcingSearchEngine should be able to recover integer index. Several
+ // additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" should be called for integer index and throw out all integer
+ // index storages, i.e. all storage sub directories (path_expr =
+ // "*/integer_index_dir/*") should be discarded. However, some valid data
+ // in integer index were discarded together, so reindexing should still
+ // take place to recover them after clearing.
+ // - "Clear()" shouldn't be called for qualified id join index, i.e. no
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with message 3 documents.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/3").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Manually add some data into integer index and increment
+ // last_added_document_id. This will cause mismatched document id with
+ // document store.
+ // - Document store: [0, 1, 2, 3]
+ // - Term index: [0, 1, 2, 3]
+ // - Integer index: [0, 1, 2, 3, 4]
+ // - Qualified id join index: [0, 1, 2, 3]
+ {
+ Filesystem filesystem;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem, GetIntegerIndexDir(),
+ /*num_data_threshold_for_bucket_split=*/65536,
+ /*pre_mapping_fbv=*/false));
+ // Add hits for document 4.
+ DocumentId original_last_added_doc_id =
+ integer_index->last_added_document_id();
+ integer_index->set_last_added_document_id(original_last_added_doc_id + 1);
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor = integer_index->Edit(
+ /*property_path=*/"indexableInteger",
+ /*document_id=*/original_last_added_doc_id + 1, /*section_id=*/0);
+ ICING_ASSERT_OK(editor->BufferKey(456));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+ }
+
+ // 3. Create the index again. This should trigger index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ // Clear() should be called to truncate integer index and thus storage sub
+ // directory (path_expr = "*/integer_index_dir/*") should be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(1);
+ // Ensure qualified id join index directory should never be discarded, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Verify term index works normally
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:consectetur");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results1 =
+ icing.Search(search_spec1, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results1.status(), ProtoIsOk());
+ EXPECT_THAT(results1.next_page_token(), Eq(0));
+ // All documents should be retrievable.
+ ASSERT_THAT(results1.results(), SizeIs(3));
+ EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1"));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results2 =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results2.results(), SizeIs(3));
+ EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1"));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ ASSERT_THAT(results3.results(), SizeIs(1));
+ EXPECT_THAT(results3.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3));
+ EXPECT_THAT(results3.results(0).joined_results(0).document().uri(),
+ Eq("message/3"));
+ EXPECT_THAT(results3.results(0).joined_results(1).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results3.results(0).joined_results(2).document().uri(),
+ Eq("message/1"));
+
+ // Verify that numeric index safely wiped out the pre-existing hit for
+ // 'indexableInteger' == 456. Add a new document without that value for
+ // 'indexableInteger' that will take docid=0. If the integer index was not
+ // rebuilt correctly, then it will still have the previously added hit for
+ // 'indexableInteger' == 456 for docid 0 and incorrectly return this new
+ // doc in a query.
+ DocumentProto another_message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/4")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ EXPECT_THAT(icing.Put(another_message).status(), ProtoIsOk());
+ // Verify integer index works normally
+ SearchSpecProto search_spec;
+ search_spec.set_query("indexableInteger == 456");
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.results(), IsEmpty());
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexTruncateQualifiedIdJoinIndexWithoutReindexing) {
+ // Test the following scenario: qualified id join index is *completely* ahead
+ // of document store. IcingSearchEngine should be able to recover qualified id
+ // join index. Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" should be called for integer index. It is a special case when
+ // document store has no document. Since there is no integer index storage
+ // sub directories (path_expr = "*/integer_index_dir/*"), nothing will be
+ // discarded.
+ // - "Clear()" should be called for qualified id join index and throw out
+ // all data, i.e. discarding the underlying mapper (path_expr =
+ // "*/qualified_id_join_index_dir/*") and reinitialize. This should be
+ // sufficient to make qualified id join index consistent with document
+ // store (in this case, document store is empty as well), so reindexing
+ // should not take place.
+
+ // 1. Create an index with no document.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ }
+
+ // 2. Manually add some data into integer index and increment
+ // last_added_document_id. This will cause mismatched document id with
+ // document store.
+ // - Document store: []
+ // - Term index: []
+ // - Integer index: []
+ // - Qualified id join index: [0]
+ {
+ Filesystem filesystem;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem,
+ GetQualifiedIdJoinIndexDir(),
+ /*pre_mapping_fbv=*/false));
+ // Add data for document 0.
+ ASSERT_THAT(qualified_id_join_index->last_added_document_id(),
+ kInvalidDocumentId);
+ qualified_id_join_index->set_last_added_document_id(0);
+ ICING_ASSERT_OK(qualified_id_join_index->Put(
+ /*schema_type_id=*/0, /*joinable_property_id=*/0, /*document_id=*/0,
+ /*ref_namespace_fingerprint_ids=*/
+ {NamespaceFingerprintIdentifier(/*namespace_id=*/0,
+ /*target_str=*/"uri")}));
+ }
+
+ // 3. Create the index again. This should trigger index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded. Even though
+ // Clear() was called, it shouldn't take effect since there is no storage
+ // sub directory ("*/integer_index_dir/*") and nothing will be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ // Clear() should be called to truncate qualified id join index and thus
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(AtLeast(1));
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ // Since truncating qualified id join index is sufficient to make it
+ // consistent with document store, replaying documents or reindexing
+ // shouldn't take place.
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ }
+
+ // 4. Since document 0 doesn't exist, testing join query is not enough to
+ // verify the correctness of qualified id join index restoration. Instead, we
+ // have to check the previously added data should not be found in qualified id
+ // join index.
+ {
+ Filesystem filesystem;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem,
+ GetQualifiedIdJoinIndexDir(),
+ /*pre_mapping_fbv=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto iterator, qualified_id_join_index->GetIterator(
+ /*schema_type_id=*/0, /*joinable_property_id=*/0));
+ EXPECT_THAT(iterator->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ RestoreIndexTruncateQualifiedIdJoinIndexWithReindexing) {
+ // Test the following scenario: qualified id join index is *partially* ahead
+ // of document store. IcingSearchEngine should be able to recover qualified id
+ // join index. Several additional behaviors are also tested:
+ // - Index directory handling:
+ // - Term index directory should be unaffected.
+ // - Integer index directory should be unaffected.
+ // - Qualified id join index directory should be unaffected.
+ // - Truncate indices:
+ // - "TruncateTo()" for term index shouldn't take effect.
+ // - "Clear()" shouldn't be called for integer index, i.e. no integer index
+ // storage sub directories (path_expr = "*/integer_index_dir/*") should be
+ // discarded.
+ // - "Clear()" should be called for qualified id join index and throw out
+ // all data, i.e. discarding the underlying mapper (path_expr =
+ // "*/qualified_id_join_index_dir/*") and reinitialize. However, some
+ // valid data in qualified id join index were discarded together, so
+ // reindexing should still take place to recover them after clearing.
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with message 3 documents.
+ {
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/3").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Manually add some data into qualified id join index and increment
+ // last_added_document_id. This will cause mismatched document id with
+ // document store.
+ // - Document store: [0, 1, 2, 3]
+ // - Term index: [0, 1, 2, 3]
+ // - Integer index: [0, 1, 2, 3]
+ // - Qualified id join index: [0, 1, 2, 3, 4]
+ {
+ Filesystem filesystem;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem,
+ GetQualifiedIdJoinIndexDir(),
+ /*pre_mapping_fbv=*/false));
+ // Add data for document 4.
+ DocumentId original_last_added_doc_id =
+ qualified_id_join_index->last_added_document_id();
+ qualified_id_join_index->set_last_added_document_id(
+ original_last_added_doc_id + 1);
+ ICING_ASSERT_OK(qualified_id_join_index->Put(
+ /*schema_type_id=*/1, /*joinable_property_id=*/0,
+ /*document_id=*/original_last_added_doc_id + 1,
+ /*ref_namespace_fingerprint_ids=*/
+ {NamespaceFingerprintIdentifier(/*namespace_id=*/0,
+ /*target_str=*/"person")}));
+ }
+
+ // 3. Create the index again. This should trigger index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(0);
+ // Clear() should be called to truncate qualified id join index and thus
+ // underlying storage sub directory (path_expr =
+ // "*/qualified_id_join_index_dir/*") should be discarded.
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(AtLeast(1));
+
+ TestIcingSearchEngine icing(
+ GetDefaultIcingOptions(), std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(), std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+
+ // Verify term index works normally
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:consectetur");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results1 =
+ icing.Search(search_spec1, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results1.status(), ProtoIsOk());
+ EXPECT_THAT(results1.next_page_token(), Eq(0));
+ // All documents should be retrievable.
+ ASSERT_THAT(results1.results(), SizeIs(3));
+ EXPECT_THAT(results1.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results1.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results1.results(2).document().uri(), Eq("message/1"));
+
+ // Verify integer index works normally
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results2 =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results2.results(), SizeIs(3));
+ EXPECT_THAT(results2.results(0).document().uri(), Eq("message/3"));
+ EXPECT_THAT(results2.results(1).document().uri(), Eq("message/2"));
+ EXPECT_THAT(results2.results(2).document().uri(), Eq("message/1"));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+
+ // Add document 4 without "senderQualifiedId". If join index is not rebuilt
+ // correctly, then it will still have the previously added senderQualifiedId
+ // for document 4 and include document 4 incorrectly in the right side.
+ DocumentProto another_message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/4")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ EXPECT_THAT(icing.Put(another_message).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ ASSERT_THAT(results3.results(), SizeIs(1));
+ EXPECT_THAT(results3.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results3.results(0).joined_results(), SizeIs(3));
+ EXPECT_THAT(results3.results(0).joined_results(0).document().uri(),
+ Eq("message/3"));
+ EXPECT_THAT(results3.results(0).joined_results(1).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results3.results(0).joined_results(2).document().uri(),
+ Eq("message/1"));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ DocumentWithNoIndexedPropertyDoesntCauseRestoreIndex) {
+ // 1. Create an index with a single document in it that has no indexed
+ // content.
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Set a schema for a single type that has no indexed properties.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("unindexedField")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("unindexedInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // Add a document that contains no indexed properties.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("unindexedField",
+ "Don't you dare search over this!")
+ .AddInt64Property("unindexedInteger", -123)
+ .Build();
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ // 2. Create the index again. This should NOT trigger a recovery of any kind.
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ init_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ DocumentWithNoValidIndexedContentDoesntCauseRestoreIndex) {
+ // 1. Create an index with a single document in it that has no valid indexed
+ // tokens in its content.
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ // Set a schema for a single type that has no term, integer, join indexed
+ // contents.
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // Add a document that contains:
+ // - No valid indexed string content - just punctuation
+ // - No integer content - since it is an optional property
+ // - No qualified id content - since it is an optional property
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "?...!")
+ .Build();
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ // 2. Create the index again. This should NOT trigger a recovery of any kind.
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ init_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogFunctionLatency) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats().latency_ms(), Eq(10));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogNumberOfDocuments) {
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 456)
+ .Build();
+
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats().num_documents(),
+ Eq(0));
+
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats().num_documents(),
+ Eq(1));
+
+ // Put another document.
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats().num_documents(),
+ Eq(2));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldNotLogRecoveryCauseForFirstTimeInitialize) {
+ // Even though the fake timer will return 10, all the latency numbers related
+ // to recovery / restoration should be 0 during the first-time initialization.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCausePartialDataLoss) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ {
+ // Append a non-checksummed document. This will mess up the checksum of the
+ // proto log, forcing it to rewind and later return a DATA_LOSS error.
+ const std::string serialized_document = document.SerializeAsString();
+ const std::string document_log_file = absl_ports::StrCat(
+ GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename());
+
+ int64_t file_size = filesystem()->GetFileSize(document_log_file.c_str());
+ filesystem()->PWrite(document_log_file.c_str(), file_size,
+ serialized_document.data(),
+ serialized_document.size());
+ }
+
+ {
+ // Document store will rewind to previous checkpoint. The cause should be
+ // DATA_LOSS and the data status should be PARTIAL_LOSS.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::PARTIAL_LOSS));
+ // Document store rewinds to previous checkpoint and all derived files were
+ // regenerated.
+ // - Last stored doc id will be consistent with last added document ids in
+ // term/integer indices, so there will be no index restoration.
+ // - Qualified id join index depends on document store derived files and
+ // since they were regenerated, we should rebuild qualified id join index.
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::DEPENDENCIES_CHANGED));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseCompleteDataLoss) {
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+
+ const std::string document_log_file = absl_ports::StrCat(
+ GetDocumentDir(), "/", DocumentLogCreator::GetDocumentLogFilename());
+ int64_t corruptible_offset;
+
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // There's some space at the beginning of the file (e.g. header, kmagic,
+ // etc) that is necessary to initialize the FileBackedProtoLog. We can't
+ // corrupt that region, so we need to figure out the offset at which
+ // documents will be written to - which is the file size after
+ // initialization.
+ corruptible_offset = filesystem()->GetFileSize(document_log_file.c_str());
+
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ }
+
+ {
+ // "Corrupt" the content written in the log. Make the corrupt document
+ // smaller than our original one so we don't accidentally write past our
+ // file.
+ DocumentProto document =
+ DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build();
+ std::string serialized_document = document.SerializeAsString();
+ ASSERT_TRUE(filesystem()->PWrite(
+ document_log_file.c_str(), corruptible_offset,
+ serialized_document.data(), serialized_document.size()));
+
+ PortableFileBackedProtoLog<DocumentWrapper>::Header header =
+ ReadDocumentLogHeader(*filesystem(), document_log_file);
+
+ // Set dirty bit to true to reflect that something changed in the log.
+ header.SetDirtyFlag(true);
+ header.SetHeaderChecksum(header.CalculateHeaderChecksum());
+
+ WriteDocumentLogHeader(*filesystem(), document_log_file, header);
+ }
+
+ {
+ // Document store will completely rewind. The cause should be DATA_LOSS and
+ // the data status should be COMPLETE_LOSS.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::COMPLETE_LOSS));
+ // The complete rewind of ground truth causes us to clear the index, but
+ // that's not considered a restoration.
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseIndexInconsistentWithGroundTruth) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ {
+ // Delete and re-initialize an empty index file to trigger
+ // RestoreIndexIfNeeded.
+ std::string idx_subdir = GetIndexDir() + "/idx";
+ ASSERT_TRUE(filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(Index::Options(GetIndexDir(),
+ /*index_merge_size=*/100,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/50),
+ filesystem(), icing_filesystem()));
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ // Index is empty but ground truth is not. Index should be restored due to
+ // the inconsistency.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(
+ IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseIntegerIndexInconsistentWithGroundTruth) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+ {
+ // Initialize and put a document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ {
+ // Delete the integer index file to trigger RestoreIndexIfNeeded.
+ std::string integer_index_dir = GetIntegerIndexDir();
+ filesystem()->DeleteDirectoryRecursively(integer_index_dir.c_str());
+ }
+
+ {
+ // Index is empty but ground truth is not. Index should be restored due to
+ // the inconsistency.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(
+ IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseQualifiedIdJoinIndexInconsistentWithGroundTruth) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ {
+ // Initialize and put documents.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ {
+ // Delete the qualified id join index file to trigger RestoreIndexIfNeeded.
+ std::string qualified_id_join_index_dir = GetQualifiedIdJoinIndexDir();
+ filesystem()->DeleteDirectoryRecursively(
+ qualified_id_join_index_dir.c_str());
+ }
+
+ {
+ // Index is empty but ground truth is not. Index should be restored due to
+ // the inconsistency.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseSchemaChangesOutOfSync) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ {
+ // Initialize and put one document.
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ {
+ // Simulate a schema change where power is lost after the schema is written.
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder(CreateMessageSchemaTypeConfig())
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ // Write the marker file
+ std::string marker_filepath =
+ absl_ports::StrCat(options.base_dir(), "/set_schema_marker");
+ ScopedFd sfd(filesystem()->OpenForWrite(marker_filepath.c_str()));
+ ASSERT_TRUE(sfd.is_valid());
+
+ // Write the new schema
+ FakeClock fake_clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ new_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ }
+
+ {
+ // Both document store and index should be recovered from checksum mismatch.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+
+ {
+ // No recovery should be needed.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseIndexIOError) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+ {
+ // Initialize and put one document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ std::string lite_index_buffer_file_path =
+ absl_ports::StrCat(GetIndexDir(), "/idx/lite.hb");
+ auto mock_icing_filesystem = std::make_unique<IcingMockFilesystem>();
+ EXPECT_CALL(*mock_icing_filesystem, OpenForWrite(_))
+ .WillRepeatedly(DoDefault());
+ // This fails Index::Create() once.
+ EXPECT_CALL(*mock_icing_filesystem,
+ OpenForWrite(Eq(lite_index_buffer_file_path)))
+ .WillOnce(Return(-1))
+ .WillRepeatedly(DoDefault());
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::move(mock_icing_filesystem),
+ std::move(fake_clock), GetTestJniCache());
+
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseIntegerIndexIOError) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+ {
+ // Initialize and put one document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ std::string integer_index_metadata_file =
+ absl_ports::StrCat(GetIntegerIndexDir(), "/integer_index.m");
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, OpenForWrite(_)).WillRepeatedly(DoDefault());
+ // This fails IntegerIndex::Create() once.
+ EXPECT_CALL(*mock_filesystem, OpenForWrite(Eq(integer_index_metadata_file)))
+ .WillOnce(Return(-1))
+ .WillRepeatedly(DoDefault());
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseQualifiedIdJoinIndexIOError) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ {
+ // Initialize and put documents.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ std::string qualified_id_join_index_metadata_file =
+ absl_ports::StrCat(GetQualifiedIdJoinIndexDir(), "/metadata");
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, PRead(A<const char*>(), _, _, _))
+ .WillRepeatedly(DoDefault());
+ // This fails QualifiedIdJoinIndexImplV2::Create() once.
+ EXPECT_CALL(
+ *mock_filesystem,
+ PRead(Matcher<const char*>(Eq(qualified_id_join_index_metadata_file)), _,
+ _, _))
+ .WillOnce(Return(false))
+ .WillRepeatedly(DoDefault());
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseDocStoreIOError) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .Build();
+ {
+ // Initialize and put one document.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ std::string document_store_header_file_path =
+ absl_ports::StrCat(GetDocumentDir(), "/document_store_header");
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, Read(A<const char*>(), _, _))
+ .WillRepeatedly(DoDefault());
+ // This fails DocumentStore::InitializeDerivedFiles() once.
+ EXPECT_CALL(
+ *mock_filesystem,
+ Read(Matcher<const char*>(Eq(document_store_header_file_path)), _, _))
+ .WillOnce(Return(false))
+ .WillRepeatedly(DoDefault());
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::DEPENDENCIES_CHANGED));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(0));
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogRecoveryCauseSchemaStoreIOError) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ }
+
+ {
+ // Delete the schema store type mapper to trigger an I/O error.
+ std::string schema_store_header_file_path =
+ GetSchemaDir() + "/schema_type_mapper";
+ ASSERT_TRUE(filesystem()->DeleteDirectoryRecursively(
+ schema_store_header_file_path.c_str()));
+ }
+
+ {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .schema_store_recovery_latency_ms(),
+ Eq(10));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .document_store_recovery_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(
+ initialize_result_proto.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result_proto.initialize_stats()
+ .index_restoration_latency_ms(),
+ Eq(0));
+ }
+}
+
+TEST_F(IcingSearchEngineInitializationTest,
+ InitializeShouldLogNumberOfSchemaTypes) {
+ {
+ // Initialize an empty storage.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ // There should be 0 schema types.
+ EXPECT_THAT(initialize_result_proto.initialize_stats().num_schema_types(),
+ Eq(0));
+
+ // Set a schema with one type config.
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ // There should be 1 schema type.
+ EXPECT_THAT(initialize_result_proto.initialize_stats().num_schema_types(),
+ Eq(1));
+
+ // Create and set a schema with two type configs: Email and Message.
+ SchemaProto schema = CreateEmailSchema();
+ *schema.add_types() = CreateMessageSchemaTypeConfig();
+
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto initialize_result_proto = icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result_proto.initialize_stats().num_schema_types(),
+ Eq(2));
+ }
+}
+
+// TODO(b/275121148): deprecate this test after rollout join index v2.
+class IcingSearchEngineInitializationSwitchJoinIndexTest
+ : public IcingSearchEngineInitializationTest,
+ public ::testing::WithParamInterface<bool> {};
+TEST_P(IcingSearchEngineInitializationSwitchJoinIndexTest, SwitchJoinIndex) {
+ bool use_join_index_v2 = GetParam();
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // 1. Create an index with message 3 documents.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_document_store_namespace_id_fingerprint(true);
+ options.set_use_new_qualified_id_join_index(use_join_index_v2);
+
+ TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/2").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ message = DocumentBuilder(message).SetUri("message/3").Build();
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+ }
+
+ // 2. Create the index again changing join index version. This should trigger
+ // join index restoration.
+ {
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure term index directory should never be discarded.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(0);
+ // Ensure integer index directory should never be discarded, and Clear()
+ // should never be called (i.e. storage sub directory
+ // "*/integer_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/integer_index_dir")))
+ .Times(0);
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/integer_index_dir/")))
+ .Times(0);
+ // Ensure qualified id join index directory should be discarded once, and
+ // Clear() should never be called (i.e. storage sub directory
+ // "*/qualified_id_join_index_dir/*" should never be discarded).
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(
+ EndsWith("/qualified_id_join_index_dir")))
+ .Times(1);
+ EXPECT_CALL(
+ *mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("/qualified_id_join_index_dir/")))
+ .Times(0);
+
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_document_store_namespace_id_fingerprint(true);
+ options.set_use_new_qualified_id_join_index(!use_join_index_v2);
+
+ TestIcingSearchEngine icing(options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::INCONSISTENT_WITH_GROUND_TRUTH));
+
+ // Verify qualified id join index works normally: join a query for
+ // `name:person` with a child query for `body:consectetur` based on the
+ // child's `senderQualifiedId` field.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:consectetur");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results = icing.Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document().uri(), Eq("person"));
+ EXPECT_THAT(results.results(0).joined_results(), SizeIs(3));
+ EXPECT_THAT(results.results(0).joined_results(0).document().uri(),
+ Eq("message/3"));
+ EXPECT_THAT(results.results(0).joined_results(1).document().uri(),
+ Eq("message/2"));
+ EXPECT_THAT(results.results(0).joined_results(2).document().uri(),
+ Eq("message/1"));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(IcingSearchEngineInitializationSwitchJoinIndexTest,
+ IcingSearchEngineInitializationSwitchJoinIndexTest,
+ testing::Values(true, false));
+
+class IcingSearchEngineInitializationVersionChangeTest
+ : public IcingSearchEngineInitializationTest,
+ public ::testing::WithParamInterface<version_util::VersionInfo> {};
+
+TEST_P(IcingSearchEngineInitializationVersionChangeTest,
+ RecoverFromVersionChange) {
+ // TODO(b/280697513): test backup schema migration
+ // Test the following scenario: version change. All derived data should be
+ // rebuilt. We test this by manually adding some invalid derived data and
+ // verifying they're removed due to rebuild.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("namespace", "person/1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("namespace", "person/2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message")
+ .SetSchema("Message")
+ .AddStringProperty("body", "correct message")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person/1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ {
+ // Initializes folder and schema, index person1 and person2
+ TestIcingSearchEngine icing(icing_options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ {
+ // Manually:
+ // - Put message into DocumentStore
+ // - But add some incorrect data for message into 3 indices
+ // - Change version file
+ //
+ // These will make sure last_added_document_id is consistent with
+ // last_stored_document_id, so if Icing didn't handle version change
+ // correctly, then the index won't be rebuilt.
+ FakeClock fake_clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(filesystem(), GetSchemaDir(), &fake_clock));
+
+ // Put message into DocumentStore
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ filesystem(), GetDocumentDir(), &fake_clock, schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/
+ icing_options.document_store_namespace_id_fingerprint(),
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, document_store->Put(message));
+
+ // Index doc_id with incorrect data
+ Index::Options options(GetIndexDir(), /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(options, filesystem(), icing_filesystem()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(*filesystem(), GetIntegerIndexDir(),
+ /*num_data_threshold_for_bucket_split=*/65536,
+ /*pre_mapping_fbv=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index,
+ QualifiedIdJoinIndexImplV2::Create(*filesystem(),
+ GetQualifiedIdJoinIndexDir(),
+ /*pre_mapping_fbv=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIndexingHandler> term_indexing_handler,
+ TermIndexingHandler::Create(
+ &fake_clock, normalizer_.get(), index.get(),
+ /*build_property_existence_metadata_hits=*/true));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(
+ &fake_clock, integer_index.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler>
+ qualified_id_join_indexing_handler,
+ QualifiedIdJoinIndexingHandler::Create(
+ &fake_clock, document_store.get(), qualified_id_join_index.get()));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(term_indexing_handler));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+ handlers.push_back(std::move(qualified_id_join_indexing_handler));
+ IndexProcessor index_processor(std::move(handlers), &fake_clock);
+
+ DocumentProto incorrect_message =
+ DocumentBuilder()
+ .SetKey("namespace", "message")
+ .SetSchema("Message")
+ .AddStringProperty("body", "wrong message")
+ .AddInt64Property("indexableInteger", 456)
+ .AddStringProperty("senderQualifiedId", "namespace#person/2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store.get(), lang_segmenter_.get(),
+ std::move(incorrect_message)));
+ ICING_ASSERT_OK(index_processor.IndexDocument(tokenized_document, doc_id));
+
+ // Change existing data's version file
+ const version_util::VersionInfo& existing_version_info = GetParam();
+ ICING_ASSERT_OK(version_util::WriteVersion(
+ *filesystem(), GetVersionFilename(), existing_version_info));
+ }
+
+ // Mock filesystem to observe and check the behavior of all indices.
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ EXPECT_THAT(initialize_result.status(), ProtoIsOk());
+ // Index Restoration should be triggered here. Incorrect data should be
+ // deleted and correct data of message should be indexed.
+ EXPECT_THAT(
+ initialize_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::VERSION_CHANGED));
+
+ // Manually check version file
+ ICING_ASSERT_OK_AND_ASSIGN(
+ version_util::VersionInfo version_info_after_init,
+ version_util::ReadVersion(*filesystem(), GetVersionFilename(),
+ GetIndexDir()));
+ EXPECT_THAT(version_info_after_init.version, Eq(version_util::kVersion));
+ EXPECT_THAT(version_info_after_init.max_version,
+ Eq(std::max(version_util::kVersion, GetParam().max_version)));
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ message;
+
+ // Verify term search
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:correct");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto search_result_proto1 =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `body:message` based on the child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:message");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto expected_join_search_result_proto;
+ expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ // Person 1 with message
+ SearchResultProto::ResultProto* result_proto =
+ expected_join_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person1;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message;
+ // Person 2 without children
+ *expected_join_search_result_proto.mutable_results()
+ ->Add()
+ ->mutable_document() = person2;
+
+ SearchResultProto search_result_proto3 = icing.Search(
+ search_spec3, ScoringSpecProto::default_instance(), result_spec3);
+ EXPECT_THAT(search_result_proto3, EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_search_result_proto));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ IcingSearchEngineInitializationVersionChangeTest,
+ IcingSearchEngineInitializationVersionChangeTest,
+ testing::Values(
+ // Manually change existing data set's version to kVersion + 1. When
+ // initializing, it will detect "rollback".
+ version_util::VersionInfo(
+ /*version_in=*/version_util::kVersion + 1,
+ /*max_version_in=*/version_util::kVersion + 1),
+
+ // Currently we don't have any "upgrade" that requires rebuild derived
+ // files, so skip this case until we have a case for it.
+
+ // Manually change existing data set's version to kVersion - 1 and
+ // max_version to kVersion. When initializing, it will detect "roll
+ // forward".
+ version_util::VersionInfo(
+ /*version_in=*/version_util::kVersion - 1,
+ /*max_version_in=*/version_util::kVersion),
+
+ // Manually change existing data set's version to 0 and max_version to
+ // 0. When initializing, it will detect "version 0 upgrade".
+ //
+ // Note: in reality, version 0 won't be written into version file, but
+ // it is ok here since it is hack to simulate version 0 situation.
+ version_util::VersionInfo(
+ /*version_in=*/0,
+ /*max_version_in=*/0),
+
+ // Manually change existing data set's version to 0 and max_version to
+ // kVersion. When initializing, it will detect "version 0 roll forward".
+ //
+ // Note: in reality, version 0 won't be written into version file, but
+ // it is ok here since it is hack to simulate version 0 situation.
+ version_util::VersionInfo(
+ /*version_in=*/0,
+ /*max_version_in=*/version_util::kVersion)));
+
+class IcingSearchEngineInitializationChangePropertyExistenceHitsFlagTest
+ : public IcingSearchEngineInitializationTest,
+ public ::testing::WithParamInterface<std::tuple<bool, bool>> {};
+TEST_P(IcingSearchEngineInitializationChangePropertyExistenceHitsFlagTest,
+ ChangePropertyExistenceHitsFlagTest) {
+ bool before_build_property_existence_metadata_hits = std::get<0>(GetParam());
+ bool after_build_property_existence_metadata_hits = std::get<1>(GetParam());
+ bool flag_changed = before_build_property_existence_metadata_hits !=
+ after_build_property_existence_metadata_hits;
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Value")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("score")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Create a document with every property.
+ DocumentProto document0 = DocumentBuilder()
+ .SetKey("icing", "uri0")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("body", "foo")
+ .AddInt64Property("timestamp", 123)
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+ // Create a document with missing body.
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("timestamp", 123)
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+ // Create a document with missing timestamp.
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("body", "foo")
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+
+ // 1. Create an index with the 3 documents.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_build_property_existence_metadata_hits(
+ before_build_property_existence_metadata_hits);
+ TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document0).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ }
+
+ // 2. Create the index again with
+ // after_build_property_existence_metadata_hits.
+ //
+ // Mock filesystem to observe and check the behavior of all indices.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ EXPECT_CALL(*mock_filesystem, DeleteDirectoryRecursively(_))
+ .WillRepeatedly(DoDefault());
+ // Ensure that the term index is rebuilt if the flag is changed.
+ EXPECT_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(EndsWith("/index_dir")))
+ .Times(flag_changed ? 1 : 0);
+
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_build_property_existence_metadata_hits(
+ after_build_property_existence_metadata_hits);
+ TestIcingSearchEngine icing(options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ InitializeResultProto initialize_result = icing.Initialize();
+ ASSERT_THAT(initialize_result.status(), ProtoIsOk());
+ // Ensure that the term index is rebuilt if the flag is changed.
+ EXPECT_THAT(initialize_result.initialize_stats().index_restoration_cause(),
+ Eq(flag_changed ? InitializeStatsProto::IO_ERROR
+ : InitializeStatsProto::NONE));
+ EXPECT_THAT(
+ initialize_result.initialize_stats().integer_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(initialize_result.initialize_stats()
+ .qualified_id_join_index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Get all documents that have "body".
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kHasPropertyFunctionFeature));
+ search_spec.add_enabled_features(
+ std::string(kListFilterQueryLanguageFeature));
+ search_spec.set_query("hasProperty(\"body\")");
+ SearchResultProto results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ if (after_build_property_existence_metadata_hits) {
+ EXPECT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document2));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document0));
+ } else {
+ EXPECT_THAT(results.results(), IsEmpty());
+ }
+
+ // Get all documents that have "timestamp".
+ search_spec.set_query("hasProperty(\"timestamp\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ if (after_build_property_existence_metadata_hits) {
+ EXPECT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document1));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document0));
+ } else {
+ EXPECT_THAT(results.results(), IsEmpty());
+ }
+
+ // Get all documents that have "score".
+ search_spec.set_query("hasProperty(\"score\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ if (after_build_property_existence_metadata_hits) {
+ EXPECT_THAT(results.results(), SizeIs(3));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document2));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document1));
+ EXPECT_THAT(results.results(2).document(), EqualsProto(document0));
+ } else {
+ EXPECT_THAT(results.results(), IsEmpty());
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ IcingSearchEngineInitializationChangePropertyExistenceHitsFlagTest,
+ IcingSearchEngineInitializationChangePropertyExistenceHitsFlagTest,
+ testing::Values(std::make_tuple(false, false), std::make_tuple(false, true),
+ std::make_tuple(true, false), std::make_tuple(true, true)));
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_optimize_test.cc b/icing/icing-search-engine_optimize_test.cc
new file mode 100644
index 0000000..61b594c
--- /dev/null
+++ b/icing/icing-search-engine_optimize_test.cc
@@ -0,0 +1,1855 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/icing-search-engine.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/join/join-processor.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/query/query-features.h"
+#include "icing/schema-builder.h"
+#include "icing/store/document-log-creator.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::Lt;
+using ::testing::Return;
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+// This test is meant to cover all tests relating to
+// IcingSearchEngine::Optimize.
+class IcingSearchEngineOptimizeTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ private:
+ Filesystem filesystem_;
+};
+
+// Non-zero value so we don't override it to be the current time
+constexpr int64_t kDefaultCreationTimestampMs = 1575492852000;
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ return icing_options;
+}
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+// TODO(b/272145329): create SearchSpecBuilder, JoinSpecBuilder,
+// SearchResultProtoBuilder and ResultProtoBuilder for unit tests and build all
+// instances by them.
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ AllPageTokensShouldBeInvalidatedAfterOptimization) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ // Searches and gets the first page, 1 result
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
+ uint64_t next_page_token = search_result_proto.next_page_token();
+ // Since the token is a random number, we don't need to verify
+ expected_search_result_proto.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ // Now document1 is still to be fetched.
+
+ OptimizeResultProto optimize_result_proto;
+ optimize_result_proto.mutable_status()->set_code(StatusProto::OK);
+ optimize_result_proto.mutable_status()->set_message("");
+ OptimizeResultProto actual_result = icing.Optimize();
+ actual_result.clear_optimize_stats();
+ ASSERT_THAT(actual_result, EqualsProto(optimize_result_proto));
+
+ // Tries to fetch the second page, no results since all tokens have been
+ // invalidated during Optimize()
+ expected_search_result_proto.clear_results();
+ expected_search_result_proto.clear_next_page_token();
+ search_result_proto = icing.GetNextPage(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, OptimizationShouldRemoveDeletedDocs) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, uri1) not found.");
+ {
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // Deletes document1
+ ASSERT_THAT(icing.Delete("namespace", "uri1").status(), ProtoIsOk());
+ const std::string document_log_path =
+ icing_options.base_dir() + "/document_dir/" +
+ DocumentLogCreator::GetDocumentLogFilename();
+ int64_t document_log_size_before =
+ filesystem()->GetFileSize(document_log_path.c_str());
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+ int64_t document_log_size_after =
+ filesystem()->GetFileSize(document_log_path.c_str());
+
+ // Validates that document can't be found right after Optimize()
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ // Validates that document is actually removed from document log
+ EXPECT_THAT(document_log_size_after, Lt(document_log_size_before));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ OptimizationShouldDeleteTemporaryDirectory) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // Create a tmp dir that will be used in Optimize() to swap files,
+ // this validates that any tmp dirs will be deleted before using.
+ const std::string tmp_dir =
+ icing_options.base_dir() + "/document_dir_optimize_tmp";
+
+ const std::string tmp_file = tmp_dir + "/file";
+ ASSERT_TRUE(filesystem()->CreateDirectory(tmp_dir.c_str()));
+ ScopedFd fd(filesystem()->OpenForWrite(tmp_file.c_str()));
+ ASSERT_TRUE(fd.is_valid());
+ ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4));
+ fd.reset();
+
+ EXPECT_THAT(icing.Optimize().status(), ProtoIsOk());
+
+ EXPECT_FALSE(filesystem()->DirectoryExists(tmp_dir.c_str()));
+ EXPECT_FALSE(filesystem()->FileExists(tmp_file.c_str()));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, GetOptimizeInfoHasCorrectStats) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(500)
+ .Build();
+
+ {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(1000);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Just initialized, nothing is optimizable yet.
+ GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status(), ProtoIsOk());
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+ EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0));
+
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // Only have active documents, nothing is optimizable yet.
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status(), ProtoIsOk());
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+ EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0));
+
+ // Deletes document1
+ ASSERT_THAT(icing.Delete("namespace", "uri1").status(), ProtoIsOk());
+
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status(), ProtoIsOk());
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(1));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Gt(0));
+ EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0));
+ int64_t first_estimated_optimizable_bytes =
+ optimize_info.estimated_optimizable_bytes();
+
+ // Add a second document, but it'll be expired since the time (1000) is
+ // greater than the document's creation timestamp (100) + the document's ttl
+ // (500)
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status(), ProtoIsOk());
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(2));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(),
+ Gt(first_estimated_optimizable_bytes));
+ EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(0));
+
+ // Optimize
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+ }
+
+ {
+ // Recreate with new time
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(5000);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Nothing is optimizable now that everything has been optimized away.
+ GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo();
+ EXPECT_THAT(optimize_info.status(), ProtoIsOk());
+ EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
+ EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
+ EXPECT_THAT(optimize_info.time_since_last_optimize_ms(), Eq(4000));
+ }
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, GetAndPutShouldWorkAfterOptimization) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body three")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document4 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri4")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body four")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document5 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri5")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body five")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Delete("namespace", "uri2").status(), ProtoIsOk());
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+
+ // Validates that Get() and Put() are good right after Optimize()
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ EXPECT_THAT(icing.Put(document4).status(), ProtoIsOk());
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ *expected_get_result_proto.mutable_document() = document1;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ *expected_get_result_proto.mutable_document() = document3;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri3", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ *expected_get_result_proto.mutable_document() = document4;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri4", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ EXPECT_THAT(icing.Put(document5).status(), ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ GetAndPutShouldWorkAfterOptimizationWithEmptyDocuments) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto empty_document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto empty_document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto empty_document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(std::move(schema)).status(), ProtoIsOk());
+
+ ASSERT_THAT(icing.Put(empty_document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(empty_document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Delete("namespace", "uri2").status(), ProtoIsOk());
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+
+ // Validates that Get() and Put() are good right after Optimize()
+ *expected_get_result_proto.mutable_document() = empty_document1;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status()
+ .code(),
+ Eq(StatusProto::NOT_FOUND));
+ EXPECT_THAT(icing.Put(empty_document3).status(), ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, DeleteShouldWorkAfterOptimization) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+
+ // Validates that Delete() works right after Optimize()
+ EXPECT_THAT(icing.Delete("namespace", "uri1").status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(
+ StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, uri1) not found.");
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->clear_message();
+ *expected_get_result_proto.mutable_document() = document2;
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.Delete("namespace", "uri2").status(), ProtoIsOk());
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, uri1) not found.");
+ EXPECT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, uri2) not found.");
+ EXPECT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, OptimizationFailureUninitializesIcing) {
+ // Setup filesystem to fail
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ bool just_swapped_files = false;
+ auto create_dir_lambda = [this, &just_swapped_files](const char* dir_name) {
+ if (just_swapped_files) {
+ // We should fail the first call immediately after swapping files.
+ just_swapped_files = false;
+ return false;
+ }
+ return filesystem()->CreateDirectoryRecursively(dir_name);
+ };
+ ON_CALL(*mock_filesystem, CreateDirectoryRecursively)
+ .WillByDefault(create_dir_lambda);
+
+ auto swap_lambda = [&just_swapped_files](const char* first_dir,
+ const char* second_dir) {
+ just_swapped_files = true;
+ return false;
+ };
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"),
+ HasSubstr("document_dir")))
+ .WillByDefault(swap_lambda);
+ TestIcingSearchEngine icing(options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // The mocks should cause an unrecoverable error during Optimize - returning
+ // INTERNAL.
+ ASSERT_THAT(icing.Optimize().status(), ProtoStatusIs(StatusProto::INTERNAL));
+
+ // Ordinary operations should fail safely.
+ SchemaProto simple_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type0").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop0")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto simple_doc = DocumentBuilder()
+ .SetKey("namespace0", "uri0")
+ .SetSchema("type0")
+ .AddStringProperty("prop0", "foo")
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ ResultSpecProto result_spec;
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+ EXPECT_THAT(icing.SetSchema(simple_schema).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Put(simple_doc).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing
+ .Get(simple_doc.namespace_(), simple_doc.uri(),
+ GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing.Search(search_spec, scoring_spec, result_spec).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+
+ // Reset should get icing back to a safe (empty) and working state.
+ EXPECT_THAT(icing.Reset().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(simple_schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(simple_doc).status(), ProtoIsOk());
+ EXPECT_THAT(icing
+ .Get(simple_doc.namespace_(), simple_doc.uri(),
+ GetResultSpecProto::default_instance())
+ .status(),
+ ProtoIsOk());
+ EXPECT_THAT(icing.Search(search_spec, scoring_spec, result_spec).status(),
+ ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, SetSchemaShouldWorkAfterOptimization) {
+ // Creates 3 test schemas
+ SchemaProto schema1 =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ SchemaProto schema2 = SchemaProto(schema1);
+ *schema2.mutable_types(0)->add_properties() =
+ PropertyConfigBuilder()
+ .SetName("property2")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+
+ SchemaProto schema3 = SchemaProto(schema2);
+ *schema3.mutable_types(0)->add_properties() =
+ PropertyConfigBuilder()
+ .SetName("property3")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+
+ // Validates that SetSchema() works right after Optimize()
+ EXPECT_THAT(icing.SetSchema(schema2).status(), ProtoIsOk());
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema3).status(), ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, SearchShouldWorkAfterOptimization) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ SearchSpecProto search_spec1;
+ search_spec1.set_term_match_type(TermMatchType::PREFIX);
+ search_spec1.set_query("m");
+
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+
+ // Validates that Search() works right after Optimize()
+ // Term search
+ SearchResultProto search_result_proto1 =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Numeric (integer) search
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Verify term search
+ SearchResultProto search_result_proto1 =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ JoinShouldWorkAfterOptimizationDeleteParent) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person two")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ DocumentProto message1 =
+ DocumentBuilder()
+ .SetKey("namespace", "message1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .AddStringProperty("senderQualifiedId", "namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message2 =
+ DocumentBuilder()
+ .SetKey("namespace", "message2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .AddStringProperty("senderQualifiedId", "namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message3 =
+ DocumentBuilder()
+ .SetKey("namespace", "message3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body three")
+ .AddStringProperty("senderQualifiedId", "namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Prepare join search spec to join a query for `name:person` with a child
+ // query for `body:message` based on the child's `senderQualifiedId` field.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:message");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ // Person1 is going to be deleted below. Only person2 which is joined with
+ // message3 should match the query.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person2;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message3;
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message3).status(), ProtoIsOk());
+ // Delete parent document: person1
+ ASSERT_THAT(icing.Delete("namespace", "person1").status(), ProtoIsOk());
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+
+ // Validates that join search query works right after Optimize()
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ JoinShouldWorkAfterOptimizationDeleteChild) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person two")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ DocumentProto message1 =
+ DocumentBuilder()
+ .SetKey("namespace", "message1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .AddStringProperty("senderQualifiedId", "namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message2 =
+ DocumentBuilder()
+ .SetKey("namespace", "message2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .AddStringProperty("senderQualifiedId", "namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message3 =
+ DocumentBuilder()
+ .SetKey("namespace", "message3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body three")
+ .AddStringProperty("senderQualifiedId", "namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Prepare join search spec to join a query for `name:person` with a child
+ // query for `body:message` based on the child's `senderQualifiedId` field.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:message");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ // Message1 and message3 are going to be deleted below. Both person1 and
+ // person2 should be included even though person2 has no child (since we're
+ // doing left join).
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto1 =
+ expected_search_result_proto.mutable_results()->Add();
+ *result_proto1->mutable_document() = person1;
+ *result_proto1->mutable_joined_results()->Add()->mutable_document() =
+ message2;
+ SearchResultProto::ResultProto* result_google::protobuf =
+ expected_search_result_proto.mutable_results()->Add();
+ *result_google::protobuf->mutable_document() = person2;
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message3).status(), ProtoIsOk());
+ // Delete child documents: message1 and message3
+ ASSERT_THAT(icing.Delete("namespace", "message1").status(), ProtoIsOk());
+ ASSERT_THAT(icing.Delete("namespace", "message3").status(), ProtoIsOk());
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+
+ // Validates that join search query works right after Optimize()
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ IcingShouldWorkFineIfOptimizationIsAborted) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ DocumentProto message1 =
+ DocumentBuilder()
+ .SetKey("namespace", "message1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ {
+ // Initializes a normal icing to create files needed
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message1).status(), ProtoIsOk());
+ }
+
+ // Creates a mock filesystem in which DeleteDirectoryRecursively() always
+ // fails. This will fail IcingSearchEngine::OptimizeDocumentStore() and makes
+ // it return ABORTED_ERROR.
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ ON_CALL(*mock_filesystem,
+ DeleteDirectoryRecursively(HasSubstr("_optimize_tmp")))
+ .WillByDefault(Return(false));
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.Optimize().status(), ProtoStatusIs(StatusProto::ABORTED));
+
+ // Now optimization is aborted, we verify that document-related functions
+ // still work as expected.
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = message1;
+ EXPECT_THAT(icing.Get("namespace", "message1",
+ GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ DocumentProto message2 =
+ DocumentBuilder()
+ .SetKey("namespace", "message2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .AddInt64Property("indexableInteger", 123)
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ EXPECT_THAT(icing.Put(message2).status(), ProtoIsOk());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ message2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ message1;
+
+ // Verify term search
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("body:m");
+ search_spec1.set_term_match_type(TermMatchType::PREFIX);
+
+ SearchResultProto search_result_proto1 =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `body:message` based on the child's `senderQualifiedId` field.
+ SearchSpecProto search_spec3;
+ search_spec3.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec3.set_query("name:person");
+ JoinSpecProto* join_spec = search_spec3.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("body:message");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec3 = ResultSpecProto::default_instance();
+ result_spec3.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto expected_join_search_result_proto;
+ expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_join_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message2;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message1;
+
+ SearchResultProto search_result_proto3 =
+ icing.Search(search_spec3, GetDefaultScoringSpec(), result_spec3);
+ EXPECT_THAT(search_result_proto3, EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ OptimizationShouldRecoverIfFileDirectoriesAreMissing) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Creates a mock filesystem in which SwapFiles() always fails and deletes the
+ // directories. This will fail IcingSearchEngine::OptimizeDocumentStore().
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"),
+ HasSubstr("document_dir")))
+ .WillByDefault([this](const char* one, const char* two) {
+ filesystem()->DeleteDirectoryRecursively(one);
+ filesystem()->DeleteDirectoryRecursively(two);
+ return false;
+ });
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Optimize() fails due to filesystem error
+ OptimizeResultProto result = icing.Optimize();
+ EXPECT_THAT(result.status(), ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+ // Should rebuild the index for data loss.
+ EXPECT_THAT(result.optimize_stats().index_restoration_mode(),
+ Eq(OptimizeStatsProto::FULL_INDEX_REBUILD));
+
+ // Document is not found because original file directory is missing
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, uri) not found.");
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ DocumentProto new_document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "new body")
+ .AddInt64Property("indexableInteger", 456)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ EXPECT_THAT(icing.Put(new_document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("m");
+ search_spec1.set_term_match_type(TermMatchType::PREFIX);
+
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ // Searching old content returns nothing because original file directory is
+ // missing
+ // Term search
+ SearchResultProto search_result_proto1 =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Numeric (integer) search
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Searching new content returns the new document
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ new_document;
+ // Term search
+ search_spec1.set_query("n");
+ search_result_proto1 = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Numeric (integer) search
+ search_spec2.set_query("indexableInteger == 456");
+ search_result_google::protobuf = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ OptimizationShouldRecoverIfDataFilesAreMissing) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Creates a mock filesystem in which SwapFiles() always fails and empties the
+ // directories. This will fail IcingSearchEngine::OptimizeDocumentStore().
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ ON_CALL(*mock_filesystem, SwapFiles(HasSubstr("document_dir_optimize_tmp"),
+ HasSubstr("document_dir")))
+ .WillByDefault([this](const char* one, const char* two) {
+ filesystem()->DeleteDirectoryRecursively(one);
+ filesystem()->CreateDirectoryRecursively(one);
+ filesystem()->DeleteDirectoryRecursively(two);
+ filesystem()->CreateDirectoryRecursively(two);
+ return false;
+ });
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Optimize() fails due to filesystem error
+ OptimizeResultProto result = icing.Optimize();
+ EXPECT_THAT(result.status(), ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+ // Should rebuild the index for data loss.
+ EXPECT_THAT(result.optimize_stats().index_restoration_mode(),
+ Eq(OptimizeStatsProto::FULL_INDEX_REBUILD));
+
+ // Document is not found because original files are missing
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, uri) not found.");
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ DocumentProto new_document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "new body")
+ .AddInt64Property("indexableInteger", 456)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ EXPECT_THAT(icing.Put(new_document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("m");
+ search_spec1.set_term_match_type(TermMatchType::PREFIX);
+
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("indexableInteger == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ // Searching old content returns nothing because original files are missing
+ // Term search
+ SearchResultProto search_result_proto1 =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Numeric (integer) search
+ SearchResultProto search_result_google::protobuf =
+ icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Searching new content returns the new document
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ new_document;
+ // Term search
+ search_spec1.set_query("n");
+ search_result_proto1 = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Numeric (integer) search
+ search_spec2.set_query("indexableInteger == 456");
+ search_result_google::protobuf = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_google::protobuf, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, OptimizeThresholdTest) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .AddInt64Property("indexableInteger", 1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .AddInt64Property("indexableInteger", 2)
+ .SetCreationTimestampMs(9000)
+ .SetTtlMs(500)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body three")
+ .AddInt64Property("indexableInteger", 3)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(5);
+ fake_clock->SetSystemTimeMilliseconds(10000);
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // Set the threshold to 0.9 to test that the threshold works.
+ options.set_optimize_rebuild_index_threshold(0.9);
+ auto icing = std::make_unique<TestIcingSearchEngine>(
+ options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::move(fake_clock),
+ GetTestJniCache());
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Add three documents.
+ ASSERT_THAT(icing->Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing->Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing->Put(document3).status(), ProtoIsOk());
+
+ // Delete the first document.
+ ASSERT_THAT(icing->Delete(document1.namespace_(), document1.uri()).status(),
+ ProtoIsOk());
+ ASSERT_THAT(icing->PersistToDisk(PersistType::FULL).status(), ProtoIsOk());
+
+ OptimizeStatsProto expected;
+ expected.set_latency_ms(5);
+ expected.set_document_store_optimize_latency_ms(5);
+ expected.set_index_restoration_latency_ms(5);
+ expected.set_num_original_documents(3);
+ expected.set_num_deleted_documents(1);
+ expected.set_num_expired_documents(1);
+ expected.set_num_original_namespaces(1);
+ expected.set_num_deleted_namespaces(0);
+ expected.set_index_restoration_mode(OptimizeStatsProto::INDEX_TRANSLATION);
+
+ // Run Optimize
+ OptimizeResultProto result = icing->Optimize();
+ // Depending on how many blocks the documents end up spread across, it's
+ // possible that Optimize can remove documents without shrinking storage. The
+ // first Optimize call will also write the OptimizeStatusProto for the first
+ // time which will take up 1 block. So make sure that before_size is no less
+ // than after_size - 1 block.
+ uint32_t page_size = getpagesize();
+ EXPECT_THAT(result.optimize_stats().storage_size_before(),
+ Ge(result.optimize_stats().storage_size_after() - page_size));
+ result.mutable_optimize_stats()->clear_storage_size_before();
+ result.mutable_optimize_stats()->clear_storage_size_after();
+ EXPECT_THAT(result.optimize_stats(), EqualsProto(expected));
+
+ fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(5);
+ fake_clock->SetSystemTimeMilliseconds(20000);
+ icing = std::make_unique<TestIcingSearchEngine>(
+ options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::move(fake_clock),
+ GetTestJniCache());
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+
+ expected = OptimizeStatsProto();
+ expected.set_latency_ms(5);
+ expected.set_document_store_optimize_latency_ms(5);
+ expected.set_index_restoration_latency_ms(5);
+ expected.set_num_original_documents(1);
+ expected.set_num_deleted_documents(0);
+ expected.set_num_expired_documents(0);
+ expected.set_num_original_namespaces(1);
+ expected.set_num_deleted_namespaces(0);
+ expected.set_time_since_last_optimize_ms(10000);
+ expected.set_index_restoration_mode(OptimizeStatsProto::INDEX_TRANSLATION);
+
+ // Run Optimize
+ result = icing->Optimize();
+ EXPECT_THAT(result.optimize_stats().storage_size_before(),
+ Eq(result.optimize_stats().storage_size_after()));
+ result.mutable_optimize_stats()->clear_storage_size_before();
+ result.mutable_optimize_stats()->clear_storage_size_after();
+ EXPECT_THAT(result.optimize_stats(), EqualsProto(expected));
+
+ // Delete the last document.
+ ASSERT_THAT(icing->Delete(document3.namespace_(), document3.uri()).status(),
+ ProtoIsOk());
+
+ expected = OptimizeStatsProto();
+ expected.set_latency_ms(5);
+ expected.set_document_store_optimize_latency_ms(5);
+ expected.set_index_restoration_latency_ms(5);
+ expected.set_num_original_documents(1);
+ expected.set_num_deleted_documents(1);
+ expected.set_num_expired_documents(0);
+ expected.set_num_original_namespaces(1);
+ expected.set_num_deleted_namespaces(1);
+ expected.set_time_since_last_optimize_ms(0);
+ // Should rebuild the index since all documents are removed.
+ expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD);
+
+ // Run Optimize
+ result = icing->Optimize();
+ EXPECT_THAT(result.optimize_stats().storage_size_before(),
+ Ge(result.optimize_stats().storage_size_after()));
+ result.mutable_optimize_stats()->clear_storage_size_before();
+ result.mutable_optimize_stats()->clear_storage_size_after();
+ EXPECT_THAT(result.optimize_stats(), EqualsProto(expected));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest, OptimizeStatsProtoTest) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .AddInt64Property("indexableInteger", 1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .AddInt64Property("indexableInteger", 2)
+ .SetCreationTimestampMs(9000)
+ .SetTtlMs(500)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body three")
+ .AddInt64Property("indexableInteger", 3)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(5);
+ fake_clock->SetSystemTimeMilliseconds(10000);
+ // Use the default Icing options, so that a change to the default value will
+ // require updating this test.
+ auto icing = std::make_unique<TestIcingSearchEngine>(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::move(fake_clock),
+ GetTestJniCache());
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing->SetSchema(schema).status(), ProtoIsOk());
+
+ // Add three documents.
+ ASSERT_THAT(icing->Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing->Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing->Put(document3).status(), ProtoIsOk());
+
+ // Delete the first document.
+ ASSERT_THAT(icing->Delete(document1.namespace_(), document1.uri()).status(),
+ ProtoIsOk());
+ ASSERT_THAT(icing->PersistToDisk(PersistType::FULL).status(), ProtoIsOk());
+
+ OptimizeStatsProto expected;
+ expected.set_latency_ms(5);
+ expected.set_document_store_optimize_latency_ms(5);
+ expected.set_index_restoration_latency_ms(5);
+ expected.set_num_original_documents(3);
+ expected.set_num_deleted_documents(1);
+ expected.set_num_expired_documents(1);
+ expected.set_num_original_namespaces(1);
+ expected.set_num_deleted_namespaces(0);
+ expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD);
+
+ // Run Optimize
+ OptimizeResultProto result = icing->Optimize();
+ // Depending on how many blocks the documents end up spread across, it's
+ // possible that Optimize can remove documents without shrinking storage. The
+ // first Optimize call will also write the OptimizeStatusProto for the first
+ // time which will take up 1 block. So make sure that before_size is no less
+ // than after_size - 1 block.
+ uint32_t page_size = getpagesize();
+ EXPECT_THAT(result.optimize_stats().storage_size_before(),
+ Ge(result.optimize_stats().storage_size_after() - page_size));
+ result.mutable_optimize_stats()->clear_storage_size_before();
+ result.mutable_optimize_stats()->clear_storage_size_after();
+ EXPECT_THAT(result.optimize_stats(), EqualsProto(expected));
+
+ fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(5);
+ fake_clock->SetSystemTimeMilliseconds(20000);
+ // Use the default Icing options, so that a change to the default value will
+ // require updating this test.
+ icing = std::make_unique<TestIcingSearchEngine>(
+ GetDefaultIcingOptions(), std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(), std::move(fake_clock),
+ GetTestJniCache());
+ ASSERT_THAT(icing->Initialize().status(), ProtoIsOk());
+
+ expected = OptimizeStatsProto();
+ expected.set_latency_ms(5);
+ expected.set_document_store_optimize_latency_ms(5);
+ expected.set_index_restoration_latency_ms(5);
+ expected.set_num_original_documents(1);
+ expected.set_num_deleted_documents(0);
+ expected.set_num_expired_documents(0);
+ expected.set_num_original_namespaces(1);
+ expected.set_num_deleted_namespaces(0);
+ expected.set_time_since_last_optimize_ms(10000);
+ expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD);
+
+ // Run Optimize
+ result = icing->Optimize();
+ EXPECT_THAT(result.optimize_stats().storage_size_before(),
+ Eq(result.optimize_stats().storage_size_after()));
+ result.mutable_optimize_stats()->clear_storage_size_before();
+ result.mutable_optimize_stats()->clear_storage_size_after();
+ EXPECT_THAT(result.optimize_stats(), EqualsProto(expected));
+
+ // Delete the last document.
+ ASSERT_THAT(icing->Delete(document3.namespace_(), document3.uri()).status(),
+ ProtoIsOk());
+
+ expected = OptimizeStatsProto();
+ expected.set_latency_ms(5);
+ expected.set_document_store_optimize_latency_ms(5);
+ expected.set_index_restoration_latency_ms(5);
+ expected.set_num_original_documents(1);
+ expected.set_num_deleted_documents(1);
+ expected.set_num_expired_documents(0);
+ expected.set_num_original_namespaces(1);
+ expected.set_num_deleted_namespaces(1);
+ expected.set_time_since_last_optimize_ms(0);
+ expected.set_index_restoration_mode(OptimizeStatsProto::FULL_INDEX_REBUILD);
+
+ // Run Optimize
+ result = icing->Optimize();
+ EXPECT_THAT(result.optimize_stats().storage_size_before(),
+ Ge(result.optimize_stats().storage_size_after()));
+ result.mutable_optimize_stats()->clear_storage_size_before();
+ result.mutable_optimize_stats()->clear_storage_size_after();
+ EXPECT_THAT(result.optimize_stats(), EqualsProto(expected));
+}
+
+TEST_F(IcingSearchEngineOptimizeTest,
+ OptimizationRewritesDocsWithNewCompressionLevel) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body two")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+ icing_options.set_compression_level(3);
+ int64_t document_log_size_compression_3;
+ int64_t document_log_size_after_opti_no_compression;
+ int64_t document_log_size_after_opti_compression_3;
+ const std::string document_log_path =
+ icing_options.base_dir() + "/document_dir/" +
+ DocumentLogCreator::GetDocumentLogFilename();
+ {
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.PersistToDisk(PersistType::FULL).status(), ProtoIsOk());
+ document_log_size_compression_3 =
+ filesystem()->GetFileSize(document_log_path.c_str());
+ } // Destroys IcingSearchEngine to make sure nothing is cached.
+
+ // Turn off compression
+ icing_options.set_compression_level(0);
+
+ {
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ // Document log size is the same even after reopening with a different
+ // compression level
+ ASSERT_EQ(document_log_size_compression_3,
+ filesystem()->GetFileSize(document_log_path.c_str()));
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+ document_log_size_after_opti_no_compression =
+ filesystem()->GetFileSize(document_log_path.c_str());
+ // Document log size is larger after optimizing since optimizing rewrites
+ // with the new compression level which is 0 or none
+ ASSERT_GT(document_log_size_after_opti_no_compression,
+ document_log_size_compression_3);
+ }
+
+ // Restore the original compression level
+ icing_options.set_compression_level(3);
+
+ {
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ // Document log size is the same even after reopening with a different
+ // compression level
+ ASSERT_EQ(document_log_size_after_opti_no_compression,
+ filesystem()->GetFileSize(document_log_path.c_str()));
+ ASSERT_THAT(icing.Optimize().status(), ProtoIsOk());
+ document_log_size_after_opti_compression_3 =
+ filesystem()->GetFileSize(document_log_path.c_str());
+ // Document log size should be the same as it was originally
+ ASSERT_EQ(document_log_size_after_opti_compression_3,
+ document_log_size_compression_3);
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_put_test.cc b/icing/icing-search-engine_put_test.cc
new file mode 100644
index 0000000..ed72f17
--- /dev/null
+++ b/icing/icing-search-engine_put_test.cc
@@ -0,0 +1,481 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/icing-search-engine.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Le;
+using ::testing::SizeIs;
+
+constexpr std::string_view kIpsumText =
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla convallis "
+ "scelerisque orci quis hendrerit. Sed augue turpis, sodales eu gravida "
+ "nec, scelerisque nec leo. Maecenas accumsan interdum commodo. Aliquam "
+ "mattis sapien est, sit amet interdum risus dapibus sed. Maecenas leo "
+ "erat, fringilla in nisl a, venenatis gravida metus. Phasellus venenatis, "
+ "orci in aliquet mattis, lectus sapien volutpat arcu, sed hendrerit ligula "
+ "arcu nec mauris. Integer dolor mi, rhoncus eget gravida et, pulvinar et "
+ "nunc. Aliquam ac sollicitudin nisi. Vivamus sit amet urna vestibulum, "
+ "tincidunt eros sed, efficitur nisl. Fusce non neque accumsan, sagittis "
+ "nisi eget, sagittis turpis. Ut pulvinar nibh eu purus feugiat faucibus. "
+ "Donec tellus nulla, tincidunt vel lacus id, bibendum fermentum turpis. "
+ "Nullam ultrices sed nibh vitae aliquet. Ut risus neque, consectetur "
+ "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh "
+ "placerat semper.";
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+// This test is meant to cover all tests relating to IcingSearchEngine::Put.
+class IcingSearchEnginePutTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ private:
+ Filesystem filesystem_;
+};
+
+constexpr int kMaxSupportedDocumentSize = (1u << 24) - 1;
+
+// Non-zero value so we don't override it to be the current time
+constexpr int64_t kDefaultCreationTimestampMs = 1575492852000;
+
+std::string GetIndexDir() { return GetTestBaseDir() + "/index_dir"; }
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ return icing_options;
+}
+
+DocumentProto CreateMessageDocument(std::string name_space, std::string uri) {
+ return DocumentBuilder()
+ .SetKey(std::move(name_space), std::move(uri))
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+}
+
+SchemaProto CreateMessageSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+}
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+TEST_F(IcingSearchEnginePutTest, MaxTokenLenReturnsOkAndTruncatesTokens) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // A length of 1 is allowed - even though it would be strange to want
+ // this.
+ options.set_max_token_length(1);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // "message" should have been truncated to "m"
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ // The indexed tokens were truncated to length of 1, so "m" will match
+ search_spec.set_query("m");
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // The query token is also truncated to length of 1, so "me"->"m" matches "m"
+ search_spec.set_query("me");
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // The query token is still truncated to length of 1, so "massage"->"m"
+ // matches "m"
+ search_spec.set_query("massage");
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEnginePutTest,
+ MaxIntMaxTokenLenReturnsOkTooLargeTokenReturnsResourceExhausted) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ // Set token length to max. This is allowed (it just means never to
+ // truncate tokens). However, this does mean that tokens that exceed the
+ // size of the lexicon will cause indexing to fail.
+ options.set_max_token_length(std::numeric_limits<int32_t>::max());
+ IcingSearchEngine icing(options, GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Add a document that just barely fits under the max document limit.
+ // This will still fail to index because we won't actually have enough
+ // room in the lexicon to fit this content.
+ std::string enormous_string(kMaxSupportedDocumentSize - 256, 'p');
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", std::move(enormous_string))
+ .Build();
+ EXPECT_THAT(icing.Put(document).status(),
+ ProtoStatusIs(StatusProto::OUT_OF_SPACE));
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("p");
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEnginePutTest, PutWithoutSchemaFailedPrecondition) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(put_result_proto.status().message(), HasSubstr("Schema not set"));
+}
+
+TEST_F(IcingSearchEnginePutTest, IndexingDocMergeFailureResets) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .Build();
+ // 1. Create an index with a LiteIndex that will only allow one document
+ // before needing a merge.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(document.ByteSizeLong());
+ IcingSearchEngine icing(options, GetTestJniCache());
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Add two documents. These should get merged into the main index.
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = DocumentBuilder(document).SetUri("fake_type/1").Build();
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ // Add one document. This one should get remain in the lite index.
+ document = DocumentBuilder(document).SetUri("fake_type/2").Build();
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ }
+
+ // 2. Delete the index file to trigger RestoreIndexIfNeeded.
+ std::string idx_subdir = GetIndexDir() + "/idx";
+ filesystem()->DeleteDirectoryRecursively(idx_subdir.c_str());
+
+ // 3. Setup a mock filesystem to fail to grow the main index once.
+ bool has_failed_already = false;
+ auto open_write_lambda = [this, &has_failed_already](const char* filename) {
+ std::string main_lexicon_suffix = "/main-lexicon.prop.2";
+ std::string filename_string(filename);
+ if (!has_failed_already &&
+ filename_string.length() >= main_lexicon_suffix.length() &&
+ filename_string.substr(
+ filename_string.length() - main_lexicon_suffix.length(),
+ main_lexicon_suffix.length()) == main_lexicon_suffix) {
+ has_failed_already = true;
+ return -1;
+ }
+ return this->filesystem()->OpenForWrite(filename);
+ };
+ auto mock_icing_filesystem = std::make_unique<IcingMockFilesystem>();
+ ON_CALL(*mock_icing_filesystem, OpenForWrite)
+ .WillByDefault(open_write_lambda);
+
+ // 4. Create the index again. This should trigger index restoration.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(document.ByteSizeLong());
+ TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(),
+ std::move(mock_icing_filesystem),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(),
+ ProtoStatusIs(StatusProto::WARNING_DATA_LOSS));
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("consectetur");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.next_page_token(), Eq(0));
+ // Only the last document that was added should still be retrievable.
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document().uri(), Eq("fake_type/2"));
+ }
+}
+
+TEST_F(IcingSearchEnginePutTest, PutDocumentShouldLogFunctionLatency) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(put_result_proto.put_document_stats().latency_ms(), Eq(10));
+}
+
+TEST_F(IcingSearchEnginePutTest, PutDocumentShouldLogDocumentStoreStats) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .AddStringProperty("body", "message body")
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(put_result_proto.put_document_stats().document_store_latency_ms(),
+ Eq(10));
+ size_t document_size = put_result_proto.put_document_stats().document_size();
+ EXPECT_THAT(document_size, Ge(document.ByteSizeLong()));
+ EXPECT_THAT(document_size, Le(document.ByteSizeLong() +
+ sizeof(DocumentProto::InternalFields)));
+}
+
+TEST_F(IcingSearchEnginePutTest, PutDocumentShouldLogIndexingStats) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .Build();
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ PutResultProto put_result_proto = icing.Put(document);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(put_result_proto.put_document_stats().index_latency_ms(), Eq(10));
+ // No merge should happen.
+ EXPECT_THAT(put_result_proto.put_document_stats().index_merge_latency_ms(),
+ Eq(0));
+ // The input document has 2 tokens.
+ EXPECT_THAT(put_result_proto.put_document_stats()
+ .tokenization_stats()
+ .num_tokens_indexed(),
+ Eq(2));
+}
+
+TEST_F(IcingSearchEnginePutTest, PutDocumentShouldLogIndexMergeLatency) {
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", kIpsumText)
+ .Build();
+
+ // Create an icing instance with index_merge_size = document1's size.
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+ icing_options.set_index_merge_size(document1.ByteSizeLong());
+
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(10);
+ TestIcingSearchEngine icing(icing_options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // Putting document2 should trigger an index merge.
+ PutResultProto put_result_proto = icing.Put(document2);
+ EXPECT_THAT(put_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(put_result_proto.put_document_stats().index_merge_latency_ms(),
+ Eq(10));
+}
+
+TEST_F(IcingSearchEnginePutTest, PutDocumentIndexFailureDeletion) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Testing has shown that adding ~600,000 terms generated this way will
+ // fill up the hit buffer.
+ std::vector<std::string> terms = GenerateUniqueTerms(600000);
+ std::string content = absl_ports::StrJoin(terms, " ");
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "foo " + content)
+ .Build();
+ // We failed to add the document to the index fully. This means that we should
+ // reject the document from Icing entirely.
+ ASSERT_THAT(icing.Put(document).status(),
+ ProtoStatusIs(StatusProto::OUT_OF_SPACE));
+
+ // Make sure that the document isn't searchable.
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
+
+ SearchResultProto search_results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(search_results.status(), ProtoIsOk());
+ ASSERT_THAT(search_results.results(), IsEmpty());
+
+ // Make sure that the document isn't retrievable.
+ GetResultProto get_result =
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance());
+ ASSERT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_schema_test.cc b/icing/icing-search-engine_schema_test.cc
new file mode 100644
index 0000000..49c024e
--- /dev/null
+++ b/icing/icing-search-engine_schema_test.cc
@@ -0,0 +1,3159 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/icing-search-engine.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/join/join-processor.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/query/query-features.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/section.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::Return;
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+// This test is meant to cover all tests relating to
+// IcingSearchEngine::GetSchema and IcingSearchEngine::SetSchema.
+class IcingSearchEngineSchemaTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ private:
+ Filesystem filesystem_;
+};
+
+// Non-zero value so we don't override it to be the current time
+constexpr int64_t kDefaultCreationTimestampMs = 1575492852000;
+
+std::string GetSchemaDir() { return GetTestBaseDir() + "/schema_dir"; }
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ icing_options.set_document_store_namespace_id_fingerprint(true);
+ icing_options.set_use_new_qualified_id_join_index(true);
+ return icing_options;
+}
+
+DocumentProto CreateMessageDocument(std::string name_space, std::string uri) {
+ return DocumentBuilder()
+ .SetKey(std::move(name_space), std::move(uri))
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .AddInt64Property("indexableInteger", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+}
+
+SchemaTypeConfigProto CreateMessageSchemaTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("indexableInteger")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+}
+
+SchemaProto CreateMessageSchema() {
+ return SchemaBuilder().AddType(CreateMessageSchemaTypeConfig()).Build();
+}
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+// TODO(b/272145329): create SearchSpecBuilder, JoinSpecBuilder,
+// SearchResultProtoBuilder and ResultProtoBuilder for unit tests and build all
+// instances by them.
+
+TEST_F(IcingSearchEngineSchemaTest,
+ CircularReferenceCreateSectionManagerReturnsInvalidArgument) {
+ // Create a type config with a circular reference.
+ SchemaProto schema;
+ auto* type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto* body = type->add_properties();
+ body->set_property_name("recipient");
+ body->set_schema_type("Person");
+ body->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_document_indexing_config()->set_index_nested_properties(true);
+
+ type = schema.add_types();
+ type->set_schema_type("Person");
+
+ body = type->add_properties();
+ body->set_property_name("recipient");
+ body->set_schema_type("Message");
+ body->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_document_indexing_config()->set_index_nested_properties(true);
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, FailToReadSchema) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ {
+ // Successfully initialize and set a schema
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ }
+
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+
+ // This fails FileBackedProto::Read() when we try to check the schema we
+ // had previously set
+ ON_CALL(*mock_filesystem,
+ OpenForRead(Eq(icing_options.base_dir() + "/schema_dir/schema.pb")))
+ .WillByDefault(Return(-1));
+
+ TestIcingSearchEngine test_icing(icing_options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(),
+ GetTestJniCache());
+
+ InitializeResultProto initialize_result_proto = test_icing.Initialize();
+ EXPECT_THAT(initialize_result_proto.status(),
+ ProtoStatusIs(StatusProto::INTERNAL));
+ EXPECT_THAT(initialize_result_proto.status().message(),
+ HasSubstr("Unable to open file for read"));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, FailToWriteSchema) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ // This fails FileBackedProto::Write()
+ ON_CALL(*mock_filesystem, OpenForWrite(HasSubstr("schema.pb")))
+ .WillByDefault(Return(-1));
+
+ TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem),
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SetSchemaResultProto set_schema_result_proto =
+ icing.SetSchema(CreateMessageSchema());
+ EXPECT_THAT(set_schema_result_proto.status(),
+ ProtoStatusIs(StatusProto::INTERNAL));
+ EXPECT_THAT(set_schema_result_proto.status().message(),
+ HasSubstr("Unable to open file for write"));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchemaIncompatibleFails) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 1. Create a schema with an Email type with properties { "title", "body"}
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // 2. Add an email document
+ DocumentProto doc = DocumentBuilder()
+ .SetKey("emails", "email#1")
+ .SetSchema("Email")
+ .AddStringProperty("title", "Hello world.")
+ .AddStringProperty("body", "Goodnight Moon.")
+ .Build();
+ EXPECT_THAT(icing.Put(std::move(doc)).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 3. Set a schema that deletes email. This should fail.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Message");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(
+ icing.SetSchema(schema, /*ignore_errors_and_delete_documents=*/false)
+ .status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+
+ // 4. Try to delete by email type. This should succeed because email wasn't
+ // deleted in step 3.
+ EXPECT_THAT(icing.DeleteBySchemaType("Email").status(), ProtoIsOk());
+ }
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaIncompatibleForceOverrideSucceeds) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 1. Create a schema with an Email type with properties { "title", "body"}
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // 2. Add an email document
+ DocumentProto doc = DocumentBuilder()
+ .SetKey("emails", "email#1")
+ .SetSchema("Email")
+ .AddStringProperty("title", "Hello world.")
+ .AddStringProperty("body", "Goodnight Moon.")
+ .Build();
+ EXPECT_THAT(icing.Put(std::move(doc)).status(), ProtoIsOk());
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 3. Set a schema that deletes email with force override. This should
+ // succeed and delete the email type.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Message");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema, true).status(), ProtoIsOk());
+
+ // 4. Try to delete by email type. This should fail because email was
+ // already deleted.
+ EXPECT_THAT(icing.DeleteBySchemaType("Email").status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+ }
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchemaUnsetVersionIsZero) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 1. Create a schema with an Email type with version 1
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(0));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchemaCompatibleVersionUpdateSucceeds) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 1. Create a schema with an Email type with version 1
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_version(1);
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ SetSchemaResultProto set_schema_result = icing.SetSchema(schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_new_schema_types()->Add("Email");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1));
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 2. Create schema that adds a new optional property and updates version.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_version(2);
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // 3. SetSchema should succeed and the version number should be updated.
+ SetSchemaResultProto set_schema_result = icing.SetSchema(schema, true);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_fully_compatible_changed_schema_types()
+ ->Add("Email");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(2));
+ }
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchemaIncompatibleVersionUpdateFails) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 1. Create a schema with an Email type with version 1
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_version(1);
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1));
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 2. Create schema that makes an incompatible change (OPTIONAL -> REQUIRED)
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_version(2);
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ // 3. SetSchema should fail and the version number should NOT be updated.
+ EXPECT_THAT(icing.SetSchema(schema).status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1));
+ }
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaIncompatibleVersionUpdateForceOverrideSucceeds) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 1. Create a schema with an Email type with version 1
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_version(1);
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1));
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 2. Create schema that makes an incompatible change (OPTIONAL -> REQUIRED)
+ // with force override to true.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_version(2);
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ // 3. SetSchema should succeed and the version number should be updated.
+ EXPECT_THAT(icing.SetSchema(schema, true).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(2));
+ }
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchemaNoChangeVersionUpdateSucceeds) {
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 1. Create a schema with an Email type with version 1
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_version(1);
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(1));
+ }
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // 2. Create schema that only changes the version.
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_version(2);
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // 3. SetSchema should succeed and the version number should be updated.
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.GetSchema().schema().types(0).version(), Eq(2));
+ }
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaDuplicateTypesReturnsAlreadyExists) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create a schema with types { "Email", "Message" and "Email" }
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ type = schema.add_types();
+ type->set_schema_type("Message");
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ *schema.add_types() = schema.types(0);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(),
+ ProtoStatusIs(StatusProto::ALREADY_EXISTS));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaDuplicatePropertiesReturnsAlreadyExists) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create a schema with an Email type with properties { "title", "body" and
+ // "title" }
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(),
+ ProtoStatusIs(StatusProto::ALREADY_EXISTS));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchema) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(1000);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ auto message_document = CreateMessageDocument("namespace", "uri");
+
+ auto schema_with_message = CreateMessageSchema();
+
+ SchemaProto schema_with_email;
+ SchemaTypeConfigProto* type = schema_with_email.add_types();
+ type->set_schema_type("Email");
+ PropertyConfigProto* property = type->add_properties();
+ property->set_property_name("title");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ SchemaProto schema_with_email_and_message = schema_with_email;
+ *schema_with_email_and_message.add_types() = CreateMessageSchemaTypeConfig();
+
+ // Create an arbitrary invalid schema
+ SchemaProto invalid_schema;
+ SchemaTypeConfigProto* empty_type = invalid_schema.add_types();
+ empty_type->set_schema_type("");
+
+ // Make sure we can't set invalid schemas
+ SetSchemaResultProto set_schema_result = icing.SetSchema(invalid_schema);
+ EXPECT_THAT(set_schema_result.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+ EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000));
+
+ // Can add an document of a set schema
+ set_schema_result = icing.SetSchema(schema_with_message);
+ EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK));
+ EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000));
+ EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk());
+
+ // Schema with Email doesn't have Message, so would result incompatible
+ // data
+ set_schema_result = icing.SetSchema(schema_with_email);
+ EXPECT_THAT(set_schema_result.status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000));
+
+ // Can expand the set of schema types and add an document of a new
+ // schema type
+ set_schema_result = icing.SetSchema(schema_with_email_and_message);
+ EXPECT_THAT(set_schema_result.status(), ProtoStatusIs(StatusProto::OK));
+ EXPECT_THAT(set_schema_result.latency_ms(), Eq(1000));
+
+ EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk());
+ // Can't add an document whose schema isn't set
+ auto photo_document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Photo")
+ .AddStringProperty("creator", "icing")
+ .Build();
+ PutResultProto put_result_proto = icing.Put(photo_document);
+ EXPECT_THAT(put_result_proto.status(), ProtoStatusIs(StatusProto::NOT_FOUND));
+ EXPECT_THAT(put_result_proto.status().message(),
+ HasSubstr("'Photo' not found"));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaNewIndexedStringPropertyTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create a schema with 2 properties:
+ // - 'a': string type, unindexed. No section id assigned.
+ // - 'b': int64 type, indexed. Section id = 0.
+ SchemaProto schema_one =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Schema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ SetSchemaResultProto set_schema_result = icing.SetSchema(schema_one);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_new_schema_types()->Add("Schema");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Schema")
+ .AddStringProperty("a", "message body")
+ .AddInt64Property("b", 123)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ // Only 'b' will be indexed.
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ SearchResultProto empty_result;
+ empty_result.mutable_status()->set_code(StatusProto::OK);
+
+ // Verify term search: won't get anything.
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("a:message");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // Verify numeric (integer) search: will get document.
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("b == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Change the schema to:
+ // - 'a': string type, indexed. Section id = 0.
+ // - 'b': int64 type, indexed. Section id = 1.
+ SchemaProto schema_two = schema_one;
+ schema_two.mutable_types(0)
+ ->mutable_properties(0)
+ ->mutable_string_indexing_config()
+ ->set_term_match_type(TERM_MATCH_PREFIX);
+ schema_two.mutable_types(0)
+ ->mutable_properties(0)
+ ->mutable_string_indexing_config()
+ ->set_tokenizer_type(TOKENIZER_PLAIN);
+ // Index restoration should be triggered here because new schema requires more
+ // properties to be indexed. Also new section ids will be reassigned and index
+ // restoration should use new section ids to rebuild.
+ set_schema_result = icing.SetSchema(schema_two);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Schema");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify term search: will get document now.
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search: will still get document.
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaNewIndexedIntegerPropertyTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create a schema with 2 properties:
+ // - 'a': int64 type, unindexed. No section id assigned.
+ // - 'b': string type, indexed. Section id = 0.
+ SchemaProto schema_one =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Schema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+
+ .Build();
+
+ SetSchemaResultProto set_schema_result = icing.SetSchema(schema_one);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_new_schema_types()->Add("Schema");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Schema")
+ .AddInt64Property("a", 123)
+ .AddStringProperty("b", "message body")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ // Only 'b' will be indexed.
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ SearchResultProto empty_result;
+ empty_result.mutable_status()->set_code(StatusProto::OK);
+
+ // Verify term search: will get document.
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("b:message");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search: won't get anything.
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("a == 123");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // Change the schema to:
+ // - 'a': int64 type, indexed. Section id = 0.
+ // - 'b': string type, indexed. Section id = 1.
+ SchemaProto schema_two = schema_one;
+ schema_two.mutable_types(0)
+ ->mutable_properties(0)
+ ->mutable_integer_indexing_config()
+ ->set_numeric_match_type(NUMERIC_MATCH_RANGE);
+ // Index restoration should be triggered here because new schema requires more
+ // properties to be indexed. Also new section ids will be reassigned and index
+ // restoration should use new section ids to rebuild.
+ set_schema_result = icing.SetSchema(schema_two);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Schema");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify term search: will still get document.
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search: will get document now.
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(
+ IcingSearchEngineSchemaTest,
+ SetSchemaNewIndexedDocumentPropertyTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create a schema with a nested document type:
+ //
+ // Section id assignment for 'Person':
+ // - "age": integer type, indexed. Section id = 0
+ // - "name": string type, indexed. Section id = 1.
+ // - "worksFor.name": string type, (nested) indexed. Section id = 2.
+ //
+ // Joinable property id assignment for 'Person':
+ // - "worksFor.listRef": string type, Qualified Id type joinable. Joinable
+ // property id = 0.
+ SchemaProto schema_one =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("List").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("title")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("age")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("worksFor")
+ .SetDataTypeDocument(
+ "Organization",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Organization")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("listRef")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema_one).status(), ProtoIsOk());
+
+ DocumentProto list_document = DocumentBuilder()
+ .SetKey("namespace", "list/1")
+ .SetSchema("List")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("title", "title")
+ .Build();
+ DocumentProto person_document =
+ DocumentBuilder()
+ .SetKey("namespace", "person/2")
+ .SetSchema("Person")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("name", "John")
+ .AddInt64Property("age", 20)
+ .AddDocumentProperty(
+ "worksFor", DocumentBuilder()
+ .SetKey("namespace", "org/1")
+ .SetSchema("Organization")
+ .AddStringProperty("name", "Google")
+ .AddStringProperty("listRef", "namespace#list/1")
+ .Build())
+ .Build();
+ EXPECT_THAT(icing.Put(list_document).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person_document).status(), ProtoIsOk());
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ person_document;
+
+ SearchResultProto empty_result;
+ empty_result.mutable_status()->set_code(StatusProto::OK);
+
+ // Verify term search
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("worksFor.name:Google");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("age == 20");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results =
+ icing.Search(search_spec2, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify join search: join a query for `title:title` (which will get
+ // list_document) with a child query for `name:John` (which will get
+ // person_document) based on the child's `worksFor.listRef` field.
+ SearchSpecProto search_spec_with_join;
+ search_spec_with_join.set_query("title:title");
+ search_spec_with_join.set_term_match_type(TermMatchType::EXACT_ONLY);
+ JoinSpecProto* join_spec = search_spec_with_join.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("worksFor.listRef");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("name:John");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = result_spec;
+
+ SearchResultProto expected_join_search_result_proto;
+ expected_join_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_join_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = list_document;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() =
+ person_document;
+
+ actual_results =
+ icing.Search(search_spec_with_join, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_search_result_proto));
+
+ // Change the schema to add another nested document property to 'Person'
+ //
+ // New section id assignment for 'Person':
+ // - "age": integer type, indexed. Section id = 0
+ // - "almaMater.name", string type, indexed. Section id = 1
+ // - "name": string type, indexed. Section id = 2
+ // - "worksFor.name": string type, (nested) indexed. Section id = 3
+ //
+ // New joinable property id assignment for 'Person':
+ // - "almaMater.listRef": string type, Qualified Id type joinable. Joinable
+ // property id = 0.
+ // - "worksFor.listRef": string type, Qualified Id type joinable. Joinable
+ // property id = 1.
+ SchemaProto schema_two =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("List").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("title")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("age")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("worksFor")
+ .SetDataTypeDocument(
+ "Organization",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("almaMater")
+ .SetDataTypeDocument(
+ "Organization",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Organization")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("listRef")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ // This schema change is compatible since the added 'almaMater' property has
+ // CARDINALITY_OPTIONAL.
+ //
+ // Index restoration should be triggered here because new schema requires more
+ // properties to be indexed. Also new section ids will be reassigned and index
+ // restoration should use new section ids to rebuild.
+ SetSchemaResultProto set_schema_result = icing.SetSchema(schema_two);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Person");
+ expected_set_schema_result.mutable_join_incompatible_changed_schema_types()
+ ->Add("Person");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify term search:
+ // Searching for "worksFor.name:Google" should still match document
+ actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // In new_schema the 'name' property is now indexed at section id 2. If
+ // searching for "name:Google" matched the document, this means that index
+ // rebuild was not triggered and Icing is still searching the old index, where
+ // 'worksFor.name' was indexed at section id 2.
+ search_spec1.set_query("name:Google");
+ actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // Verify numeric (integer) search: should still match document
+ actual_results =
+ icing.Search(search_spec2, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify join search: should still able to join by `worksFor.listRef`
+ actual_results =
+ icing.Search(search_spec_with_join, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaChangeNestedPropertiesTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SchemaTypeConfigProto person_proto =
+ SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("age")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ // Create a schema with nested properties:
+ // - "sender.age": int64 type, (nested) indexed. Section id = 0.
+ // - "sender.name": string type, (nested) indexed. Section id = 1.
+ // - "subject": string type, indexed. Section id = 2.
+ // - "timestamp": int64 type, indexed. Section id = 3.
+ SchemaProto nested_schema =
+ SchemaBuilder()
+ .AddType(person_proto)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_new_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_new_schema_types()->Add("Person");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("subject",
+ "Did you get the memo about TPS reports?")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Bill Lundbergh")
+ .AddInt64Property("age", 20)
+ .Build())
+ .AddInt64Property("timestamp", 1234)
+ .Build();
+
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ SearchResultProto empty_result;
+ empty_result.mutable_status()->set_code(StatusProto::OK);
+
+ // Verify term search
+ // document should match a query for 'Bill' in 'sender.name', but not in
+ // 'subject'
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("sender.name:Bill");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ search_spec1.set_query("subject:Bill");
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // Verify numeric (integer) search
+ // document should match a query for 20 in 'sender.age', but not in
+ // 'timestamp'
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("sender.age == 20");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ search_spec2.set_query("timestamp == 20");
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // Now update the schema with index_nested_properties=false. This should
+ // reassign section ids, lead to an index rebuild and ensure that nothing
+ // match a query for "Bill" or 20.
+ // - "sender.age": int64 type, (nested) unindexed. No section id assigned.
+ // - "sender.name": string type, (nested) unindexed. No section id assigned.
+ // - "subject": string type, indexed. Section id = 0.
+ // - "timestamp": int64 type, indexed. Section id = 1.
+ SchemaProto no_nested_schema =
+ SchemaBuilder()
+ .AddType(person_proto)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ set_schema_result = icing.SetSchema(no_nested_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Email");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify term search
+ // document shouldn't match a query for 'Bill' in either 'sender.name' or
+ // 'subject'
+ search_spec1.set_query("sender.name:Bill");
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ search_spec1.set_query("subject:Bill");
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // Verify numeric (integer) search
+ // document shouldn't match a query for 20 in either 'sender.age' or
+ // 'timestamp'
+ search_spec2.set_query("sender.age == 20");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ search_spec2.set_query("timestamp == 20");
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+}
+
+TEST_F(
+ IcingSearchEngineSchemaTest,
+ SetSchemaChangeNestedPropertiesListTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SchemaTypeConfigProto person_proto =
+ SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("address")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("age")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("birthday")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ // Create a schema with nested properties:
+ // - "sender.address": string type, (nested) non-indexable. Section id = 0.
+ // - "sender.age": int64 type, (nested) indexed. Section id = 1.
+ // - "sender.birthday": int64 type, (nested) non-indexable. Section id = 2.
+ // - "sender.lastName": int64 type, (nested) indexed. Section id = 3.
+ // - "sender.name": string type, (nested) indexed. Section id = 4.
+ // - "subject": string type, indexed. Section id = 5.
+ // - "timestamp": int64 type, indexed. Section id = 6.
+ // - "sender.foo": unknown type, (nested) non-indexable. Section id = 7.
+ //
+ // "sender.address" and "sender.birthday" are assigned a section id because
+ // they are listed in the indexable_nested_properties_list for 'Email.sender'.
+ // They are assigned a sectionId but are not indexed since their indexing
+ // configs are non-indexable.
+ //
+ // "sender.foo" is also assigned a section id, but is also not undefined by
+ // the schema definition. Trying to index a document with this nested property
+ // should fail.
+ SchemaProto nested_schema =
+ SchemaBuilder()
+ .AddType(person_proto)
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person", /*indexable_nested_properties_list=*/
+ {"age", "lastName", "address", "name", "birthday",
+ "foo"})
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_new_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_new_schema_types()->Add("Person");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("subject",
+ "Did you get the memo about TPS reports?")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Bill")
+ .AddStringProperty("lastName", "Lundbergh")
+ .AddStringProperty("address", "1600 Amphitheatre Pkwy")
+ .AddInt64Property("age", 20)
+ .AddInt64Property("birthday", 20)
+ .Build())
+ .AddInt64Property("timestamp", 1234)
+ .Build();
+
+ // Indexing this doc should fail, since the 'sender.foo' property is not found
+ DocumentProto invalid_document =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("subject",
+ "Did you get the memo about TPS reports?")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Bill")
+ .AddStringProperty("lastName", "Lundbergh")
+ .AddStringProperty("address", "1600 Amphitheatre Pkwy")
+ .AddInt64Property("age", 20)
+ .AddInt64Property("birthday", 20)
+ .AddBytesProperty("foo", "bar bytes")
+ .Build())
+ .AddInt64Property("timestamp", 1234)
+ .Build();
+
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(invalid_document).status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ SearchResultProto empty_result;
+ empty_result.mutable_status()->set_code(StatusProto::OK);
+
+ // Verify term search
+ // document should match a query for 'Bill' in 'sender.name', but not in
+ // 'sender.lastName'
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("sender.name:Bill");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ search_spec1.set_query("sender.lastName:Bill");
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // document should match a query for 'Lundber' in 'sender.lastName', but not
+ // in 'sender.name'.
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("sender.lastName:Lundber");
+ search_spec2.set_term_match_type(TermMatchType::PREFIX);
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ search_spec2.set_query("sender.name:Lundber");
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // document should not match a query for 'Amphitheatre' because the
+ // 'sender.address' field is not indexed.
+ search_spec2.set_query("Amphitheatre");
+ search_spec2.set_term_match_type(TermMatchType::PREFIX);
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // Verify numeric (integer) search
+ // document should match a query for 20 in 'sender.age', but not in
+ // 'timestamp' or 'sender.birthday'
+ SearchSpecProto search_spec3;
+ search_spec3.set_query("sender.age == 20");
+ search_spec3.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec3.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec3, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ search_spec3.set_query("timestamp == 20");
+ actual_results = icing.Search(search_spec3, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ search_spec3.set_query("birthday == 20");
+ actual_results = icing.Search(search_spec3, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ // Now update the schema and don't index "sender.name", "sender.birthday" and
+ // "sender.foo".
+ // This should reassign section ids, lead to an index rebuild and ensure that
+ // nothing match a query for "Bill".
+ //
+ // Section id assignment:
+ // - "sender.address": string type, (nested) non-indexable. Section id = 0.
+ // - "sender.age": int64 type, (nested) indexed. Section id = 1.
+ // - "sender.birthday": int64 type, (nested) unindexed. No section id.
+ // - "sender.lastName": int64 type, (nested) indexed. Section id = 2.
+ // - "sender.name": string type, (nested) unindexed. No section id.
+ // - "subject": string type, indexed. Section id = 3.
+ // - "timestamp": int64 type, indexed. Section id = 4.
+ // - "sender.foo": unknown type, invalid. No section id.
+ SchemaProto nested_schema_with_less_props =
+ SchemaBuilder()
+ .AddType(person_proto)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person", /*indexable_nested_properties=*/
+ {"age", "lastName", "address"})
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ set_schema_result = icing.SetSchema(nested_schema_with_less_props);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Email");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify term search
+ // document shouldn't match a query for 'Bill' in either 'sender.name' or
+ // 'subject'
+ search_spec1.set_query("sender.name:Bill");
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+
+ search_spec1.set_query("subject:Bill");
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaNewJoinablePropertyTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create "Message" schema with 3 properties:
+ // - "subject": string type, non-joinable. No joinable property id assigned.
+ // It is indexed and used for searching only.
+ // - "receiverQualifiedId": string type, non-joinable. No joinable property id
+ // assigned.
+ // - "senderQualifiedId": string type, Qualified Id type joinable. Joinable
+ // property id = 0.
+ SchemaProto schema_one =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("receiverQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_NONE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ SetSchemaResultProto set_schema_result = icing.SetSchema(schema_one);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_new_schema_types()->Add("Message");
+ expected_set_schema_result.mutable_new_schema_types()->Add("Person");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person one")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "person two")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ DocumentProto message =
+ DocumentBuilder()
+ .SetKey("namespace", "message1")
+ .SetSchema("Message")
+ .AddStringProperty("subject", "message")
+ .AddStringProperty("receiverQualifiedId", "namespace#person1")
+ .AddStringProperty("senderQualifiedId", "namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ EXPECT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message).status(), ProtoIsOk());
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `subject:message` based on the child's `receiverQualifiedId` field.
+ // Since "receiverQualifiedId" is not JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ // joining on that property should only return the "left-side" (`name:person`)
+ // of the join.
+ SearchSpecProto search_spec_join_by_receiver;
+ search_spec_join_by_receiver.set_query("name:person");
+ search_spec_join_by_receiver.set_term_match_type(TermMatchType::EXACT_ONLY);
+ JoinSpecProto* join_spec = search_spec_join_by_receiver.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("receiverQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("subject:message");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ SearchResultProto expected_empty_child_search_result_proto;
+ expected_empty_child_search_result_proto.mutable_status()->set_code(
+ StatusProto::OK);
+ *expected_empty_child_search_result_proto.mutable_results()
+ ->Add()
+ ->mutable_document() = person2;
+ *expected_empty_child_search_result_proto.mutable_results()
+ ->Add()
+ ->mutable_document() = person1;
+ SearchResultProto actual_results = icing.Search(
+ search_spec_join_by_receiver, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_empty_child_search_result_proto));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `subject:message` based on the child's `senderQualifiedId` field.
+ // Since "senderQualifiedId" is JOINABLE_VALUE_TYPE_QUALIFIED_ID, joining on
+ // that property should return both "left-side" (`name:person`) and
+ // "right-side" (`subject:message`) of the join.
+ SearchSpecProto search_spec_join_by_sender = search_spec_join_by_receiver;
+ join_spec = search_spec_join_by_sender.mutable_join_spec();
+ join_spec->set_child_property_expression("senderQualifiedId");
+
+ SearchResultProto expected_join_by_sender_search_result_proto;
+ expected_join_by_sender_search_result_proto.mutable_status()->set_code(
+ StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_join_by_sender_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person2;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message;
+ *expected_join_by_sender_search_result_proto.mutable_results()
+ ->Add()
+ ->mutable_document() = person1;
+ actual_results = icing.Search(search_spec_join_by_sender,
+ GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_by_sender_search_result_proto));
+
+ // Change "Message" schema to:
+ // - "subject": string type, non-joinable. No joinable property id assigned.
+ // - "receiverQualifiedId": string type, Qualified Id joinable. Joinable
+ // property id = 0.
+ // - "senderQualifiedId": string type, Qualified Id joinable. Joinable
+ // property id = 1.
+ SchemaProto schema_two = schema_one;
+ schema_two.mutable_types(1)
+ ->mutable_properties(1)
+ ->mutable_joinable_config()
+ ->set_value_type(JOINABLE_VALUE_TYPE_QUALIFIED_ID);
+ // Index restoration should be triggered here because new schema requires more
+ // joinable properties. Also new joinable property ids will be reassigned and
+ // index restoration should use new joinable property ids to rebuild.
+ set_schema_result = icing.SetSchema(schema_two);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result.mutable_join_incompatible_changed_schema_types()
+ ->Add("Message");
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `subject:message` based on the child's `receiverQualifiedId` field.
+ // Since we've changed "receiverQualifiedId" to be
+ // JOINABLE_VALUE_TYPE_QUALIFIED_ID, joining on that property should return
+ // should return both "left-side" (`name:person`) and "right-side"
+ // (`subject:message`) of the join now.
+ SearchResultProto expected_join_by_receiver_search_result_proto;
+ expected_join_by_receiver_search_result_proto.mutable_status()->set_code(
+ StatusProto::OK);
+ result_proto =
+ expected_join_by_receiver_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person1;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = message;
+ *expected_join_by_receiver_search_result_proto.mutable_results()
+ ->Add()
+ ->mutable_document() = person2;
+ actual_results = icing.Search(search_spec_join_by_receiver,
+ GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_by_receiver_search_result_proto));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `subject:message` based on the child's `senderQualifiedId` field. We should
+ // get the same set of result since `senderQualifiedId` is unchanged.
+ actual_results = icing.Search(search_spec_join_by_sender,
+ GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_join_by_sender_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaWithValidCycle_circularSchemaDefinitionNotAllowedFails) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_allow_circular_schema_definitions(false);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create schema with circular type definitions: A <-> B
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false)))
+ .Build();
+
+ EXPECT_THAT(
+ icing.SetSchema(schema, /*ignore_errors_and_delete_documents=*/false)
+ .status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaWithValidCycle_allowCircularSchemaDefinitionsOK) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_allow_circular_schema_definitions(true);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create schema with valid circular type definitions: A <-> B, B->A sets
+ // index_nested_properties=false
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false)))
+ .Build();
+
+ EXPECT_THAT(
+ icing.SetSchema(schema, /*ignore_errors_and_delete_documents=*/false)
+ .status(),
+ ProtoStatusIs(StatusProto::OK));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaWithInvalidCycle_allowCircularSchemaDefinitionsFails) {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_allow_circular_schema_definitions(true);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create schema with invalid circular type definitions: A <-> B, all edges
+ // set index_nested_properties=true
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true)))
+ .Build();
+
+ EXPECT_THAT(
+ icing.SetSchema(schema, /*ignore_errors_and_delete_documents=*/false)
+ .status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(
+ IcingSearchEngineSchemaTest,
+ ForceSetSchemaIndexedPropertyDeletionTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create a schema with 4 properties:
+ // - "body": string type, indexed. Section id = 0.
+ // - "subject": string type, indexed. Section id = 1.
+ // - "timestamp1": int64 type, indexed. Section id = 2.
+ // - "timestamp2": int64 type, indexed. Section id = 3.
+ SchemaProto email_with_body_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp1")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp2")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SetSchemaResultProto set_schema_result =
+ icing.SetSchema(email_with_body_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_new_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Create a document with only subject and timestamp2 property.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("subject",
+ "Did you get the memo about TPS reports?")
+ .AddInt64Property("timestamp2", 1234)
+ .Build();
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ // Verify term search
+ // We should be able to retrieve the document by searching for 'tps' in
+ // 'subject'.
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("subject:tps");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ // We should be able to retrieve the document by searching for 1234 in
+ // 'timestamp2'.
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("timestamp2 == 1234");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Now update the schema to remove the 'body' and 'timestamp1' field. This is
+ // backwards incompatible, but document should be preserved because it doesn't
+ // contain a 'body' or 'timestamp1' field.
+ // - "subject": string type, indexed. Section id = 0.
+ // - "timestamp2": int64 type, indexed. Section id = 1.
+ //
+ // If the index is not correctly rebuilt, then the hits of 'subject' and
+ // 'timestamp2' in the index will still have old section ids of 1, 3 and
+ // therefore they won't be found.
+ SchemaProto email_no_body_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp2")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ set_schema_result = icing.SetSchema(
+ email_no_body_schema, /*ignore_errors_and_delete_documents=*/true);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Email");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify term search
+ // We should be able to retrieve the document by searching for 'tps' in
+ // 'subject'.
+ search_spec1.set_query("subject:tps");
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ // We should be able to retrieve the document by searching for 1234 in
+ // 'timestamp'.
+ search_spec2.set_query("timestamp2 == 1234");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ ForceSetSchemaJoinablePropertyDeletionTriggersIndexRestoration) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create "Email" schema with 2 joinable properties:
+ // - "receiverQualifiedId": qualified id joinable. Joinable property id = 0.
+ // - "senderQualifiedId": qualified id joinable. Joinable property id = 1.
+ SchemaProto email_with_receiver_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("receiverQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SetSchemaResultProto set_schema_result =
+ icing.SetSchema(email_with_receiver_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_new_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_new_schema_types()->Add("Person");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ DocumentProto person = DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("name", "person")
+ .Build();
+ // Create an email document with only "senderQualifiedId" joinable property.
+ DocumentProto email =
+ DocumentBuilder()
+ .SetKey("namespace", "email")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("subject",
+ "Did you get the memo about TPS reports?")
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .Build();
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(email).status(), ProtoIsOk());
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `subject:tps` based on the child's `senderQualifiedId` field. We should be
+ // able to join person and email documents by this property.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = email;
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("name:person");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("subject:tps");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Now update the schema to remove "receiverQualifiedId" fields. This is
+ // backwards incompatible, but document should be preserved because it doesn't
+ // contain "receiverQualifiedId" field. Also since it is join incompatible, we
+ // have to rebuild join index.
+ // - "senderQualifiedId": qualified id joinable. Joinable property id = 0.
+ //
+ // If the index is not correctly rebuilt, then the joinable data of
+ // "senderQualifiedId" in the join index will still have old joinable property
+ // id of 1 and therefore won't take effect for join search query.
+ SchemaProto email_without_receiver_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Although we've just deleted an existing property "receiverQualifiedId" from
+ // schema "Email", some email documents will still be preserved because they
+ // don't have "receiverQualifiedId" property.
+ set_schema_result =
+ icing.SetSchema(email_without_receiver_schema,
+ /*ignore_errors_and_delete_documents=*/true);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_join_incompatible_changed_schema_types()
+ ->Add("Email");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `subject:tps` based on the child's `senderQualifiedId` field. We should
+ // still be able to join person and email documents by this property.
+ actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(
+ IcingSearchEngineSchemaTest,
+ ForceSetSchemaIndexedPropertyDeletionAndAdditionTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create a schema with 3 properties:
+ // - "body": string type, indexed. Section id = 0.
+ // - "subject": string type, indexed. Section id = 1.
+ // - "timestamp": int64 type, indexed. Section id = 2.
+ SchemaProto email_with_body_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SetSchemaResultProto set_schema_result =
+ icing.SetSchema(email_with_body_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_new_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Create a document with only subject and timestamp property.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("subject",
+ "Did you get the memo about TPS reports?")
+ .AddInt64Property("timestamp", 1234)
+ .Build();
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ // Verify term search
+ // We should be able to retrieve the document by searching for 'tps' in
+ // 'subject'.
+ SearchSpecProto search_spec1;
+ search_spec1.set_query("subject:tps");
+ search_spec1.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ // We should be able to retrieve the document by searching for 1234 in
+ // 'timestamp'.
+ SearchSpecProto search_spec2;
+ search_spec2.set_query("timestamp == 1234");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Now update the schema to remove the 'body' field. This is backwards
+ // incompatible, but document should be preserved because it doesn't contain a
+ // 'body' field.
+ // - "subject": string type, indexed. Section id = 0.
+ // - "timestamp": int64 type, indexed. Section id = 1.
+ // - "to": string type, indexed. Section id = 2.
+ //
+ // If the index is not correctly rebuilt, then the hits of 'subject' and
+ // 'timestamp' in the index will still have old section ids of 1, 2 and
+ // therefore they won't be found.
+ SchemaProto email_no_body_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("to")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ set_schema_result = icing.SetSchema(
+ email_no_body_schema, /*ignore_errors_and_delete_documents=*/true);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Email");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify term search
+ // We should be able to retrieve the document by searching for 'tps' in
+ // 'subject'.
+ search_spec1.set_query("subject:tps");
+ actual_results = icing.Search(search_spec1, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Verify numeric (integer) search
+ // We should be able to retrieve the document by searching for 1234 in
+ // 'timestamp'.
+ search_spec2.set_query("timestamp == 1234");
+ search_spec2.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec2.add_enabled_features(std::string(kNumericSearchFeature));
+
+ actual_results = icing.Search(search_spec2, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(
+ IcingSearchEngineSchemaTest,
+ ForceSetSchemaJoinablePropertyDeletionAndAdditionTriggersIndexRestorationAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create "Email" schema with 2 joinable properties:
+ // - "receiverQualifiedId": qualified id joinable. Joinable property id = 0.
+ // - "senderQualifiedId": qualified id joinable. Joinable property id = 1.
+ SchemaProto email_with_body_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("receiverQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SetSchemaResultProto set_schema_result =
+ icing.SetSchema(email_with_body_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_new_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_new_schema_types()->Add("Person");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ DocumentProto person = DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("Person")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("name", "person")
+ .Build();
+ // Create an email document with only subject and timestamp property.
+ DocumentProto email =
+ DocumentBuilder()
+ .SetKey("namespace", "email")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("subject",
+ "Did you get the memo about TPS reports?")
+ .AddStringProperty("senderQualifiedId", "namespace#person")
+ .Build();
+
+ EXPECT_THAT(icing.Put(person).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(email).status(), ProtoIsOk());
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `subject:tps` based on the child's `senderQualifiedId` field. We should be
+ // able to join person and email documents by this property.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto =
+ expected_search_result_proto.mutable_results()->Add();
+ *result_proto->mutable_document() = person;
+ *result_proto->mutable_joined_results()->Add()->mutable_document() = email;
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("name:person");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("senderQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::EXACT_ONLY);
+ nested_search_spec->set_query("subject:tps");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ ResultSpecProto result_spec = ResultSpecProto::default_instance();
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Now update the schema to remove the "receiverQualified" field and add
+ // "zQualifiedId". This is backwards incompatible, but document should
+ // be preserved because it doesn't contain a "receiverQualified" field and
+ // "zQualifiedId" is optional.
+ // - "senderQualifiedId": qualified id joinable. Joinable property id = 0.
+ // - "zQualifiedId": qualified id joinable. Joinable property id = 1.
+ //
+ // If the index is not correctly rebuilt, then the joinable data of
+ // "senderQualifiedId" in the join index will still have old joinable property
+ // id of 1 and therefore won't take effect for join search query.
+ SchemaProto email_no_body_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("zQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ set_schema_result = icing.SetSchema(
+ email_no_body_schema, /*ignore_errors_and_delete_documents=*/true);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_join_incompatible_changed_schema_types()
+ ->Add("Email");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Verify join search: join a query for `name:person` with a child query for
+ // `subject:tps` based on the child's `senderQualifiedId` field. We should
+ // still be able to join person and email documents by this property.
+ actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ ForceSetSchemaIncompatibleNestedDocsAreDeleted) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SchemaTypeConfigProto email_schema_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument("Person",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto nested_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("company")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(email_schema_type)
+ .Build();
+
+ SetSchemaResultProto set_schema_result = icing.SetSchema(nested_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result;
+ expected_set_schema_result.mutable_new_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_new_schema_types()->Add("Person");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Create two documents - a person document and an email document - both docs
+ // should be deleted when we remove the 'company' field from the person type.
+ DocumentProto person_document =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Person")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("name", "Bill Lundbergh")
+ .AddStringProperty("company", "Initech Corp.")
+ .Build();
+ EXPECT_THAT(icing.Put(person_document).status(), ProtoIsOk());
+
+ DocumentProto email_document =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(1000)
+ .AddStringProperty("subject",
+ "Did you get the memo about TPS reports?")
+ .AddDocumentProperty("sender", person_document)
+ .Build();
+ EXPECT_THAT(icing.Put(email_document).status(), ProtoIsOk());
+
+ // We should be able to retrieve both documents.
+ GetResultProto get_result =
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance());
+ EXPECT_THAT(get_result.status(), ProtoIsOk());
+ EXPECT_THAT(get_result.document(), EqualsProto(person_document));
+
+ get_result =
+ icing.Get("namespace1", "uri2", GetResultSpecProto::default_instance());
+ EXPECT_THAT(get_result.status(), ProtoIsOk());
+ EXPECT_THAT(get_result.document(), EqualsProto(email_document));
+
+ // Now update the schema to remove the 'company' field. This is backwards
+ // incompatible, *both* documents should be deleted because both fail
+ // validation (they each contain a 'Person' that has a non-existent property).
+ nested_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(email_schema_type)
+ .Build();
+
+ set_schema_result = icing.SetSchema(
+ nested_schema, /*ignore_errors_and_delete_documents=*/true);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result = SetSchemaResultProto();
+ expected_set_schema_result.mutable_incompatible_schema_types()->Add("Person");
+ expected_set_schema_result.mutable_incompatible_schema_types()->Add("Email");
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Email");
+ expected_set_schema_result.mutable_index_incompatible_changed_schema_types()
+ ->Add("Person");
+ expected_set_schema_result.mutable_status()->set_code(StatusProto::OK);
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result));
+
+ // Both documents should be deleted now.
+ get_result =
+ icing.Get("namespace1", "uri1", GetResultSpecProto::default_instance());
+ EXPECT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND));
+
+ get_result =
+ icing.Get("namespace1", "uri2", GetResultSpecProto::default_instance());
+ EXPECT_THAT(get_result.status(), ProtoStatusIs(StatusProto::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchemaRevalidatesDocumentsAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SchemaProto schema_with_optional_subject;
+ auto type = schema_with_optional_subject.add_types();
+ type->set_schema_type("email");
+
+ // Add a OPTIONAL property
+ auto property = type->add_properties();
+ property->set_property_name("subject");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ EXPECT_THAT(icing.SetSchema(schema_with_optional_subject).status(),
+ ProtoIsOk());
+
+ DocumentProto email_document_without_subject =
+ DocumentBuilder()
+ .SetKey("namespace", "without_subject")
+ .SetSchema("email")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto email_document_with_subject =
+ DocumentBuilder()
+ .SetKey("namespace", "with_subject")
+ .SetSchema("email")
+ .AddStringProperty("subject", "foo")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ EXPECT_THAT(icing.Put(email_document_without_subject).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(email_document_with_subject).status(), ProtoIsOk());
+
+ SchemaProto schema_with_required_subject;
+ type = schema_with_required_subject.add_types();
+ type->set_schema_type("email");
+
+ // Add a REQUIRED property
+ property = type->add_properties();
+ property->set_property_name("subject");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+
+ // Can't set the schema since it's incompatible
+ SetSchemaResultProto set_schema_result =
+ icing.SetSchema(schema_with_required_subject);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_set_schema_result_proto;
+ expected_set_schema_result_proto.mutable_status()->set_code(
+ StatusProto::FAILED_PRECONDITION);
+ expected_set_schema_result_proto.mutable_status()->set_message(
+ "Schema is incompatible.");
+ expected_set_schema_result_proto.add_incompatible_schema_types("email");
+
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto));
+
+ // Force set it
+ set_schema_result =
+ icing.SetSchema(schema_with_required_subject,
+ /*ignore_errors_and_delete_documents=*/true);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_set_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_set_schema_result_proto.mutable_status()->clear_message();
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_set_schema_result_proto));
+
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = email_document_with_subject;
+
+ EXPECT_THAT(icing.Get("namespace", "with_subject",
+ GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // The document without a subject got deleted because it failed validation
+ // against the new schema
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, without_subject) not found.");
+ expected_get_result_proto.clear_document();
+
+ EXPECT_THAT(icing.Get("namespace", "without_subject",
+ GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchemaDeletesDocumentsAndReturnsOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("email");
+ type = schema.add_types();
+ type->set_schema_type("message");
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto email_document =
+ DocumentBuilder()
+ .SetKey("namespace", "email_uri")
+ .SetSchema("email")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto message_document =
+ DocumentBuilder()
+ .SetKey("namespace", "message_uri")
+ .SetSchema("message")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ EXPECT_THAT(icing.Put(email_document).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(message_document).status(), ProtoIsOk());
+
+ // Clear the schema and only add the "email" type, essentially deleting the
+ // "message" type
+ SchemaProto new_schema;
+ type = new_schema.add_types();
+ type->set_schema_type("email");
+
+ // Can't set the schema since it's incompatible
+ SetSchemaResultProto set_schema_result = icing.SetSchema(new_schema);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ SetSchemaResultProto expected_result;
+ expected_result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
+ expected_result.mutable_status()->set_message("Schema is incompatible.");
+ expected_result.add_deleted_schema_types("message");
+
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_result));
+
+ // Force set it
+ set_schema_result =
+ icing.SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/true);
+ // Ignore latency numbers. They're covered elsewhere.
+ set_schema_result.clear_latency_ms();
+ expected_result.mutable_status()->set_code(StatusProto::OK);
+ expected_result.mutable_status()->clear_message();
+ EXPECT_THAT(set_schema_result, EqualsProto(expected_result));
+
+ // "email" document is still there
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = email_document;
+
+ EXPECT_THAT(icing.Get("namespace", "email_uri",
+ GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // "message" document got deleted
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, message_uri) not found.");
+ expected_get_result_proto.clear_document();
+
+ EXPECT_THAT(icing.Get("namespace", "message_uri",
+ GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, GetSchemaNotFound) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.GetSchema().status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, GetSchemaOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ GetSchemaResultProto expected_get_schema_result_proto;
+ expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
+ EXPECT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, GetSchemaTypeFailedPrecondition) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ GetSchemaTypeResultProto get_schema_type_result_proto =
+ icing.GetSchemaType("nonexistent_schema");
+ EXPECT_THAT(get_schema_type_result_proto.status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(get_schema_type_result_proto.status().message(),
+ HasSubstr("Schema not set"));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, GetSchemaTypeOk) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ GetSchemaTypeResultProto expected_get_schema_type_result_proto;
+ expected_get_schema_type_result_proto.mutable_status()->set_code(
+ StatusProto::OK);
+ *expected_get_schema_type_result_proto.mutable_schema_type_config() =
+ CreateMessageSchema().types(0);
+ EXPECT_THAT(icing.GetSchemaType(CreateMessageSchema().types(0).schema_type()),
+ EqualsProto(expected_get_schema_type_result_proto));
+}
+
+TEST_F(IcingSearchEngineSchemaTest,
+ SetSchemaCanNotDetectPreviousSchemaWasLostWithoutDocuments) {
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ // Make an incompatible schema, a previously OPTIONAL field is REQUIRED
+ SchemaProto incompatible_schema = schema;
+ incompatible_schema.mutable_types(0)->mutable_properties(0)->set_cardinality(
+ PropertyConfigProto::Cardinality::REQUIRED);
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ ASSERT_TRUE(filesystem()->DeleteDirectoryRecursively(GetSchemaDir().c_str()));
+
+ // Since we don't have any documents yet, we can't detect this edge-case. But
+ // it should be fine since there aren't any documents to be invalidated.
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(incompatible_schema).status(), ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineSchemaTest, SetSchemaCanDetectPreviousSchemaWasLost) {
+ SchemaTypeConfigProto message_schema_type_config =
+ CreateMessageSchemaTypeConfig();
+ message_schema_type_config.mutable_properties(0)->set_cardinality(
+ CARDINALITY_OPTIONAL);
+
+ SchemaProto schema;
+ *schema.add_types() = message_schema_type_config;
+
+ // Make an incompatible schema, a previously OPTIONAL field is REQUIRED
+ SchemaProto incompatible_schema = schema;
+ incompatible_schema.mutable_types(0)->mutable_properties(0)->set_cardinality(
+ PropertyConfigProto::Cardinality::REQUIRED);
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Can retrieve by namespace/uri
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document;
+
+ ASSERT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Can search for it
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ } // This should shut down IcingSearchEngine and persist anything it needs to
+
+ ASSERT_TRUE(filesystem()->DeleteDirectoryRecursively(GetSchemaDir().c_str()));
+
+ // Setting the new, different schema will remove incompatible documents
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(incompatible_schema).status(), ProtoIsOk());
+
+ // Can't retrieve by namespace/uri
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
+ expected_get_result_proto.mutable_status()->set_message(
+ "Document (namespace, uri) not found.");
+
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Can't search for it
+ SearchResultProto empty_result;
+ empty_result.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, IcingShouldWorkFor64Sections) {
+ // Create a schema with 64 sections
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ // Person has 4 sections.
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("phoneNumber")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ // Email has 16 sections.
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("date")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("time")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("receiver")
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("cc")
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ // EmailCollection has 64 sections.
+ .SetType("EmailCollection")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("email1")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("email2")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("email3")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("email4")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first1")
+ .AddStringProperty("lastName", "last1")
+ .AddStringProperty("emailAddress", "email1@gmail.com")
+ .AddStringProperty("phoneNumber", "000-000-001")
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first2")
+ .AddStringProperty("lastName", "last2")
+ .AddStringProperty("emailAddress", "email2@gmail.com")
+ .AddStringProperty("phoneNumber", "000-000-002")
+ .Build();
+ DocumentProto person3 =
+ DocumentBuilder()
+ .SetKey("namespace", "person3")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first3")
+ .AddStringProperty("lastName", "last3")
+ .AddStringProperty("emailAddress", "email3@gmail.com")
+ .AddStringProperty("phoneNumber", "000-000-003")
+ .Build();
+ DocumentProto email1 = DocumentBuilder()
+ .SetKey("namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("body", "test body")
+ .AddStringProperty("subject", "test subject")
+ .AddStringProperty("date", "2022-08-01")
+ .AddStringProperty("time", "1:00 PM")
+ .AddDocumentProperty("sender", person1)
+ .AddDocumentProperty("receiver", person2)
+ .AddDocumentProperty("cc", person3)
+ .Build();
+ DocumentProto email2 = DocumentBuilder()
+ .SetKey("namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("body", "test body")
+ .AddStringProperty("subject", "test subject")
+ .AddStringProperty("date", "2022-08-02")
+ .AddStringProperty("time", "2:00 PM")
+ .AddDocumentProperty("sender", person2)
+ .AddDocumentProperty("receiver", person1)
+ .AddDocumentProperty("cc", person3)
+ .Build();
+ DocumentProto email3 = DocumentBuilder()
+ .SetKey("namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("body", "test body")
+ .AddStringProperty("subject", "test subject")
+ .AddStringProperty("date", "2022-08-03")
+ .AddStringProperty("time", "3:00 PM")
+ .AddDocumentProperty("sender", person3)
+ .AddDocumentProperty("receiver", person1)
+ .AddDocumentProperty("cc", person2)
+ .Build();
+ DocumentProto email4 = DocumentBuilder()
+ .SetKey("namespace", "email4")
+ .SetSchema("Email")
+ .AddStringProperty("body", "test body")
+ .AddStringProperty("subject", "test subject")
+ .AddStringProperty("date", "2022-08-04")
+ .AddStringProperty("time", "4:00 PM")
+ .AddDocumentProperty("sender", person3)
+ .AddDocumentProperty("receiver", person2)
+ .AddDocumentProperty("cc", person1)
+ .Build();
+ DocumentProto email_collection =
+ DocumentBuilder()
+ .SetKey("namespace", "email_collection")
+ .SetSchema("EmailCollection")
+ .AddDocumentProperty("email1", email1)
+ .AddDocumentProperty("email2", email2)
+ .AddDocumentProperty("email3", email3)
+ .AddDocumentProperty("email4", email4)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email_collection).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ const std::vector<std::string> query_terms = {
+ "first1", "last2", "email3@gmail.com", "000-000-001",
+ "body", "subject", "2022-08-02", "3\\:00"};
+ SearchResultProto expected_document;
+ expected_document.mutable_status()->set_code(StatusProto::OK);
+ *expected_document.mutable_results()->Add()->mutable_document() =
+ email_collection;
+ for (const std::string& query_term : query_terms) {
+ search_spec.set_query(query_term);
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(expected_document));
+ }
+
+ search_spec.set_query("foo");
+ SearchResultProto expected_no_documents;
+ expected_no_documents.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(expected_no_documents));
+}
+
+TEST_F(IcingSearchEngineSchemaTest, IcingShouldReturnErrorForExtraSections) {
+ // Create a schema with more sections than allowed.
+ SchemaTypeConfigBuilder schema_type_config_builder =
+ SchemaTypeConfigBuilder().SetType("type");
+ for (int i = 0; i <= kMaxSectionId + 1; ++i) {
+ schema_type_config_builder.AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop" + std::to_string(i))
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL));
+ }
+ SchemaProto schema =
+ SchemaBuilder().AddType(schema_type_config_builder).Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status().message(),
+ HasSubstr("Too many properties to be indexed"));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_search_test.cc b/icing/icing-search-engine_search_test.cc
new file mode 100644
index 0000000..21512c6
--- /dev/null
+++ b/icing/icing-search-engine_search_test.cc
@@ -0,0 +1,7173 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/icing-search-engine.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/join/join-processor.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/query/query-features.h"
+#include "icing/result/result-state-manager.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/snippet-helpers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::DoubleEq;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::IsEmpty;
+using ::testing::Lt;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+// This test is meant to cover all tests relating to IcingSearchEngine::Search
+// and IcingSearchEngine::GetNextPage.
+class IcingSearchEngineSearchTest
+ : public ::testing::TestWithParam<SearchSpecProto::SearchType::Code> {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ private:
+ Filesystem filesystem_;
+};
+
+// Non-zero value so we don't override it to be the current time
+constexpr int64_t kDefaultCreationTimestampMs = 1575492852000;
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ icing_options.set_document_store_namespace_id_fingerprint(true);
+ icing_options.set_use_new_qualified_id_join_index(true);
+ return icing_options;
+}
+
+DocumentProto CreateMessageDocument(std::string name_space, std::string uri) {
+ return DocumentBuilder()
+ .SetKey(std::move(name_space), std::move(uri))
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+}
+
+DocumentProto CreateEmailDocument(const std::string& name_space,
+ const std::string& uri, int score,
+ const std::string& subject_content,
+ const std::string& body_content) {
+ return DocumentBuilder()
+ .SetKey(name_space, uri)
+ .SetSchema("Email")
+ .SetScore(score)
+ .AddStringProperty("subject", subject_content)
+ .AddStringProperty("body", body_content)
+ .Build();
+}
+
+SchemaProto CreateMessageSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+}
+
+SchemaProto CreateEmailSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+}
+
+SchemaProto CreatePersonAndEmailSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+}
+
+ScoringSpecProto GetDefaultScoringSpec() {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ return scoring_spec;
+}
+
+UsageReport CreateUsageReport(std::string name_space, std::string uri,
+ int64_t timestamp_ms,
+ UsageReport::UsageType usage_type) {
+ UsageReport usage_report;
+ usage_report.set_document_namespace(name_space);
+ usage_report.set_document_uri(uri);
+ usage_report.set_usage_timestamp_ms(timestamp_ms);
+ usage_report.set_usage_type(usage_type);
+ return usage_report;
+}
+
+std::vector<std::string> GetUrisFromSearchResults(
+ SearchResultProto& search_result_proto) {
+ std::vector<std::string> result_uris;
+ result_uris.reserve(search_result_proto.results_size());
+ for (int i = 0; i < search_result_proto.results_size(); i++) {
+ result_uris.push_back(
+ search_result_proto.mutable_results(i)->document().uri());
+ }
+ return result_uris;
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchReturnsValidResults) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
+
+ SearchResultProto results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+
+ const DocumentProto& document = results.results(0).document();
+ EXPECT_THAT(document, EqualsProto(document_two));
+
+ const SnippetProto& snippet = results.results(0).snippet();
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("message body"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("message"));
+
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
+ EXPECT_THAT(results.results(1).snippet().entries(), IsEmpty());
+
+ search_spec.set_query("foo");
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchReturnsScoresDocumentScore) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
+ document_one.set_score(93);
+ document_one.set_creation_timestamp_ms(10000);
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
+ document_two.set_score(15);
+ document_two.set_creation_timestamp_ms(12000);
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ // Rank by DOCUMENT_SCORE and ensure that the score field is populated with
+ // document score.
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ SearchResultProto results = icing.Search(search_spec, scoring_spec,
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_one));
+ EXPECT_THAT(results.results(0).score(), 93);
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_two));
+ EXPECT_THAT(results.results(1).score(), 15);
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchReturnsScoresCreationTimestamp) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
+ document_one.set_score(93);
+ document_one.set_creation_timestamp_ms(10000);
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
+ document_two.set_score(15);
+ document_two.set_creation_timestamp_ms(12000);
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ // Rank by CREATION_TS and ensure that the score field is populated with
+ // creation ts.
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+ SearchResultProto results = icing.Search(search_spec, scoring_spec,
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+ EXPECT_THAT(results.results(0).score(), 12000);
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
+ EXPECT_THAT(results.results(1).score(), 10000);
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchReturnsOneResult) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(1000);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document_two;
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+
+ EXPECT_THAT(search_result_proto.query_stats().latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(),
+ Eq(1000));
+ // TODO(b/305098009): deprecate search-related flat fields in query_stats.
+ EXPECT_THAT(search_result_proto.query_stats().parse_query_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().scoring_latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().ranking_latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .parse_query_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .scoring_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_documents_scored(),
+ Eq(2));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_lite_index(),
+ Eq(2));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_main_index(),
+ Eq(0));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_integer_index(),
+ Eq(0));
+
+ // The token is a random number so we don't verify it.
+ expected_search_result_proto.set_next_page_token(
+ search_result_proto.next_page_token());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchReturnsOneResult_readOnlyFalse) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(1000);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+ search_spec.set_use_read_only_search(false);
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document_two;
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+
+ EXPECT_THAT(search_result_proto.query_stats().latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(),
+ Eq(1000));
+ // TODO(b/305098009): deprecate search-related flat fields in query_stats.
+ EXPECT_THAT(search_result_proto.query_stats().parse_query_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().scoring_latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().ranking_latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .parse_query_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .scoring_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_documents_scored(),
+ Eq(2));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_lite_index(),
+ Eq(2));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_main_index(),
+ Eq(0));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_integer_index(),
+ Eq(0));
+
+ // The token is a random number so we don't verify it.
+ expected_search_result_proto.set_next_page_token(
+ search_result_proto.next_page_token());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchZeroResultLimitReturnsEmptyResults) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(0);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchZeroResultLimitReturnsEmptyResults_readOnlyFalse) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+ search_spec.set_search_type(GetParam());
+ search_spec.set_use_read_only_search(false);
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(0);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithNumToScore) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(1000);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
+ document_one.set_score(10);
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
+ document_two.set_score(5);
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(10);
+ result_spec.set_num_to_score(10);
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+
+ SearchResultProto expected_search_result_proto1;
+ expected_search_result_proto1.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto1.mutable_results()->Add()->mutable_document() =
+ document_one;
+ *expected_search_result_proto1.mutable_results()->Add()->mutable_document() =
+ document_two;
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto1));
+
+ result_spec.set_num_to_score(1);
+ // By setting num_to_score = 1, only document_two will be scored, ranked, and
+ // returned.
+ // - num_to_score cutoff is only affected by the reading order from posting
+ // list. IOW, since we read posting lists in doc id descending order,
+ // ScoringProcessor scores documents with higher doc ids first and cuts off
+ // if exceeding num_to_score.
+ // - Therefore, even though document_one has higher score, ScoringProcessor
+ // still skips document_one, because posting list reads document_two first
+ // and ScoringProcessor stops after document_two given that total # of
+ // scored document has already reached num_to_score.
+ SearchResultProto expected_search_result_google::protobuf;
+ expected_search_result_google::protobuf.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_google::protobuf.mutable_results()->Add()->mutable_document() =
+ document_two;
+
+ search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_google::protobuf));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchNegativeResultLimitReturnsInvalidArgument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(-5);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(
+ StatusProto::INVALID_ARGUMENT);
+ expected_search_result_proto.mutable_status()->set_message(
+ "ResultSpecProto.num_per_page cannot be negative.");
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchNegativeResultLimitReturnsInvalidArgument_readOnlyFalse) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+ search_spec.set_search_type(GetParam());
+ search_spec.set_use_read_only_search(false);
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(-5);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(
+ StatusProto::INVALID_ARGUMENT);
+ expected_search_result_proto.mutable_status()->set_message(
+ "ResultSpecProto.num_per_page cannot be negative.");
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchNonPositivePageTotalBytesLimitReturnsInvalidArgument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_total_bytes_per_page_threshold(-1);
+
+ SearchResultProto actual_results1 =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results1.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+
+ result_spec.set_num_total_bytes_per_page_threshold(0);
+ SearchResultProto actual_results2 =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results2.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchNegativeMaxJoinedChildrenPerParentReturnsInvalidArgument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_max_joined_children_per_parent_to_return(-1);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(
+ StatusProto::INVALID_ARGUMENT);
+ expected_search_result_proto.mutable_status()->set_message(
+ "ResultSpecProto.max_joined_children_per_parent_to_return cannot be "
+ "negative.");
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchNonPositiveNumToScoreReturnsInvalidArgument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_to_score(-1);
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(
+ StatusProto::INVALID_ARGUMENT);
+ expected_search_result_proto.mutable_status()->set_message(
+ "ResultSpecProto.num_to_score cannot be non-positive.");
+
+ SearchResultProto actual_results1 =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results1, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ result_spec.set_num_to_score(0);
+ SearchResultProto actual_results2 =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(actual_results2, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithPersistenceReturnsValidResults) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ {
+ // Set the schema up beforehand.
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ // Schema will be persisted to disk when icing goes out of scope.
+ }
+
+ {
+ // Ensure that icing initializes the schema and section_manager
+ // properly from the pre-existing file.
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ EXPECT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
+ ProtoIsOk());
+ // The index and document store will be persisted to disk when icing goes
+ // out of scope.
+ }
+
+ {
+ // Ensure that the index is brought back up without problems and we
+ // can query for the content that we expect.
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ CreateMessageDocument("namespace", "uri");
+
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ search_spec.set_query("foo");
+
+ SearchResultProto empty_result;
+ empty_result.mutable_status()->set_code(StatusProto::OK);
+ actual_results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results,
+ EqualsSearchResultIgnoreStatsAndScores(empty_result));
+ }
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchShouldReturnEmpty) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(1000);
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ // Empty result, no next-page token
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+
+ EXPECT_THAT(search_result_proto.query_stats().latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().document_retrieval_latency_ms(),
+ Eq(0));
+ EXPECT_THAT(search_result_proto.query_stats().lock_acquisition_latency_ms(),
+ Eq(1000));
+ // TODO(b/305098009): deprecate search-related flat fields in query_stats.
+ EXPECT_THAT(search_result_proto.query_stats().parse_query_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().scoring_latency_ms(), Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats().ranking_latency_ms(), Eq(0));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .parse_query_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .scoring_latency_ms(),
+ Eq(1000));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_documents_scored(),
+ Eq(0));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_lite_index(),
+ Eq(0));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_main_index(),
+ Eq(0));
+ EXPECT_THAT(search_result_proto.query_stats()
+ .parent_search_stats()
+ .num_fetched_hits_integer_index(),
+ Eq(0));
+
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchShouldReturnMultiplePages) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates and inserts 5 documents
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
+ DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(2);
+
+ // Searches and gets the first page, 2 results
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document5;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document4;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
+ uint64_t next_page_token = search_result_proto.next_page_token();
+ // Since the token is a random number, we don't need to verify
+ expected_search_result_proto.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Second page, 2 results
+ expected_search_result_proto.clear_results();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ search_result_proto = icing.GetNextPage(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Third page, 1 result
+ expected_search_result_proto.clear_results();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ // Because there are no more results, we should not return the next page
+ // token.
+ expected_search_result_proto.clear_next_page_token();
+ search_result_proto = icing.GetNextPage(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // No more results
+ expected_search_result_proto.clear_results();
+ search_result_proto = icing.GetNextPage(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchWithNoScoringShouldReturnMultiplePages) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates and inserts 5 documents
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
+ DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(2);
+
+ // Searches and gets the first page, 2 results
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document5;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document4;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
+ uint64_t next_page_token = search_result_proto.next_page_token();
+ // Since the token is a random number, we don't need to verify
+ expected_search_result_proto.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Second page, 2 results
+ expected_search_result_proto.clear_results();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ search_result_proto = icing.GetNextPage(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Third page, 1 result
+ expected_search_result_proto.clear_results();
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ // Because there are no more results, we should not return the next page
+ // token.
+ expected_search_result_proto.clear_next_page_token();
+ search_result_proto = icing.GetNextPage(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // No more results
+ expected_search_result_proto.clear_results();
+ search_result_proto = icing.GetNextPage(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchWithUnknownEnabledFeatureShouldReturnError) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+ search_spec.add_enabled_features("BAD_FEATURE");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_P(IcingSearchEngineSearchTest, ShouldReturnMultiplePagesWithSnippets) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates and inserts 5 documents
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
+ DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(2);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(3);
+
+ // Searches and gets the first page, 2 results with 2 snippets
+ SearchResultProto search_result =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ ASSERT_THAT(search_result.results(), SizeIs(2));
+ ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
+
+ const DocumentProto& document_result_1 = search_result.results(0).document();
+ EXPECT_THAT(document_result_1, EqualsProto(document5));
+ const SnippetProto& snippet_result_1 = search_result.results(0).snippet();
+ EXPECT_THAT(snippet_result_1.entries(), SizeIs(1));
+ EXPECT_THAT(snippet_result_1.entries(0).property_name(), Eq("body"));
+ std::string_view content = GetString(
+ &document_result_1, snippet_result_1.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet_result_1.entries(0)),
+ ElementsAre("message body"));
+ EXPECT_THAT(GetMatches(content, snippet_result_1.entries(0)),
+ ElementsAre("message"));
+
+ const DocumentProto& document_result_2 = search_result.results(1).document();
+ EXPECT_THAT(document_result_2, EqualsProto(document4));
+ const SnippetProto& snippet_result_2 = search_result.results(1).snippet();
+ EXPECT_THAT(snippet_result_2.entries(0).property_name(), Eq("body"));
+ content = GetString(&document_result_2,
+ snippet_result_2.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet_result_2.entries(0)),
+ ElementsAre("message body"));
+ EXPECT_THAT(GetMatches(content, snippet_result_2.entries(0)),
+ ElementsAre("message"));
+
+ // Second page, 2 result with 1 snippet
+ search_result = icing.GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ ASSERT_THAT(search_result.results(), SizeIs(2));
+ ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
+
+ const DocumentProto& document_result_3 = search_result.results(0).document();
+ EXPECT_THAT(document_result_3, EqualsProto(document3));
+ const SnippetProto& snippet_result_3 = search_result.results(0).snippet();
+ EXPECT_THAT(snippet_result_3.entries(0).property_name(), Eq("body"));
+ content = GetString(&document_result_3,
+ snippet_result_3.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet_result_3.entries(0)),
+ ElementsAre("message body"));
+ EXPECT_THAT(GetMatches(content, snippet_result_3.entries(0)),
+ ElementsAre("message"));
+
+ EXPECT_THAT(search_result.results(1).document(), EqualsProto(document2));
+ EXPECT_THAT(search_result.results(1).snippet().entries(), IsEmpty());
+
+ // Third page, 1 result with 0 snippets
+ search_result = icing.GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ ASSERT_THAT(search_result.results(), SizeIs(1));
+ ASSERT_THAT(search_result.next_page_token(), Eq(kInvalidNextPageToken));
+
+ EXPECT_THAT(search_result.results(0).document(), EqualsProto(document1));
+ EXPECT_THAT(search_result.results(0).snippet().entries(), IsEmpty());
+}
+
+TEST_P(IcingSearchEngineSearchTest, ShouldInvalidateNextPageToken) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+
+ // Searches and gets the first page, 1 result
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
+ uint64_t next_page_token = search_result_proto.next_page_token();
+ // Since the token is a random number, we don't need to verify
+ expected_search_result_proto.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+ // Now document1 is still to be fetched.
+
+ // Invalidates token
+ icing.InvalidateNextPageToken(next_page_token);
+
+ // Tries to fetch the second page, no result since it's invalidated
+ expected_search_result_proto.clear_results();
+ expected_search_result_proto.clear_next_page_token();
+ search_result_proto = icing.GetNextPage(next_page_token);
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchIncludesDocumentsBeforeTtl) {
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(500)
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_search_type(GetParam());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
+
+ // Time just has to be less than the document's creation timestamp (100) + the
+ // document's ttl (500)
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(400);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Check that the document is returned as part of search results
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchDoesntIncludeDocumentsPastTtl) {
+ SchemaProto schema;
+ auto type = schema.add_types();
+ type->set_schema_type("Message");
+
+ auto body = type->add_properties();
+ body->set_property_name("body");
+ body->set_data_type(PropertyConfigProto::DataType::STRING);
+ body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ body->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ body->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message body")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(500)
+ .Build();
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("message");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_search_type(GetParam());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ // Time just has to be greater than the document's creation timestamp (100) +
+ // the document's ttl (500)
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(700);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Check that the document is not returned as part of search results
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchWorksAfterSchemaTypesCompatiblyModified) {
+ SchemaProto schema;
+ auto type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ auto property = type_config->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+
+ DocumentProto message_document =
+ DocumentBuilder()
+ .SetKey("namespace", "message_uri")
+ .SetSchema("message")
+ .AddStringProperty("body", "foo")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(message_document).status(), ProtoIsOk());
+
+ // Make sure we can search for message document
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_search_type(GetParam());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ // The message isn't indexed, so we get nothing
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // With just the schema type filter, we can search for the message
+ search_spec.Clear();
+ search_spec.add_schema_type_filters("message");
+
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ message_document;
+
+ search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+
+ // Since SchemaTypeIds are assigned based on order in the SchemaProto, this
+ // will force a change in the DocumentStore's cached SchemaTypeIds
+ schema.clear_types();
+ type_config = schema.add_types();
+ type_config->set_schema_type("email");
+
+ // Adding a new indexed property will require reindexing
+ type_config = schema.add_types();
+ type_config->set_schema_type("message");
+
+ property = type_config->add_properties();
+ property->set_property_name("body");
+ property->set_data_type(PropertyConfigProto::DataType::STRING);
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::PREFIX);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+
+ EXPECT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ search_spec.Clear();
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.add_schema_type_filters("message");
+
+ // We can still search for the message document
+ search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchResultShouldBeRankedByDocumentScore) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Intentionally inserts the documents in the order that is different than
+ // their score order
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ // Result should be in descending score order
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWorksForNestedSubtypeDocument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Artist")
+ .AddParentType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("Company").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("employee")
+ .SetDataTypeDocument("Person",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // Create a company with a person and an artist.
+ DocumentProto document_company =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Company")
+ .AddDocumentProperty("employee",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "name_person")
+ .Build(),
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "name_artist")
+ .AddStringProperty("emailAddress", "email")
+ .Build())
+ .Build();
+ ASSERT_THAT(icing.Put(document_company).status(), ProtoIsOk());
+
+ SearchResultProto company_search_result_proto;
+ company_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *company_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document_company;
+
+ SearchResultProto empty_search_result_proto;
+ empty_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_search_type(GetParam());
+
+ // "name_person" should match the company.
+ search_spec.set_query("name_person");
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ company_search_result_proto));
+
+ // "name_artist" should match the company.
+ search_spec.set_query("name_artist");
+ search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ company_search_result_proto));
+
+ // "email" should not match the company even though the artist has a matched
+ // property. This is because the "employee" property is defined as Person
+ // type, and indexing on document properties should be based on defined types,
+ // instead of subtypes.
+ search_spec.set_query("email");
+ search_result_proto = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ empty_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchShouldAllowNoScoring) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 documents and ensures the relationship of them is:
+ // document1 < document2 < document3
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(1571111111111)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(1572222222222)
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampMs(1573333333333)
+ .Build();
+
+ // Intentionally inserts the documents in the order that is different than
+ // their score order
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+
+ // Results should not be ranked by score but returned in reverse insertion
+ // order.
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultShouldBeRankedByCreationTimestamp) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 documents and ensures the relationship in terms of creation
+ // timestamp score is: document1 < document2 < document3
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetCreationTimestampMs(1571111111111)
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetCreationTimestampMs(1572222222222)
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetCreationTimestampMs(1573333333333)
+ .Build();
+
+ // Intentionally inserts the documents in the order that is different than
+ // their score order
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ // Result should be in descending timestamp order
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchResultShouldBeRankedByUsageCount) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 test documents
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Intentionally inserts the documents in a different order to eliminate the
+ // possibility that the following results are sorted in the default reverse
+ // insertion order.
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // Report usage for doc3 twice and doc2 once. The order will be doc3 > doc2 >
+ // doc1 when ranked by USAGE_TYPE1_COUNT.
+ UsageReport usage_report_doc3 = CreateUsageReport(
+ /*name_space=*/"namespace", /*uri=*/"uri/3", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_doc2 = CreateUsageReport(
+ /*name_space=*/"namespace", /*uri=*/"uri/2", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ASSERT_THAT(icing.ReportUsage(usage_report_doc3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.ReportUsage(usage_report_doc3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.ReportUsage(usage_report_doc2).status(), ProtoIsOk());
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ // Result should be in descending USAGE_TYPE1_COUNT order
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultShouldHaveDefaultOrderWithoutUsageCounts) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 test documents
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ // None of the documents have usage reports. Result should be in the default
+ // reverse insertion order.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultShouldBeRankedByUsageTimestamp) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 test documents
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Intentionally inserts the documents in a different order to eliminate the
+ // possibility that the following results are sorted in the default reverse
+ // insertion order.
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // Report usage for doc2 and doc3. The order will be doc3 > doc2 > doc1 when
+ // ranked by USAGE_TYPE1_LAST_USED_TIMESTAMP.
+ UsageReport usage_report_doc2 = CreateUsageReport(
+ /*name_space=*/"namespace", /*uri=*/"uri/2", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_doc3 = CreateUsageReport(
+ /*name_space=*/"namespace", /*uri=*/"uri/3", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE1);
+ ASSERT_THAT(icing.ReportUsage(usage_report_doc2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.ReportUsage(usage_report_doc3).status(), ProtoIsOk());
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ // Result should be in descending USAGE_TYPE1_LAST_USED_TIMESTAMP order
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, Bm25fRelevanceScoringOneNamespace) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+
+ // Create and index documents in namespace "namespace1".
+ DocumentProto document = CreateEmailDocument(
+ "namespace1", "namespace1/uri0", /*score=*/10, "sushi belmont",
+ "fresh fish. inexpensive. good sushi.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri1", /*score=*/13, "peacock koriander",
+ "indian food. buffet. spicy food. kadai chicken.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri2", /*score=*/4,
+ "panda express",
+ "chinese food. cheap. inexpensive. kung pao.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri3", /*score=*/23,
+ "speederia pizza",
+ "thin-crust pizza. good and fast.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri4", /*score=*/8,
+ "whole foods",
+ "salads. pizza. organic food. expensive.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri5", /*score=*/18, "peets coffee",
+ "espresso. decaf. brewed coffee. whole beans. excellent coffee.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri6", /*score=*/4, "costco",
+ "bulk. cheap whole beans. frozen fish. food samples.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri7", /*score=*/4,
+ "starbucks coffee",
+ "habit. birthday rewards. good coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("coffee OR food");
+ search_spec.set_search_type(GetParam());
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+
+ // Result should be in descending score order
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+ // Both doc5 and doc7 have "coffee" in name and text sections.
+ // However, doc5 has more matches in the text section.
+ // Documents with "food" are ranked lower as the term "food" is commonly
+ // present in this corpus, and thus, has a lower IDF.
+ EXPECT_THAT(GetUrisFromSearchResults(search_result_proto),
+ ElementsAre("namespace1/uri5", // 'coffee' 3 times
+ "namespace1/uri7", // 'coffee' 2 times
+ "namespace1/uri1", // 'food' 2 times
+ "namespace1/uri4", // 'food' 2 times
+ "namespace1/uri2", // 'food' 1 time
+ "namespace1/uri6")); // 'food' 1 time
+}
+
+TEST_P(IcingSearchEngineSearchTest, Bm25fRelevanceScoringOneNamespaceAdvanced) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+
+ // Create and index documents in namespace "namespace1".
+ DocumentProto document = CreateEmailDocument(
+ "namespace1", "namespace1/uri0", /*score=*/10, "sushi belmont",
+ "fresh fish. inexpensive. good sushi.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri1", /*score=*/13, "peacock koriander",
+ "indian food. buffet. spicy food. kadai chicken.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri2", /*score=*/4,
+ "panda express",
+ "chinese food. cheap. inexpensive. kung pao.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri3", /*score=*/23,
+ "speederia pizza",
+ "thin-crust pizza. good and fast.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri4", /*score=*/8,
+ "whole foods",
+ "salads. pizza. organic food. expensive.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri5", /*score=*/18, "peets coffee",
+ "espresso. decaf. brewed coffee. whole beans. excellent coffee.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri6", /*score=*/4, "costco",
+ "bulk. cheap whole beans. frozen fish. food samples.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri7", /*score=*/4,
+ "starbucks coffee",
+ "habit. birthday rewards. good coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("coffee OR food");
+ search_spec.set_search_type(GetParam());
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_advanced_scoring_expression("this.relevanceScore() * 2 + 1");
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+
+ // Result should be in descending score order
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+ // Both doc5 and doc7 have "coffee" in name and text sections.
+ // However, doc5 has more matches in the text section.
+ // Documents with "food" are ranked lower as the term "food" is commonly
+ // present in this corpus, and thus, has a lower IDF.
+ EXPECT_THAT(GetUrisFromSearchResults(search_result_proto),
+ ElementsAre("namespace1/uri5", // 'coffee' 3 times
+ "namespace1/uri7", // 'coffee' 2 times
+ "namespace1/uri1", // 'food' 2 times
+ "namespace1/uri4", // 'food' 2 times
+ "namespace1/uri2", // 'food' 1 time
+ "namespace1/uri6")); // 'food' 1 time
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ Bm25fRelevanceScoringOneNamespaceNotOperator) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+
+ // Create and index documents in namespace "namespace1".
+ DocumentProto document = CreateEmailDocument(
+ "namespace1", "namespace1/uri0", /*score=*/10, "sushi belmont",
+ "fresh fish. inexpensive. good sushi.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri1", /*score=*/13, "peacock koriander",
+ "indian food. buffet. spicy food. kadai chicken.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri2", /*score=*/4,
+ "panda express",
+ "chinese food. cheap. inexpensive. kung pao.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri3", /*score=*/23, "speederia pizza",
+ "thin-crust pizza. good and fast. nice coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri4", /*score=*/8,
+ "whole foods",
+ "salads. pizza. organic food. expensive.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri5", /*score=*/18, "peets coffee",
+ "espresso. decaf. brewed coffee. whole beans. excellent coffee.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri6", /*score=*/4, "costco",
+ "bulk. cheap whole beans. frozen fish. food samples.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri7", /*score=*/4,
+ "starbucks coffee",
+ "habit. birthday rewards. good coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("coffee -starbucks");
+ search_spec.set_search_type(GetParam());
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+
+ // Result should be in descending score order
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+ EXPECT_THAT(
+ GetUrisFromSearchResults(search_result_proto),
+ ElementsAre("namespace1/uri5", // 'coffee' 3 times, 'starbucks' 0 times
+ "namespace1/uri3")); // 'coffee' 1 times, 'starbucks' 0 times
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ Bm25fRelevanceScoringOneNamespaceSectionRestrict) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+
+ // Create and index documents in namespace "namespace1".
+ DocumentProto document = CreateEmailDocument(
+ "namespace1", "namespace1/uri0", /*score=*/10, "sushi belmont",
+ "fresh fish. inexpensive. good sushi.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri1", /*score=*/13, "peacock koriander",
+ "indian food. buffet. spicy food. kadai chicken.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri2", /*score=*/4,
+ "panda express",
+ "chinese food. cheap. inexpensive. kung pao.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri3", /*score=*/23,
+ "speederia pizza",
+ "thin-crust pizza. good and fast.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri4", /*score=*/8,
+ "whole foods",
+ "salads. pizza. organic food. expensive.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document =
+ CreateEmailDocument("namespace1", "namespace1/uri5", /*score=*/18,
+ "peets coffee, best coffee",
+ "espresso. decaf. whole beans. excellent coffee.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri6", /*score=*/4, "costco",
+ "bulk. cheap whole beans. frozen fish. food samples.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri7", /*score=*/4, "starbucks",
+ "habit. birthday rewards. good coffee. brewed coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("subject:coffee OR body:food");
+ search_spec.set_search_type(GetParam());
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+
+ // Result should be in descending score order
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+ // The term frequencies of "coffee" and "food" are calculated respectively
+ // from the subject section and the body section.
+ // Documents with "food" are ranked lower as the term "food" is commonly
+ // present in this corpus, and thus, has a lower IDF.
+ EXPECT_THAT(
+ GetUrisFromSearchResults(search_result_proto),
+ ElementsAre("namespace1/uri5", // 'coffee' 2 times in section subject
+ "namespace1/uri1", // 'food' 2 times in section body
+ "namespace1/uri4", // 'food' 2 times in section body
+ "namespace1/uri2", // 'food' 1 time in section body
+ "namespace1/uri6")); // 'food' 1 time in section body
+}
+
+TEST_P(IcingSearchEngineSearchTest, Bm25fRelevanceScoringTwoNamespaces) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+
+ // Create and index documents in namespace "namespace1".
+ DocumentProto document = CreateEmailDocument(
+ "namespace1", "namespace1/uri0", /*score=*/10, "sushi belmont",
+ "fresh fish. inexpensive. good sushi.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri1", /*score=*/13, "peacock koriander",
+ "indian food. buffet. spicy food. kadai chicken.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri2", /*score=*/4,
+ "panda express",
+ "chinese food. cheap. inexpensive. kung pao.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri3", /*score=*/23,
+ "speederia pizza",
+ "thin-crust pizza. good and fast.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri4", /*score=*/8,
+ "whole foods",
+ "salads. pizza. organic food. expensive.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri5", /*score=*/18, "peets coffee",
+ "espresso. decaf. brewed coffee. whole beans. excellent coffee.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri6", /*score=*/4, "costco",
+ "bulk. cheap whole beans. frozen fish. food samples.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri7", /*score=*/4,
+ "starbucks coffee",
+ "habit. birthday rewards. good coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Create and index documents in namespace "namespace2".
+ document = CreateEmailDocument("namespace2", "namespace2/uri0", /*score=*/10,
+ "sushi belmont",
+ "fresh fish. inexpensive. good sushi.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace2", "namespace2/uri1", /*score=*/13, "peacock koriander",
+ "indian food. buffet. spicy food. kadai chicken.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace2", "namespace2/uri2", /*score=*/4,
+ "panda express",
+ "chinese food. cheap. inexpensive. kung pao.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace2", "namespace2/uri3", /*score=*/23,
+ "speederia pizza",
+ "thin-crust pizza. good and fast.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace2", "namespace2/uri4", /*score=*/8,
+ "whole foods",
+ "salads. pizza. organic food. expensive.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace2", "namespace2/uri5", /*score=*/18, "peets coffee",
+ "espresso. decaf. brewed coffee. whole beans. excellent coffee.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace2", "namespace2/uri6", /*score=*/4, "costco",
+ "bulk. cheap whole beans. frozen fish. food samples.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace2", "namespace2/uri7", /*score=*/4,
+ "starbucks coffee", "good coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("coffee OR food");
+ search_spec.set_search_type(GetParam());
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+ ResultSpecProto result_spec_proto;
+ result_spec_proto.set_num_per_page(16);
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec_proto);
+
+ // Result should be in descending score order
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+ // The two corpora have the same documents except for document 7, which in
+ // "namespace2" is much shorter than the average dcoument length, so it is
+ // boosted.
+ EXPECT_THAT(GetUrisFromSearchResults(search_result_proto),
+ ElementsAre("namespace2/uri7", // 'coffee' 2 times, short doc
+ "namespace1/uri5", // 'coffee' 3 times
+ "namespace2/uri5", // 'coffee' 3 times
+ "namespace1/uri7", // 'coffee' 2 times
+ "namespace1/uri1", // 'food' 2 times
+ "namespace2/uri1", // 'food' 2 times
+ "namespace1/uri4", // 'food' 2 times
+ "namespace2/uri4", // 'food' 2 times
+ "namespace1/uri2", // 'food' 1 time
+ "namespace2/uri2", // 'food' 1 time
+ "namespace1/uri6", // 'food' 1 time
+ "namespace2/uri6")); // 'food' 1 time
+}
+
+TEST_P(IcingSearchEngineSearchTest, Bm25fRelevanceScoringWithNamespaceFilter) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+
+ // Create and index documents in namespace "namespace1".
+ DocumentProto document = CreateEmailDocument(
+ "namespace1", "namespace1/uri0", /*score=*/10, "sushi belmont",
+ "fresh fish. inexpensive. good sushi.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri1", /*score=*/13, "peacock koriander",
+ "indian food. buffet. spicy food. kadai chicken.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri2", /*score=*/4,
+ "panda express",
+ "chinese food. cheap. inexpensive. kung pao.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri3", /*score=*/23,
+ "speederia pizza",
+ "thin-crust pizza. good and fast.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri4", /*score=*/8,
+ "whole foods",
+ "salads. pizza. organic food. expensive.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri5", /*score=*/18, "peets coffee",
+ "espresso. decaf. brewed coffee. whole beans. excellent coffee.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace1", "namespace1/uri6", /*score=*/4, "costco",
+ "bulk. cheap whole beans. frozen fish. food samples.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace1", "namespace1/uri7", /*score=*/4,
+ "starbucks coffee",
+ "habit. birthday rewards. good coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Create and index documents in namespace "namespace2".
+ document = CreateEmailDocument("namespace2", "namespace2/uri0", /*score=*/10,
+ "sushi belmont",
+ "fresh fish. inexpensive. good sushi.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace2", "namespace2/uri1", /*score=*/13, "peacock koriander",
+ "indian food. buffet. spicy food. kadai chicken.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace2", "namespace2/uri2", /*score=*/4,
+ "panda express",
+ "chinese food. cheap. inexpensive. kung pao.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace2", "namespace2/uri3", /*score=*/23,
+ "speederia pizza",
+ "thin-crust pizza. good and fast.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace2", "namespace2/uri4", /*score=*/8,
+ "whole foods",
+ "salads. pizza. organic food. expensive.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace2", "namespace2/uri5", /*score=*/18, "peets coffee",
+ "espresso. decaf. brewed coffee. whole beans. excellent coffee.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument(
+ "namespace2", "namespace2/uri6", /*score=*/4, "costco",
+ "bulk. cheap whole beans. frozen fish. food samples.");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ document = CreateEmailDocument("namespace2", "namespace2/uri7", /*score=*/4,
+ "starbucks coffee", "good coffee");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("coffee OR food");
+ search_spec.set_search_type(GetParam());
+ // Now query only corpus 2
+ search_spec.add_namespace_filters("namespace2");
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ search_result_proto = icing.Search(search_spec, scoring_spec,
+ ResultSpecProto::default_instance());
+
+ // Result from namespace "namespace2" should be in descending score order
+ EXPECT_THAT(search_result_proto.status(), ProtoIsOk());
+ // Both doc5 and doc7 have "coffee" in name and text sections.
+ // Even though doc5 has more matches in the text section, doc7's length is
+ // much shorter than the average corpus's length, so it's being boosted.
+ // Documents with "food" are ranked lower as the term "food" is commonly
+ // present in this corpus, and thus, has a lower IDF.
+ EXPECT_THAT(GetUrisFromSearchResults(search_result_proto),
+ ElementsAre("namespace2/uri7", // 'coffee' 2 times, short doc
+ "namespace2/uri5", // 'coffee' 3 times
+ "namespace2/uri1", // 'food' 2 times
+ "namespace2/uri4", // 'food' 2 times
+ "namespace2/uri2", // 'food' 1 time
+ "namespace2/uri6")); // 'food' 1 time
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultShouldHaveDefaultOrderWithoutUsageTimestamp) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 test documents
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ // None of the documents have usage reports. Result should be in the default
+ // reverse insertion order.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchResultShouldBeRankedAscendingly) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ // Intentionally inserts the documents in the order that is different than
+ // their score order
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // "m" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ // Result should be in ascending score order
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ scoring_spec.set_order_by(ScoringSpecProto::Order::ASC);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingDuplicateNamespaceShouldReturnError) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // "m" will match all 2 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Specify "namespace1" twice. This should result in an error.
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace1");
+ result_grouping = result_spec.add_result_groupings();
+ entry = result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_result_proto.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingDuplicateSchemaShouldReturnError) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // "m" will match all 2 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Specify "Message" twice. This should result in an error.
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_schema("Message");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_schema("nonexistentMessage");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(1);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_schema("Message");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_result_proto.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingDuplicateNamespaceAndSchemaSchemaShouldReturnError) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // "m" will match all 2 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Specify "namespace1xMessage" twice. This should result in an error.
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+ entry->set_schema("Message");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+ entry->set_schema("Message");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace1");
+ entry->set_schema("Message");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(1);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace1");
+ entry->set_schema("Message");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_result_proto.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingNonPositiveMaxResultsShouldReturnError) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // "m" will match all 2 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ // Specify zero results. This should result in an error.
+ ResultSpecProto result_spec;
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(0);
+ entry->set_namespace_("namespace1");
+ entry->set_schema("Message");
+ result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+ entry->set_schema("Message");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_result_proto.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+
+ // Specify negative results. This should result in an error.
+ result_spec.mutable_result_groupings(0)->set_max_results(-1);
+ EXPECT_THAT(search_result_proto.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingMultiNamespaceGrouping) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4 < document5 <
+ // document6
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document4 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/4")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(4)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document5 =
+ DocumentBuilder()
+ .SetKey("namespace3", "uri/5")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(5)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document6 =
+ DocumentBuilder()
+ .SetKey("namespace3", "uri/6")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(6)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document6).status(), ProtoIsOk());
+
+ // "m" will match all 6 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(2);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace3");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+
+ // The last result (document1) in namespace "namespace1" should not be
+ // included. "namespace2" and "namespace3" are grouped together. So only the
+ // two highest scored documents between the two (both of which are in
+ // "namespace3") should be returned.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document6;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document5;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchResultGroupingMultiSchemaGrouping) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetScore(1)
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .AddDocumentProperty("sender", DocumentBuilder()
+ .SetKey("namespace", "uri1-sender")
+ .SetSchema("Person")
+ .AddStringProperty("name", "foo")
+ .Build())
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Message")
+ .SetScore(2)
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("body", "fo")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "uri3")
+ .SetSchema("Message")
+ .SetScore(3)
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("body", "fo")
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ // "f" will match all 3 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("f");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_schema("Message");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(1);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("Email");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+
+ // Each of the highest scored documents of schema type "Message" (document3)
+ // and "Email" (document1) should be returned.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document3;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document1;
+
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingMultiNamespaceAndSchemaGrouping) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 3 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4 < document5 <
+ // document6
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document4 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/4")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(4)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document5 =
+ DocumentBuilder()
+ .SetKey("namespace3", "uri/5")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(5)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document6 =
+ DocumentBuilder()
+ .SetKey("namespace3", "uri/6")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(6)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document6).status(), ProtoIsOk());
+
+ // "m" will match all 6 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+ entry->set_schema("Message");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(1);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+ entry->set_schema("Message");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(1);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace3");
+ entry->set_schema("Message");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+
+ // The three highest scored documents that fit the criteria of
+ // "namespace1xMessage" (document2), "namespace2xMessage" (document4),
+ // and "namespace3xMessage" (document6) should be returned.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document6;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document4;
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingNonexistentNamespaceShouldBeIgnored) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // "m" will match all 2 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("nonexistentNamespace");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+
+ // Only the top ranked document in "namespace" (document2), should be
+ // returned. The presence of "nonexistentNamespace" in the same result
+ // grouping should have no effect.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingNonexistentSchemaShouldBeIgnored) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // "m" will match all 2 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_schema("Message");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_schema("nonexistentMessage");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+
+ // Only the top ranked document in "Message" (document2), should be
+ // returned. The presence of "nonexistentMessage" in the same result
+ // grouping should have no effect.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document2;
+
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchResultGroupingNonexistentNamespaceAndSchemaShouldBeIgnored) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message1")
+ .SetScore(1)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message2")
+ .SetScore(2)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/3")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message3")
+ .SetScore(3)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ DocumentProto document4 =
+ DocumentBuilder()
+ .SetKey("namespace2", "uri/4")
+ .SetSchema("Message")
+ .AddStringProperty("body", "message4")
+ .SetScore(4)
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+
+ // "m" will match all 2 documents
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("m");
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(ResultSpecProto::SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace2");
+ entry->set_schema("Message");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_schema("namespace1");
+ entry->set_schema("nonexistentMessage");
+
+ SearchResultProto search_result_proto =
+ icing.Search(search_spec, scoring_spec, result_spec);
+
+ // Only the top ranked document in "namespace2xMessage" (document4), should be
+ // returned. The presence of "namespace1xnonexistentMessage" in the same
+ // result grouping should have no effect. If either the namespace or the
+ // schema type is nonexistent, the entire entry will be ignored.
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document4;
+
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SnippetNormalization) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "MDI zurich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "mdi Zürich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("mdi Zürich");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
+
+ SearchResultProto results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ ASSERT_THAT(results.results(), SizeIs(2));
+ const DocumentProto& result_document_1 = results.results(0).document();
+ const SnippetProto& result_snippet_1 = results.results(0).snippet();
+ EXPECT_THAT(result_document_1, EqualsProto(document_two));
+ EXPECT_THAT(result_snippet_1.entries(), SizeIs(1));
+ EXPECT_THAT(result_snippet_1.entries(0).property_name(), Eq("body"));
+ std::string_view content = GetString(
+ &result_document_1, result_snippet_1.entries(0).property_name());
+ EXPECT_THAT(
+ GetWindows(content, result_snippet_1.entries(0)),
+ ElementsAre("mdi Zürich Team Meeting", "mdi Zürich Team Meeting"));
+ EXPECT_THAT(GetMatches(content, result_snippet_1.entries(0)),
+ ElementsAre("mdi", "Zürich"));
+
+ const DocumentProto& result_document_2 = results.results(1).document();
+ const SnippetProto& result_snippet_2 = results.results(1).snippet();
+ EXPECT_THAT(result_document_2, EqualsProto(document_one));
+ EXPECT_THAT(result_snippet_2.entries(), SizeIs(1));
+ EXPECT_THAT(result_snippet_2.entries(0).property_name(), Eq("body"));
+ content = GetString(&result_document_2,
+ result_snippet_2.entries(0).property_name());
+ EXPECT_THAT(
+ GetWindows(content, result_snippet_2.entries(0)),
+ ElementsAre("MDI zurich Team Meeting", "MDI zurich Team Meeting"));
+ EXPECT_THAT(GetMatches(content, result_snippet_2.entries(0)),
+ ElementsAre("MDI", "zurich"));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SnippetNormalizationPrefix) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", "MDI zurich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "mdi Zürich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("md Zür");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
+
+ SearchResultProto results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ ASSERT_THAT(results.results(), SizeIs(2));
+ const DocumentProto& result_document_1 = results.results(0).document();
+ const SnippetProto& result_snippet_1 = results.results(0).snippet();
+ EXPECT_THAT(result_document_1, EqualsProto(document_two));
+ EXPECT_THAT(result_snippet_1.entries(), SizeIs(1));
+ EXPECT_THAT(result_snippet_1.entries(0).property_name(), Eq("body"));
+ std::string_view content = GetString(
+ &result_document_1, result_snippet_1.entries(0).property_name());
+ EXPECT_THAT(
+ GetWindows(content, result_snippet_1.entries(0)),
+ ElementsAre("mdi Zürich Team Meeting", "mdi Zürich Team Meeting"));
+ EXPECT_THAT(GetMatches(content, result_snippet_1.entries(0)),
+ ElementsAre("mdi", "Zürich"));
+
+ const DocumentProto& result_document_2 = results.results(1).document();
+ const SnippetProto& result_snippet_2 = results.results(1).snippet();
+ EXPECT_THAT(result_document_2, EqualsProto(document_one));
+ EXPECT_THAT(result_snippet_2.entries(), SizeIs(1));
+ EXPECT_THAT(result_snippet_2.entries(0).property_name(), Eq("body"));
+ content = GetString(&result_document_2,
+ result_snippet_2.entries(0).property_name());
+ EXPECT_THAT(
+ GetWindows(content, result_snippet_2.entries(0)),
+ ElementsAre("MDI zurich Team Meeting", "MDI zurich Team Meeting"));
+ EXPECT_THAT(GetMatches(content, result_snippet_2.entries(0)),
+ ElementsAre("MDI", "zurich"));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SnippetSectionRestrict) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status(), ProtoIsOk());
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "MDI zurich Team Meeting")
+ .AddStringProperty("body", "MDI zurich Team Meeting")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "MDI zurich trip")
+ .AddStringProperty("body", "Let's travel to zurich")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("body:Zür");
+ search_spec->set_search_type(GetParam());
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+ result_spec->set_num_per_page(1);
+ result_spec->mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec->mutable_snippet_spec()->set_num_matches_per_property(10);
+ result_spec->mutable_snippet_spec()->set_num_to_snippet(10);
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ ASSERT_THAT(results.results(), SizeIs(1));
+
+ const DocumentProto& result_document_two = results.results(0).document();
+ const SnippetProto& result_snippet_two = results.results(0).snippet();
+ EXPECT_THAT(result_document_two, EqualsProto(document_two));
+ EXPECT_THAT(result_snippet_two.entries(), SizeIs(1));
+ EXPECT_THAT(result_snippet_two.entries(0).property_name(), Eq("body"));
+ std::string_view content = GetString(
+ &result_document_two, result_snippet_two.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet_two.entries(0)),
+ ElementsAre("Let's travel to zurich"));
+ EXPECT_THAT(GetMatches(content, result_snippet_two.entries(0)),
+ ElementsAre("zurich"));
+
+ search_spec.reset();
+ scoring_spec.reset();
+ result_spec.reset();
+
+ results = icing.GetNextPage(results.next_page_token());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ ASSERT_THAT(results.results(), SizeIs(1));
+
+ const DocumentProto& result_document_one = results.results(0).document();
+ const SnippetProto& result_snippet_one = results.results(0).snippet();
+ EXPECT_THAT(result_document_one, EqualsProto(document_one));
+ EXPECT_THAT(result_snippet_one.entries(), SizeIs(1));
+ EXPECT_THAT(result_snippet_one.entries(0).property_name(), Eq("body"));
+ content = GetString(&result_document_one,
+ result_snippet_one.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet_one.entries(0)),
+ ElementsAre("MDI zurich Team Meeting"));
+ EXPECT_THAT(GetMatches(content, result_snippet_one.entries(0)),
+ ElementsAre("zurich"));
+}
+
+TEST_P(IcingSearchEngineSearchTest, Hyphens) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SchemaProto schema;
+ SchemaTypeConfigProto* type = schema.add_types();
+ type->set_schema_type("MyType");
+ PropertyConfigProto* prop = type->add_properties();
+ prop->set_property_name("foo");
+ prop->set_data_type(PropertyConfigProto::DataType::STRING);
+ prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ prop->mutable_string_indexing_config()->set_term_match_type(
+ TermMatchType::EXACT_ONLY);
+ prop->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("MyType")
+ .AddStringProperty("foo", "foo bar-baz bat")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("MyType")
+ .AddStringProperty("foo", "bar for baz bat-man")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("foo:bar-baz");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ SearchResultProto results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ ASSERT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithProjectionEmptyFieldPath) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query that will match those documents and use an empty field
+ // mask to request NO properties.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("hello");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ // Retrieve only one result at a time to make sure that projection works when
+ // retrieving all pages.
+ result_spec.set_num_per_page(1);
+ TypePropertyMask* email_field_mask = result_spec.add_type_property_masks();
+ email_field_mask->set_schema_type("Email");
+ email_field_mask->add_paths("");
+
+ SearchResultProto results =
+ icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+
+ // 3. Verify that the returned results contain no properties.
+ DocumentProto projected_document_two = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .Build();
+ EXPECT_THAT(results.results(0).document(),
+ EqualsProto(projected_document_two));
+
+ results = icing.GetNextPage(results.next_page_token());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+ DocumentProto projected_document_one = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .Build();
+ EXPECT_THAT(results.results(0).document(),
+ EqualsProto(projected_document_one));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithProjectionMultipleFieldPaths) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query that will match those documents and request only
+ // 'sender.name' and 'subject' properties.
+ // Create all of search_spec, result_spec and scoring_spec as objects with
+ // scope that will end before the call to GetNextPage to ensure that the
+ // implementation isn't relying on references to any of them.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("hello");
+ search_spec->set_search_type(GetParam());
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+ // Retrieve only one result at a time to make sure that projection works when
+ // retrieving all pages.
+ result_spec->set_num_per_page(1);
+ TypePropertyMask* email_field_mask = result_spec->add_type_property_masks();
+ email_field_mask->set_schema_type("Email");
+ email_field_mask->add_paths("sender.name");
+ email_field_mask->add_paths("subject");
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+
+ // 3. Verify that the first returned result only contains the 'sender.name'
+ // property.
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .Build();
+ EXPECT_THAT(results.results(0).document(),
+ EqualsProto(projected_document_two));
+
+ // 4. Now, delete all of the specs used in the search. GetNextPage should have
+ // no problem because it shouldn't be keeping any references to them.
+ search_spec.reset();
+ result_spec.reset();
+ scoring_spec.reset();
+
+ // 5. Verify that the second returned result only contains the 'sender.name'
+ // property.
+ results = icing.GetNextPage(results.next_page_token());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .Build();
+ EXPECT_THAT(results.results(0).document(),
+ EqualsProto(projected_document_one));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFilters) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "hellogirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query with property filters of sender.name and subject for the
+ // Email schema type.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("hello");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* email_property_filters =
+ search_spec->add_type_property_filters();
+ email_property_filters->set_schema_type("Email");
+ email_property_filters->add_paths("sender.name");
+ email_property_filters->add_paths("subject");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+
+ // 3. Verify that only the first document is returned. Although 'hello' is
+ // present in document_two, it shouldn't be in the result since 'hello' is not
+ // in the specified property filter.
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_one));
+}
+
+TEST_P(IcingSearchEngineSearchTest, EmptySearchWithPropertyFilter) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "hellogirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query with a property filter
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* email_property_filters =
+ search_spec->add_type_property_filters();
+ email_property_filters->set_schema_type("Email");
+ email_property_filters->add_paths("subject");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ // 3. Verify that both documents are returned.
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+}
+
+TEST_P(IcingSearchEngineSearchTest, EmptySearchWithEmptyPropertyFilter) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "hellogirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query with a property filter
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* email_property_filters =
+ search_spec->add_type_property_filters();
+ // Add empty list for Email's property filters
+ email_property_filters->set_schema_type("Email");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ // 3. Verify that both documents are returned.
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFiltersOnMultipleSchema) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ // Add Person and Organization schema with a property 'name' in both.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Organization")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("address")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ // 1. Add person document
+ DocumentProto person_document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "hellogirl@aol.com")
+ .Build();
+ ASSERT_THAT(icing.Put(person_document).status(), ProtoIsOk());
+
+ // 1. Add organization document
+ DocumentProto organization_document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Organization")
+ .AddStringProperty("name", "Meg Corp")
+ .AddStringProperty("address", "Universal street")
+ .Build();
+ ASSERT_THAT(icing.Put(organization_document).status(), ProtoIsOk());
+
+ // 2. Issue a query with property filters. Person schema has name in it's
+ // property filter but Organization schema doesn't.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("Meg");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* person_property_filters =
+ search_spec->add_type_property_filters();
+ person_property_filters->set_schema_type("Person");
+ person_property_filters->add_paths("name");
+ TypePropertyMask* organization_property_filters =
+ search_spec->add_type_property_filters();
+ organization_property_filters->set_schema_type("Organization");
+ organization_property_filters->add_paths("address");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+
+ // 3. Verify that only the person document is returned. Although 'Meg' is
+ // present in organization document, it shouldn't be in the result since
+ // the name field is not specified in the Organization property filter.
+ EXPECT_THAT(results.results(0).document(), EqualsProto(person_document));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithWildcardPropertyFilters) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "hellogirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query with property filters of sender.name and subject for the
+ // wildcard(*) schema type.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("hello");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* wildcard_property_filters =
+ search_spec->add_type_property_filters();
+ wildcard_property_filters->set_schema_type("*");
+ wildcard_property_filters->add_paths("sender.name");
+ wildcard_property_filters->add_paths("subject");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+
+ // 3. Verify that only the first document is returned since the second
+ // document doesn't contain the word 'hello' in either of fields specified in
+ // the property filter. This confirms that the property filters for the
+ // wildcard entry have been applied to the Email schema as well.
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_one));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithMixedPropertyFilters) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "hellogirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query with property filters of sender.name and subject for the
+ // wildcard(*) schema type plus property filters of sender.name and body for
+ // the Email schema type.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("hello");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* wildcard_property_filters =
+ search_spec->add_type_property_filters();
+ wildcard_property_filters->set_schema_type("*");
+ wildcard_property_filters->add_paths("sender.name");
+ wildcard_property_filters->add_paths("subject");
+ TypePropertyMask* email_property_filters =
+ search_spec->add_type_property_filters();
+ email_property_filters->set_schema_type("Email");
+ email_property_filters->add_paths("sender.name");
+ email_property_filters->add_paths("body");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+
+ // 3. Verify that only the second document is returned since the first
+ // document doesn't contain the word 'hello' in either of fields sender.name
+ // or body. This confirms that the property filters specified for Email schema
+ // have been applied and the ones specified for wildcard entry have been
+ // ignored.
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithNonApplicablePropertyFilters) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "hellogirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query with property filters of sender.name and subject for an
+ // unknown schema type.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("hello");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* email_property_filters =
+ search_spec->add_type_property_filters();
+ email_property_filters->set_schema_type("unknown");
+ email_property_filters->add_paths("sender.name");
+ email_property_filters->add_paths("subject");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+
+ // 3. Verify that both the documents are returned since each of them have the
+ // word 'hello' in at least 1 property. The second document being returned
+ // confirms that the body field was searched and the specified property
+ // filters were not applied to the Email schema type.
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithEmptyPropertyFilter) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Message")
+ .AddStringProperty("body", "Hello World!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ // 2. Issue a query with empty property filter for Message schema.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("hello");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* message_property_filters =
+ search_spec->add_type_property_filters();
+ message_property_filters->set_schema_type("Message");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+
+ // 3. Verify that no documents are returned. Although 'hello' is present in
+ // the indexed document, it shouldn't be returned since the Message property
+ // filter doesn't allow any properties to be searched.
+ ASSERT_THAT(results.results(), IsEmpty());
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchWithPropertyFilterHavingInvalidProperty) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Message")
+ .AddStringProperty("body", "Hello World!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ // 2. Issue a query with property filter having invalid/unknown property for
+ // Message schema.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("hello");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* message_property_filters =
+ search_spec->add_type_property_filters();
+ message_property_filters->set_schema_type("Message");
+ message_property_filters->add_paths("unknown");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+
+ // 3. Verify that no documents are returned. Although 'hello' is present in
+ // the indexed document, it shouldn't be returned since the Message property
+ // filter doesn't allow any valid properties to be searched. Any
+ // invalid/unknown properties specified in the property filters will be
+ // ignored while searching.
+ ASSERT_THAT(results.results(), IsEmpty());
+}
+
+TEST_P(IcingSearchEngineSearchTest, SearchWithPropertyFiltersWithNesting) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "hellogirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query with property filter of sender.emailAddress for the Email
+ // schema type.
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("hello");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* email_property_filters =
+ search_spec->add_type_property_filters();
+ email_property_filters->set_schema_type("Email");
+ email_property_filters->add_paths("sender.emailAddress");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ *scoring_spec = GetDefaultScoringSpec();
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+
+ // 3. Verify that only the first document is returned since the second
+ // document doesn't contain the word 'hello' in sender.emailAddress. The first
+ // document being returned confirms that the nested property
+ // sender.emailAddress was actually searched.
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_one));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchWithPropertyFilter_RelevanceScoreUnaffectedByExcludedSectionHits) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add two email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Hello Ryan")
+ .AddStringProperty("emailAddress", "hello@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello Hello!")
+ .AddStringProperty("body", "hello1 hello2 hello3 hello4 hello5")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "world@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello Hello!")
+ .AddStringProperty("body", "one1 two2 three3 four4 five5")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // 2. Issue a query with a property filter
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("Hello");
+ search_spec->set_search_type(GetParam());
+ TypePropertyMask* email_property_filters =
+ search_spec->add_type_property_filters();
+ email_property_filters->set_schema_type("Email");
+ email_property_filters->add_paths("subject");
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ // 3. Verify that both documents are returned and have equal relevance score
+ // Note, the total number of tokens must be equal in the documents
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ scoring_spec->set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ ASSERT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).score(), DoubleEq(results.results(1).score()));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ SearchWithPropertyFilter_ExcludingSectionsWithHitsLowersRelevanceScore) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // 1. Add an email document
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Hello Ryan")
+ .AddStringProperty("emailAddress", "hello@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello Hello!")
+ .AddStringProperty("body", "hello hello hello hello hello")
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ // 2. Issue a query without property filter
+ auto search_spec = std::make_unique<SearchSpecProto>();
+ search_spec->set_term_match_type(TermMatchType::PREFIX);
+ search_spec->set_query("Hello");
+ search_spec->set_search_type(GetParam());
+
+ auto result_spec = std::make_unique<ResultSpecProto>();
+
+ // 3. Get the relevance score without property filter
+ auto scoring_spec = std::make_unique<ScoringSpecProto>();
+ scoring_spec->set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+ SearchResultProto results =
+ icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ double original_relevance_score = results.results(0).score();
+
+ // 4. Relevance score with property filter should be lower
+ TypePropertyMask* email_property_filters =
+ search_spec->add_type_property_filters();
+ email_property_filters->set_schema_type("Email");
+ email_property_filters->add_paths("subject");
+ results = icing.Search(*search_spec, *scoring_spec, *result_spec);
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).score(), Lt(original_relevance_score));
+}
+
+TEST_P(IcingSearchEngineSearchTest, QueryStatsProtoTest) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(5);
+
+ // Set index merge size to 6 hits. This will cause document1, document2,
+ // document3's hits being merged into the main index, and document4,
+ // document5's hits will remain in the lite index.
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(sizeof(TermIdHitPair::Value) * 6);
+
+ TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates and inserts 5 documents
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
+ DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.add_namespace_filters("namespace");
+ search_spec.add_schema_type_filters(document1.schema());
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(2);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(64);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(3);
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+ // Searches and gets the first page, 2 results with 2 snippets
+ SearchResultProto search_result =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ ASSERT_THAT(search_result.results(), SizeIs(2));
+ ASSERT_THAT(search_result.next_page_token(), Ne(kInvalidNextPageToken));
+
+ // Check the stats
+ // TODO(b/305098009): deprecate search-related flat fields in query_stats.
+ QueryStatsProto exp_stats;
+ exp_stats.set_query_length(7);
+ exp_stats.set_num_terms(1);
+ exp_stats.set_num_namespaces_filtered(1);
+ exp_stats.set_num_schema_types_filtered(1);
+ exp_stats.set_ranking_strategy(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ exp_stats.set_is_first_page(true);
+ exp_stats.set_requested_page_size(2);
+ exp_stats.set_num_results_returned_current_page(2);
+ exp_stats.set_num_documents_scored(5);
+ exp_stats.set_num_results_with_snippets(2);
+ exp_stats.set_latency_ms(5);
+ exp_stats.set_parse_query_latency_ms(5);
+ exp_stats.set_scoring_latency_ms(5);
+ exp_stats.set_ranking_latency_ms(5);
+ exp_stats.set_document_retrieval_latency_ms(5);
+ exp_stats.set_lock_acquisition_latency_ms(5);
+ exp_stats.set_num_joined_results_returned_current_page(0);
+
+ QueryStatsProto::SearchStats* exp_parent_search_stats =
+ exp_stats.mutable_parent_search_stats();
+ exp_parent_search_stats->set_query_length(7);
+ exp_parent_search_stats->set_num_terms(1);
+ exp_parent_search_stats->set_num_namespaces_filtered(1);
+ exp_parent_search_stats->set_num_schema_types_filtered(1);
+ exp_parent_search_stats->set_ranking_strategy(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ exp_parent_search_stats->set_num_documents_scored(5);
+ exp_parent_search_stats->set_parse_query_latency_ms(5);
+ exp_parent_search_stats->set_scoring_latency_ms(5);
+ exp_parent_search_stats->set_num_fetched_hits_lite_index(2);
+ exp_parent_search_stats->set_num_fetched_hits_main_index(3);
+ exp_parent_search_stats->set_num_fetched_hits_integer_index(0);
+
+ EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats));
+
+ // Second page, 2 result with 1 snippet
+ search_result = icing.GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ ASSERT_THAT(search_result.results(), SizeIs(2));
+ ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
+
+ exp_stats = QueryStatsProto();
+ exp_stats.set_is_first_page(false);
+ exp_stats.set_requested_page_size(2);
+ exp_stats.set_num_results_returned_current_page(2);
+ exp_stats.set_num_results_with_snippets(1);
+ exp_stats.set_latency_ms(5);
+ exp_stats.set_document_retrieval_latency_ms(5);
+ exp_stats.set_lock_acquisition_latency_ms(5);
+ exp_stats.set_num_joined_results_returned_current_page(0);
+ EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats));
+
+ // Third page, 1 result with 0 snippets
+ search_result = icing.GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ ASSERT_THAT(search_result.results(), SizeIs(1));
+ ASSERT_THAT(search_result.next_page_token(), Eq(kInvalidNextPageToken));
+
+ exp_stats = QueryStatsProto();
+ exp_stats.set_is_first_page(false);
+ exp_stats.set_requested_page_size(2);
+ exp_stats.set_num_results_returned_current_page(1);
+ exp_stats.set_num_results_with_snippets(0);
+ exp_stats.set_latency_ms(5);
+ exp_stats.set_document_retrieval_latency_ms(5);
+ exp_stats.set_lock_acquisition_latency_ms(5);
+ exp_stats.set_num_joined_results_returned_current_page(0);
+ EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats));
+}
+
+TEST_P(IcingSearchEngineSearchTest, JoinQueryStatsProtoTest) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(5);
+
+ // Set index merge size to 13 hits. This will cause person1, person2, email1,
+ // email2, email3's hits being merged into the main index, and person3,
+ // email4's hits will remain in the lite index.
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_index_merge_size(sizeof(TermIdHitPair::Value) * 13);
+
+ TestIcingSearchEngine icing(options, std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first1")
+ .AddStringProperty("lastName", "last1")
+ .AddStringProperty("emailAddress", "email1@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first2")
+ .AddStringProperty("lastName", "last2")
+ .AddStringProperty("emailAddress", "email2@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(2)
+ .Build();
+ DocumentProto person3 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person3")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first3")
+ .AddStringProperty("lastName", "last3")
+ .AddStringProperty("emailAddress", "email3@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(3)
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(3)
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(2)
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+ DocumentProto email4 =
+ DocumentBuilder()
+ .SetKey("namespace", "email4")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 4")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(0)
+ .Build();
+
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email4).status(), ProtoIsOk());
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+ search_spec.set_search_type(GetParam());
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ nested_search_spec->set_search_type(GetParam());
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ // Parent ScoringSpec
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::JOIN_AGGREGATE_SCORE);
+ scoring_spec.set_order_by(ScoringSpecProto::Order::DESC);
+
+ // Parent ResultSpec
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ // Since we:
+ // - Use COUNT for aggregation scoring strategy.
+ // - (Default) use DOCUMENT_SCORE to score child documents.
+ // - (Default) use DESC as the ranking order.
+ //
+ // person1 with [email1, email2, email4] should have the highest aggregated
+ // score (3) and be returned first. person2 with [email3] (aggregated score =
+ // 1) should be the second, and person3 with no child (aggregated score = 0)
+ // should be the last.
+ SearchResultProto expected_result1;
+ expected_result1.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto1 =
+ expected_result1.mutable_results()->Add();
+ *result_proto1->mutable_document() = person1;
+ *result_proto1->mutable_joined_results()->Add()->mutable_document() = email1;
+ *result_proto1->mutable_joined_results()->Add()->mutable_document() = email2;
+ *result_proto1->mutable_joined_results()->Add()->mutable_document() = email4;
+
+ SearchResultProto expected_result2;
+ expected_result2.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_google::protobuf =
+ expected_result2.mutable_results()->Add();
+ *result_google::protobuf->mutable_document() = person2;
+ *result_google::protobuf->mutable_joined_results()->Add()->mutable_document() = email3;
+
+ SearchResultProto expected_result3;
+ expected_result3.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto3 =
+ expected_result3.mutable_results()->Add();
+ *result_proto3->mutable_document() = person3;
+
+ SearchResultProto search_result =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ uint64_t next_page_token = search_result.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ expected_result1.set_next_page_token(next_page_token);
+ ASSERT_THAT(search_result,
+ EqualsSearchResultIgnoreStatsAndScores(expected_result1));
+
+ // Check the stats
+ // TODO(b/305098009): deprecate search-related flat fields in query_stats.
+ QueryStatsProto exp_stats;
+ exp_stats.set_query_length(15);
+ exp_stats.set_num_terms(1);
+ exp_stats.set_num_namespaces_filtered(0);
+ exp_stats.set_num_schema_types_filtered(0);
+ exp_stats.set_ranking_strategy(
+ ScoringSpecProto::RankingStrategy::JOIN_AGGREGATE_SCORE);
+ exp_stats.set_is_first_page(true);
+ exp_stats.set_requested_page_size(1);
+ exp_stats.set_num_results_returned_current_page(1);
+ exp_stats.set_num_documents_scored(3);
+ exp_stats.set_num_results_with_snippets(0);
+ exp_stats.set_latency_ms(5);
+ exp_stats.set_parse_query_latency_ms(5);
+ exp_stats.set_scoring_latency_ms(5);
+ exp_stats.set_ranking_latency_ms(5);
+ exp_stats.set_document_retrieval_latency_ms(5);
+ exp_stats.set_lock_acquisition_latency_ms(5);
+ exp_stats.set_num_joined_results_returned_current_page(3);
+ exp_stats.set_join_latency_ms(5);
+ exp_stats.set_is_join_query(true);
+
+ QueryStatsProto::SearchStats* exp_parent_search_stats =
+ exp_stats.mutable_parent_search_stats();
+ exp_parent_search_stats->set_query_length(15);
+ exp_parent_search_stats->set_num_terms(1);
+ exp_parent_search_stats->set_num_namespaces_filtered(0);
+ exp_parent_search_stats->set_num_schema_types_filtered(0);
+ exp_parent_search_stats->set_ranking_strategy(
+ ScoringSpecProto::RankingStrategy::JOIN_AGGREGATE_SCORE);
+ exp_parent_search_stats->set_num_documents_scored(3);
+ exp_parent_search_stats->set_parse_query_latency_ms(5);
+ exp_parent_search_stats->set_scoring_latency_ms(5);
+ exp_parent_search_stats->set_num_fetched_hits_lite_index(1);
+ exp_parent_search_stats->set_num_fetched_hits_main_index(2);
+ exp_parent_search_stats->set_num_fetched_hits_integer_index(0);
+
+ QueryStatsProto::SearchStats* exp_child_search_stats =
+ exp_stats.mutable_child_search_stats();
+ exp_child_search_stats->set_query_length(12);
+ exp_child_search_stats->set_num_terms(1);
+ exp_child_search_stats->set_num_namespaces_filtered(0);
+ exp_child_search_stats->set_num_schema_types_filtered(0);
+ exp_child_search_stats->set_ranking_strategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ exp_child_search_stats->set_num_documents_scored(4);
+ exp_child_search_stats->set_parse_query_latency_ms(5);
+ exp_child_search_stats->set_scoring_latency_ms(5);
+ exp_child_search_stats->set_num_fetched_hits_lite_index(1);
+ exp_child_search_stats->set_num_fetched_hits_main_index(3);
+ exp_child_search_stats->set_num_fetched_hits_integer_index(0);
+
+ EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats));
+
+ // Second page, 1 child doc.
+ search_result = icing.GetNextPage(next_page_token);
+ next_page_token = search_result.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ expected_result2.set_next_page_token(next_page_token);
+ EXPECT_THAT(search_result,
+ EqualsSearchResultIgnoreStatsAndScores(expected_result2));
+
+ exp_stats = QueryStatsProto();
+ exp_stats.set_is_first_page(false);
+ exp_stats.set_requested_page_size(1);
+ exp_stats.set_num_results_returned_current_page(1);
+ exp_stats.set_num_results_with_snippets(0);
+ exp_stats.set_latency_ms(5);
+ exp_stats.set_document_retrieval_latency_ms(5);
+ exp_stats.set_lock_acquisition_latency_ms(5);
+ exp_stats.set_num_joined_results_returned_current_page(1);
+ EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats));
+
+ // Third page, 0 child docs.
+ search_result = icing.GetNextPage(next_page_token);
+ next_page_token = search_result.next_page_token();
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ ASSERT_THAT(search_result.results(), SizeIs(1));
+ ASSERT_THAT(search_result.next_page_token(), Eq(kInvalidNextPageToken));
+
+ exp_stats = QueryStatsProto();
+ exp_stats.set_is_first_page(false);
+ exp_stats.set_requested_page_size(1);
+ exp_stats.set_num_results_returned_current_page(1);
+ exp_stats.set_num_joined_results_returned_current_page(0);
+ exp_stats.set_latency_ms(5);
+ exp_stats.set_document_retrieval_latency_ms(5);
+ exp_stats.set_lock_acquisition_latency_ms(5);
+ exp_stats.set_num_results_with_snippets(0);
+ ASSERT_THAT(search_result,
+ EqualsSearchResultIgnoreStatsAndScores(expected_result3));
+ EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats));
+
+ ASSERT_THAT(search_result.next_page_token(), Eq(kInvalidNextPageToken));
+
+ search_result = icing.GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ ASSERT_THAT(search_result.results(), IsEmpty());
+ ASSERT_THAT(search_result.next_page_token(), Eq(kInvalidNextPageToken));
+
+ exp_stats = QueryStatsProto();
+ exp_stats.set_is_first_page(false);
+ exp_stats.set_lock_acquisition_latency_ms(5);
+ EXPECT_THAT(search_result.query_stats(), EqualsProto(exp_stats));
+}
+
+TEST_P(IcingSearchEngineSearchTest, SnippetErrorTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Generic").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetScore(10)
+ .SetSchema("Generic")
+ .AddStringProperty("subject", "I like cats", "I like dogs",
+ "I like birds", "I like fish")
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetScore(20)
+ .SetSchema("Generic")
+ .AddStringProperty("subject", "I like red", "I like green",
+ "I like blue", "I like yellow")
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetScore(5)
+ .SetSchema("Generic")
+ .AddStringProperty("subject", "I like cupcakes", "I like donuts",
+ "I like eclairs", "I like froyo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.add_schema_type_filters("Generic");
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_query("like");
+ search_spec.set_search_type(GetParam());
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(3);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(4);
+ SearchResultProto search_results =
+ icing.Search(search_spec, scoring_spec, result_spec);
+
+ ASSERT_THAT(search_results.results(), SizeIs(3));
+ const SearchResultProto::ResultProto* result = &search_results.results(0);
+ EXPECT_THAT(result->document().uri(), Eq("uri2"));
+ ASSERT_THAT(result->snippet().entries(), SizeIs(3));
+ const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+ EXPECT_THAT(entry->property_name(), "subject[0]");
+ std::string_view content = GetString(&result->document(), "subject[0]");
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+ entry = &result->snippet().entries(1);
+ EXPECT_THAT(entry->property_name(), "subject[1]");
+ content = GetString(&result->document(), "subject[1]");
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+ entry = &result->snippet().entries(2);
+ EXPECT_THAT(entry->property_name(), "subject[2]");
+ content = GetString(&result->document(), "subject[2]");
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+ result = &search_results.results(1);
+ EXPECT_THAT(result->document().uri(), Eq("uri1"));
+ ASSERT_THAT(result->snippet().entries(), SizeIs(3));
+ entry = &result->snippet().entries(0);
+ EXPECT_THAT(entry->property_name(), "subject[0]");
+ content = GetString(&result->document(), "subject[0]");
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+ entry = &result->snippet().entries(1);
+ ASSERT_THAT(entry->property_name(), "subject[1]");
+ content = GetString(&result->document(), "subject[1]");
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+ entry = &result->snippet().entries(2);
+ ASSERT_THAT(entry->property_name(), "subject[2]");
+ content = GetString(&result->document(), "subject[2]");
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("like"));
+
+ result = &search_results.results(2);
+ ASSERT_THAT(result->document().uri(), Eq("uri3"));
+ ASSERT_THAT(result->snippet().entries(), IsEmpty());
+}
+
+TEST_P(IcingSearchEngineSearchTest, CJKSnippetTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF8 idx: 0 3 9 15 18
+ // UTF16 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kChinese)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto search_spec;
+ search_spec.set_query("走");
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(
+ std::numeric_limits<int>::max());
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(
+ std::numeric_limits<int>::max());
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results = icing.Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ ASSERT_THAT(search_results.status(), ProtoIsOk());
+ ASSERT_THAT(search_results.results(), SizeIs(1));
+ const SearchResultProto::ResultProto* result = &search_results.results(0);
+ EXPECT_THAT(result->document().uri(), Eq("uri1"));
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(result->snippet().entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("body"));
+
+ // Get the content for "subject" and see what the match is.
+ std::string_view content = GetString(&result->document(), "body");
+ ASSERT_THAT(content, Eq(kChinese));
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(9));
+ EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(6));
+ std::string_view match =
+ content.substr(match_proto.exact_match_byte_position(),
+ match_proto.exact_match_byte_length());
+ ASSERT_THAT(match, Eq("走路"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
+TEST_P(IcingSearchEngineSearchTest, InvalidToEmptyQueryTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // String: "Luca Brasi sleeps with the 🐟🐟🐟."
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF8 idx: 0 5 11 18 23 27 3135 39
+ // UTF16 idx: 0 5 11 18 23 27 2931 33
+ // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟"
+ // and "🐟".
+ constexpr std::string_view kSicilianMessage =
+ "Luca Brasi sleeps with the 🐟🐟🐟.";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kSicilianMessage)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "Some other content.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto search_spec;
+ search_spec.set_query("?");
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
+ search_spec.set_search_type(GetParam());
+ ScoringSpecProto scoring_spec;
+ ResultSpecProto result_spec;
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ if (GetParam() ==
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ // This is the actual correct behavior.
+ EXPECT_THAT(search_results.results(), IsEmpty());
+ } else {
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+ }
+
+ search_spec.set_query("。");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ if (GetParam() ==
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ // This is the actual correct behavior.
+ EXPECT_THAT(search_results.results(), IsEmpty());
+ } else {
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+ }
+
+ search_spec.set_query("-");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ if (GetParam() ==
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ // This is the actual correct behavior.
+ EXPECT_THAT(search_results.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+ } else {
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+ }
+
+ search_spec.set_query(":");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ if (GetParam() ==
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ // This is the actual correct behavior.
+ EXPECT_THAT(search_results.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+ } else {
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+ }
+
+ search_spec.set_query("OR");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ if (GetParam() ==
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ EXPECT_THAT(search_results.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+ } else {
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+ }
+
+ search_spec.set_query(" ");
+ search_results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(search_results.status(), ProtoIsOk());
+ EXPECT_THAT(search_results.results(), SizeIs(2));
+}
+
+TEST_P(IcingSearchEngineSearchTest, EmojiSnippetTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // String: "Luca Brasi sleeps with the 🐟🐟🐟."
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF8 idx: 0 5 11 18 23 27 3135 39
+ // UTF16 idx: 0 5 11 18 23 27 2931 33
+ // Breaks into segments: "Luca", "Brasi", "sleeps", "with", "the", "🐟", "🐟"
+ // and "🐟".
+ constexpr std::string_view kSicilianMessage =
+ "Luca Brasi sleeps with the 🐟🐟🐟.";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kSicilianMessage)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Message")
+ .AddStringProperty("body", "Some other content.")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto search_spec;
+ search_spec.set_query("🐟");
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results = icing.Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ ASSERT_THAT(search_results.status(), ProtoIsOk());
+ ASSERT_THAT(search_results.results(), SizeIs(1));
+ const SearchResultProto::ResultProto* result = &search_results.results(0);
+ EXPECT_THAT(result->document().uri(), Eq("uri1"));
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(result->snippet().entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("body"));
+
+ // Get the content for "subject" and see what the match is.
+ std::string_view content = GetString(&result->document(), "body");
+ ASSERT_THAT(content, Eq(kSicilianMessage));
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ EXPECT_THAT(match_proto.exact_match_byte_position(), Eq(27));
+ EXPECT_THAT(match_proto.exact_match_byte_length(), Eq(4));
+ std::string_view match =
+ content.substr(match_proto.exact_match_byte_position(),
+ match_proto.exact_match_byte_length());
+ ASSERT_THAT(match, Eq("🐟"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(27));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+}
+
+TEST_P(IcingSearchEngineSearchTest, JoinByQualifiedId) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first1")
+ .AddStringProperty("lastName", "last1")
+ .AddStringProperty("emailAddress", "email1@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first2")
+ .AddStringProperty("lastName", "last2")
+ .AddStringProperty("emailAddress", "email2@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(2)
+ .Build();
+ DocumentProto person3 =
+ DocumentBuilder()
+ .SetKey(R"(pkg$db/name#space\\)", "person3")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first3")
+ .AddStringProperty("lastName", "last3")
+ .AddStringProperty("emailAddress", "email3@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(3)
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(3)
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(2)
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("personQualifiedId",
+ R"(pkg$db/name\#space\\\\#person3)") // escaped
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email3).status(), ProtoIsOk());
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+ search_spec.set_search_type(GetParam());
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::MAX);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ nested_search_spec->set_search_type(GetParam());
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ // Parent ScoringSpec
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+
+ // Parent ResultSpec
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ // Since we:
+ // - Use MAX for aggregation scoring strategy.
+ // - (Default) use DOCUMENT_SCORE to score child documents.
+ // - (Default) use DESC as the ranking order.
+ //
+ // person1 + email1 should have the highest aggregated score (3) and be
+ // returned first. person2 + email2 (aggregated score = 2) should be the
+ // second, and person3 + email3 (aggregated score = 1) should be the last.
+ SearchResultProto expected_result1;
+ expected_result1.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto1 =
+ expected_result1.mutable_results()->Add();
+ *result_proto1->mutable_document() = person1;
+ *result_proto1->mutable_joined_results()->Add()->mutable_document() = email1;
+
+ SearchResultProto expected_result2;
+ expected_result2.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_google::protobuf =
+ expected_result2.mutable_results()->Add();
+ *result_google::protobuf->mutable_document() = person2;
+ *result_google::protobuf->mutable_joined_results()->Add()->mutable_document() = email2;
+
+ SearchResultProto expected_result3;
+ expected_result3.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto3 =
+ expected_result3.mutable_results()->Add();
+ *result_proto3->mutable_document() = person3;
+ *result_proto3->mutable_joined_results()->Add()->mutable_document() = email3;
+
+ SearchResultProto result1 =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ uint64_t next_page_token = result1.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ expected_result1.set_next_page_token(next_page_token);
+ EXPECT_THAT(result1,
+ EqualsSearchResultIgnoreStatsAndScores(expected_result1));
+
+ SearchResultProto result2 = icing.GetNextPage(next_page_token);
+ next_page_token = result2.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ expected_result2.set_next_page_token(next_page_token);
+ EXPECT_THAT(result2,
+ EqualsSearchResultIgnoreStatsAndScores(expected_result2));
+
+ SearchResultProto result3 = icing.GetNextPage(next_page_token);
+ next_page_token = result3.next_page_token();
+ EXPECT_THAT(next_page_token, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(result3,
+ EqualsSearchResultIgnoreStatsAndScores(expected_result3));
+}
+
+TEST_P(IcingSearchEngineSearchTest, JoinByQualifiedIdMultipleNamespaces) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace1", "person")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first1")
+ .AddStringProperty("lastName", "last1")
+ .AddStringProperty("emailAddress", "email1@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace2", "person")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first2")
+ .AddStringProperty("lastName", "last2")
+ .AddStringProperty("emailAddress", "email2@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(2)
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace1#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(3)
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace1#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(2)
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("namespace2", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace2#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email3).status(), ProtoIsOk());
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+ search_spec.set_search_type(GetParam());
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ nested_search_spec->set_search_type(GetParam());
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ // Parent ScoringSpec
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+
+ // Parent ResultSpec
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ // Since we:
+ // - Use COUNT for aggregation scoring strategy.
+ // - (Default) use DESC as the ranking order.
+ //
+ // pkg$db/namespace1#person + email1, email2 should have the highest
+ // aggregated score (2) and be returned first. pkg$db/namespace2#person +
+ // email3 (aggregated score = 1) should be the second.
+ SearchResultProto expected_result1;
+ expected_result1.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_proto1 =
+ expected_result1.mutable_results()->Add();
+ *result_proto1->mutable_document() = person1;
+ *result_proto1->mutable_joined_results()->Add()->mutable_document() = email1;
+ *result_proto1->mutable_joined_results()->Add()->mutable_document() = email2;
+
+ SearchResultProto expected_result2;
+ expected_result2.mutable_status()->set_code(StatusProto::OK);
+ SearchResultProto::ResultProto* result_google::protobuf =
+ expected_result2.mutable_results()->Add();
+ *result_google::protobuf->mutable_document() = person2;
+ *result_google::protobuf->mutable_joined_results()->Add()->mutable_document() = email3;
+
+ SearchResultProto result1 =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ uint64_t next_page_token = result1.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ expected_result1.set_next_page_token(next_page_token);
+ EXPECT_THAT(result1,
+ EqualsSearchResultIgnoreStatsAndScores(expected_result1));
+
+ SearchResultProto result2 = icing.GetNextPage(next_page_token);
+ next_page_token = result2.next_page_token();
+ EXPECT_THAT(next_page_token, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(result2,
+ EqualsSearchResultIgnoreStatsAndScores(expected_result2));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ JoinShouldLimitNumChildDocumentsByMaxJoinedChildPerParent) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first1")
+ .AddStringProperty("lastName", "last1")
+ .AddStringProperty("emailAddress", "email1@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first2")
+ .AddStringProperty("lastName", "last2")
+ .AddStringProperty("emailAddress", "email2@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(2)
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(100)
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(99)
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(98)
+ .Build();
+ DocumentProto email4 =
+ DocumentBuilder()
+ .SetKey("namespace", "email4")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 4")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(97)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email4).status(), ProtoIsOk());
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+ search_spec.set_search_type(GetParam());
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ nested_search_spec->set_search_type(GetParam());
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ // Parent ScoringSpec
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+
+ // Parent ResultSpec with max_joined_children_per_parent_to_return = 2
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+ result_spec.set_max_joined_children_per_parent_to_return(2);
+
+ // - Use COUNT for aggregation scoring strategy.
+ // - max_joined_children_per_parent_to_return = 2.
+ // - (Default) use DESC as the ranking order.
+ //
+ // person2 should have the highest aggregated score (3) since email2, email3,
+ // email4 are joined to it and the COUNT aggregated score is 3. However, only
+ // email2 and email3 should be attached to person2 due to
+ // max_joined_children_per_parent_to_return limitation in result_spec.
+ // person1 should be the second (aggregated score = 1).
+ SearchResultProto::ResultProto expected_result_proto1;
+ *expected_result_proto1.mutable_document() = person2;
+ expected_result_proto1.set_score(3);
+ SearchResultProto::ResultProto* child_result_proto1 =
+ expected_result_proto1.mutable_joined_results()->Add();
+ *child_result_proto1->mutable_document() = email2;
+ child_result_proto1->set_score(99);
+ SearchResultProto::ResultProto* child_result_google::protobuf =
+ expected_result_proto1.mutable_joined_results()->Add();
+ *child_result_google::protobuf->mutable_document() = email3;
+ child_result_google::protobuf->set_score(98);
+
+ SearchResultProto::ResultProto expected_result_google::protobuf;
+ *expected_result_google::protobuf.mutable_document() = person1;
+ expected_result_google::protobuf.set_score(1);
+ SearchResultProto::ResultProto* child_result_proto3 =
+ expected_result_google::protobuf.mutable_joined_results()->Add();
+ *child_result_proto3->mutable_document() = email1;
+ child_result_proto3->set_score(100);
+
+ SearchResultProto result1 =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ uint64_t next_page_token = result1.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ EXPECT_THAT(result1.results(),
+ ElementsAre(EqualsProto(expected_result_proto1)));
+
+ SearchResultProto result2 = icing.GetNextPage(next_page_token);
+ next_page_token = result2.next_page_token();
+ EXPECT_THAT(next_page_token, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(result2.results(),
+ ElementsAre(EqualsProto(expected_result_google::protobuf)));
+}
+
+TEST_P(IcingSearchEngineSearchTest, JoinWithZeroMaxJoinedChildPerParent) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first1")
+ .AddStringProperty("lastName", "last1")
+ .AddStringProperty("emailAddress", "email1@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first2")
+ .AddStringProperty("lastName", "last2")
+ .AddStringProperty("emailAddress", "email2@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(2)
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(100)
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(99)
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(98)
+ .Build();
+ DocumentProto email4 =
+ DocumentBuilder()
+ .SetKey("namespace", "email4")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 4")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(97)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email4).status(), ProtoIsOk());
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+ search_spec.set_search_type(GetParam());
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ nested_search_spec->set_search_type(GetParam());
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ // Parent ScoringSpec
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+
+ // Parent ResultSpec with max_joined_children_per_parent_to_return = 0
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+ result_spec.set_max_joined_children_per_parent_to_return(0);
+
+ // - Use COUNT for aggregation scoring strategy.
+ // - max_joined_children_per_parent_to_return = 0.
+ // - (Default) use DESC as the ranking order.
+ //
+ // person2 should have the highest aggregated score (3) since email2, email3,
+ // email4 are joined to it and the COUNT aggregated score is 3. However, no
+ // child documents should be attached to person2 due to
+ // max_joined_children_per_parent_to_return limitation in result_spec.
+ // person1 should be the second (aggregated score = 1) with no attached child
+ // documents.
+ SearchResultProto::ResultProto expected_result_proto1;
+ *expected_result_proto1.mutable_document() = person2;
+ expected_result_proto1.set_score(3);
+
+ SearchResultProto::ResultProto expected_result_google::protobuf;
+ *expected_result_google::protobuf.mutable_document() = person1;
+ expected_result_google::protobuf.set_score(1);
+
+ SearchResultProto result1 =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ uint64_t next_page_token = result1.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ EXPECT_THAT(result1.results(),
+ ElementsAre(EqualsProto(expected_result_proto1)));
+
+ SearchResultProto result2 = icing.GetNextPage(next_page_token);
+ next_page_token = result2.next_page_token();
+ EXPECT_THAT(next_page_token, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(result2.results(),
+ ElementsAre(EqualsProto(expected_result_google::protobuf)));
+}
+
+TEST_P(IcingSearchEngineSearchTest, JoinSnippet) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first")
+ .AddStringProperty("lastName", "last")
+ .AddStringProperty("emailAddress", "email@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+
+ DocumentProto email =
+ DocumentBuilder()
+ .SetKey("namespace", "email")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(3)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email).status(), ProtoIsOk());
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+ search_spec.set_search_type(GetParam());
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::MAX);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ nested_search_spec->set_search_type(GetParam());
+ // Child ResultSpec (with snippet)
+ ResultSpecProto* nested_result_spec = nested_spec->mutable_result_spec();
+ nested_result_spec->mutable_snippet_spec()->set_max_window_utf32_length(64);
+ nested_result_spec->mutable_snippet_spec()->set_num_matches_per_property(1);
+ nested_result_spec->mutable_snippet_spec()->set_num_to_snippet(1);
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+
+ // Parent ScoringSpec
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+
+ // Parent ResultSpec (without snippet)
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto result =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(result.status(), ProtoIsOk());
+ EXPECT_THAT(result.next_page_token(), Eq(kInvalidNextPageToken));
+
+ ASSERT_THAT(result.results(), SizeIs(1));
+ // Check parent doc (person).
+ const DocumentProto& result_parent_document = result.results(0).document();
+ EXPECT_THAT(result_parent_document, EqualsProto(person));
+ EXPECT_THAT(result.results(0).snippet().entries(), IsEmpty());
+
+ // Check child doc (email).
+ ASSERT_THAT(result.results(0).joined_results(), SizeIs(1));
+ const DocumentProto& result_child_document =
+ result.results(0).joined_results(0).document();
+ const SnippetProto& result_child_snippet =
+ result.results(0).joined_results(0).snippet();
+ EXPECT_THAT(result_child_document, EqualsProto(email));
+ ASSERT_THAT(result_child_snippet.entries(), SizeIs(1));
+ EXPECT_THAT(result_child_snippet.entries(0).property_name(), Eq("subject"));
+ std::string_view content = GetString(
+ &result_child_document, result_child_snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_child_snippet.entries(0)),
+ ElementsAre("test subject"));
+ EXPECT_THAT(GetMatches(content, result_child_snippet.entries(0)),
+ ElementsAre("test"));
+}
+
+TEST_P(IcingSearchEngineSearchTest, JoinProjection) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ DocumentProto person =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first")
+ .AddStringProperty("lastName", "last")
+ .AddStringProperty("emailAddress", "email@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+
+ DocumentProto email =
+ DocumentBuilder()
+ .SetKey("namespace", "email")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(3)
+ .Build();
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email).status(), ProtoIsOk());
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+ search_spec.set_search_type(GetParam());
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ join_spec->set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::MAX);
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ nested_search_spec->set_search_type(GetParam());
+ // Child ResultSpec (with projection)
+ ResultSpecProto* nested_result_spec = nested_spec->mutable_result_spec();
+ TypePropertyMask* type_property_mask =
+ nested_result_spec->add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("subject");
+ *nested_spec->mutable_scoring_spec() = GetDefaultScoringSpec();
+
+ // Parent ScoringSpec
+ ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
+
+ // Parent ResultSpec (with projection)
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+ type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Person");
+ type_property_mask->add_paths("emailAddress");
+
+ SearchResultProto result =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(result.status(), ProtoIsOk());
+ EXPECT_THAT(result.next_page_token(), Eq(kInvalidNextPageToken));
+
+ ASSERT_THAT(result.results(), SizeIs(1));
+ // Check parent doc (person): should contain only the "emailAddress" property.
+ DocumentProto projected_person_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person")
+ .SetSchema("Person")
+ .AddStringProperty("emailAddress", "email@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(1)
+ .Build();
+ EXPECT_THAT(result.results().at(0).document(),
+ EqualsProto(projected_person_document));
+
+ // Check child doc (email): should contain only the "subject" property.
+ ASSERT_THAT(result.results(0).joined_results(), SizeIs(1));
+ DocumentProto projected_email_document =
+ DocumentBuilder()
+ .SetKey("namespace", "email")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(3)
+ .Build();
+ EXPECT_THAT(result.results(0).joined_results(0).document(),
+ EqualsProto(projected_email_document));
+}
+
+TEST_F(IcingSearchEngineSearchTest, JoinWithAdvancedScoring) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("firstName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("lastName")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("personQualifiedId")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ const int32_t person1_doc_score = 10;
+ const int32_t person2_doc_score = 25;
+ const int32_t person3_doc_score = 123;
+ const int32_t email1_doc_score = 10;
+ const int32_t email2_doc_score = 15;
+ const int32_t email3_doc_score = 40;
+
+ // person1 has children email1 and email2.
+ DocumentProto person1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first1")
+ .AddStringProperty("lastName", "last1")
+ .AddStringProperty("emailAddress", "email1@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(person1_doc_score)
+ .Build();
+ // person2 has a single child email3
+ DocumentProto person2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first2")
+ .AddStringProperty("lastName", "last2")
+ .AddStringProperty("emailAddress", "email2@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(person2_doc_score)
+ .Build();
+ // person3 has no child.
+ DocumentProto person3 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person3")
+ .SetSchema("Person")
+ .AddStringProperty("firstName", "first3")
+ .AddStringProperty("lastName", "last3")
+ .AddStringProperty("emailAddress", "email3@gmail.com")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(person3_doc_score)
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(email1_doc_score)
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person1")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(email2_doc_score)
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("personQualifiedId", "pkg$db/namespace#person2")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetScore(email3_doc_score)
+ .Build();
+
+ // Set children scoring expression and their expected value.
+ ScoringSpecProto child_scoring_spec = GetDefaultScoringSpec();
+ child_scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION);
+ child_scoring_spec.set_advanced_scoring_expression(
+ "this.documentScore() * 2 + 1");
+ const int32_t exp_email1_score = email1_doc_score * 2 + 1;
+ const int32_t exp_email2_score = email2_doc_score * 2 + 1;
+ const int32_t exp_email3_score = email3_doc_score * 2 + 1;
+
+ // Set parent scoring expression and their expected value.
+ ScoringSpecProto parent_scoring_spec = GetDefaultScoringSpec();
+ parent_scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION);
+ parent_scoring_spec.set_advanced_scoring_expression(
+ "this.documentScore() * sum(this.childrenRankingSignals())");
+ const int32_t exp_person1_score =
+ person1_doc_score * (exp_email1_score + exp_email2_score);
+ const int32_t exp_person2_score = person2_doc_score * exp_email3_score;
+ const int32_t exp_person3_score = person3_doc_score * 0;
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(person3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(email3).status(), ProtoIsOk());
+
+ // Parent SearchSpec
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("firstName:first");
+
+ // JoinSpec
+ JoinSpecProto* join_spec = search_spec.mutable_join_spec();
+ join_spec->set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec->set_child_property_expression("personQualifiedId");
+ JoinSpecProto::NestedSpecProto* nested_spec =
+ join_spec->mutable_nested_spec();
+ SearchSpecProto* nested_search_spec = nested_spec->mutable_search_spec();
+ nested_search_spec->set_term_match_type(TermMatchType::PREFIX);
+ nested_search_spec->set_query("subject:test");
+ *nested_spec->mutable_scoring_spec() = child_scoring_spec;
+ *nested_spec->mutable_result_spec() = ResultSpecProto::default_instance();
+
+ // Parent ResultSpec
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(1);
+ result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+
+ SearchResultProto results =
+ icing.Search(search_spec, parent_scoring_spec, result_spec);
+ uint64_t next_page_token = results.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document().uri(), Eq("person2"));
+ // exp_person2_score = 2025
+ EXPECT_THAT(results.results(0).score(), Eq(exp_person2_score));
+
+ results = icing.GetNextPage(next_page_token);
+ next_page_token = results.next_page_token();
+ EXPECT_THAT(next_page_token, Ne(kInvalidNextPageToken));
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document().uri(), Eq("person1"));
+ // exp_person1_score = 520
+ EXPECT_THAT(results.results(0).score(), Eq(exp_person1_score));
+
+ results = icing.GetNextPage(next_page_token);
+ next_page_token = results.next_page_token();
+ EXPECT_THAT(next_page_token, Eq(kInvalidNextPageToken));
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document().uri(), Eq("person3"));
+ // exp_person3_score = 0
+ EXPECT_THAT(results.results(0).score(), Eq(exp_person3_score));
+}
+
+TEST_F(IcingSearchEngineSearchTest, NumericFilterAdvancedQuerySucceeds) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("transaction")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("price")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("cost")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document_one = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("price", 10)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("price", 25)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ DocumentProto document_three = DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("cost", 2)
+ .Build();
+ ASSERT_THAT(icing.Put(document_three).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("price < 20");
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_one));
+
+ search_spec.set_query("price == 25");
+ results = icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+
+ search_spec.set_query("cost > 2");
+ results = icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.results(), IsEmpty());
+
+ search_spec.set_query("cost >= 2");
+ results = icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_three));
+
+ search_spec.set_query("price <= 25");
+ results = icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
+}
+
+TEST_F(IcingSearchEngineSearchTest,
+ NumericFilterAdvancedQueryWithPersistenceSucceeds) {
+ IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+
+ {
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("transaction")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("price")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("cost")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ // Schema will be persisted to disk when icing goes out of scope.
+ }
+
+ DocumentProto document_one = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("price", 10)
+ .Build();
+ DocumentProto document_two = DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("price", 25)
+ .AddInt64Property("cost", 2)
+ .Build();
+ {
+ // Ensure that icing initializes the schema and section_manager
+ // properly from the pre-existing file.
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+ // The index and document store will be persisted to disk when icing goes
+ // out of scope.
+ }
+
+ {
+ // Ensure that the index is brought back up without problems and we
+ // can query for the content that we expect.
+ IcingSearchEngine icing(icing_options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("price < 20");
+ search_spec.set_search_type(
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_one));
+
+ search_spec.set_query("price == 25");
+ results = icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+
+ search_spec.set_query("cost > 2");
+ results = icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.results(), IsEmpty());
+
+ search_spec.set_query("cost >= 2");
+ results = icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+
+ search_spec.set_query("price <= 25");
+ results = icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
+ }
+}
+
+TEST_F(IcingSearchEngineSearchTest, NumericFilterOldQueryFails) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("transaction")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("price")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("cost")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document_one = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("price", 10)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("price", 25)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ DocumentProto document_three = DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("cost", 2)
+ .Build();
+ ASSERT_THAT(icing.Put(document_three).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("price < 20");
+ search_spec.set_search_type(SearchSpecProto::SearchType::ICING_RAW_QUERY);
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineSearchTest, NumericFilterQueryStatsProtoTest) {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetTimerElapsedMilliseconds(5);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("transaction")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("price")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("cost")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document_one = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("price", 10)
+ .Build();
+ ASSERT_THAT(icing.Put(document_one).status(), ProtoIsOk());
+
+ DocumentProto document_two = DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(2)
+ .AddInt64Property("price", 25)
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ DocumentProto document_three = DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(3)
+ .AddInt64Property("cost", 2)
+ .Build();
+ ASSERT_THAT(icing.Put(document_three).status(), ProtoIsOk());
+
+ DocumentProto document_four = DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema("transaction")
+ .SetCreationTimestampMs(4)
+ .AddInt64Property("price", 15)
+ .Build();
+ ASSERT_THAT(icing.Put(document_four).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.add_namespace_filters("namespace");
+ search_spec.add_schema_type_filters(document_one.schema());
+ search_spec.set_query("price < 20");
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(5);
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+
+ SearchResultProto results =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ ASSERT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document_four));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
+
+ // Check the stats
+ // TODO(b/305098009): deprecate search-related flat fields in query_stats.
+ QueryStatsProto exp_stats;
+ exp_stats.set_query_length(10);
+ exp_stats.set_num_terms(0);
+ exp_stats.set_num_namespaces_filtered(1);
+ exp_stats.set_num_schema_types_filtered(1);
+ exp_stats.set_ranking_strategy(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ exp_stats.set_is_first_page(true);
+ exp_stats.set_requested_page_size(5);
+ exp_stats.set_num_results_returned_current_page(2);
+ exp_stats.set_num_documents_scored(2);
+ exp_stats.set_num_results_with_snippets(0);
+ exp_stats.set_latency_ms(5);
+ exp_stats.set_parse_query_latency_ms(5);
+ exp_stats.set_scoring_latency_ms(5);
+ exp_stats.set_ranking_latency_ms(5);
+ exp_stats.set_document_retrieval_latency_ms(5);
+ exp_stats.set_lock_acquisition_latency_ms(5);
+ exp_stats.set_num_joined_results_returned_current_page(0);
+
+ QueryStatsProto::SearchStats* exp_parent_search_stats =
+ exp_stats.mutable_parent_search_stats();
+ exp_parent_search_stats->set_query_length(10);
+ exp_parent_search_stats->set_num_terms(0);
+ exp_parent_search_stats->set_num_namespaces_filtered(1);
+ exp_parent_search_stats->set_num_schema_types_filtered(1);
+ exp_parent_search_stats->set_ranking_strategy(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ exp_parent_search_stats->set_is_numeric_query(true);
+ exp_parent_search_stats->set_num_documents_scored(2);
+ exp_parent_search_stats->set_parse_query_latency_ms(5);
+ exp_parent_search_stats->set_scoring_latency_ms(5);
+ exp_parent_search_stats->set_num_fetched_hits_lite_index(0);
+ exp_parent_search_stats->set_num_fetched_hits_main_index(0);
+ // Since we will inspect 1 bucket from "price" in integer index and it
+ // contains 3 hits, we will fetch 3 hits (but filter out one of them).
+ exp_parent_search_stats->set_num_fetched_hits_integer_index(3);
+
+ EXPECT_THAT(results.query_stats(), EqualsProto(exp_stats));
+}
+
+TEST_P(IcingSearchEngineSearchTest, BarisNormalizationTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Person")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("name", "Barış")
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ DocumentProto document_two = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("name", "ıbar")
+ .Build();
+ ASSERT_THAT(icing.Put(document_two).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
+ search_spec.set_search_type(GetParam());
+
+ ScoringSpecProto scoring_spec;
+ ResultSpecProto result_spec;
+
+ SearchResultProto exp_results;
+ exp_results.mutable_status()->set_code(StatusProto::OK);
+ *exp_results.add_results()->mutable_document() = document;
+
+ search_spec.set_query("barış");
+ SearchResultProto results =
+ icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(results, EqualsSearchResultIgnoreStatsAndScores(exp_results));
+
+ search_spec.set_query("barıs");
+ results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(results, EqualsSearchResultIgnoreStatsAndScores(exp_results));
+
+ search_spec.set_query("baris");
+ results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(results, EqualsSearchResultIgnoreStatsAndScores(exp_results));
+
+ SearchResultProto exp_results2;
+ exp_results2.mutable_status()->set_code(StatusProto::OK);
+ *exp_results2.add_results()->mutable_document() = document_two;
+ search_spec.set_query("ı");
+ results = icing.Search(search_spec, scoring_spec, result_spec);
+ EXPECT_THAT(results, EqualsSearchResultIgnoreStatsAndScores(exp_results2));
+}
+
+TEST_P(IcingSearchEngineSearchTest, LatinSnippetTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ constexpr std::string_view kLatin = "test ḞÖÖḸĬŞĤ test";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Message")
+ .AddStringProperty("body", kLatin)
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("foo");
+ search_spec.set_term_match_type(TERM_MATCH_PREFIX);
+ search_spec.set_search_type(GetParam());
+
+ ResultSpecProto result_spec;
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(
+ std::numeric_limits<int>::max());
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(
+ std::numeric_limits<int>::max());
+
+ // Search and make sure that we got a single successful result
+ SearchResultProto search_results = icing.Search(
+ search_spec, ScoringSpecProto::default_instance(), result_spec);
+ ASSERT_THAT(search_results.status(), ProtoIsOk());
+ ASSERT_THAT(search_results.results(), SizeIs(1));
+ const SearchResultProto::ResultProto* result = &search_results.results(0);
+ EXPECT_THAT(result->document().uri(), Eq("uri1"));
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(result->snippet().entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &result->snippet().entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("body"));
+
+ // Ensure that there is one and only one match within "body"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+
+ // Check that the match is "ḞÖÖḸĬŞĤ".
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+ std::string_view match =
+ kLatin.substr(match_proto.exact_match_byte_position(),
+ match_proto.submatch_byte_length());
+ ASSERT_THAT(match, Eq("ḞÖÖ"));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ DocumentStoreNamespaceIdFingerprintCompatible) {
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+
+ // Initialize with some documents with document_store_namespace_id_fingerprint
+ // being false.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_document_store_namespace_id_fingerprint(false);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Creates and inserts 3 documents
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ }
+
+ // Reinitializate with document_store_namespace_id_fingerprint being true,
+ // and test that we are still able to read/query docs.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_document_store_namespace_id_fingerprint(true);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ ASSERT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoIsOk());
+ ASSERT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoIsOk());
+ ASSERT_THAT(
+ icing.Get("namespace", "uri3", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(3));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document3));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document2));
+ EXPECT_THAT(results.results(2).document(), EqualsProto(document1));
+ }
+
+ // Reinitializate with document_store_namespace_id_fingerprint being false,
+ // and test that we are still able to read/query docs.
+ {
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_document_store_namespace_id_fingerprint(false);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ ASSERT_THAT(
+ icing.Get("namespace", "uri1", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoIsOk());
+ ASSERT_THAT(
+ icing.Get("namespace", "uri2", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoIsOk());
+ ASSERT_THAT(
+ icing.Get("namespace", "uri3", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoIsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message");
+ search_spec.set_search_type(GetParam());
+ SearchResultProto results =
+ icing.Search(search_spec, ScoringSpecProto::default_instance(),
+ ResultSpecProto::default_instance());
+ ASSERT_THAT(results.results(), SizeIs(3));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document3));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document2));
+ EXPECT_THAT(results.results(2).document(), EqualsProto(document1));
+ }
+}
+
+TEST_P(IcingSearchEngineSearchTest, HasPropertyQuery) {
+ if (GetParam() !=
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ GTEST_SKIP()
+ << "The hasProperty() function is only supported in advanced query.";
+ }
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Value")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("score")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Create a document with every property.
+ DocumentProto document0 = DocumentBuilder()
+ .SetKey("icing", "uri0")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("body", "foo")
+ .AddInt64Property("timestamp", 123)
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+ // Create a document with missing body.
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("timestamp", 123)
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+ // Create a document with missing timestamp.
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("body", "foo")
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_build_property_existence_metadata_hits(true);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document0).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // Get all documents that have "body".
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_search_type(GetParam());
+ search_spec.add_enabled_features(std::string(kHasPropertyFunctionFeature));
+ search_spec.add_enabled_features(
+ std::string(kListFilterQueryLanguageFeature));
+ search_spec.set_query("hasProperty(\"body\")");
+ SearchResultProto results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document2));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document0));
+
+ // Get all documents that have "timestamp".
+ search_spec.set_query("hasProperty(\"timestamp\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(2));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document1));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document0));
+
+ // Get all documents that have "score".
+ search_spec.set_query("hasProperty(\"score\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(3));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document2));
+ EXPECT_THAT(results.results(1).document(), EqualsProto(document1));
+ EXPECT_THAT(results.results(2).document(), EqualsProto(document0));
+}
+
+TEST_P(IcingSearchEngineSearchTest,
+ HasPropertyQueryDoesNotWorkWithoutMetadataHits) {
+ if (GetParam() !=
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ GTEST_SKIP()
+ << "The hasProperty() function is only supported in advanced query.";
+ }
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Value")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("score")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Create a document with every property.
+ DocumentProto document0 = DocumentBuilder()
+ .SetKey("icing", "uri0")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("body", "foo")
+ .AddInt64Property("timestamp", 123)
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+ // Create a document with missing body.
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddInt64Property("timestamp", 123)
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+ // Create a document with missing timestamp.
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema("Value")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("body", "foo")
+ .AddDoubleProperty("score", 456.789)
+ .Build();
+
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_build_property_existence_metadata_hits(false);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document0).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // Check that none of the following hasProperty queries can return any
+ // results.
+ //
+ // Get all documents that have "body".
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_search_type(GetParam());
+ search_spec.add_enabled_features(std::string(kHasPropertyFunctionFeature));
+ search_spec.add_enabled_features(
+ std::string(kListFilterQueryLanguageFeature));
+ search_spec.set_query("hasProperty(\"body\")");
+ SearchResultProto results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), IsEmpty());
+
+ // Get all documents that have "timestamp".
+ search_spec.set_query("hasProperty(\"timestamp\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), IsEmpty());
+
+ // Get all documents that have "score".
+ search_spec.set_query("hasProperty(\"score\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), IsEmpty());
+}
+
+TEST_P(IcingSearchEngineSearchTest, HasPropertyQueryNestedDocument) {
+ if (GetParam() !=
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ GTEST_SKIP()
+ << "The hasProperty() function is only supported in advanced query.";
+ }
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Value")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("score")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TreeNode")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("value")
+ .SetDataTypeDocument(
+ "Value", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Create a complex nested root_document with the following property paths.
+ // - name
+ // - value
+ // - value.body
+ // - value.score
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "uri")
+ .SetSchema("TreeNode")
+ .SetCreationTimestampMs(1)
+ .AddStringProperty("name", "root")
+ .AddDocumentProperty("value", DocumentBuilder()
+ .SetKey("icing", "uri")
+ .SetSchema("Value")
+ .AddStringProperty("body", "foo")
+ .AddDoubleProperty("score", 456.789)
+ .Build())
+ .Build();
+
+ IcingSearchEngineOptions options = GetDefaultIcingOptions();
+ options.set_build_property_existence_metadata_hits(true);
+ IcingSearchEngine icing(options, GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // Check that the document can be found by `hasProperty("name")`.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ search_spec.set_search_type(GetParam());
+ search_spec.add_enabled_features(std::string(kHasPropertyFunctionFeature));
+ search_spec.add_enabled_features(
+ std::string(kListFilterQueryLanguageFeature));
+ search_spec.set_query("hasProperty(\"name\")");
+ SearchResultProto results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document));
+
+ // Check that the document can be found by `hasProperty("value")`.
+ search_spec.set_query("hasProperty(\"value\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document));
+
+ // Check that the document can be found by `hasProperty("value.body")`.
+ search_spec.set_query("hasProperty(\"value.body\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document));
+
+ // Check that the document can be found by `hasProperty("value.score")`.
+ search_spec.set_query("hasProperty(\"value.score\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), SizeIs(1));
+ EXPECT_THAT(results.results(0).document(), EqualsProto(document));
+
+ // Check that the document can NOT be found by `hasProperty("body")`.
+ search_spec.set_query("hasProperty(\"body\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), IsEmpty());
+
+ // Check that the document can NOT be found by `hasProperty("score")`.
+ search_spec.set_query("hasProperty(\"score\")");
+ results = icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(results.status(), ProtoIsOk());
+ EXPECT_THAT(results.results(), IsEmpty());
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ IcingSearchEngineSearchTest, IcingSearchEngineSearchTest,
+ testing::Values(
+ SearchSpecProto::SearchType::ICING_RAW_QUERY,
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY));
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_suggest_test.cc b/icing/icing-search-engine_suggest_test.cc
new file mode 100644
index 0000000..b3aeafc
--- /dev/null
+++ b/icing/icing-search-engine_suggest_test.cc
@@ -0,0 +1,1601 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/icing-search-engine.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/portable/endian.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::UnorderedElementsAre;
+
+// For mocking purpose, we allow tests to provide a custom Filesystem.
+class TestIcingSearchEngine : public IcingSearchEngine {
+ public:
+ TestIcingSearchEngine(const IcingSearchEngineOptions& options,
+ std::unique_ptr<const Filesystem> filesystem,
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
+};
+
+std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+
+// This test is meant to cover all tests relating to IcingSearchEngine::Search
+// and IcingSearchEngine::SearchSuggestions.
+class IcingSearchEngineSuggestTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+ filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(GetTestBaseDir().c_str());
+ }
+
+ const Filesystem* filesystem() const { return &filesystem_; }
+
+ private:
+ Filesystem filesystem_;
+};
+
+IcingSearchEngineOptions GetDefaultIcingOptions() {
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_base_dir(GetTestBaseDir());
+ return icing_options;
+}
+
+SchemaProto CreatePersonAndEmailSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // Creates and inserts 6 documents, and index 6 termSix, 5 termFive, 4
+ // termFour, 3 termThree, 2 termTwo and one termOne.
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty(
+ "subject", "termOne termTwo termThree termFour termFive termSix")
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject",
+ "termTwo termThree termFour termFive termSix")
+ .Build();
+ DocumentProto document3 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termThree termFour termFive termSix")
+ .Build();
+ DocumentProto document4 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri4")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termFour termFive termSix")
+ .Build();
+ DocumentProto document5 =
+ DocumentBuilder()
+ .SetKey("namespace", "uri5")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termFive termSix")
+ .Build();
+ DocumentProto document6 = DocumentBuilder()
+ .SetKey("namespace", "uri6")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termSix")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document5).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document6).status(), ProtoIsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("t");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ // Query all suggestions, and they will be ranked.
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions().at(0).query(), "termsix");
+ ASSERT_THAT(response.suggestions().at(1).query(), "termfive");
+ ASSERT_THAT(response.suggestions().at(2).query(), "termfour");
+ ASSERT_THAT(response.suggestions().at(3).query(), "termthree");
+ ASSERT_THAT(response.suggestions().at(4).query(), "termtwo");
+ ASSERT_THAT(response.suggestions().at(5).query(), "termone");
+
+ // Query first three suggestions, and they will be ranked.
+ suggestion_spec.set_num_to_return(3);
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions().at(0).query(), "termsix");
+ ASSERT_THAT(response.suggestions().at(1).query(), "termfive");
+ ASSERT_THAT(response.suggestions().at(2).query(), "termfour");
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_ShouldReturnInOneNamespace) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ // namespace1 has 2 results.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFoo),
+ EqualsProto(suggestionFool)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_ShouldReturnInMultipleNamespace) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fo")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace3", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ // namespace2 and namespace3 has 2 results.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace2");
+ suggestion_spec.add_namespace_filters("namespace3");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFoo),
+ EqualsProto(suggestionFool)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_NamespaceNotFound) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fo")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // Search for non-exist namespace3
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace3");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ EXPECT_THAT(response.status().code(), Eq(StatusProto::OK));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_OtherNamespaceDontContributeToHitCount) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ // Index 4 documents,
+ // namespace1 has 2 hit2 for term one
+ // namespace2 has 2 hit2 for term two and 1 hit for term one.
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termone")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termone")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termone termtwo")
+ .Build();
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace2", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "termtwo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionTermOne;
+ suggestionTermOne.set_query("termone");
+ SuggestionResponse::Suggestion suggestionTermTwo;
+ suggestionTermTwo.set_query("termtwo");
+
+ // only search suggestion for namespace2. The correctly order should be
+ // {"termtwo", "termone"}. If we're not filtering out namespace1 when
+ // calculating our score, then it will be {"termone", "termtwo"}.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("t");
+ suggestion_spec.add_namespace_filters("namespace2");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ ElementsAre(EqualsProto(suggestionTermTwo),
+ EqualsProto(suggestionTermOne)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_DeletionTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ // namespace1 has this suggestion
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+
+ // namespace2 has this suggestion
+ suggestion_spec.clear_namespace_filters();
+ suggestion_spec.add_namespace_filters("namespace2");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+
+ // delete document from namespace 1
+ EXPECT_THAT(icing.Delete("namespace1", "uri1").status(), ProtoIsOk());
+
+ // Now namespace1 will return empty
+ suggestion_spec.clear_namespace_filters();
+ suggestion_spec.add_namespace_filters("namespace1");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(), IsEmpty());
+
+ // namespace2 still has this suggestion, so we can prove the reason of
+ // namespace 1 cannot find it is we filter it out, not it doesn't exist.
+ suggestion_spec.add_namespace_filters("namespace2");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_ShouldReturnInOneDocument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+
+ // Only search in namespace1,uri1
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ NamespaceDocumentUriGroup* namespace1_uri1 =
+ suggestion_spec.add_document_uri_filters();
+ namespace1_uri1->set_namespace_("namespace1");
+ namespace1_uri1->add_document_uris("uri1");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+
+ // Only search in namespace1,uri2
+ suggestion_spec.clear_document_uri_filters();
+ NamespaceDocumentUriGroup* namespace1_uri2 =
+ suggestion_spec.add_document_uri_filters();
+ namespace1_uri2->set_namespace_("namespace1");
+ namespace1_uri2->add_document_uris("uri2");
+
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFoo)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_ShouldReturnInMultipleDocument) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace1", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+
+ // Only search document in namespace1,uri1 and namespace2,uri2
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ NamespaceDocumentUriGroup* namespace1_uri1_uri2 =
+ suggestion_spec.add_document_uri_filters();
+ namespace1_uri1_uri2->set_namespace_("namespace1");
+ namespace1_uri1_uri2->add_document_uris("uri1");
+ namespace1_uri1_uri2->add_document_uris("uri2");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool),
+ EqualsProto(suggestionFoo)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_ShouldReturnInDesiredDocumentAndNamespace) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace3", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+
+ // Only search document in namespace1,uri1 and all documents under namespace2
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.add_namespace_filters("namespace2");
+ NamespaceDocumentUriGroup* namespace1_uri1 =
+ suggestion_spec.add_document_uri_filters();
+ namespace1_uri1->set_namespace_("namespace1");
+ namespace1_uri1->add_document_uris("uri1");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool),
+ EqualsProto(suggestionFoo)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_DocumentIdDoesntExist) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // Search for a non-exist document id : namespace3,uri3
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ suggestion_spec.add_namespace_filters("namespace3");
+ NamespaceDocumentUriGroup* namespace3_uri3 =
+ suggestion_spec.add_document_uri_filters();
+ namespace3_uri3->set_namespace_("namespace3");
+ namespace3_uri3->add_document_uris("uri3");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(), IsEmpty());
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_DocumentIdFilterDoesntMatchNamespaceFilter) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // Search for the document namespace1,uri1 with namespace filter in
+ // namespace2.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ NamespaceDocumentUriGroup* namespace1_uri1 =
+ suggestion_spec.add_document_uri_filters();
+ namespace1_uri1->set_namespace_("namespace1");
+ namespace1_uri1->add_document_uris("uri1");
+ suggestion_spec.add_namespace_filters("namespace2");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ EXPECT_THAT(response.status().code(), Eq(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_EmptyDocumentIdInNamespace) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // Give empty document uris in namespace 1
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ NamespaceDocumentUriGroup* namespace1_uri1 =
+ suggestion_spec.add_document_uri_filters();
+ namespace1_uri1->set_namespace_("namespace1");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ EXPECT_THAT(response.status().code(), Eq(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_ShouldReturnInDesiredSchemaType) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .AddDocumentProperty("sender", DocumentBuilder()
+ .SetKey("namespace", "uri1-sender")
+ .SetSchema("Person")
+ .AddStringProperty("name", "foo")
+ .Build())
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Message")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("body", "fo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ suggestion_spec.add_schema_type_filters("Email");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFoo),
+ EqualsProto(suggestionFool)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_SchemaTypeNotFound) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("body", "fo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ suggestion_spec.add_schema_type_filters("Email");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(), IsEmpty());
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_ShouldReturnInDesiredProperty) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1-sender")
+ .SetSchema("Person")
+ .AddStringProperty("name", "foo")
+ .AddStringProperty("emailAddress", "fo")
+ .Build())
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ // Only search in subject.
+ TypePropertyMask* mask = suggestion_spec.add_type_property_filters();
+ mask->set_schema_type("Email");
+ mask->add_paths("subject");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+
+ // Search in subject and sender.name
+ suggestion_spec.clear_type_property_filters();
+ mask = suggestion_spec.add_type_property_filters();
+ mask->set_schema_type("Email");
+ mask->add_paths("subject");
+ mask->add_paths("sender.name");
+
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFoo),
+ EqualsProto(suggestionFool)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_NestedPropertyReturnNothing) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .AddDocumentProperty("sender", DocumentBuilder()
+ .SetKey("namespace", "uri1-sender")
+ .SetSchema("Person")
+ .AddStringProperty("name", "foo")
+ .Build())
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ // Only search in Person.name.
+ suggestion_spec.add_schema_type_filters("Person");
+ TypePropertyMask* mask = suggestion_spec.add_type_property_filters();
+ mask->set_schema_type("Person");
+ mask->add_paths("name");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(), IsEmpty());
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_PropertyFilterAndSchemaFilter) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .AddDocumentProperty("sender", DocumentBuilder()
+ .SetKey("namespace", "uri1-sender")
+ .SetSchema("Person")
+ .AddStringProperty("name", "foo")
+ .Build())
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Message")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("body", "fo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFoo;
+ suggestionFoo.set_query("foo");
+ SuggestionResponse::Suggestion suggestionFo;
+ suggestionFo.set_query("fo");
+
+ // Search in sender.name of Email and everything in Message.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ suggestion_spec.add_schema_type_filters("Email");
+ suggestion_spec.add_schema_type_filters("Message");
+ TypePropertyMask* mask1 = suggestion_spec.add_type_property_filters();
+ mask1->set_schema_type("Email");
+ mask1->add_paths("sender.name");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFoo),
+ EqualsProto(suggestionFo)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_PropertyFilterNotMatchSchemaFilter) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("body", "fo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // Search in sender.name of Email but schema type is Message.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+ suggestion_spec.add_schema_type_filters("Message");
+ TypePropertyMask* mask1 = suggestion_spec.add_type_property_filters();
+ mask1->set_schema_type("Email");
+ mask1->add_paths("sender.name");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ EXPECT_THAT(response.status().code(), Eq(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_OrderByTermFrequency) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Message")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty(
+ "body", "termthree termthree termthree termtwo termtwo termone")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // Search in sender.name of Email but schema type is Message.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("t");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::EXACT_ONLY);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY);
+
+ SuggestionResponse::Suggestion suggestionTermOne;
+ suggestionTermOne.set_query("termone");
+ SuggestionResponse::Suggestion suggestionTermTwo;
+ suggestionTermTwo.set_query("termtwo");
+ SuggestionResponse::Suggestion suggestionTermThree;
+ suggestionTermThree.set_query("termthree");
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ ElementsAre(EqualsProto(suggestionTermThree),
+ EqualsProto(suggestionTermTwo),
+ EqualsProto(suggestionTermOne)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_ExpiredTest) {
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(500)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ {
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(400);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ // namespace1 has this suggestion
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+
+ // namespace2 has this suggestion
+ suggestion_spec.clear_namespace_filters();
+ suggestion_spec.add_namespace_filters("namespace2");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+ }
+ // We reinitialize here so we can feed in a fake clock this time
+ {
+ // Time needs to be past document1 creation time (100) + ttl (500) for it
+ // to count as "expired". document2 is not expired since its ttl is 1000.
+ auto fake_clock = std::make_unique<FakeClock>();
+ fake_clock->SetSystemTimeMilliseconds(800);
+
+ TestIcingSearchEngine icing(GetDefaultIcingOptions(),
+ std::make_unique<Filesystem>(),
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f");
+ suggestion_spec.add_namespace_filters("namespace1");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ // Now namespace1 will return empty
+ suggestion_spec.clear_namespace_filters();
+ suggestion_spec.add_namespace_filters("namespace1");
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(), IsEmpty());
+
+ // namespace2 still has this suggestion
+ SuggestionResponse::Suggestion suggestionFool;
+ suggestionFool.set_query("fool");
+
+ suggestion_spec.add_namespace_filters("namespace2");
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionFool)));
+ }
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_emptyPrefix) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_NonPositiveNumToReturn) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("prefix");
+ suggestion_spec.set_num_to_return(0);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ ASSERT_THAT(icing.SearchSuggestions(suggestion_spec).status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_MultipleTerms_And) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "bar fo")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionBarFo;
+ suggestionBarFo.set_query("bar fo");
+
+ // Search "bar AND f" only document 1 should match the search.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("bar f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionBarFo)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_MultipleTerms_Or) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "bar fo")
+ .Build();
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "cat foo")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ SuggestionResponse::Suggestion suggestionBarCatFo;
+ suggestionBarCatFo.set_query("bar OR cat fo");
+ SuggestionResponse::Suggestion suggestionBarCatFoo;
+ suggestionBarCatFoo.set_query("bar OR cat foo");
+
+ // Search for "(bar OR cat) AND f" both document1 "bar fo" and document2 "cat
+ // foo" could match.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("bar OR cat f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionBarCatFo),
+ EqualsProto(suggestionBarCatFoo)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_PropertyRestriction) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1-sender")
+ .SetSchema("Person")
+ .AddStringProperty("name", "foo")
+ .AddStringProperty("emailAddress", "fo")
+ .Build())
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+
+ // Add property restriction, only search for subject.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("subject:f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse::Suggestion suggestionSubjectFool;
+ suggestionSubjectFool.set_query("subject:fool");
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionSubjectFool)));
+
+ // Add property restriction, only search for nested sender.name
+ suggestion_spec.set_prefix("sender.name:f");
+ SuggestionResponse::Suggestion suggestionSenderNameFoo;
+ suggestionSenderNameFoo.set_query("sender.name:foo");
+
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionSenderNameFoo)));
+
+ // Add property restriction, only search for nonExist section
+ suggestion_spec.set_prefix("none:f");
+
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(), IsEmpty());
+}
+
+TEST_F(IcingSearchEngineSuggestTest,
+ SearchSuggestionsTest_AndOperatorPlusPropertyRestriction) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "bar fo") // "bar fo"
+ .AddStringProperty("body", "fool")
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "bar cat foo") // "bar cat fool"
+ .AddStringProperty("body", "fool")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace1", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool") // "fool"
+ .AddStringProperty("body", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ // Search for "bar AND subject:f"
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("bar subject:f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse::Suggestion suggestionBarSubjectFo;
+ suggestionBarSubjectFo.set_query("bar subject:fo");
+ SuggestionResponse::Suggestion suggestionBarSubjectFoo;
+ suggestionBarSubjectFoo.set_query("bar subject:foo");
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionBarSubjectFo),
+ EqualsProto(suggestionBarSubjectFoo)));
+
+ // Search for "bar AND cat AND subject:f"
+ suggestion_spec.set_prefix("bar cat subject:f");
+ SuggestionResponse::Suggestion suggestionBarCatSubjectFoo;
+ suggestionBarCatSubjectFoo.set_query("bar cat subject:foo");
+
+ response = icing.SearchSuggestions(suggestion_spec);
+ ASSERT_THAT(response.status(), ProtoIsOk());
+ ASSERT_THAT(response.suggestions(),
+ UnorderedElementsAre(EqualsProto(suggestionBarCatSubjectFoo)));
+}
+
+TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_InvalidPrefixTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "bar fo") // "bar fo"
+ .AddStringProperty("body", "fool")
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "bar cat foo") // "bar cat fool"
+ .AddStringProperty("body", "fool")
+ .Build();
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace1", "uri3")
+ .SetSchema("Email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "fool") // "fool"
+ .AddStringProperty("body", "fool")
+ .Build();
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ // Search for "f OR"
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f OR");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ suggestion_spec.mutable_scoring_spec()->set_rank_by(
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT);
+
+ SuggestionResponse response = icing.SearchSuggestions(suggestion_spec);
+ if (SearchSpecProto::default_instance().search_type() ==
+ SearchSpecProto::SearchType::ICING_RAW_QUERY) {
+ EXPECT_THAT(response.status(), ProtoIsOk());
+ EXPECT_THAT(response.suggestions(), IsEmpty());
+ } else {
+ EXPECT_THAT(response.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+ EXPECT_THAT(response.suggestions(), IsEmpty());
+ }
+
+ // TODO(b/208654892): Update handling for hyphens to only consider it a hyphen
+ // within a TEXT token (rather than a MINUS token) when surrounded on both
+ // sides by TEXT rather than just preceded by TEXT.
+ // Search for "f-"
+ suggestion_spec.set_prefix("f-");
+ response = icing.SearchSuggestions(suggestion_spec);
+ EXPECT_THAT(response.status(), ProtoIsOk());
+ EXPECT_THAT(response.suggestions(), IsEmpty());
+
+ // Search for "f:"
+ suggestion_spec.set_prefix("f:");
+ response = icing.SearchSuggestions(suggestion_spec);
+ if (SearchSpecProto::default_instance().search_type() ==
+ SearchSpecProto::SearchType::ICING_RAW_QUERY) {
+ EXPECT_THAT(response.status(), ProtoIsOk());
+ EXPECT_THAT(response.suggestions(), IsEmpty());
+ } else {
+ EXPECT_THAT(response.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+ EXPECT_THAT(response.suggestions(), IsEmpty());
+ }
+
+ // Search for "OR OR - :"
+ suggestion_spec.set_prefix("OR OR - :");
+ response = icing.SearchSuggestions(suggestion_spec);
+ if (SearchSpecProto::default_instance().search_type() ==
+ SearchSpecProto::SearchType::ICING_RAW_QUERY) {
+ EXPECT_THAT(response.status(), ProtoIsOk());
+ EXPECT_THAT(response.suggestions(), IsEmpty());
+ } else {
+ EXPECT_THAT(response.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
+ EXPECT_THAT(response.suggestions(), IsEmpty());
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/icing-search-engine_test.cc b/icing/icing-search-engine_test.cc
index b0946c9..ddb83a8 100644
--- a/icing/icing-search-engine_test.cc
+++ b/icing/icing-search-engine_test.cc
@@ -26,19 +26,30 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/portable/endian.h"
#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
#include "icing/proto/initialize.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/reset.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/status.pb.h"
-#include "icing/schema/schema-store.h"
-#include "icing/schema/section.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/schema-builder.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
-#include "icing/testing/snippet-helpers.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -48,13 +59,11 @@ namespace lib {
namespace {
using ::icing::lib::portable_equals_proto::EqualsProto;
-using ::testing::_;
using ::testing::Eq;
+using ::testing::Ge;
using ::testing::Gt;
using ::testing::HasSubstr;
using ::testing::IsEmpty;
-using ::testing::Lt;
-using ::testing::Matcher;
using ::testing::Return;
using ::testing::SizeIs;
using ::testing::StrEq;
@@ -65,19 +74,33 @@ class TestIcingSearchEngine : public IcingSearchEngine {
public:
TestIcingSearchEngine(const IcingSearchEngineOptions& options,
std::unique_ptr<const Filesystem> filesystem,
- std::unique_ptr<FakeClock> clock)
- : IcingSearchEngine(options, std::move(filesystem), std::move(clock)) {}
+ std::unique_ptr<const IcingFilesystem> icing_filesystem,
+ std::unique_ptr<Clock> clock,
+ std::unique_ptr<JniCache> jni_cache)
+ : IcingSearchEngine(options, std::move(filesystem),
+ std::move(icing_filesystem), std::move(clock),
+ std::move(jni_cache)) {}
};
std::string GetTestBaseDir() { return GetTestTempDir() + "/icing"; }
+// This test is meant to cover all tests relating to IcingSearchEngine apis not
+// specifically covered by the other IcingSearchEngine*Test.
class IcingSearchEngineTest : public testing::Test {
protected:
void SetUp() override {
- // File generated via icu_data_file rule in //icing/BUILD.
- std::string icu_data_file_path =
- GetTestFilePath("icing/icu.dat");
- ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
filesystem_.CreateDirectoryRecursively(GetTestBaseDir().c_str());
}
@@ -91,21 +114,9 @@ class IcingSearchEngineTest : public testing::Test {
Filesystem filesystem_;
};
-constexpr int kMaxSupportedDocumentSize = (1u << 24) - 1;
-
// Non-zero value so we don't override it to be the current time
constexpr int64_t kDefaultCreationTimestampMs = 1575492852000;
-std::string GetDocumentDir() { return GetTestBaseDir() + "/document_dir"; }
-
-std::string GetIndexDir() { return GetTestBaseDir() + "/index_dir"; }
-
-std::string GetSchemaDir() { return GetTestBaseDir() + "/schema_dir"; }
-
-std::string GetHeaderFilename() {
- return GetTestBaseDir() + "/icing_search_engine_header";
-}
-
IcingSearchEngineOptions GetDefaultIcingOptions() {
IcingSearchEngineOptions icing_options;
icing_options.set_base_dir(GetTestBaseDir());
@@ -122,42 +133,48 @@ DocumentProto CreateMessageDocument(std::string name_space, std::string uri) {
}
SchemaProto CreateMessageSchema() {
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("Message");
-
- auto body = type->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- return schema;
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
}
-SchemaProto CreateEmailSchema() {
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("Email");
-
- auto* body = type->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- auto* subj = type->add_properties();
- subj->set_property_name("subject");
- subj->set_data_type(PropertyConfigProto::DataType::STRING);
- subj->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- subj->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- subj->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- return schema;
+SchemaProto CreatePersonAndEmailSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
}
ScoringSpecProto GetDefaultScoringSpec() {
@@ -166,634 +183,38 @@ ScoringSpecProto GetDefaultScoringSpec() {
return scoring_spec;
}
-TEST_F(IcingSearchEngineTest, SimpleInitialization) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document = CreateMessageDocument("namespace", "uri");
- ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(DocumentProto(document)).status().code(),
- Eq(StatusProto::OK));
-}
-
-TEST_F(IcingSearchEngineTest, InitializingAgainSavesNonPersistedData) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document = CreateMessageDocument("namespace", "uri");
- ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document;
-
- ASSERT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
-
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, MaxIndexMergeSizeReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_index_merge_size(std::numeric_limits<int32_t>::max());
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, NegativeMergeSizeReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_index_merge_size(-1);
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, ZeroMergeSizeReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_index_merge_size(0);
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, GoodIndexMergeSizeReturnsOk) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- // One is fine, if a bit weird. It just means that the lite index will be
- // smaller and will request a merge any time content is added to it.
- options.set_index_merge_size(1);
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-}
-
-TEST_F(IcingSearchEngineTest,
- NegativeMaxTokensPerDocSizeReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_max_tokens_per_doc(-1);
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, ZeroMaxTokensPerDocSizeReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_max_tokens_per_doc(0);
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, GoodMaxTokensPerDocSizeReturnsOk) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- // INT_MAX is valid - it just means that we shouldn't limit the number of
- // tokens per document. It would be pretty inconceivable that anyone would
- // produce such a document - the text being indexed alone would take up at
- // least ~4.3 GiB! - and the document would be rejected before indexing
- // for exceeding max_document_size, but there's no reason to explicitly
- // bar it.
- options.set_max_tokens_per_doc(std::numeric_limits<int32_t>::max());
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-}
-
-TEST_F(IcingSearchEngineTest, NegativeMaxTokenLenReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_max_token_length(-1);
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, ZeroMaxTokenLenReturnsInvalidArgument) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- options.set_max_token_length(0);
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, MaxTokenLenReturnsOkAndTruncatesTokens) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- // A length of 1 is allowed - even though it would be strange to want
- // this.
- options.set_max_token_length(1);
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document = CreateMessageDocument("namespace", "uri");
- EXPECT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
-
- // "message" should have been truncated to "m"
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- // The indexed tokens were truncated to length of 1, so "m" will match
- search_spec.set_query("m");
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document;
-
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- // The query token is also truncated to length of 1, so "me"->"m" matches "m"
- search_spec.set_query("me");
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- // The query token is still truncated to length of 1, so "massage"->"m"
- // matches "m"
- search_spec.set_query("massage");
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest,
- MaxIntMaxTokenLenReturnsOkTooLargeTokenReturnsResourceExhausted) {
- IcingSearchEngineOptions options = GetDefaultIcingOptions();
- // Set token length to max. This is allowed (it just means never to
- // truncate tokens). However, this does mean that tokens that exceed the
- // size of the lexicon will cause indexing to fail.
- options.set_max_token_length(std::numeric_limits<int32_t>::max());
- IcingSearchEngine icing(options);
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- // Add a document that just barely fits under the max document limit.
- // This will still fail to index because we won't actually have enough
- // room in the lexicon to fit this content.
- std::string enormous_string(kMaxSupportedDocumentSize - 256, 'p');
- DocumentProto document =
- DocumentBuilder()
- .SetKey("namespace", "uri")
- .SetSchema("Message")
- .AddStringProperty("body", std::move(enormous_string))
- .Build();
- EXPECT_THAT(icing.Put(document).status().code(),
- Eq(StatusProto::OUT_OF_SPACE));
-
- SearchSpecProto search_spec;
- search_spec.set_query("p");
- search_spec.set_term_match_type(TermMatchType::PREFIX);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, FailToCreateDocStore) {
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- // This fails DocumentStore::Create()
- ON_CALL(*mock_filesystem, CreateDirectoryRecursively(_))
- .WillByDefault(Return(false));
-
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::move(mock_filesystem),
- std::make_unique<FakeClock>());
-
- InitializeResultProto initialize_result_proto = icing.Initialize();
- EXPECT_THAT(initialize_result_proto.status().code(),
- Eq(StatusProto::INTERNAL));
- EXPECT_THAT(initialize_result_proto.status().message(),
- HasSubstr("Could not create directory"));
-}
-
-TEST_F(IcingSearchEngineTest,
- CircularReferenceCreateSectionManagerReturnsInvalidArgument) {
- // Create a type config with a circular reference.
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("Message");
-
- auto* body = type->add_properties();
- body->set_property_name("recipient");
- body->set_schema_type("Person");
- body->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- type = schema.add_types();
- type->set_schema_type("Person");
-
- body = type->add_properties();
- body->set_property_name("recipient");
- body->set_schema_type("Message");
- body->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(schema).status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-}
-
-TEST_F(IcingSearchEngineTest, PutWithoutSchemaFailedPrecondition) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- DocumentProto document = CreateMessageDocument("namespace", "uri");
- PutResultProto put_result_proto = icing.Put(document);
- EXPECT_THAT(put_result_proto.status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(put_result_proto.status().message(), HasSubstr("Schema not set"));
-}
-
-TEST_F(IcingSearchEngineTest, FailToReadSchema) {
- IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
-
- {
- // Successfully initialize and set a schema
- IcingSearchEngine icing(icing_options);
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- }
-
- auto mock_filesystem = std::make_unique<MockFilesystem>();
-
- // This fails FileBackedProto::Read() when we try to check the schema we
- // had previously set
- ON_CALL(*mock_filesystem,
- OpenForRead(Eq(icing_options.base_dir() + "/schema_dir/schema.pb")))
- .WillByDefault(Return(-1));
-
- TestIcingSearchEngine test_icing(icing_options, std::move(mock_filesystem),
- std::make_unique<FakeClock>());
-
- InitializeResultProto initialize_result_proto = test_icing.Initialize();
- EXPECT_THAT(initialize_result_proto.status().code(),
- Eq(StatusProto::INTERNAL));
- EXPECT_THAT(initialize_result_proto.status().message(),
- HasSubstr("Unable to open file for read"));
-}
-
-TEST_F(IcingSearchEngineTest, FailToWriteSchema) {
- IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
-
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- // This fails FileBackedProto::Write()
- ON_CALL(*mock_filesystem,
- OpenForWrite(Eq(icing_options.base_dir() + "/schema_dir/schema.pb")))
- .WillByDefault(Return(-1));
-
- TestIcingSearchEngine icing(icing_options, std::move(mock_filesystem),
- std::make_unique<FakeClock>());
-
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- SetSchemaResultProto set_schema_result_proto =
- icing.SetSchema(CreateMessageSchema());
- EXPECT_THAT(set_schema_result_proto.status().code(),
- Eq(StatusProto::INTERNAL));
- EXPECT_THAT(set_schema_result_proto.status().message(),
- HasSubstr("Unable to open file for write"));
-}
-
-TEST_F(IcingSearchEngineTest, SetSchema) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- auto message_document = CreateMessageDocument("namespace", "uri");
-
- auto schema_with_message = CreateMessageSchema();
-
- SchemaProto schema_with_email;
- SchemaTypeConfigProto* type = schema_with_email.add_types();
- type->set_schema_type("Email");
- PropertyConfigProto* property = type->add_properties();
- property->set_property_name("title");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- SchemaProto schema_with_email_and_message = schema_with_email;
- type = schema_with_email_and_message.add_types();
- type->set_schema_type("Message");
- property = type->add_properties();
- property->set_property_name("body");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- // Create an arbitrary invalid schema
- SchemaProto invalid_schema;
- SchemaTypeConfigProto* empty_type = invalid_schema.add_types();
- empty_type->set_schema_type("");
-
- // Make sure we can't set invalid schemas
- EXPECT_THAT(icing.SetSchema(invalid_schema).status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
-
- // Can add an document of a set schema
- EXPECT_THAT(icing.SetSchema(schema_with_message).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(message_document).status().code(), Eq(StatusProto::OK));
-
- // Schema with Email doesn't have Message, so would result incompatible
- // data
- EXPECT_THAT(icing.SetSchema(schema_with_email).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
-
- // Can expand the set of schema types and add an document of a new
- // schema type
- EXPECT_THAT(icing.SetSchema(SchemaProto(schema_with_email_and_message))
- .status()
- .code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(message_document).status().code(), Eq(StatusProto::OK));
-
- // Can't add an document whose schema isn't set
- auto photo_document = DocumentBuilder()
- .SetKey("namespace", "uri")
- .SetSchema("Photo")
- .AddStringProperty("creator", "icing")
- .Build();
- PutResultProto put_result_proto = icing.Put(photo_document);
- EXPECT_THAT(put_result_proto.status().code(), Eq(StatusProto::NOT_FOUND));
- EXPECT_THAT(put_result_proto.status().message(),
- HasSubstr("'Photo' not found"));
-}
-
-TEST_F(IcingSearchEngineTest, SetSchemaTriggersIndexRestorationAndReturnsOk) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- SchemaProto schema_with_no_indexed_property = CreateMessageSchema();
- schema_with_no_indexed_property.mutable_types(0)
- ->mutable_properties(0)
- ->clear_indexing_config();
-
- EXPECT_THAT(icing.SetSchema(schema_with_no_indexed_property).status().code(),
- Eq(StatusProto::OK));
- // Nothing will be index and Search() won't return anything.
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto empty_result;
- empty_result.mutable_status()->set_code(StatusProto::OK);
-
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(empty_result));
-
- SchemaProto schema_with_indexed_property = CreateMessageSchema();
- // Index restoration should be triggered here because new schema requires more
- // properties to be indexed.
- EXPECT_THAT(icing.SetSchema(schema_with_indexed_property).status().code(),
- Eq(StatusProto::OK));
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- CreateMessageDocument("namespace", "uri");
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SetSchemaRevalidatesDocumentsAndReturnsOk) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- SchemaProto schema_with_optional_subject;
- auto type = schema_with_optional_subject.add_types();
- type->set_schema_type("email");
-
- // Add a OPTIONAL property
- auto property = type->add_properties();
- property->set_property_name("subject");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- EXPECT_THAT(icing.SetSchema(schema_with_optional_subject).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto email_document_without_subject =
- DocumentBuilder()
- .SetKey("namespace", "without_subject")
- .SetSchema("email")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto email_document_with_subject =
- DocumentBuilder()
- .SetKey("namespace", "with_subject")
- .SetSchema("email")
- .AddStringProperty("subject", "foo")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
-
- EXPECT_THAT(icing.Put(email_document_without_subject).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(email_document_with_subject).status().code(),
- Eq(StatusProto::OK));
-
- SchemaProto schema_with_required_subject;
- type = schema_with_required_subject.add_types();
- type->set_schema_type("email");
-
- // Add a REQUIRED property
- property = type->add_properties();
- property->set_property_name("subject");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- // Can't set the schema since it's incompatible
- SetSchemaResultProto expected_set_schema_result_proto;
- expected_set_schema_result_proto.mutable_status()->set_code(
- StatusProto::FAILED_PRECONDITION);
- expected_set_schema_result_proto.mutable_status()->set_message(
- "Schema is incompatible.");
- expected_set_schema_result_proto.add_incompatible_schema_types("email");
-
- EXPECT_THAT(icing.SetSchema(schema_with_required_subject),
- EqualsProto(expected_set_schema_result_proto));
-
- // Force set it
- expected_set_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
- expected_set_schema_result_proto.mutable_status()->clear_message();
- EXPECT_THAT(icing.SetSchema(schema_with_required_subject,
- /*ignore_errors_and_delete_documents=*/true),
- EqualsProto(expected_set_schema_result_proto));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = email_document_with_subject;
-
- EXPECT_THAT(icing.Get("namespace", "with_subject"),
- EqualsProto(expected_get_result_proto));
-
- // The document without a subject got deleted because it failed validation
- // against the new schema
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, without_subject) not found.");
- expected_get_result_proto.clear_document();
-
- EXPECT_THAT(icing.Get("namespace", "without_subject"),
- EqualsProto(expected_get_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SetSchemaDeletesDocumentsAndReturnsOk) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("email");
- type = schema.add_types();
- type->set_schema_type("message");
-
- EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
-
- DocumentProto email_document =
- DocumentBuilder()
- .SetKey("namespace", "email_uri")
- .SetSchema("email")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto message_document =
- DocumentBuilder()
- .SetKey("namespace", "message_uri")
- .SetSchema("message")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
-
- EXPECT_THAT(icing.Put(email_document).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(message_document).status().code(), Eq(StatusProto::OK));
-
- // Clear the schema and only add the "email" type, essentially deleting the
- // "message" type
- SchemaProto new_schema;
- type = new_schema.add_types();
- type->set_schema_type("email");
-
- // Can't set the schema since it's incompatible
- SetSchemaResultProto expected_result;
- expected_result.mutable_status()->set_code(StatusProto::FAILED_PRECONDITION);
- expected_result.mutable_status()->set_message("Schema is incompatible.");
- expected_result.add_deleted_schema_types("message");
-
- EXPECT_THAT(icing.SetSchema(new_schema), EqualsProto(expected_result));
-
- // Force set it
- expected_result.mutable_status()->set_code(StatusProto::OK);
- expected_result.mutable_status()->clear_message();
- EXPECT_THAT(icing.SetSchema(new_schema,
- /*ignore_errors_and_delete_documents=*/true),
- EqualsProto(expected_result));
-
- // "email" document is still there
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = email_document;
-
- EXPECT_THAT(icing.Get("namespace", "email_uri"),
- EqualsProto(expected_get_result_proto));
-
- // "message" document got deleted
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, message_uri) not found.");
- expected_get_result_proto.clear_document();
-
- EXPECT_THAT(icing.Get("namespace", "message_uri"),
- EqualsProto(expected_get_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, GetSchemaNotFound) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- EXPECT_THAT(icing.GetSchema().status().code(), Eq(StatusProto::NOT_FOUND));
-}
-
-TEST_F(IcingSearchEngineTest, GetSchemaOk) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- GetSchemaResultProto expected_get_schema_result_proto;
- expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
- EXPECT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, GetSchemaTypeFailedPrecondition) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- GetSchemaTypeResultProto get_schema_type_result_proto =
- icing.GetSchemaType("nonexistent_schema");
- EXPECT_THAT(get_schema_type_result_proto.status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(get_schema_type_result_proto.status().message(),
- HasSubstr("Schema not set"));
-}
-
-TEST_F(IcingSearchEngineTest, GetSchemaTypeOk) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- GetSchemaTypeResultProto expected_get_schema_type_result_proto;
- expected_get_schema_type_result_proto.mutable_status()->set_code(
- StatusProto::OK);
- *expected_get_schema_type_result_proto.mutable_schema_type_config() =
- CreateMessageSchema().types(0);
- EXPECT_THAT(icing.GetSchemaType(CreateMessageSchema().types(0).schema_type()),
- EqualsProto(expected_get_schema_type_result_proto));
+UsageReport CreateUsageReport(std::string name_space, std::string uri,
+ int64_t timestamp_ms,
+ UsageReport::UsageType usage_type) {
+ UsageReport usage_report;
+ usage_report.set_document_namespace(name_space);
+ usage_report.set_document_uri(uri);
+ usage_report.set_usage_timestamp_ms(timestamp_ms);
+ usage_report.set_usage_type(usage_type);
+ return usage_report;
}
TEST_F(IcingSearchEngineTest, GetDocument) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
// Simple put and get
- ASSERT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(CreateMessageDocument("namespace", "uri")).status(),
+ ProtoIsOk());
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
*expected_get_result_proto.mutable_document() =
CreateMessageDocument("namespace", "uri");
- ASSERT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
+ ASSERT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
// Put an invalid document
PutResultProto put_result_proto = icing.Put(DocumentProto());
- EXPECT_THAT(put_result_proto.status().code(),
- Eq(StatusProto::INVALID_ARGUMENT));
+ EXPECT_THAT(put_result_proto.status(),
+ ProtoStatusIs(StatusProto::INVALID_ARGUMENT));
EXPECT_THAT(put_result_proto.status().message(),
HasSubstr("'namespace' is empty"));
@@ -802,1785 +223,624 @@ TEST_F(IcingSearchEngineTest, GetDocument) {
expected_get_result_proto.mutable_status()->set_message(
"Document (wrong, uri) not found.");
expected_get_result_proto.clear_document();
- ASSERT_THAT(icing.Get("wrong", "uri"),
+ ASSERT_THAT(icing.Get("wrong", "uri", GetResultSpecProto::default_instance()),
EqualsProto(expected_get_result_proto));
}
-TEST_F(IcingSearchEngineTest, SearchReturnsValidResults) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
- ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
-
- DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
- ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
- result_spec.mutable_snippet_spec()->set_num_to_snippet(1);
-
- SearchResultProto results =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- EXPECT_THAT(results.status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(results.results(), SizeIs(2));
- EXPECT_THAT(results.results(0).document(), EqualsProto(document_two));
- EXPECT_THAT(GetMatch(results.results(0).document(),
- results.results(0).snippet(), "body",
- /*snippet_index=*/0),
- Eq("message"));
- EXPECT_THAT(
- GetWindow(results.results(0).document(), results.results(0).snippet(),
- "body", /*snippet_index=*/0),
- Eq("message body"));
- EXPECT_THAT(results.results(1).document(), EqualsProto(document_one));
- EXPECT_THAT(
- GetMatch(results.results(1).document(), results.results(1).snippet(),
- "body", /*snippet_index=*/0),
- IsEmpty());
- EXPECT_THAT(
- GetWindow(results.results(1).document(), results.results(1).snippet(),
- "body", /*snippet_index=*/0),
- IsEmpty());
-
- search_spec.set_query("foo");
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SearchReturnsOneResult) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document_one = CreateMessageDocument("namespace", "uri1");
- ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
-
- DocumentProto document_two = CreateMessageDocument("namespace", "uri2");
- ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(1);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document_two;
-
- SearchResultProto search_result_proto =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- EXPECT_THAT(search_result_proto.status().code(), Eq(StatusProto::OK));
- // The token is a random number so we don't verify it.
- expected_search_result_proto.set_next_page_token(
- search_result_proto.next_page_token());
- EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SearchZeroResultLimitReturnsEmptyResults) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("");
-
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(0);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), result_spec),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SearchNegativeResultLimitReturnsInvalidArgument) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("");
-
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(-5);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(
- StatusProto::INVALID_ARGUMENT);
- expected_search_result_proto.mutable_status()->set_message(
- "ResultSpecProto.num_per_page cannot be negative.");
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(), result_spec),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SearchWithPersistenceReturnsValidResults) {
- IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
-
- {
- // Set the schema up beforehand.
- IcingSearchEngine icing(icing_options);
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- // Schema will be persisted to disk when icing goes out of scope.
- }
-
- {
- // Ensure that icing initializes the schema and section_manager
- // properly from the pre-existing file.
- IcingSearchEngine icing(icing_options);
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
- // The index and document store will be persisted to disk when icing goes
- // out of scope.
- }
-
- {
- // Ensure that the index is brought back up without problems and we
- // can query for the content that we expect.
- IcingSearchEngine icing(icing_options);
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- search_spec.set_query("foo");
-
- SearchResultProto empty_result;
- empty_result.mutable_status()->set_code(StatusProto::OK);
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(empty_result));
- }
-}
-
-TEST_F(IcingSearchEngineTest, SearchShouldReturnEmpty) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- // Empty result, no next-page token
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
-
- SearchResultProto search_result_proto =
- icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance());
-
- EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SearchShouldReturnMultiplePages) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- // Creates and inserts 5 documents
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
- DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
- DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
- DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document4).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document5).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(2);
-
- // Searches and gets the first page, 2 results
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document5;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document4;
- SearchResultProto search_result_proto =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
- uint64_t next_page_token = search_result_proto.next_page_token();
- // Since the token is a random number, we don't need to verify
- expected_search_result_proto.set_next_page_token(next_page_token);
- EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
-
- // Second page, 2 results
- expected_search_result_proto.clear_results();
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document3;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
- EXPECT_THAT(icing.GetNextPage(next_page_token),
- EqualsProto(expected_search_result_proto));
-
- // Third page, 1 result
- expected_search_result_proto.clear_results();
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document1;
- EXPECT_THAT(icing.GetNextPage(next_page_token),
- EqualsProto(expected_search_result_proto));
-
- // No more results
- expected_search_result_proto.clear_results();
- expected_search_result_proto.clear_next_page_token();
- EXPECT_THAT(icing.GetNextPage(next_page_token),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SearchWithNoScoringShouldReturnMultiplePages) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- // Creates and inserts 5 documents
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
- DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
- DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
- DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document4).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document5).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- ScoringSpecProto scoring_spec;
- scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
-
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(2);
-
- // Searches and gets the first page, 2 results
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document5;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document4;
- SearchResultProto search_result_proto =
- icing.Search(search_spec, scoring_spec, result_spec);
- EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
- uint64_t next_page_token = search_result_proto.next_page_token();
- // Since the token is a random number, we don't need to verify
- expected_search_result_proto.set_next_page_token(next_page_token);
- EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
-
- // Second page, 2 results
- expected_search_result_proto.clear_results();
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document3;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
- EXPECT_THAT(icing.GetNextPage(next_page_token),
- EqualsProto(expected_search_result_proto));
-
- // Third page, 1 result
- expected_search_result_proto.clear_results();
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document1;
- EXPECT_THAT(icing.GetNextPage(next_page_token),
- EqualsProto(expected_search_result_proto));
-
- // No more results
- expected_search_result_proto.clear_results();
- expected_search_result_proto.clear_next_page_token();
- EXPECT_THAT(icing.GetNextPage(next_page_token),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, ShouldReturnMultiplePagesWithSnippets) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- // Creates and inserts 5 documents
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
- DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
- DocumentProto document4 = CreateMessageDocument("namespace", "uri4");
- DocumentProto document5 = CreateMessageDocument("namespace", "uri5");
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document4).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document5).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(2);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(1);
- result_spec.mutable_snippet_spec()->set_num_to_snippet(3);
-
- // Searches and gets the first page, 2 results with 2 snippets
- SearchResultProto search_result =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- ASSERT_THAT(search_result.status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(search_result.results(), SizeIs(2));
- ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
-
- EXPECT_THAT(search_result.results(0).document(), EqualsProto(document5));
- EXPECT_THAT(GetMatch(search_result.results(0).document(),
- search_result.results(0).snippet(), "body",
- /*snippet_index=*/0),
- Eq("message"));
- EXPECT_THAT(GetWindow(search_result.results(0).document(),
- search_result.results(0).snippet(), "body",
- /*snippet_index=*/0),
- Eq("message body"));
- EXPECT_THAT(search_result.results(1).document(), EqualsProto(document4));
- EXPECT_THAT(GetMatch(search_result.results(1).document(),
- search_result.results(1).snippet(), "body",
- /*snippet_index=*/0),
- Eq("message"));
- EXPECT_THAT(GetWindow(search_result.results(1).document(),
- search_result.results(1).snippet(), "body",
- /*snippet_index=*/0),
- Eq("message body"));
-
- // Second page, 2 result with 1 snippet
- search_result = icing.GetNextPage(search_result.next_page_token());
- ASSERT_THAT(search_result.status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(search_result.results(), SizeIs(2));
- ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
-
- EXPECT_THAT(search_result.results(0).document(), EqualsProto(document3));
- EXPECT_THAT(GetMatch(search_result.results(0).document(),
- search_result.results(0).snippet(), "body",
- /*snippet_index=*/0),
- Eq("message"));
- EXPECT_THAT(GetWindow(search_result.results(0).document(),
- search_result.results(0).snippet(), "body",
- /*snippet_index=*/0),
- Eq("message body"));
- EXPECT_THAT(search_result.results(1).document(), EqualsProto(document2));
- EXPECT_THAT(search_result.results(1).snippet().entries_size(), Eq(0));
-
- // Third page, 1 result with 0 snippets
- search_result = icing.GetNextPage(search_result.next_page_token());
- ASSERT_THAT(search_result.status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(search_result.results(), SizeIs(1));
- ASSERT_THAT(search_result.next_page_token(), Gt(kInvalidNextPageToken));
-
- EXPECT_THAT(search_result.results(0).document(), EqualsProto(document1));
- EXPECT_THAT(search_result.results(0).snippet().entries_size(), Eq(0));
-}
-
-TEST_F(IcingSearchEngineTest, ShouldInvalidateNextPageToken) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(1);
-
- // Searches and gets the first page, 1 result
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
- SearchResultProto search_result_proto =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
- uint64_t next_page_token = search_result_proto.next_page_token();
- // Since the token is a random number, we don't need to verify
- expected_search_result_proto.set_next_page_token(next_page_token);
- EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
- // Now document1 is still to be fetched.
-
- // Invalidates token
- icing.InvalidateNextPageToken(next_page_token);
-
- // Tries to fetch the second page, no result since it's invalidated
- expected_search_result_proto.clear_results();
- expected_search_result_proto.clear_next_page_token();
- EXPECT_THAT(icing.GetNextPage(next_page_token),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest,
- AllPageTokensShouldBeInvalidatedAfterOptimization) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
+TEST_F(IcingSearchEngineTest, GetDocumentProjectionEmpty) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("message");
-
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(1);
-
- // Searches and gets the first page, 1 result
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
- SearchResultProto search_result_proto =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- EXPECT_THAT(search_result_proto.next_page_token(), Gt(kInvalidNextPageToken));
- uint64_t next_page_token = search_result_proto.next_page_token();
- // Since the token is a random number, we don't need to verify
- expected_search_result_proto.set_next_page_token(next_page_token);
- EXPECT_THAT(search_result_proto, EqualsProto(expected_search_result_proto));
- // Now document1 is still to be fetched.
-
- OptimizeResultProto optimize_result_proto;
- optimize_result_proto.mutable_status()->set_code(StatusProto::OK);
- optimize_result_proto.mutable_status()->set_message("");
- ASSERT_THAT(icing.Optimize(), EqualsProto(optimize_result_proto));
-
- // Tries to fetch the second page, no results since all tokens have been
- // invalidated during Optimize()
- expected_search_result_proto.clear_results();
- expected_search_result_proto.clear_next_page_token();
- EXPECT_THAT(icing.GetNextPage(next_page_token),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, OptimizationShouldRemoveDeletedDocs) {
- IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ GetResultSpecProto result_spec;
+ TypePropertyMask* mask = result_spec.add_type_property_masks();
+ mask->set_schema_type(document.schema());
+ mask->add_paths("");
GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, uri1) not found.");
- {
- IcingSearchEngine icing(icing_options);
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
-
- // Deletes document1
- ASSERT_THAT(icing.Delete("namespace", "uri1").status().code(),
- Eq(StatusProto::OK));
- const std::string document_log_path =
- icing_options.base_dir() + "/document_dir/document_log";
- int64_t document_log_size_before =
- filesystem()->GetFileSize(document_log_path.c_str());
- ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
- int64_t document_log_size_after =
- filesystem()->GetFileSize(document_log_path.c_str());
-
- // Validates that document can't be found right after Optimize()
- EXPECT_THAT(icing.Get("namespace", "uri1"),
- EqualsProto(expected_get_result_proto));
- // Validates that document is actually removed from document log
- EXPECT_THAT(document_log_size_after, Lt(document_log_size_before));
- } // Destroys IcingSearchEngine to make sure nothing is cached.
-
- IcingSearchEngine icing(icing_options);
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Get("namespace", "uri1"),
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document;
+ expected_get_result_proto.mutable_document()->clear_properties();
+ ASSERT_THAT(icing.Get("namespace", "uri", result_spec),
EqualsProto(expected_get_result_proto));
}
-TEST_F(IcingSearchEngineTest, OptimizationShouldDeleteTemporaryDirectory) {
- IcingSearchEngineOptions icing_options = GetDefaultIcingOptions();
- IcingSearchEngine icing(icing_options);
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- // Create a tmp dir that will be used in Optimize() to swap files,
- // this validates that any tmp dirs will be deleted before using.
- const std::string tmp_dir =
- icing_options.base_dir() + "/document_dir_optimize_tmp";
-
- const std::string tmp_file = tmp_dir + "/file";
- ASSERT_TRUE(filesystem()->CreateDirectory(tmp_dir.c_str()));
- ScopedFd fd(filesystem()->OpenForWrite(tmp_file.c_str()));
- ASSERT_TRUE(fd.is_valid());
- ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4));
- fd.reset();
-
- EXPECT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
-
- EXPECT_FALSE(filesystem()->DirectoryExists(tmp_dir.c_str()));
- EXPECT_FALSE(filesystem()->FileExists(tmp_file.c_str()));
-}
-
-TEST_F(IcingSearchEngineTest, GetOptimizeInfoHasCorrectStats) {
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(100)
- .SetTtlMs(500)
- .Build();
+TEST_F(IcingSearchEngineTest, GetDocumentWildCardProjectionEmpty) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
- auto fake_clock = std::make_unique<FakeClock>();
- fake_clock->SetSystemTimeMilliseconds(1000);
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::make_unique<Filesystem>(),
- std::move(fake_clock));
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- // Just initialized, nothing is optimizable yet.
- GetOptimizeInfoResultProto optimize_info = icing.GetOptimizeInfo();
- EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
- EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
-
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
-
- // Only have active documents, nothing is optimizable yet.
- optimize_info = icing.GetOptimizeInfo();
- EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
- EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
-
- // Deletes document1
- ASSERT_THAT(icing.Delete("namespace", "uri1").status().code(),
- Eq(StatusProto::OK));
-
- optimize_info = icing.GetOptimizeInfo();
- EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(optimize_info.optimizable_docs(), Eq(1));
- EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Gt(0));
- int64_t first_estimated_optimizable_bytes =
- optimize_info.estimated_optimizable_bytes();
-
- // Add a second document, but it'll be expired since the time (1000) is
- // greater than the document's creation timestamp (100) + the document's ttl
- // (500)
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
-
- optimize_info = icing.GetOptimizeInfo();
- EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(optimize_info.optimizable_docs(), Eq(2));
- EXPECT_THAT(optimize_info.estimated_optimizable_bytes(),
- Gt(first_estimated_optimizable_bytes));
-
- // Optimize
- ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
-
- // Nothing is optimizable now that everything has been optimized away.
- optimize_info = icing.GetOptimizeInfo();
- EXPECT_THAT(optimize_info.status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(optimize_info.optimizable_docs(), Eq(0));
- EXPECT_THAT(optimize_info.estimated_optimizable_bytes(), Eq(0));
-}
-
-TEST_F(IcingSearchEngineTest, GetAndPutShouldWorkAfterOptimization) {
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
- DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ GetResultSpecProto result_spec;
+ TypePropertyMask* mask = result_spec.add_type_property_masks();
+ mask->set_schema_type("*");
+ mask->add_paths("");
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document1;
-
- {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
-
- // Validates that Get() and Put() are good right after Optimize()
- EXPECT_THAT(icing.Get("namespace", "uri1"),
- EqualsProto(expected_get_result_proto));
- EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
- } // Destroys IcingSearchEngine to make sure nothing is cached.
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Get("namespace", "uri1"),
- EqualsProto(expected_get_result_proto));
-
- *expected_get_result_proto.mutable_document() = document2;
- EXPECT_THAT(icing.Get("namespace", "uri2"),
+ *expected_get_result_proto.mutable_document() = document;
+ expected_get_result_proto.mutable_document()->clear_properties();
+ ASSERT_THAT(icing.Get("namespace", "uri", result_spec),
EqualsProto(expected_get_result_proto));
-
- EXPECT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
}
-TEST_F(IcingSearchEngineTest, DeleteShouldWorkAfterOptimization) {
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
- {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
-
- // Validates that Delete() works right after Optimize()
- EXPECT_THAT(icing.Delete("namespace", "uri1").status().code(),
- Eq(StatusProto::OK));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(
- StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, uri1) not found.");
- EXPECT_THAT(icing.Get("namespace", "uri1"),
- EqualsProto(expected_get_result_proto));
-
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- expected_get_result_proto.mutable_status()->clear_message();
- *expected_get_result_proto.mutable_document() = document2;
- EXPECT_THAT(icing.Get("namespace", "uri2"),
- EqualsProto(expected_get_result_proto));
- } // Destroys IcingSearchEngine to make sure nothing is cached.
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Delete("namespace", "uri2").status().code(),
- Eq(StatusProto::OK));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, uri1) not found.");
- EXPECT_THAT(icing.Get("namespace", "uri1"),
- EqualsProto(expected_get_result_proto));
-
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, uri2) not found.");
- EXPECT_THAT(icing.Get("namespace", "uri2"),
- EqualsProto(expected_get_result_proto));
-}
+TEST_F(IcingSearchEngineTest, GetDocumentProjectionMultipleFieldPaths) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
-TEST_F(IcingSearchEngineTest, DeleteBySchemaType) {
- SchemaProto schema;
- // Add an email type
- auto type = schema.add_types();
- type->set_schema_type("email");
- auto property = type->add_properties();
- property->set_property_name("subject");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- // Add an message type
- type = schema.add_types();
- type->set_schema_type("message");
- property = type->add_properties();
- property->set_property_name("body");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- DocumentProto document1 =
- DocumentBuilder()
- .SetKey("namespace1", "uri1")
- .SetSchema("message")
- .AddStringProperty("body", "message body1")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document2 =
+ // 1. Add an email document
+ DocumentProto document =
DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .SetSchema("email")
- .AddStringProperty("subject", "message body2")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
.Build();
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document1;
- EXPECT_THAT(icing.Get("namespace1", "uri1"),
- EqualsProto(expected_get_result_proto));
-
- *expected_get_result_proto.mutable_document() = document2;
- EXPECT_THAT(icing.Get("namespace2", "uri2"),
- EqualsProto(expected_get_result_proto));
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
- // Delete the first type. The first doc should be irretrievable. The
- // second should still be present.
- EXPECT_THAT(icing.DeleteBySchemaType("message").status().code(),
- Eq(StatusProto::OK));
-
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace1, uri1) not found.");
- expected_get_result_proto.clear_document();
- EXPECT_THAT(icing.Get("namespace1", "uri1"),
- EqualsProto(expected_get_result_proto));
+ GetResultSpecProto result_spec;
+ TypePropertyMask* mask = result_spec.add_type_property_masks();
+ mask->set_schema_type("Email");
+ mask->add_paths("sender.name");
+ mask->add_paths("subject");
+ // 2. Verify that the returned result only contains the 'sender.name'
+ // property and the 'subject' property.
+ GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- expected_get_result_proto.mutable_status()->clear_message();
- *expected_get_result_proto.mutable_document() = document2;
- EXPECT_THAT(icing.Get("namespace2", "uri2"),
- EqualsProto(expected_get_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, DeleteByNamespace) {
- DocumentProto document1 =
- DocumentBuilder()
- .SetKey("namespace1", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "message body1")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document2 =
+ *expected_get_result_proto.mutable_document() =
DocumentBuilder()
- .SetKey("namespace2", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "message body2")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
.Build();
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document1;
- EXPECT_THAT(icing.Get("namespace1", "uri1"),
- EqualsProto(expected_get_result_proto));
-
- *expected_get_result_proto.mutable_document() = document2;
- EXPECT_THAT(icing.Get("namespace2", "uri2"),
+ ASSERT_THAT(icing.Get("namespace", "uri1", result_spec),
EqualsProto(expected_get_result_proto));
-
- // Delete the first namespace. The first doc should be irretrievable. The
- // second should still be present.
- EXPECT_THAT(icing.DeleteByNamespace("namespace1").status().code(),
- Eq(StatusProto::OK));
-
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace1, uri1) not found.");
- expected_get_result_proto.clear_document();
- EXPECT_THAT(icing.Get("namespace1", "uri1"),
- EqualsProto(expected_get_result_proto));
-
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- expected_get_result_proto.mutable_status()->clear_message();
- *expected_get_result_proto.mutable_document() = document2;
- EXPECT_THAT(icing.Get("namespace2", "uri2"),
- EqualsProto(expected_get_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SetSchemaShouldWorkAfterOptimization) {
- // Creates 3 test schemas
- SchemaProto schema1 = SchemaProto(CreateMessageSchema());
-
- SchemaProto schema2 = SchemaProto(schema1);
- auto new_property2 = schema2.mutable_types(0)->add_properties();
- new_property2->set_property_name("property2");
- new_property2->set_data_type(PropertyConfigProto::DataType::STRING);
- new_property2->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- new_property2->mutable_indexing_config()->set_term_match_type(
- TermMatchType::PREFIX);
- new_property2->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- SchemaProto schema3 = SchemaProto(schema2);
- auto new_property3 = schema3.mutable_types(0)->add_properties();
- new_property3->set_property_name("property3");
- new_property3->set_data_type(PropertyConfigProto::DataType::STRING);
- new_property3->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- new_property3->mutable_indexing_config()->set_term_match_type(
- TermMatchType::PREFIX);
- new_property3->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(schema1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
-
- // Validates that SetSchema() works right after Optimize()
- EXPECT_THAT(icing.SetSchema(schema2).status().code(), Eq(StatusProto::OK));
- } // Destroys IcingSearchEngine to make sure nothing is cached.
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(schema3).status().code(), Eq(StatusProto::OK));
-}
-
-TEST_F(IcingSearchEngineTest, SearchShouldWorkAfterOptimization) {
- DocumentProto document = CreateMessageDocument("namespace", "uri");
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("m");
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document;
-
- {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Optimize().status().code(), Eq(StatusProto::OK));
-
- // Validates that Search() works right after Optimize()
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
- } // Destroys IcingSearchEngine to make sure nothing is cached.
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest, IcingShouldWorkFineIfOptimizationIsAborted) {
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- {
- // Initializes a normal icing to create files needed
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- }
+TEST_F(IcingSearchEngineTest, GetDocumentWildcardProjectionMultipleFieldPaths) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
- // Creates a mock filesystem in which DeleteDirectoryRecursively() always
- // fails. This will fail IcingSearchEngine::OptimizeDocumentStore() and makes
- // it return ABORTED_ERROR.
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- ON_CALL(*mock_filesystem, DeleteDirectoryRecursively)
- .WillByDefault(Return(false));
-
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::move(mock_filesystem),
- std::make_unique<FakeClock>());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Optimize().status().code(), Eq(StatusProto::ABORTED));
+ // 1. Add an email document
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
- // Now optimization is aborted, we verify that document-related functions
- // still work as expected.
+ GetResultSpecProto result_spec;
+ TypePropertyMask* mask = result_spec.add_type_property_masks();
+ mask->set_schema_type("*");
+ mask->add_paths("sender.name");
+ mask->add_paths("subject");
+ // 2. Verify that the returned result only contains the 'sender.name'
+ // property and the 'subject' property.
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document1;
- EXPECT_THAT(icing.Get("namespace", "uri1"),
+ *expected_get_result_proto.mutable_document() =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .Build();
+ ASSERT_THAT(icing.Get("namespace", "uri1", result_spec),
EqualsProto(expected_get_result_proto));
-
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
-
- EXPECT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_query("m");
- search_spec.set_term_match_type(TermMatchType::PREFIX);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document1;
-
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
}
TEST_F(IcingSearchEngineTest,
- OptimizationShouldRecoverIfFileDirectoriesAreMissing) {
- // Creates a mock filesystem in which SwapFiles() always fails and deletes the
- // directories. This will fail IcingSearchEngine::OptimizeDocumentStore().
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- ON_CALL(*mock_filesystem, SwapFiles)
- .WillByDefault([this](const char* one, const char* two) {
- filesystem()->DeleteDirectoryRecursively(one);
- filesystem()->DeleteDirectoryRecursively(two);
- return false;
- });
+ GetDocumentSpecificProjectionOverridesWildcardProjection) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(),
+ ProtoIsOk());
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::move(mock_filesystem),
- std::make_unique<FakeClock>());
-
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-
- // Optimize() fails due to filesystem error
- EXPECT_THAT(icing.Optimize().status().code(),
- Eq(StatusProto::WARNING_DATA_LOSS));
-
- // Document is not found because original file directory is missing
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, uri) not found.");
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
-
- DocumentProto new_document =
+ // 1. Add an email document
+ DocumentProto document =
DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "new body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .AddStringProperty("subject", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
.Build();
-
- EXPECT_THAT(icing.Put(new_document).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_query("m");
- search_spec.set_term_match_type(TermMatchType::PREFIX);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
-
- // Searching old content returns nothing because original file directory is
- // missing
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- search_spec.set_query("n");
-
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- new_document;
-
- // Searching new content returns the new document
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, OptimizationShouldRecoverIfDataFilesAreMissing) {
- // Creates a mock filesystem in which SwapFiles() always fails and empties the
- // directories. This will fail IcingSearchEngine::OptimizeDocumentStore().
- auto mock_filesystem = std::make_unique<MockFilesystem>();
- ON_CALL(*mock_filesystem, SwapFiles)
- .WillByDefault([this](const char* one, const char* two) {
- filesystem()->DeleteDirectoryRecursively(one);
- filesystem()->CreateDirectoryRecursively(one);
- filesystem()->DeleteDirectoryRecursively(two);
- filesystem()->CreateDirectoryRecursively(two);
- return false;
- });
-
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::move(mock_filesystem),
- std::make_unique<FakeClock>());
-
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-
- // Optimize() fails due to filesystem error
- EXPECT_THAT(icing.Optimize().status().code(),
- Eq(StatusProto::WARNING_DATA_LOSS));
-
- // Document is not found because original files are missing
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ // 2. Add type property masks for the wildcard and the specific type of the
+ // document 'Email'. The wildcard should be ignored and only the 'Email'
+ // projection should apply.
+ GetResultSpecProto result_spec;
+ TypePropertyMask* mask = result_spec.add_type_property_masks();
+ mask->set_schema_type("*");
+ mask->add_paths("subject");
+ mask = result_spec.add_type_property_masks();
+ mask->set_schema_type("Email");
+ mask->add_paths("body");
+
+ // 3. Verify that the returned result only contains the 'body' property.
GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, uri) not found.");
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
-
- DocumentProto new_document =
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() =
DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "new body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
.Build();
-
- EXPECT_THAT(icing.Put(new_document).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_query("m");
- search_spec.set_term_match_type(TermMatchType::PREFIX);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
-
- // Searching old content returns nothing because original files are missing
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- search_spec.set_query("n");
-
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- new_document;
-
- // Searching new content returns the new document
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SearchIncludesDocumentsBeforeTtl) {
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("Message");
-
- auto body = type->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- DocumentProto document = DocumentBuilder()
- .SetKey("namespace", "uri")
- .SetSchema("Message")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(100)
- .SetTtlMs(500)
- .Build();
-
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document;
-
- // Time just has to be less than the document's creation timestamp (100) + the
- // document's ttl (500)
- auto fake_clock = std::make_unique<FakeClock>();
- fake_clock->SetSystemTimeMilliseconds(400);
-
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::make_unique<Filesystem>(),
- std::move(fake_clock));
-
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
-
- // Check that the document is returned as part of search results
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, SearchDoesntIncludeDocumentsPastTtl) {
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("Message");
-
- auto body = type->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- DocumentProto document = DocumentBuilder()
- .SetKey("namespace", "uri")
- .SetSchema("Message")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(100)
- .SetTtlMs(500)
- .Build();
-
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
-
- // Time just has to be greater than the document's creation timestamp (100) +
- // the document's ttl (500)
- auto fake_clock = std::make_unique<FakeClock>();
- fake_clock->SetSystemTimeMilliseconds(700);
-
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::make_unique<Filesystem>(),
- std::move(fake_clock));
-
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
-
- // Check that the document is not returned as part of search results
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
+ ASSERT_THAT(icing.Get("namespace", "uri1", result_spec),
+ EqualsProto(expected_get_result_proto));
}
-TEST_F(IcingSearchEngineTest, SearchWorksAfterSchemaTypesCompatiblyModified) {
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("message");
-
- auto property = type_config->add_properties();
- property->set_property_name("body");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+TEST_F(IcingSearchEngineTest, GetDocumentProjectionPolymorphism) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Artist")
+ .AddParentType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("company")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
- DocumentProto message_document =
+ // Add a person document and an artist document
+ DocumentProto document_person =
DocumentBuilder()
- .SetKey("namespace", "message_uri")
- .SetSchema("message")
- .AddStringProperty("body", "foo")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
.Build();
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(message_document).status().code(), Eq(StatusProto::OK));
-
- // Make sure we can search for message document
- SearchSpecProto search_spec;
- search_spec.set_query("foo");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
-
- // The message isn't indexed, so we get nothing
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- // With just the schema type filter, we can search for the message
- search_spec.Clear();
- search_spec.add_schema_type_filters("message");
-
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- message_document;
-
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- // Since SchemaTypeIds are assigned based on order in the SchemaProto, this
- // will force a change in the DocumentStore's cached SchemaTypeIds
- schema.clear_types();
- type_config = schema.add_types();
- type_config->set_schema_type("email");
-
- // Adding a new indexed property will require reindexing
- type_config = schema.add_types();
- type_config->set_schema_type("message");
-
- property = type_config->add_properties();
- property->set_property_name("body");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::PREFIX);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
-
- search_spec.Clear();
- search_spec.set_query("foo");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
- search_spec.add_schema_type_filters("message");
-
- // We can still search for the message document
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, RecoverFromMissingHeaderFile) {
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
+ DocumentProto document_artist =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "Meg Artist")
+ .AddStringProperty("emailAddress", "artist@aol.com")
+ .AddStringProperty("company", "aol")
+ .Build();
+ ASSERT_THAT(icing.Put(document_person).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document_artist).status(), ProtoIsOk());
+
+ // Add type property masks
+ GetResultSpecProto result_spec;
+ TypePropertyMask* person_type_property_mask =
+ result_spec.add_type_property_masks();
+ person_type_property_mask->set_schema_type("Person");
+ person_type_property_mask->add_paths("name");
+ // Since Artist is a child type of Person, the TypePropertyMask for Person
+ // will be merged to Artist's TypePropertyMask by polymorphism, so that 'name'
+ // will also show in Artist's projection results.
+ TypePropertyMask* artist_type_property_mask =
+ result_spec.add_type_property_masks();
+ artist_type_property_mask->set_schema_type("Artist");
+ artist_type_property_mask->add_paths("emailAddress");
+
+ // Verify that the returned person document only contains the 'name' property,
+ // and the returned artist document contain both the 'name' and 'emailAddress'
+ // properties.
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
*expected_get_result_proto.mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- {
- // Basic initialization/setup
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- EXPECT_TRUE(filesystem()->DeleteFile(GetHeaderFilename().c_str()));
-
- // We should be able to recover from this and access all our previous data
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- // Checks that DocumentLog is still ok
- EXPECT_THAT(icing.Get("namespace", "uri"),
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .Build();
+ ASSERT_THAT(icing.Get("namespace", "uri1", result_spec),
EqualsProto(expected_get_result_proto));
- // Checks that the index is still ok so we can search over it
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- // Checks that Schema is still since it'll be needed to validate the document
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-}
-
-TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderMagic) {
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
*expected_get_result_proto.mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- {
- // Basic initialization/setup
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- // Change the header's magic value
- int32_t invalid_magic = 1; // Anything that's not the actual kMagic value.
- filesystem()->PWrite(GetHeaderFilename().c_str(),
- offsetof(IcingSearchEngine::Header, magic),
- &invalid_magic, sizeof(invalid_magic));
-
- // We should be able to recover from this and access all our previous data
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- // Checks that DocumentLog is still ok
- EXPECT_THAT(icing.Get("namespace", "uri"),
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "Meg Artist")
+ .AddStringProperty("emailAddress", "artist@aol.com")
+ .Build();
+ ASSERT_THAT(icing.Get("namespace", "uri2", result_spec),
EqualsProto(expected_get_result_proto));
-
- // Checks that the index is still ok so we can search over it
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- // Checks that Schema is still since it'll be needed to validate the document
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
}
-TEST_F(IcingSearchEngineTest, RecoverFromInvalidHeaderChecksum) {
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+TEST_F(IcingSearchEngineTest, GetDocumentProjectionMultipleParentPolymorphism) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("note")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("EmailMessage")
+ .AddParentType("Email")
+ .AddParentType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("note")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- CreateMessageDocument("namespace", "uri");
+ // Add an email document and a message document
+ DocumentProto document_email =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("sender", "sender1")
+ .AddStringProperty("recipient", "recipient1")
+ .Build();
+ DocumentProto document_message = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Message")
+ .AddStringProperty("content", "content1")
+ .AddStringProperty("note", "note1")
+ .Build();
+ // Add an emailMessage document
+ DocumentProto document_email_message =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("EmailMessage")
+ .AddStringProperty("sender", "sender2")
+ .AddStringProperty("recipient", "recipient2")
+ .AddStringProperty("content", "content2")
+ .AddStringProperty("note", "note2")
+ .Build();
+ ASSERT_THAT(icing.Put(document_email).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document_message).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document_email_message).status(), ProtoIsOk());
+
+ // Add type property masks for Email and Message, and both of them will apply
+ // to EmailMessage.
+ GetResultSpecProto result_spec;
+ TypePropertyMask* email_type_property_mask =
+ result_spec.add_type_property_masks();
+ email_type_property_mask->set_schema_type("Email");
+ email_type_property_mask->add_paths("sender");
+
+ TypePropertyMask* message_type_property_mask =
+ result_spec.add_type_property_masks();
+ message_type_property_mask->set_schema_type("Message");
+ message_type_property_mask->add_paths("content");
+
+ // Verify that
+ // - The returned email document only contains the 'sender' property.
+ // - The returned message document only contains the 'content' property.
+ // - The returned email message document contains both the 'sender' and
+ // 'content' properties,
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
*expected_get_result_proto.mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- {
- // Basic initialization/setup
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- // Change the header's checksum value
- uint32_t invalid_checksum =
- 1; // Anything that's not the actual checksum value
- filesystem()->PWrite(GetHeaderFilename().c_str(),
- offsetof(IcingSearchEngine::Header, checksum),
- &invalid_checksum, sizeof(invalid_checksum));
-
- // We should be able to recover from this and access all our previous data
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- // Checks that DocumentLog is still ok
- EXPECT_THAT(icing.Get("namespace", "uri"),
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("sender", "sender1")
+ .Build();
+ ASSERT_THAT(icing.Get("namespace", "uri1", result_spec),
EqualsProto(expected_get_result_proto));
- // Checks that the index is still ok so we can search over it
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-
- // Checks that Schema is still since it'll be needed to validate the document
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-}
-
-TEST_F(IcingSearchEngineTest, UnableToRecoverFromCorruptSchema) {
- {
- // Basic initialization/setup
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- const std::string schema_file =
- absl_ports::StrCat(GetSchemaDir(), "/schema.pb");
- const std::string corrupt_data = "1234";
- EXPECT_TRUE(filesystem()->Write(schema_file.c_str(), corrupt_data.data(),
- corrupt_data.size()));
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::INTERNAL));
-}
-
-TEST_F(IcingSearchEngineTest, UnableToRecoverFromCorruptDocumentLog) {
- {
- // Basic initialization/setup
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- const std::string document_log_file =
- absl_ports::StrCat(GetDocumentDir(), "/document_log");
- const std::string corrupt_data = "1234";
- EXPECT_TRUE(filesystem()->Write(document_log_file.c_str(),
- corrupt_data.data(), corrupt_data.size()));
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::INTERNAL));
-}
-
-TEST_F(IcingSearchEngineTest, RecoverFromInconsistentSchemaStore) {
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2_with_additional_property =
+ *expected_get_result_proto.mutable_document() =
DocumentBuilder()
.SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
.SetSchema("Message")
- .AddStringProperty("additional", "content")
- .AddStringProperty("body", "message body")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .AddStringProperty("content", "content1")
.Build();
+ ASSERT_THAT(icing.Get("namespace", "uri2", result_spec),
+ EqualsProto(expected_get_result_proto));
- {
- // Initializes folder and schema
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("Message");
-
- auto property = type->add_properties();
- property->set_property_name("body");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::PREFIX);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- property = type->add_properties();
- property->set_property_name("additional");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- EXPECT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document2_with_additional_property).status().code(),
- Eq(StatusProto::OK));
-
- // Won't get us anything because "additional" isn't marked as an indexed
- // property in the schema
- SearchSpecProto search_spec;
- search_spec.set_query("additional:content");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- {
- // This schema will change the SchemaTypeIds from the previous schema_
- // (since SchemaTypeIds are assigned based on order of the types, and this
- // new schema changes the ordering of previous types)
- SchemaProto new_schema;
- auto type = new_schema.add_types();
- type->set_schema_type("Email");
-
- type = new_schema.add_types();
- type->set_schema_type("Message");
-
- // Adding a new property changes the SectionIds (since SectionIds are
- // assigned based on alphabetical order of indexed sections, marking
- // "additional" as an indexed property will push the "body" property to a
- // different SectionId)
- auto property = type->add_properties();
- property->set_property_name("body");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::PREFIX);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- property = type->add_properties();
- property->set_property_name("additional");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::PREFIX);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(filesystem(), GetSchemaDir()));
- ICING_EXPECT_OK(schema_store->SetSchema(new_schema));
- } // Will persist new schema
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- // We can insert a Email document since we kept the new schema
- DocumentProto email_document =
+ *expected_get_result_proto.mutable_document() =
DocumentBuilder()
- .SetKey("namespace", "email_uri")
- .SetSchema("Email")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .SetKey("namespace", "uri3")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("EmailMessage")
+ .AddStringProperty("sender", "sender2")
+ .AddStringProperty("content", "content2")
.Build();
- EXPECT_THAT(icing.Put(email_document).status().code(), Eq(StatusProto::OK));
-
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = email_document;
-
- EXPECT_THAT(icing.Get("namespace", "email_uri"),
+ ASSERT_THAT(icing.Get("namespace", "uri3", result_spec),
EqualsProto(expected_get_result_proto));
-
- SearchSpecProto search_spec;
-
- // The section restrict will ensure we are using the correct, updated
- // SectionId in the Index
- search_spec.set_query("additional:content");
-
- // Schema type filter will ensure we're using the correct, updated
- // SchemaTypeId in the DocumentStore
- search_spec.add_schema_type_filters("Message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2_with_additional_property;
-
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest, RecoverFromInconsistentDocumentStore) {
- DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
- DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+TEST_F(IcingSearchEngineTest, GetDocumentProjectionDiamondPolymorphism) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+
+ // Create a schema with a diamond inheritance relation.
+ // Object
+ // / \
+ // Email Message
+ // \ /
+ // EmailMessage
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Object").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("objectId")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddParentType("Object")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("objectId")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddParentType("Object")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("objectId")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("note")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("EmailMessage")
+ .AddParentType("Email")
+ .AddParentType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("objectId")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("note")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(icing.SetSchema(schema).status(), ProtoIsOk());
- {
- // Initializes folder and schema, index one document
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- } // This should shut down IcingSearchEngine and persist anything it needs to
+ // Add an email document and a message document
+ DocumentProto document_email =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("objectId", "object1")
+ .AddStringProperty("sender", "sender1")
+ .AddStringProperty("recipient", "recipient1")
+ .Build();
+ DocumentProto document_message = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Message")
+ .AddStringProperty("objectId", "object2")
+ .AddStringProperty("content", "content1")
+ .AddStringProperty("note", "note1")
+ .Build();
+ // Add an emailMessage document
+ DocumentProto document_email_message =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("EmailMessage")
+ .AddStringProperty("objectId", "object3")
+ .AddStringProperty("sender", "sender2")
+ .AddStringProperty("recipient", "recipient2")
+ .AddStringProperty("content", "content2")
+ .AddStringProperty("note", "note2")
+ .Build();
- {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(filesystem(), GetSchemaDir()));
- ICING_EXPECT_OK(schema_store->SetSchema(CreateMessageSchema()));
-
- // Puts a second document into DocumentStore but doesn't index it.
- FakeClock fake_clock;
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(filesystem(), GetDocumentDir(), &fake_clock,
- schema_store.get()));
- ICING_EXPECT_OK(document_store->Put(document2));
- }
+ ASSERT_THAT(icing.Put(document_email).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document_message).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document_email_message).status(), ProtoIsOk());
- IcingSearchEngine icing(GetDefaultIcingOptions());
- // Index Restoration should be triggered here and document2 should be
- // indexed.
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ // Add type property masks for Object, which should apply to Email, Message
+ // and EmailMessage.
+ GetResultSpecProto result_spec;
+ TypePropertyMask* email_type_property_mask =
+ result_spec.add_type_property_masks();
+ email_type_property_mask->set_schema_type("Object");
+ email_type_property_mask->add_paths("objectId");
+ // Verify that all the documents only contain the 'objectId' property.
GetResultProto expected_get_result_proto;
expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document1;
-
- // DocumentStore kept the additional document
- EXPECT_THAT(icing.Get("namespace", "uri1"),
+ *expected_get_result_proto.mutable_document() =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("objectId", "object1")
+ .Build();
+ ASSERT_THAT(icing.Get("namespace", "uri1", result_spec),
EqualsProto(expected_get_result_proto));
- *expected_get_result_proto.mutable_document() = document2;
- EXPECT_THAT(icing.Get("namespace", "uri2"),
+ *expected_get_result_proto.mutable_document() =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Message")
+ .AddStringProperty("objectId", "object2")
+ .Build();
+ ASSERT_THAT(icing.Get("namespace", "uri2", result_spec),
EqualsProto(expected_get_result_proto));
- // We indexed the additional document
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
-
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document1;
-
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, RecoverFromInconsistentIndex) {
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- {
- // Initializes folder and schema, index one document
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- // Pretend we lost the entire index
- EXPECT_TRUE(filesystem()->DeleteDirectoryRecursively(
- absl_ports::StrCat(GetIndexDir(), "/idx/lite.").c_str()));
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- // Check that our index is ok by searching over the restored index
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
-
-TEST_F(IcingSearchEngineTest, RecoverFromCorruptIndex) {
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
-
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- CreateMessageDocument("namespace", "uri");
-
- {
- // Initializes folder and schema, index one document
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- // Pretend index is corrupted
- const std::string index_hit_buffer_file = GetIndexDir() + "/idx/lite.hb";
- ScopedFd fd(filesystem()->OpenForWrite(index_hit_buffer_file.c_str()));
- ASSERT_TRUE(fd.is_valid());
- ASSERT_TRUE(filesystem()->Write(fd.get(), "1234", 4));
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
-
- // Check that our index is ok by searching over the restored index
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
+ *expected_get_result_proto.mutable_document() =
+ DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("EmailMessage")
+ .AddStringProperty("objectId", "object3")
+ .Build();
+ ASSERT_THAT(icing.Get("namespace", "uri3", result_spec),
+ EqualsProto(expected_get_result_proto));
}
-TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByDocumentScore) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
+TEST_F(IcingSearchEngineTest, OlderUsageTimestampShouldNotOverrideNewerOnes) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
- // Creates 3 documents and ensures the relationship in terms of document
- // score is: document1 < document2 < document3
+ // Creates 3 test documents
DocumentProto document1 =
DocumentBuilder()
.SetKey("namespace", "uri/1")
.SetSchema("Message")
.AddStringProperty("body", "message1")
- .SetScore(1)
.SetCreationTimestampMs(kDefaultCreationTimestampMs)
.Build();
DocumentProto document2 =
@@ -2588,354 +848,274 @@ TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByDocumentScore) {
.SetKey("namespace", "uri/2")
.SetSchema("Message")
.AddStringProperty("body", "message2")
- .SetScore(2)
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document3 =
- DocumentBuilder()
- .SetKey("namespace", "uri/3")
- .SetSchema("Message")
- .AddStringProperty("body", "message3")
- .SetScore(3)
.SetCreationTimestampMs(kDefaultCreationTimestampMs)
.Build();
- // Intentionally inserts the documents in the order that is different than
- // their score order
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
-
- // "m" will match all 3 documents
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+
+ // Report usage for doc1 and doc2. The older timestamp 5000 shouldn't be
+ // overridden by 1000. The order will be doc1 > doc2 when ranked by
+ // USAGE_TYPE1_LAST_USED_TIMESTAMP.
+ UsageReport usage_report_doc1_time1 = CreateUsageReport(
+ /*name_space=*/"namespace", /*uri=*/"uri/1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_doc1_time5 = CreateUsageReport(
+ /*name_space=*/"namespace", /*uri=*/"uri/1", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_doc2_time3 = CreateUsageReport(
+ /*name_space=*/"namespace", /*uri=*/"uri/2", /*timestamp_ms=*/3000,
+ UsageReport::USAGE_TYPE1);
+ ASSERT_THAT(icing.ReportUsage(usage_report_doc1_time5).status(), ProtoIsOk());
+ ASSERT_THAT(icing.ReportUsage(usage_report_doc2_time3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.ReportUsage(usage_report_doc1_time1).status(), ProtoIsOk());
+
+ // "m" will match both documents
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::PREFIX);
search_spec.set_query("m");
- // Result should be in descending score order
+ // Result should be in descending USAGE_TYPE1_LAST_USED_TIMESTAMP order
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
*expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document3;
+ document1;
*expected_search_result_proto.mutable_results()->Add()->mutable_document() =
document2;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document1;
- ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
- scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
- EXPECT_THAT(icing.Search(search_spec, scoring_spec,
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP);
+ SearchResultProto search_result_proto = icing.Search(
+ search_spec, scoring_spec, ResultSpecProto::default_instance());
+ EXPECT_THAT(search_result_proto, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest, SearchShouldAllowNoScoring) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- // Creates 3 documents and ensures the relationship of them is:
- // document1 < document2 < document3
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace", "uri/1")
- .SetSchema("Message")
- .AddStringProperty("body", "message1")
- .SetScore(1)
- .SetCreationTimestampMs(1571111111111)
- .Build();
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace", "uri/2")
- .SetSchema("Message")
- .AddStringProperty("body", "message2")
- .SetScore(2)
- .SetCreationTimestampMs(1572222222222)
- .Build();
- DocumentProto document3 = DocumentBuilder()
- .SetKey("namespace", "uri/3")
- .SetSchema("Message")
- .AddStringProperty("body", "message3")
- .SetScore(3)
- .SetCreationTimestampMs(1573333333333)
- .Build();
-
- // Intentionally inserts the documents in the order that is different than
- // their score order
- ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
-
- // "m" will match all 3 documents
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("m");
+TEST_F(IcingSearchEngineTest, ImplicitPersistToDiskFullSavesEverything) {
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Put(document).status(), ProtoIsOk());
+ } // Destructing calls a PersistToDisk(FULL)
+
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+
+ // There should be no recovery since everything should be saved properly.
+ InitializeResultProto init_result = icing.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Schema is still intact.
+ GetSchemaResultProto expected_get_schema_result_proto;
+ expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document1;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document3;
-
- // Results should not be ranked by score but returned in reverse insertion
- // order.
- ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
- scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
- EXPECT_THAT(icing.Search(search_spec, scoring_spec,
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
-}
+ EXPECT_THAT(icing.GetSchema(), EqualsProto(expected_get_schema_result_proto));
-TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedByCreationTimestamp) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- // Creates 3 documents and ensures the relationship in terms of creation
- // timestamp score is: document1 < document2 < document3
- DocumentProto document1 = DocumentBuilder()
- .SetKey("namespace", "uri/1")
- .SetSchema("Message")
- .AddStringProperty("body", "message1")
- .SetCreationTimestampMs(1571111111111)
- .Build();
- DocumentProto document2 = DocumentBuilder()
- .SetKey("namespace", "uri/2")
- .SetSchema("Message")
- .AddStringProperty("body", "message2")
- .SetCreationTimestampMs(1572222222222)
- .Build();
- DocumentProto document3 = DocumentBuilder()
- .SetKey("namespace", "uri/3")
- .SetSchema("Message")
- .AddStringProperty("body", "message3")
- .SetCreationTimestampMs(1573333333333)
- .Build();
-
- // Intentionally inserts the documents in the order that is different than
- // their score order
- ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
-
- // "m" will match all 3 documents
+ // Documents are still intact.
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document;
+
+ EXPECT_THAT(
+ icing.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Index is still intact.
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("m");
+ search_spec.set_query("message"); // Content in the Message document.
- // Result should be in descending timestamp order
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
*expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document3;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document1;
+ document;
- ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
- scoring_spec.set_rank_by(
- ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
- EXPECT_THAT(icing.Search(search_spec, scoring_spec,
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
+ SearchResultProto actual_results =
+ icing.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest, SearchResultShouldBeRankedAscendingly) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
+TEST_F(IcingSearchEngineTest, ExplicitPersistToDiskFullSavesEverything) {
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
- // Creates 3 documents and ensures the relationship in terms of document
- // score is: document1 < document2 < document3
- DocumentProto document1 =
- DocumentBuilder()
- .SetKey("namespace", "uri/1")
- .SetSchema("Message")
- .AddStringProperty("body", "message1")
- .SetScore(1)
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document2 =
- DocumentBuilder()
- .SetKey("namespace", "uri/2")
- .SetSchema("Message")
- .AddStringProperty("body", "message2")
- .SetScore(2)
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- DocumentProto document3 =
- DocumentBuilder()
- .SetKey("namespace", "uri/3")
- .SetSchema("Message")
- .AddStringProperty("body", "message3")
- .SetScore(3)
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
+ // Add schema and documents to our first icing1 instance.
+ IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk());
+ EXPECT_THAT(icing1.PersistToDisk(PersistType::FULL).status(), ProtoIsOk());
+
+ // Initialize a second icing2 instance which should have it's own memory
+ // space. If data from icing1 isn't being persisted to the files, then icing2
+ // won't be able to see those changes.
+ IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache());
+
+ // There should be no recovery since everything should be saved properly.
+ InitializeResultProto init_result = icing2.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // Schema is still intact.
+ GetSchemaResultProto expected_get_schema_result_proto;
+ expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
- // Intentionally inserts the documents in the order that is different than
- // their score order
- ASSERT_THAT(icing.Put(document2).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document3).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(document1).status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing2.GetSchema(),
+ EqualsProto(expected_get_schema_result_proto));
- // "m" will match all 3 documents
+ // Documents are still intact.
+ GetResultProto expected_get_result_proto;
+ expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_result_proto.mutable_document() = document;
+
+ EXPECT_THAT(
+ icing2.Get("namespace", "uri", GetResultSpecProto::default_instance()),
+ EqualsProto(expected_get_result_proto));
+
+ // Index is still intact.
SearchSpecProto search_spec;
search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("m");
+ search_spec.set_query("message"); // Content in the Message document.
- // Result should be in ascending score order
SearchResultProto expected_search_result_proto;
expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
*expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document1;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document2;
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- document3;
+ document;
- ScoringSpecProto scoring_spec = GetDefaultScoringSpec();
- scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
- scoring_spec.set_order_by(ScoringSpecProto::Order::ASC);
- EXPECT_THAT(icing.Search(search_spec, scoring_spec,
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
+ SearchResultProto actual_results =
+ icing2.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest,
- SetSchemaCanNotDetectPreviousSchemaWasLostWithoutDocuments) {
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("Message");
+TEST_F(IcingSearchEngineTest, NoPersistToDiskLosesAllDocumentsAndIndex) {
+ IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
+ EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk());
+ EXPECT_THAT(
+ icing1.Get("namespace", "uri", GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(document));
+
+ // It's intentional that no PersistToDisk call is made before initializing a
+ // second instance of icing.
+
+ IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing2.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::PARTIAL_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // The document shouldn't be found because we forgot to call
+ // PersistToDisk(LITE)!
+ EXPECT_THAT(
+ icing2.Get("namespace", "uri", GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::NOT_FOUND));
- auto body = type->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ // Searching also shouldn't get us anything because the index wasn't
+ // recovered.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message"); // Content in the Message document.
- // Make an incompatible schema, a previously OPTIONAL field is REQUIRED
- SchemaProto incompatible_schema = schema;
- incompatible_schema.mutable_types(0)->mutable_properties(0)->set_cardinality(
- PropertyConfigProto::Cardinality::REQUIRED);
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- ASSERT_TRUE(filesystem()->DeleteDirectoryRecursively(GetSchemaDir().c_str()));
-
- // Since we don't have any documents yet, we can't detect this edge-case. But
- // it should be fine since there aren't any documents to be invalidated.
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(incompatible_schema).status().code(),
- Eq(StatusProto::OK));
+ SearchResultProto actual_results =
+ icing2.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
}
-TEST_F(IcingSearchEngineTest, SetSchemaCanDetectPreviousSchemaWasLost) {
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("Message");
-
- auto body = type->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- body->mutable_indexing_config()->set_term_match_type(TermMatchType::PREFIX);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+TEST_F(IcingSearchEngineTest, PersistToDiskLiteSavesGroundTruth) {
+ DocumentProto document = CreateMessageDocument("namespace", "uri");
- // Make an incompatible schema, a previously OPTIONAL field is REQUIRED
- SchemaProto incompatible_schema = schema;
- incompatible_schema.mutable_types(0)->mutable_properties(0)->set_cardinality(
- PropertyConfigProto::Cardinality::REQUIRED);
+ IcingSearchEngine icing1(GetDefaultIcingOptions(), GetTestJniCache());
+ EXPECT_THAT(icing1.Initialize().status(), ProtoIsOk());
+ EXPECT_THAT(icing1.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ EXPECT_THAT(icing1.Put(document).status(), ProtoIsOk());
+ EXPECT_THAT(icing1.PersistToDisk(PersistType::LITE).status(), ProtoIsOk());
+ EXPECT_THAT(
+ icing1.Get("namespace", "uri", GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(document));
+
+ IcingSearchEngine icing2(GetDefaultIcingOptions(), GetTestJniCache());
+ InitializeResultProto init_result = icing2.Initialize();
+ EXPECT_THAT(init_result.status(), ProtoIsOk());
+ EXPECT_THAT(init_result.initialize_stats().document_store_data_status(),
+ Eq(InitializeStatsProto::NO_DATA_LOSS));
+ EXPECT_THAT(init_result.initialize_stats().schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ // A checksum mismatch gets reported as an IO error. The document store and
+ // index didn't have their derived files included in the checksum previously,
+ // so reinitializing will trigger a checksum mismatch.
+ EXPECT_THAT(init_result.initialize_stats().document_store_recovery_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(init_result.initialize_stats().index_restoration_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+
+ // Schema is still intact.
+ GetSchemaResultProto expected_get_schema_result_proto;
+ expected_get_schema_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_get_schema_result_proto.mutable_schema() = CreateMessageSchema();
- SearchSpecProto search_spec;
- search_spec.set_query("message");
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(icing2.GetSchema(),
+ EqualsProto(expected_get_schema_result_proto));
- {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(schema).status().code(), Eq(StatusProto::OK));
-
- DocumentProto document = CreateMessageDocument("namespace", "uri");
- ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
-
- // Can retrieve by namespace/uri
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() = document;
-
- ASSERT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
-
- // Can search for it
- SearchResultProto expected_search_result_proto;
- expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
- CreateMessageDocument("namespace", "uri");
- ASSERT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(expected_search_result_proto));
- } // This should shut down IcingSearchEngine and persist anything it needs to
-
- ASSERT_TRUE(filesystem()->DeleteDirectoryRecursively(GetSchemaDir().c_str()));
-
- // Setting the new, different schema will remove incompatible documents
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(incompatible_schema).status().code(),
- Eq(StatusProto::OK));
-
- // Can't retrieve by namespace/uri
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::NOT_FOUND);
- expected_get_result_proto.mutable_status()->set_message(
- "Document (namespace, uri) not found.");
-
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
+ // The document should be found because we called PersistToDisk(LITE)!
+ EXPECT_THAT(
+ icing2.Get("namespace", "uri", GetResultSpecProto::default_instance())
+ .document(),
+ EqualsProto(document));
- // Can't search for it
- SearchResultProto empty_result;
- empty_result.mutable_status()->set_code(StatusProto::OK);
- EXPECT_THAT(icing.Search(search_spec, GetDefaultScoringSpec(),
- ResultSpecProto::default_instance()),
- EqualsProto(empty_result));
-}
+ // Recovered index is still intact.
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_query("message"); // Content in the Message document.
-TEST_F(IcingSearchEngineTest, PersistToDisk) {
- GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
- *expected_get_result_proto.mutable_document() =
- CreateMessageDocument("namespace", "uri");
+ SearchResultProto expected_search_result_proto;
+ expected_search_result_proto.mutable_status()->set_code(StatusProto::OK);
+ *expected_search_result_proto.mutable_results()->Add()->mutable_document() =
+ document;
- {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-
- // Persisting shouldn't affect anything
- EXPECT_THAT(icing.PersistToDisk().status().code(), Eq(StatusProto::OK));
-
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
- } // Destructing persists as well
-
- IcingSearchEngine icing(GetDefaultIcingOptions());
- EXPECT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Get("namespace", "uri"),
- EqualsProto(expected_get_result_proto));
+ SearchResultProto actual_results =
+ icing2.Search(search_spec, GetDefaultScoringSpec(),
+ ResultSpecProto::default_instance());
+ EXPECT_THAT(actual_results, EqualsSearchResultIgnoreStatsAndScores(
+ expected_search_result_proto));
}
TEST_F(IcingSearchEngineTest, ResetOk) {
@@ -2943,22 +1123,21 @@ TEST_F(IcingSearchEngineTest, ResetOk) {
SchemaProto empty_schema = SchemaProto(message_schema);
empty_schema.clear_types();
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(message_schema).status().code(),
- Eq(StatusProto::OK));
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(message_schema).status(), ProtoIsOk());
int64_t empty_state_size =
filesystem()->GetFileDiskUsage(GetTestBaseDir().c_str());
DocumentProto document = CreateMessageDocument("namespace", "uri");
- ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
// Check that things have been added
EXPECT_THAT(filesystem()->GetDiskUsage(GetTestBaseDir().c_str()),
Gt(empty_state_size));
- EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(icing.Reset().status(), ProtoIsOk());
// Check that we're back to an empty state
EXPECT_EQ(filesystem()->GetFileDiskUsage(GetTestBaseDir().c_str()),
@@ -2967,297 +1146,38 @@ TEST_F(IcingSearchEngineTest, ResetOk) {
// Sanity check that we can still call other APIs. If things aren't cleared,
// then this should raise an error since the empty schema is incompatible with
// the old message_schema.
- EXPECT_THAT(icing.SetSchema(empty_schema).status().code(),
- Eq(StatusProto::OK));
+ EXPECT_THAT(icing.SetSchema(empty_schema).status(), ProtoIsOk());
}
-TEST_F(IcingSearchEngineTest, ResetAbortedError) {
+TEST_F(IcingSearchEngineTest, ResetDeleteFailureCausesInternalError) {
auto mock_filesystem = std::make_unique<MockFilesystem>();
- // This fails IcingSearchEngine::Reset(). But since we didn't actually delete
- // anything, we'll be able to consider this just an ABORTED call.
+ // This fails IcingSearchEngine::Reset() with status code INTERNAL and leaves
+ // the IcingSearchEngine instance in an uninitialized state.
ON_CALL(*mock_filesystem,
DeleteDirectoryRecursively(StrEq(GetTestBaseDir().c_str())))
.WillByDefault(Return(false));
TestIcingSearchEngine icing(GetDefaultIcingOptions(),
std::move(mock_filesystem),
- std::make_unique<FakeClock>());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
+ std::make_unique<IcingFilesystem>(),
+ std::make_unique<FakeClock>(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
DocumentProto document = CreateMessageDocument("namespace", "uri");
- ASSERT_THAT(icing.Put(document).status().code(), Eq(StatusProto::OK));
- EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::ABORTED));
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+ EXPECT_THAT(icing.Reset().status(), ProtoStatusIs(StatusProto::INTERNAL));
- // Everything is still intact.
- // Can get old data.
GetResultProto expected_get_result_proto;
- expected_get_result_proto.mutable_status()->set_code(StatusProto::OK);
+ expected_get_result_proto.mutable_status()->set_code(
+ StatusProto::FAILED_PRECONDITION);
*expected_get_result_proto.mutable_document() = document;
- EXPECT_THAT(icing.Get(document.namespace_(), document.uri()),
- EqualsProto(expected_get_result_proto));
-
- // Can add new data.
- EXPECT_THAT(
- icing.Put(CreateMessageDocument("namespace", "uri")).status().code(),
- Eq(StatusProto::OK));
-}
-
-TEST_F(IcingSearchEngineTest, ResetInternalError) {
- auto mock_filesystem = std::make_unique<MockFilesystem>();
-
- // Let all other calls succeed.
- EXPECT_CALL(*mock_filesystem, Write(Matcher<const char*>(_), _, _))
- .WillRepeatedly(Return(true));
-
- // This prevents IcingSearchEngine from creating a DocumentStore instance on
- // reinitialization
- const std::string document_log_path =
- GetTestBaseDir() + "/document_dir/document_log";
- EXPECT_CALL(
- *mock_filesystem,
- Write(Matcher<const char*>(StrEq(document_log_path.c_str())), _, _))
- .WillOnce(Return(true))
- .WillOnce(Return(false));
-
- TestIcingSearchEngine icing(GetDefaultIcingOptions(),
- std::move(mock_filesystem),
- std::make_unique<FakeClock>());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
- EXPECT_THAT(icing.Reset().status().code(), Eq(StatusProto::INTERNAL));
-}
-
-TEST_F(IcingSearchEngineTest, SnippetNormalization) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document_one =
- DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "MDI zurich Team Meeting")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
-
- DocumentProto document_two =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "mdi Zürich Team Meeting")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
- search_spec.set_query("mdi Zürich");
-
- ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
- result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
-
- SearchResultProto results =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- EXPECT_THAT(results.status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(results.results(), SizeIs(2));
- const DocumentProto& result_document_1 = results.results(0).document();
- const SnippetProto& result_snippet_1 = results.results(0).snippet();
- EXPECT_THAT(result_document_1, EqualsProto(document_two));
- EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body",
- /*snippet_index=*/0),
- Eq("mdi"));
- EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body",
- /*snippet_index=*/0),
- Eq("mdi Zürich Team Meeting"));
- EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body",
- /*snippet_index=*/1),
- Eq("Zürich"));
- EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body",
- /*snippet_index=*/1),
- Eq("mdi Zürich Team Meeting"));
-
- const DocumentProto& result_document_2 = results.results(1).document();
- const SnippetProto& result_snippet_2 = results.results(1).snippet();
- EXPECT_THAT(result_document_2, EqualsProto(document_one));
- EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body",
- /*snippet_index=*/0),
- Eq("MDI"));
- EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body",
- /*snippet_index=*/0),
- Eq("MDI zurich Team Meeting"));
- EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body",
- /*snippet_index=*/1),
- Eq("zurich"));
- EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body",
- /*snippet_index=*/1),
- Eq("MDI zurich Team Meeting"));
-}
-
-TEST_F(IcingSearchEngineTest, SnippetNormalizationPrefix) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document_one =
- DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Message")
- .AddStringProperty("body", "MDI zurich Team Meeting")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
-
- DocumentProto document_two =
- DocumentBuilder()
- .SetKey("namespace", "uri2")
- .SetSchema("Message")
- .AddStringProperty("body", "mdi Zürich Team Meeting")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- ASSERT_THAT(icing.Put(document_two).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("md Zür");
-
- ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(2);
- result_spec.mutable_snippet_spec()->set_num_to_snippet(2);
-
- SearchResultProto results =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- EXPECT_THAT(results.status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(results.results(), SizeIs(2));
- const DocumentProto& result_document_1 = results.results(0).document();
- const SnippetProto& result_snippet_1 = results.results(0).snippet();
- EXPECT_THAT(result_document_1, EqualsProto(document_two));
- EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body",
- /*snippet_index=*/0),
- Eq("mdi"));
- EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body",
- /*snippet_index=*/0),
- Eq("mdi Zürich Team Meeting"));
- EXPECT_THAT(GetMatch(result_document_1, result_snippet_1, "body",
- /*snippet_index=*/1),
- Eq("Zürich"));
- EXPECT_THAT(GetWindow(result_document_1, result_snippet_1, "body",
- /*snippet_index=*/1),
- Eq("mdi Zürich Team Meeting"));
-
- const DocumentProto& result_document_2 = results.results(1).document();
- const SnippetProto& result_snippet_2 = results.results(1).snippet();
- EXPECT_THAT(result_document_2, EqualsProto(document_one));
- EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body",
- /*snippet_index=*/0),
- Eq("MDI"));
- EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body",
- /*snippet_index=*/0),
- Eq("MDI zurich Team Meeting"));
- EXPECT_THAT(GetMatch(result_document_2, result_snippet_2, "body",
- /*snippet_index=*/1),
- Eq("zurich"));
- EXPECT_THAT(GetWindow(result_document_2, result_snippet_2, "body",
- /*snippet_index=*/1),
- Eq("MDI zurich Team Meeting"));
-}
-
-TEST_F(IcingSearchEngineTest, SnippetSectionRestrict) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateEmailSchema()).status().code(),
- Eq(StatusProto::OK));
-
- DocumentProto document_one =
- DocumentBuilder()
- .SetKey("namespace", "uri1")
- .SetSchema("Email")
- .AddStringProperty("subject", "MDI zurich Team Meeting")
- .AddStringProperty("body", "MDI zurich Team Meeting")
- .SetCreationTimestampMs(kDefaultCreationTimestampMs)
- .Build();
- ASSERT_THAT(icing.Put(document_one).status().code(), Eq(StatusProto::OK));
-
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::PREFIX);
- search_spec.set_query("body:Zür");
-
- ResultSpecProto result_spec;
- result_spec.mutable_snippet_spec()->set_max_window_bytes(64);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(10);
- result_spec.mutable_snippet_spec()->set_num_to_snippet(10);
-
- SearchResultProto results =
- icing.Search(search_spec, GetDefaultScoringSpec(), result_spec);
- EXPECT_THAT(results.status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(results.results(), SizeIs(1));
- const DocumentProto& result_document = results.results(0).document();
- const SnippetProto& result_snippet = results.results(0).snippet();
- EXPECT_THAT(result_document, EqualsProto(document_one));
- EXPECT_THAT(
- GetMatch(result_document, result_snippet, "body", /*snippet_index=*/0),
- Eq("zurich"));
- EXPECT_THAT(
- GetWindow(result_document, result_snippet, "body", /*snippet_index=*/0),
- Eq("MDI zurich Team Meeting"));
- EXPECT_THAT(
- GetMatch(result_document, result_snippet, "subject", /*snippet_index=*/0),
- IsEmpty());
- EXPECT_THAT(GetWindow(result_document, result_snippet, "subject",
- /*snippet_index=*/0),
- IsEmpty());
-}
-
-TEST_F(IcingSearchEngineTest, UninitializedInstanceFailsSafely) {
- IcingSearchEngine icing(GetDefaultIcingOptions());
-
- SchemaProto email_schema = CreateMessageSchema();
- EXPECT_THAT(icing.SetSchema(email_schema).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(icing.GetSchema().status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(
- icing.GetSchemaType(email_schema.types(0).schema_type()).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
-
- DocumentProto doc = CreateMessageDocument("namespace", "uri");
- EXPECT_THAT(icing.Put(doc).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(icing.Get(doc.namespace_(), doc.uri()).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(icing.Delete(doc.namespace_(), doc.uri()).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(icing.DeleteByNamespace(doc.namespace_()).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(icing.DeleteBySchemaType(email_schema.types(0).schema_type())
- .status()
- .code(),
- Eq(StatusProto::FAILED_PRECONDITION));
-
- SearchSpecProto search_spec = SearchSpecProto::default_instance();
- ScoringSpecProto scoring_spec = ScoringSpecProto::default_instance();
- ResultSpecProto result_spec = ResultSpecProto::default_instance();
- EXPECT_THAT(
- icing.Search(search_spec, scoring_spec, result_spec).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- constexpr int kSomePageToken = 12;
- EXPECT_THAT(icing.GetNextPage(kSomePageToken).status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- icing.InvalidateNextPageToken(kSomePageToken); // Verify this doesn't crash.
-
- EXPECT_THAT(icing.PersistToDisk().status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
- EXPECT_THAT(icing.Optimize().status().code(),
- Eq(StatusProto::FAILED_PRECONDITION));
+ EXPECT_THAT(icing
+ .Get(document.namespace_(), document.uri(),
+ GetResultSpecProto::default_instance())
+ .status(),
+ ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
}
TEST_F(IcingSearchEngineTest, GetAllNamespaces) {
@@ -3298,47 +1218,43 @@ TEST_F(IcingSearchEngineTest, GetAllNamespaces) {
TestIcingSearchEngine icing(GetDefaultIcingOptions(),
std::make_unique<Filesystem>(),
- std::move(fake_clock));
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status().code(),
- Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
// No namespaces exist yet
GetAllNamespacesResultProto result = icing.GetAllNamespaces();
- EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.status(), ProtoIsOk());
EXPECT_THAT(result.namespaces(), IsEmpty());
- ASSERT_THAT(icing.Put(namespace1).status().code(), Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(namespace2_uri1).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(namespace2_uri2).status().code(),
- Eq(StatusProto::OK));
- ASSERT_THAT(icing.Put(namespace3).status().code(), Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Put(namespace1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(namespace2_uri1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(namespace2_uri2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(namespace3).status(), ProtoIsOk());
// All namespaces should exist now
result = icing.GetAllNamespaces();
- EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.status(), ProtoIsOk());
EXPECT_THAT(result.namespaces(),
UnorderedElementsAre("namespace1", "namespace2", "namespace3"));
// After deleting namespace2_uri1 document, we still have namespace2_uri2 in
// "namespace2" so it should still show up
- ASSERT_THAT(icing.Delete("namespace2", "uri1").status().code(),
- Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Delete("namespace2", "uri1").status(), ProtoIsOk());
result = icing.GetAllNamespaces();
- EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.status(), ProtoIsOk());
EXPECT_THAT(result.namespaces(),
UnorderedElementsAre("namespace1", "namespace2", "namespace3"));
// After deleting namespace2_uri2 document, we no longer have any documents
// in "namespace2"
- ASSERT_THAT(icing.Delete("namespace2", "uri2").status().code(),
- Eq(StatusProto::OK));
+ ASSERT_THAT(icing.Delete("namespace2", "uri2").status(), ProtoIsOk());
result = icing.GetAllNamespaces();
- EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.status(), ProtoIsOk());
EXPECT_THAT(result.namespaces(),
UnorderedElementsAre("namespace1", "namespace3"));
}
@@ -3352,16 +1268,112 @@ TEST_F(IcingSearchEngineTest, GetAllNamespaces) {
TestIcingSearchEngine icing(GetDefaultIcingOptions(),
std::make_unique<Filesystem>(),
- std::move(fake_clock));
- ASSERT_THAT(icing.Initialize().status().code(), Eq(StatusProto::OK));
+ std::make_unique<IcingFilesystem>(),
+ std::move(fake_clock), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
// Only valid document left is the one in "namespace1"
GetAllNamespacesResultProto result = icing.GetAllNamespaces();
- EXPECT_THAT(result.status().code(), Eq(StatusProto::OK));
+ EXPECT_THAT(result.status(), ProtoIsOk());
EXPECT_THAT(result.namespaces(), UnorderedElementsAre("namespace1"));
}
}
+TEST_F(IcingSearchEngineTest, StorageInfoTest) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Create three documents.
+ DocumentProto document1 = CreateMessageDocument("namespace", "uri1");
+ DocumentProto document2 = CreateMessageDocument("namespace", "uri2");
+ DocumentProto document3 = CreateMessageDocument("namespace", "uri3");
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+
+ // Ensure that total_storage_size is set. All the other stats are covered by
+ // the classes that generate them.
+ StorageInfoResultProto result = icing.GetStorageInfo();
+ EXPECT_THAT(result.status(), ProtoIsOk());
+ EXPECT_THAT(result.storage_info().total_storage_size(), Ge(0));
+}
+
+TEST_F(IcingSearchEngineTest, GetDebugInfoVerbosityBasicSucceeds) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Create a document.
+ DocumentProto document = CreateMessageDocument("namespace", "email");
+ ASSERT_THAT(icing.Put(document).status(), ProtoIsOk());
+
+ DebugInfoResultProto result = icing.GetDebugInfo(DebugInfoVerbosity::BASIC);
+ EXPECT_THAT(result.status(), ProtoIsOk());
+
+ // Some sanity checks
+ DebugInfoProto debug_info = result.debug_info();
+ EXPECT_THAT(
+ debug_info.document_info().document_storage_info().num_alive_documents(),
+ Eq(1));
+ EXPECT_THAT(debug_info.document_info().corpus_info(),
+ IsEmpty()); // because verbosity=BASIC
+ EXPECT_THAT(debug_info.schema_info().crc(), Gt(0));
+}
+
+TEST_F(IcingSearchEngineTest,
+ GetDebugInfoVerbosityDetailedSucceedsWithCorpusInfo) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+
+ // Create 4 documents.
+ DocumentProto document1 = CreateMessageDocument("namespace1", "email/1");
+ DocumentProto document2 = CreateMessageDocument("namespace1", "email/2");
+ DocumentProto document3 = CreateMessageDocument("namespace2", "email/3");
+ DocumentProto document4 = CreateMessageDocument("namespace2", "email/4");
+ ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk());
+ ASSERT_THAT(icing.Put(document4).status(), ProtoIsOk());
+
+ DebugInfoResultProto result =
+ icing.GetDebugInfo(DebugInfoVerbosity::DETAILED);
+ EXPECT_THAT(result.status(), ProtoIsOk());
+
+ // Some sanity checks
+ DebugInfoProto debug_info = result.debug_info();
+ EXPECT_THAT(
+ debug_info.document_info().document_storage_info().num_alive_documents(),
+ Eq(4));
+ EXPECT_THAT(debug_info.document_info().corpus_info(), SizeIs(2));
+ EXPECT_THAT(debug_info.schema_info().crc(), Gt(0));
+}
+
+TEST_F(IcingSearchEngineTest, GetDebugInfoUninitialized) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ DebugInfoResultProto result =
+ icing.GetDebugInfo(DebugInfoVerbosity::DETAILED);
+ EXPECT_THAT(result.status(), ProtoStatusIs(StatusProto::FAILED_PRECONDITION));
+}
+
+TEST_F(IcingSearchEngineTest, GetDebugInfoNoSchemaNoDocumentsSucceeds) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ DebugInfoResultProto result =
+ icing.GetDebugInfo(DebugInfoVerbosity::DETAILED);
+ ASSERT_THAT(result.status(), ProtoIsOk());
+}
+
+TEST_F(IcingSearchEngineTest, GetDebugInfoWithSchemaNoDocumentsSucceeds) {
+ IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache());
+ ASSERT_THAT(icing.Initialize().status(), ProtoIsOk());
+ ASSERT_THAT(icing.SetSchema(CreateMessageSchema()).status(), ProtoIsOk());
+ DebugInfoResultProto result =
+ icing.GetDebugInfo(DebugInfoVerbosity::DETAILED);
+ ASSERT_THAT(result.status(), ProtoIsOk());
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/index/data-indexing-handler.h b/icing/index/data-indexing-handler.h
new file mode 100644
index 0000000..16a1796
--- /dev/null
+++ b/icing/index/data-indexing-handler.h
@@ -0,0 +1,69 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_DATA_INDEXING_HANDLER_H_
+#define ICING_INDEX_DATA_INDEXING_HANDLER_H_
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/util/clock.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+// Parent class for indexing different types of data in TokenizedDocument.
+class DataIndexingHandler {
+ public:
+ explicit DataIndexingHandler(const Clock* clock) : clock_(*clock) {}
+
+ virtual ~DataIndexingHandler() = default;
+
+ // Handles the indexing process: add data into the specific type index (e.g.
+ // term index, integer index, qualified id type joinable index) for all
+ // contents in the corresponding type of data in tokenized_document.
+ // For example, IntegerSectionIndexingHandler::Handle should add data into
+ // integer index for all contents in tokenized_document.integer_sections.
+ //
+ // Also it should handle last added DocumentId properly (based on
+ // recovery_mode_) to avoid adding previously indexed documents.
+ //
+ // tokenized_document: document object with different types of tokenized data.
+ // document_id: id of the document.
+ // recovery_mode: decides how to handle document_id <=
+ // last_added_document_id. If in recovery_mode, then
+ // Handle() will simply return OK immediately. Otherwise,
+ // returns INVALID_ARGUMENT_ERROR.
+ // put_document_stats: object for collecting stats during indexing. It can be
+ // nullptr.
+ //
+ /// Returns:
+ // - OK on success.
+ // - INVALID_ARGUMENT_ERROR if document_id is invalid OR document_id is less
+ // than or equal to the document_id of a previously indexed document in
+ // non recovery mode.
+ // - Any other errors. It depends on each implementation.
+ virtual libtextclassifier3::Status Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ bool recovery_mode, PutDocumentStatsProto* put_document_stats) = 0;
+
+ protected:
+ const Clock& clock_; // Does not own.
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_DATA_INDEXING_HANDLER_H_
diff --git a/icing/index/hit/doc-hit-info.cc b/icing/index/hit/doc-hit-info.cc
deleted file mode 100644
index 80dbbde..0000000
--- a/icing/index/hit/doc-hit-info.cc
+++ /dev/null
@@ -1,62 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/index/hit/doc-hit-info.h"
-
-#include "icing/legacy/core/icing-string-util.h"
-
-namespace icing {
-namespace lib {
-
-bool DocHitInfo::operator<(const DocHitInfo& other) const {
- if (document_id() != other.document_id()) {
- // Sort by document_id descending. This mirrors how the individual hits that
- // are collapsed into this DocHitInfo would sort with other hits -
- // document_ids are inverted when encoded in hits. Hits are encoded this way
- // because they are appended to posting lists and the most recent value
- // appended to a posting list must have the smallest encoded value of any
- // hit on the posting list.
- return document_id() > other.document_id();
- }
- if (hit_section_ids_mask() != other.hit_section_ids_mask()) {
- return hit_section_ids_mask() < other.hit_section_ids_mask();
- }
- // Doesn't matter which way we compare this array, as long as
- // DocHitInfo is unequal when it is unequal.
- return memcmp(max_hit_score_, other.max_hit_score_, sizeof(max_hit_score_)) <
- 0;
-}
-
-void DocHitInfo::UpdateSection(SectionId section_id, Hit::Score hit_score) {
- SectionIdMask section_id_mask = (1u << section_id);
- if (hit_section_ids_mask() & section_id_mask) {
- max_hit_score_[section_id] =
- std::max(max_hit_score_[section_id], hit_score);
- } else {
- max_hit_score_[section_id] = hit_score;
- hit_section_ids_mask_ |= section_id_mask;
- }
-}
-
-void DocHitInfo::MergeSectionsFrom(const DocHitInfo& other) {
- SectionIdMask other_mask = other.hit_section_ids_mask();
- while (other_mask) {
- SectionId section_id = __builtin_ctz(other_mask);
- UpdateSection(section_id, other.max_hit_score(section_id));
- other_mask &= ~(1u << section_id);
- }
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/index/hit/doc-hit-info.h b/icing/index/hit/doc-hit-info.h
index 386822d..2770de2 100644
--- a/icing/index/hit/doc-hit-info.h
+++ b/icing/index/hit/doc-hit-info.h
@@ -25,19 +25,16 @@
namespace icing {
namespace lib {
-// DocHitInfo provides a collapsed view of all hits for a specific term and doc.
-// Hits contain a document_id, section_id and a hit score. The information in
-// multiple hits is collapse into a DocHitInfo by providing a SectionIdMask of
-// all sections that contained a hit for this term as well as the highest hit
-// score of any hit for each section.
+// DocHitInfo provides a collapsed view of all hits for a specific doc.
+// Hits contain a document_id and section_id. The information in multiple hits
+// is collapse into a DocHitInfo by providing a SectionIdMask of all sections
+// that contained a hit for this term.
class DocHitInfo {
public:
explicit DocHitInfo(DocumentId document_id_in = kInvalidDocumentId,
SectionIdMask hit_section_ids_mask = kSectionIdMaskNone)
: document_id_(document_id_in),
- hit_section_ids_mask_(hit_section_ids_mask) {
- memset(max_hit_score_, Hit::kMaxHitScore, sizeof(max_hit_score_));
- }
+ hit_section_ids_mask_(hit_section_ids_mask) {}
DocumentId document_id() const { return document_id_; }
@@ -49,38 +46,44 @@ class DocHitInfo {
hit_section_ids_mask_ = section_id_mask;
}
- Hit::Score max_hit_score(SectionId section_id) const {
- return max_hit_score_[section_id];
+ bool operator<(const DocHitInfo& other) const {
+ if (document_id() != other.document_id()) {
+ // Sort by document_id descending. This mirrors how the individual hits
+ // that are collapsed into this DocHitInfo would sort with other hits -
+ // document_ids are inverted when encoded in hits. Hits are encoded this
+ // way because they are appended to posting lists and the most recent
+ // value appended to a posting list must have the smallest encoded value
+ // of any hit on the posting list.
+ return document_id() > other.document_id();
+ }
+ return hit_section_ids_mask() < other.hit_section_ids_mask();
}
-
- bool operator<(const DocHitInfo& other) const;
bool operator==(const DocHitInfo& other) const {
- return (*this < other) == (other < *this);
+ return document_id_ == other.document_id_ &&
+ hit_section_ids_mask_ == other.hit_section_ids_mask_;
}
- // Updates the hit_section_ids_mask and max_hit_score for the section, if
- // necessary.
- void UpdateSection(SectionId section_id, Hit::Score hit_score);
+ // Updates the hit_section_ids_mask for the section, if necessary.
+ void UpdateSection(SectionId section_id) {
+ hit_section_ids_mask_ |= (UINT64_C(1) << section_id);
+ }
- // Merges the sections of other into this. The hit_section_ids_masks are or'd
- // and the max hit score for each section between the two is set.
+ // Merges the sections of other into this. The hit_section_ids_masks are or'd.
//
// This does not affect the DocumentId of this or other. If callers care about
// only merging sections for DocHitInfos with the same DocumentId, callers
// should check this themselves.
- void MergeSectionsFrom(const DocHitInfo& other);
+ void MergeSectionsFrom(const SectionIdMask& other_hit_section_ids_mask) {
+ hit_section_ids_mask_ |= other_hit_section_ids_mask;
+ }
private:
DocumentId document_id_;
SectionIdMask hit_section_ids_mask_;
- Hit::Score max_hit_score_[kMaxSectionId + 1];
} __attribute__((packed));
-static_assert(sizeof(DocHitInfo) == 22, "");
+static_assert(sizeof(DocHitInfo) == 12, "");
// TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions.
static_assert(icing_is_packed_pod<DocHitInfo>::value, "go/icing-ubsan");
-static_assert(sizeof(Hit::Score) == 1,
- "Change how max_hit_score_ is initialized if changing the type "
- "of Hit::Score");
} // namespace lib
} // namespace icing
diff --git a/icing/index/hit/doc-hit-info_test.cc b/icing/index/hit/doc-hit-info_test.cc
index d8adbc1..13eca9a 100644
--- a/icing/index/hit/doc-hit-info_test.cc
+++ b/icing/index/hit/doc-hit-info_test.cc
@@ -14,143 +14,29 @@
#include "icing/index/hit/doc-hit-info.h"
-#include "icing/index/hit/hit.h"
-#include "icing/schema/section.h"
-#include "icing/store/document-id.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
namespace icing {
namespace lib {
using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsTrue;
using ::testing::Ne;
-constexpr DocumentId kSomeDocumentId = 12;
-constexpr DocumentId kSomeOtherDocumentId = 54;
-
-TEST(DocHitInfoTest, InitialMaxHitScores) {
- DocHitInfo info(kSomeDocumentId);
- for (SectionId i = 0; i <= kMaxSectionId; ++i) {
- EXPECT_THAT(info.max_hit_score(i), Eq(Hit::kMaxHitScore));
- }
-}
-
-TEST(DocHitInfoTest, UpdateHitScores) {
- DocHitInfo info(kSomeDocumentId);
- ASSERT_THAT(info.max_hit_score(3), Eq(Hit::kMaxHitScore));
-
- // Updating a section for the first time, should change its max hit score,
- // even though the hit score (16) may be lower than the current value returned
- // by info.max_hit_score(3) (kMaxHitScore)
- info.UpdateSection(3, 16);
- EXPECT_THAT(info.max_hit_score(3), Eq(16));
-
- // Updating a section with a hit score lower than the previously set one
- // should not update max hit score.
- info.UpdateSection(3, 15);
- EXPECT_THAT(info.max_hit_score(3), Eq(16));
-
- // Updating a section with a hit score higher than the previously set one
- // should update the max hit score.
- info.UpdateSection(3, 17);
- EXPECT_THAT(info.max_hit_score(3), Eq(17));
-
- // Updating a section with kMaxHitScore should *always* set the max hit
- // score to kMaxHitScore (regardless of what value kMaxHitScore is
- // defined with).
- info.UpdateSection(3, Hit::kMaxHitScore);
- EXPECT_THAT(info.max_hit_score(3), Eq(Hit::kMaxHitScore));
-
- // Updating a section that has had kMaxHitScore explicitly set, should
- // *never* change the max hit score (regardless of what value kMaxHitScore
- // is defined with).
- info.UpdateSection(3, 16);
- EXPECT_THAT(info.max_hit_score(3), Eq(Hit::kMaxHitScore));
-}
-
-TEST(DocHitInfoTest, UpdateSectionIdMask) {
- DocHitInfo info(kSomeDocumentId);
- EXPECT_THAT(info.hit_section_ids_mask(), Eq(kSectionIdMaskNone));
-
- info.UpdateSection(3, 16);
- EXPECT_THAT(info.hit_section_ids_mask() & 1U << 3, IsTrue());
-
- // Calling update again shouldn't do anything
- info.UpdateSection(3, 15);
- EXPECT_THAT(info.hit_section_ids_mask() & 1U << 3, IsTrue());
-
- // Updating another section shouldn't do anything
- info.UpdateSection(2, 77);
- EXPECT_THAT(info.hit_section_ids_mask() & 1U << 3, IsTrue());
-}
-
-TEST(DocHitInfoTest, MergeSectionsFromDifferentDocumentId) {
- // Merging infos with different document_ids works.
- DocHitInfo info1(kSomeDocumentId);
- DocHitInfo info2(kSomeOtherDocumentId);
- info2.UpdateSection(7, 12);
- info1.MergeSectionsFrom(info2);
- EXPECT_THAT(info1.max_hit_score(7), Eq(12));
- EXPECT_THAT(info1.document_id(), Eq(kSomeDocumentId));
-}
-
-TEST(DocHitInfoTest, MergeSectionsFromKeepsOldSection) {
- // Merging shouldn't override sections that are present info1, but not present
- // in info2.
- DocHitInfo info1(kSomeDocumentId);
- info1.UpdateSection(3, 16);
- DocHitInfo info2(kSomeDocumentId);
- info1.MergeSectionsFrom(info2);
- EXPECT_THAT(info1.max_hit_score(3), Eq(16));
-}
-
-TEST(DocHitInfoTest, MergeSectionsFromAddsNewSection) {
- // Merging should add sections that were not present in info1, but are present
- // in info2.
- DocHitInfo info1(kSomeDocumentId);
- DocHitInfo info2(kSomeDocumentId);
- info2.UpdateSection(7, 12);
- info1.MergeSectionsFrom(info2);
- EXPECT_THAT(info1.max_hit_score(7), Eq(12));
-}
-
-TEST(DocHitInfoTest, MergeSectionsFromSetsHigherHitScore) {
- // Merging should override the value of a section in info1 if the same section
- // is present in info2 with a higher hit score.
- DocHitInfo info1(kSomeDocumentId);
- info1.UpdateSection(2, 77);
- DocHitInfo info2(kSomeDocumentId);
- info2.UpdateSection(2, 89);
- info1.MergeSectionsFrom(info2);
- EXPECT_THAT(info1.max_hit_score(2), Eq(89));
-}
-
-TEST(DocHitInfoTest, MergeSectionsFromDoesNotSetLowerHitScore) {
- // Merging should not override the hit score of a section in info1 if the same
- // section is present in info2 but with a lower hit score.
- DocHitInfo info1(kSomeDocumentId);
- info1.UpdateSection(5, 108);
- DocHitInfo info2(kSomeDocumentId);
- info2.UpdateSection(5, 13);
- info1.MergeSectionsFrom(info2);
- EXPECT_THAT(info1.max_hit_score(5), Eq(108));
-}
-
TEST(DocHitInfoTest, Comparison) {
constexpr DocumentId kDocumentId = 1;
DocHitInfo info(kDocumentId);
- info.UpdateSection(1, 12);
+ info.UpdateSection(1);
constexpr DocumentId kHighDocumentId = 15;
DocHitInfo high_document_id_info(kHighDocumentId);
- high_document_id_info.UpdateSection(1, 12);
+ high_document_id_info.UpdateSection(1);
DocHitInfo high_section_id_info(kDocumentId);
- high_section_id_info.UpdateSection(1, 12);
- high_section_id_info.UpdateSection(6, Hit::kMaxHitScore);
+ high_section_id_info.UpdateSection(1);
+ high_section_id_info.UpdateSection(6);
std::vector<DocHitInfo> infos{info, high_document_id_info,
high_section_id_info};
@@ -160,10 +46,10 @@ TEST(DocHitInfoTest, Comparison) {
// There are no requirements for how DocHitInfos with the same DocumentIds and
// hit masks will compare, but they must not be equal.
- DocHitInfo different_hit_score_info(kDocumentId);
- different_hit_score_info.UpdateSection(1, 76);
- EXPECT_THAT(info < different_hit_score_info,
- Ne(different_hit_score_info < info));
+ DocHitInfo different_term_frequency_info(kDocumentId);
+ different_term_frequency_info.UpdateSection(2);
+ EXPECT_THAT(info < different_term_frequency_info,
+ Ne(different_term_frequency_info < info));
}
} // namespace lib
diff --git a/icing/index/hit/hit.cc b/icing/index/hit/hit.cc
index 1852bd5..493e62b 100644
--- a/icing/index/hit/hit.cc
+++ b/icing/index/hit/hit.cc
@@ -30,13 +30,25 @@ enum FlagOffset {
// This hit represents a prefix of a longer term. If exact matches are
// required, then this hit should be ignored.
kPrefixHit = 1,
- // Whether or not the hit has a hit score other than kMaxHitScore.
- kHasScore = 2,
+ // Whether or not the hit has a term_frequency other than
+ // kDefaultTermFrequency.
+ kHasTermFrequency = 2,
kNumFlags = 3,
};
+
+static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <
+ sizeof(Hit::Value) * 8,
+ "Hit::kInvalidValue contains risky value and we should have at "
+ "least one unused bit to avoid potential bugs. Please follow the "
+ "process mentioned in hit.h to correct the value of "
+ "Hit::kInvalidValue and remove this static_assert afterwards.");
+
static_assert(kDocumentIdBits + kSectionIdBits + kNumFlags <=
sizeof(Hit::Value) * 8,
"HitOverflow");
+static_assert(kDocumentIdBits == 22, "");
+static_assert(kSectionIdBits == 6, "");
+static_assert(kNumFlags == 3, "");
inline DocumentId InvertDocumentId(DocumentId document_id) {
static_assert(kMaxDocumentId <= (std::numeric_limits<DocumentId>::max() - 1),
@@ -51,9 +63,35 @@ inline DocumentId InvertDocumentId(DocumentId document_id) {
} // namespace
-Hit::Hit(SectionId section_id, DocumentId document_id, Hit::Score score,
- bool is_in_prefix_section, bool is_prefix_hit)
- : score_(score) {
+BasicHit::BasicHit(SectionId section_id, DocumentId document_id) {
+ // Values are stored so that when sorted, they appear in document_id
+ // descending, section_id ascending, order. So inverted document_id appears in
+ // the most significant bits, followed by (uninverted) section_id.
+ Value temp_value = 0;
+ bit_util::BitfieldSet(/*new_value=*/InvertDocumentId(document_id),
+ /*lsb_offset=*/kSectionIdBits, /*len=*/kDocumentIdBits,
+ /*value_out=*/&temp_value);
+ bit_util::BitfieldSet(/*new_value=*/section_id, /*lsb_offset=*/0,
+ /*len=*/kSectionIdBits, /*value_out=*/&temp_value);
+ value_ = temp_value;
+}
+
+DocumentId BasicHit::document_id() const {
+ DocumentId inverted_document_id = bit_util::BitfieldGet(
+ value_, /*lsb_offset=*/kSectionIdBits, /*len=*/kDocumentIdBits);
+ // Undo the document_id inversion.
+ return InvertDocumentId(inverted_document_id);
+}
+
+SectionId BasicHit::section_id() const {
+ return bit_util::BitfieldGet(value_, /*lsb_offset=*/0,
+ /*len=*/kSectionIdBits);
+}
+
+Hit::Hit(SectionId section_id, DocumentId document_id,
+ Hit::TermFrequency term_frequency, bool is_in_prefix_section,
+ bool is_prefix_hit)
+ : term_frequency_(term_frequency) {
// Values are stored so that when sorted, they appear in document_id
// descending, section_id ascending, order. Also, all else being
// equal, non-prefix hits sort before prefix hits. So inverted
@@ -64,9 +102,11 @@ Hit::Hit(SectionId section_id, DocumentId document_id, Hit::Score score,
kSectionIdBits + kNumFlags, kDocumentIdBits,
&temp_value);
bit_util::BitfieldSet(section_id, kNumFlags, kSectionIdBits, &temp_value);
- bit_util::BitfieldSet(score != kMaxHitScore, kHasScore, 1, &temp_value);
- bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, 1, &temp_value);
- bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection, 1, &temp_value);
+ bit_util::BitfieldSet(term_frequency != kDefaultTermFrequency,
+ kHasTermFrequency, /*len=*/1, &temp_value);
+ bit_util::BitfieldSet(is_prefix_hit, kPrefixHit, /*len=*/1, &temp_value);
+ bit_util::BitfieldSet(is_in_prefix_section, kInPrefixSection,
+ /*len=*/1, &temp_value);
value_ = temp_value;
}
@@ -81,8 +121,8 @@ SectionId Hit::section_id() const {
return bit_util::BitfieldGet(value(), kNumFlags, kSectionIdBits);
}
-bool Hit::has_score() const {
- return bit_util::BitfieldGet(value(), kHasScore, 1);
+bool Hit::has_term_frequency() const {
+ return bit_util::BitfieldGet(value(), kHasTermFrequency, 1);
}
bool Hit::is_prefix_hit() const {
@@ -93,6 +133,11 @@ bool Hit::is_in_prefix_section() const {
return bit_util::BitfieldGet(value(), kInPrefixSection, 1);
}
+Hit Hit::TranslateHit(Hit old_hit, DocumentId new_document_id) {
+ return Hit(old_hit.section_id(), new_document_id, old_hit.term_frequency(),
+ old_hit.is_in_prefix_section(), old_hit.is_prefix_hit());
+}
+
bool Hit::EqualsDocumentIdAndSectionId::operator()(const Hit& hit1,
const Hit& hit2) const {
return (hit1.value() >> kNumFlags) == (hit2.value() >> kNumFlags);
diff --git a/icing/index/hit/hit.h b/icing/index/hit/hit.h
index d1be204..111b320 100644
--- a/icing/index/hit/hit.h
+++ b/icing/index/hit/hit.h
@@ -15,6 +15,7 @@
#ifndef ICING_INDEX_HIT_HIT_H_
#define ICING_INDEX_HIT_HIT_H_
+#include <array>
#include <cstdint>
#include <limits>
@@ -25,56 +26,131 @@
namespace icing {
namespace lib {
+// BasicHit is a specific encoding that refers to content within a document. A
+// basic hit consists of:
+// - a DocumentId
+// - a SectionId
+// referring to the document and section that the hit corresponds to.
+//
+// The hit is the most basic unit of the index and, when grouped together by
+// term, can be used to encode what terms appear in what documents.
+//
+// BasicHit is for indices (e.g. numeric index) that don't require term
+// frequency.
+class BasicHit {
+ public:
+ // The datatype used to encode BasicHit information: the document_id and
+ // section_id.
+ using Value = uint32_t;
+
+ // WARNING: Changing this value will invalidate any pre-existing posting lists
+ // on user devices.
+ //
+ // kInvalidValue contains:
+ // - 0 for unused bits. Note that unused bits are always 0 for both valid and
+ // invalid BasicHit values.
+ // - Inverted kInvalidDocumentId
+ // - SectionId 0 (valid), which is ok because inverted kInvalidDocumentId has
+ // already invalidated the value. In fact, we currently use all 2^6 section
+ // ids and there is no "invalid section id", so it doesn't matter what
+ // SectionId we set for kInvalidValue.
+ static constexpr Value kInvalidValue = 0;
+
+ explicit BasicHit(SectionId section_id, DocumentId document_id);
+
+ explicit BasicHit() : value_(kInvalidValue) {}
+
+ bool is_valid() const { return value_ != kInvalidValue; }
+ Value value() const { return value_; }
+ DocumentId document_id() const;
+ SectionId section_id() const;
+
+ bool operator<(const BasicHit& h2) const { return value_ < h2.value_; }
+ bool operator==(const BasicHit& h2) const { return value_ == h2.value_; }
+
+ private:
+ // Value bits layout: 4 unused + 22 document_id + 6 section id.
+ Value value_;
+} __attribute__((packed));
+static_assert(sizeof(BasicHit) == 4, "");
+
// Hit is a specific encoding that refers to content within a document. A hit
// consists of:
// - a DocumentId
// - a SectionId
// referring to the document and section that the hit corresponds to, as well as
// metadata about the hit:
-// - whether the Hit has a Score other than the default value
+// - whether the Hit has a TermFrequency other than the default value
// - whether the Hit does not appear exactly in the document, but instead
// represents a term that is a prefix of a term in the document
// - whether the Hit came from a section that has prefix expansion enabled
-// and a score for the hit. Ranging from [0,255] a higher score indicates a
-// higher quality hit.
+// and a term frequency for the hit.
+//
// The hit is the most basic unit of the index and, when grouped together by
// term, can be used to encode what terms appear in what documents.
class Hit {
public:
// The datatype used to encode Hit information: the document_id, section_id
- // and the has_score, prefix hit and in prefix section flags.
+ // and the has_term_frequency, prefix hit and in prefix section flags.
using Value = uint32_t;
// WARNING: Changing this value will invalidate any pre-existing posting lists
// on user devices.
+ //
+ // WARNING:
+ // - Hit::kInvalidValue should contain inverted kInvalidDocumentId, which is
+ // b'00...0. However, currently we set it as UINT32_MAX and actually it
+ // contains b'11...1, which is the inverted document_id 0.
+ // - It means Hit::kInvalidValue contains valid (document_id, section_id,
+ // flags), so we potentially cannot distinguish if a Hit is invalid or not.
+ // The invalidity is an essential feature for posting list since we use it
+ // to determine the state of the posting list.
+ // - The reason why it won't break the current posting list is because the
+ // unused bit(s) are set as 1 for Hit::kInvalidValue and 0 for all valid
+ // Hits. In other words, the unused bit(s) are actually serving as "invalid
+ // flag".
+ // - If we want to exhaust all unused bits in the future, then we have to
+ // change Hit::kInvalidValue to set the inverted document_id section
+ // correctly (b'00...0, refer to BasicHit::kInvalidValue as an example).
+ // - Also this problem is guarded by static_assert in hit.cc. If exhausting
+ // all unused bits, then the static_assert will detect and fail. We can
+ // safely remove the static_assert check after following the above process
+ // to resolve the incorrect Hit::kInvalidValue issue.
static constexpr Value kInvalidValue = std::numeric_limits<Value>::max();
// Docs are sorted in reverse, and 0 is never used as the inverted
// DocumentId (because it is the inverse of kInvalidValue), so it is always
// the max in a descending sort.
static constexpr Value kMaxDocumentIdSortValue = 0;
- // A score reflecting the "quality" of this hit. The higher the score, the
- // higher quality the hit.
- using Score = uint8_t;
- // By default, hits are given the highest possible score.
- static constexpr Score kMaxHitScore = std::numeric_limits<Score>::max();
+ // The Term Frequency of a Hit.
+ using TermFrequency = uint8_t;
+ using TermFrequencyArray = std::array<Hit::TermFrequency, kTotalNumSections>;
+ // Max TermFrequency is 255.
+ static constexpr TermFrequency kMaxTermFrequency =
+ std::numeric_limits<TermFrequency>::max();
+ static constexpr TermFrequency kDefaultTermFrequency = 1;
+ static constexpr TermFrequency kNoTermFrequency = 0;
- explicit Hit(Value value = kInvalidValue, Score score = kMaxHitScore)
- : value_(value), score_(score) {}
- Hit(SectionId section_id, DocumentId document_id, Score score,
- bool is_in_prefix_section = false, bool is_prefix_hit = false);
+ explicit Hit(Value value = kInvalidValue,
+ TermFrequency term_frequency = kDefaultTermFrequency)
+ : value_(value), term_frequency_(term_frequency) {}
+ Hit(SectionId section_id, DocumentId document_id,
+ TermFrequency term_frequency, bool is_in_prefix_section = false,
+ bool is_prefix_hit = false);
bool is_valid() const { return value() != kInvalidValue; }
Value value() const { return value_; }
DocumentId document_id() const;
SectionId section_id() const;
- // Whether or not the hit contains a non-default score. Hits with non-default
- // score are considered to be of lower quality.
- bool has_score() const;
- Score score() const { return score_; }
+ // Whether or not the hit contains a valid term frequency.
+ bool has_term_frequency() const;
+ TermFrequency term_frequency() const { return term_frequency_; }
bool is_prefix_hit() const;
bool is_in_prefix_section() const;
+ // Creates a new hit based on old_hit but with new_document_id set.
+ static Hit TranslateHit(Hit old_hit, DocumentId new_document_id);
+
bool operator<(const Hit& h2) const { return value() < h2.value(); }
bool operator==(const Hit& h2) const { return value() == h2.value(); }
@@ -83,10 +159,10 @@ class Hit {
};
private:
- // Value and score must be in this order.
- // Value bits layout: 5 unused + 20 document_id + 4 section id + 3 flags.
+ // Value and TermFrequency must be in this order.
+ // Value bits layout: 1 unused + 22 document_id + 6 section id + 3 flags.
Value value_;
- Score score_;
+ TermFrequency term_frequency_;
} __attribute__((packed));
static_assert(sizeof(Hit) == 5, "");
// TODO(b/138991332) decide how to remove/replace all is_packed_pod assertions.
diff --git a/icing/index/hit/hit_test.cc b/icing/index/hit/hit_test.cc
index 17db66b..0086d91 100644
--- a/icing/index/hit/hit_test.cc
+++ b/icing/index/hit/hit_test.cc
@@ -26,6 +26,7 @@ namespace {
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::Ge;
using ::testing::IsFalse;
using ::testing::IsTrue;
using ::testing::Lt;
@@ -33,46 +34,103 @@ using ::testing::Not;
static constexpr DocumentId kSomeDocumentId = 24;
static constexpr SectionId kSomeSectionid = 5;
-static constexpr Hit::Score kSomeHitScore = 57;
+static constexpr Hit::TermFrequency kSomeTermFrequency = 57;
-TEST(HitTest, HasScoreFlag) {
- Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore);
- EXPECT_THAT(h1.has_score(), IsFalse());
- EXPECT_THAT(h1.score(), Eq(Hit::kMaxHitScore));
+TEST(BasicHitTest, Accessors) {
+ BasicHit h1(kSomeSectionid, kSomeDocumentId);
+ EXPECT_THAT(h1.document_id(), Eq(kSomeDocumentId));
+ EXPECT_THAT(h1.section_id(), Eq(kSomeSectionid));
+}
+
+TEST(BasicHitTest, Invalid) {
+ BasicHit default_invalid;
+ EXPECT_THAT(default_invalid.is_valid(), IsFalse());
+
+ // Also make sure the invalid BasicHit contains an invalid document id.
+ EXPECT_THAT(default_invalid.document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(default_invalid.section_id(), Eq(kMinSectionId));
+}
+
+TEST(BasicHitTest, Valid) {
+ BasicHit maximum_document_id_hit(kSomeSectionid, kMaxDocumentId);
+ EXPECT_THAT(maximum_document_id_hit.is_valid(), IsTrue());
+
+ BasicHit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId);
+ EXPECT_THAT(maximum_section_id_hit.is_valid(), IsTrue());
+
+ BasicHit minimum_document_id_hit(kSomeSectionid, kMinDocumentId);
+ EXPECT_THAT(minimum_document_id_hit.is_valid(), IsTrue());
+
+ BasicHit minimum_section_id_hit(kMinSectionId, kSomeDocumentId);
+ EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue());
+
+ BasicHit all_maximum_hit(kMaxSectionId, kMaxDocumentId);
+ EXPECT_THAT(all_maximum_hit.is_valid(), IsTrue());
- Hit h2(kSomeSectionid, kSomeDocumentId, kSomeHitScore);
- EXPECT_THAT(h2.has_score(), IsTrue());
- EXPECT_THAT(h2.score(), Eq(kSomeHitScore));
+ BasicHit all_minimum_hit(kMinSectionId, kMinDocumentId);
+ EXPECT_THAT(all_minimum_hit.is_valid(), IsTrue());
+
+ // We use invalid BasicHit for std::lower_bound. Verify that value of the
+ // smallest valid BasicHit (which contains kMinSectionId, kMaxDocumentId) is
+ // >= BasicHit::kInvalidValue.
+ BasicHit smallest_hit(kMinSectionId, kMaxDocumentId);
+ ASSERT_THAT(smallest_hit.is_valid(), IsTrue());
+ EXPECT_THAT(smallest_hit.value(), Ge(BasicHit::kInvalidValue));
+}
+
+TEST(BasicHitTest, Comparison) {
+ BasicHit hit(/*section_id=*/1, /*document_id=*/243);
+ // DocumentIds are sorted in ascending order. So a hit with a lower
+ // document_id should be considered greater than one with a higher
+ // document_id.
+ BasicHit higher_document_id_hit(/*section_id=*/1, /*document_id=*/2409);
+ BasicHit higher_section_id_hit(/*section_id=*/15, /*document_id=*/243);
+
+ std::vector<BasicHit> hits{hit, higher_document_id_hit,
+ higher_section_id_hit};
+ std::sort(hits.begin(), hits.end());
+ EXPECT_THAT(hits,
+ ElementsAre(higher_document_id_hit, hit, higher_section_id_hit));
+}
+
+TEST(HitTest, HasTermFrequencyFlag) {
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
+ EXPECT_THAT(h1.has_term_frequency(), IsFalse());
+ EXPECT_THAT(h1.term_frequency(), Eq(Hit::kDefaultTermFrequency));
+
+ Hit h2(kSomeSectionid, kSomeDocumentId, kSomeTermFrequency);
+ EXPECT_THAT(h2.has_term_frequency(), IsTrue());
+ EXPECT_THAT(h2.term_frequency(), Eq(kSomeTermFrequency));
}
TEST(HitTest, IsPrefixHitFlag) {
- Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore);
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
EXPECT_THAT(h1.is_prefix_hit(), IsFalse());
- Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore,
+ Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/false);
EXPECT_THAT(h2.is_prefix_hit(), IsFalse());
- Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore,
+ Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false, /*is_prefix_hit=*/true);
EXPECT_THAT(h3.is_prefix_hit(), IsTrue());
}
TEST(HitTest, IsInPrefixSectionFlag) {
- Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore);
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
EXPECT_THAT(h1.is_in_prefix_section(), IsFalse());
- Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore,
+ Hit h2(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/false);
EXPECT_THAT(h2.is_in_prefix_section(), IsFalse());
- Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore,
+ Hit h3(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true);
EXPECT_THAT(h3.is_in_prefix_section(), IsTrue());
}
TEST(HitTest, Accessors) {
- Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kMaxHitScore);
+ Hit h1(kSomeSectionid, kSomeDocumentId, Hit::kDefaultTermFrequency);
EXPECT_THAT(h1.document_id(), Eq(kSomeDocumentId));
EXPECT_THAT(h1.section_id(), Eq(kSomeSectionid));
}
@@ -88,47 +146,64 @@ TEST(HitTest, Valid) {
Hit explicit_valid(kSomeValue);
EXPECT_THAT(explicit_valid.is_valid(), IsTrue());
- Hit maximum_document_id_hit(kSomeSectionid, kMaxDocumentId, kSomeHitScore);
+ Hit maximum_document_id_hit(kSomeSectionid, kMaxDocumentId,
+ kSomeTermFrequency);
EXPECT_THAT(maximum_document_id_hit.is_valid(), IsTrue());
- Hit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId, kSomeHitScore);
+ Hit maximum_section_id_hit(kMaxSectionId, kSomeDocumentId,
+ kSomeTermFrequency);
EXPECT_THAT(maximum_section_id_hit.is_valid(), IsTrue());
- Hit minimum_document_id_hit(kSomeSectionid, 0, kSomeHitScore);
+ Hit minimum_document_id_hit(kSomeSectionid, 0, kSomeTermFrequency);
EXPECT_THAT(minimum_document_id_hit.is_valid(), IsTrue());
- Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeHitScore);
+ Hit minimum_section_id_hit(0, kSomeDocumentId, kSomeTermFrequency);
EXPECT_THAT(minimum_section_id_hit.is_valid(), IsTrue());
+
+ // We use Hit with value Hit::kMaxDocumentIdSortValue for std::lower_bound in
+ // the lite index. Verify that the value of the smallest valid Hit (which
+ // contains kMinSectionId, kMaxDocumentId and 3 flags = false) is >=
+ // Hit::kMaxDocumentIdSortValue.
+ Hit smallest_hit(kMinSectionId, kMaxDocumentId, Hit::kDefaultTermFrequency);
+ ASSERT_THAT(smallest_hit.is_valid(), IsTrue());
+ ASSERT_THAT(smallest_hit.has_term_frequency(), IsFalse());
+ ASSERT_THAT(smallest_hit.is_prefix_hit(), IsFalse());
+ ASSERT_THAT(smallest_hit.is_in_prefix_section(), IsFalse());
+ EXPECT_THAT(smallest_hit.value(), Ge(Hit::kMaxDocumentIdSortValue));
}
TEST(HitTest, Comparison) {
- Hit hit(1, 243, Hit::kMaxHitScore);
+ Hit hit(1, 243, Hit::kDefaultTermFrequency);
// DocumentIds are sorted in ascending order. So a hit with a lower
// document_id should be considered greater than one with a higher
// document_id.
- Hit higher_document_id_hit(1, 2409, Hit::kMaxHitScore);
- Hit higher_section_id_hit(15, 243, Hit::kMaxHitScore);
- // Whether or not a hit score was set is considered, but the score itself is
- // not.
- Hit hitscore_hit(1, 243, 12);
- Hit prefix_hit(1, 243, Hit::kMaxHitScore, /*is_in_prefix_section=*/false,
+ Hit higher_document_id_hit(1, 2409, Hit::kDefaultTermFrequency);
+ Hit higher_section_id_hit(15, 243, Hit::kDefaultTermFrequency);
+ // Whether or not a term frequency was set is considered, but the term
+ // frequency itself is not.
+ Hit term_frequency_hit(1, 243, 12);
+ Hit prefix_hit(1, 243, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false,
/*is_prefix_hit=*/true);
- Hit hit_in_prefix_section(1, 243, Hit::kMaxHitScore,
+ Hit hit_in_prefix_section(1, 243, Hit::kDefaultTermFrequency,
/*is_in_prefix_section=*/true,
/*is_prefix_hit=*/false);
- std::vector<Hit> hits{
- hit, higher_document_id_hit, higher_section_id_hit, hitscore_hit,
- prefix_hit, hit_in_prefix_section};
+ std::vector<Hit> hits{hit,
+ higher_document_id_hit,
+ higher_section_id_hit,
+ term_frequency_hit,
+ prefix_hit,
+ hit_in_prefix_section};
std::sort(hits.begin(), hits.end());
- EXPECT_THAT(hits,
- ElementsAre(higher_document_id_hit, hit, hit_in_prefix_section,
- prefix_hit, hitscore_hit, higher_section_id_hit));
-
- Hit higher_hitscore_hit(1, 243, 108);
- // Hit score value is not considered when comparing hits.
- EXPECT_THAT(hitscore_hit, Not(Lt(higher_hitscore_hit)));
- EXPECT_THAT(higher_hitscore_hit, Not(Lt(hitscore_hit)));
+ EXPECT_THAT(
+ hits, ElementsAre(higher_document_id_hit, hit, hit_in_prefix_section,
+ prefix_hit, term_frequency_hit, higher_section_id_hit));
+
+ Hit higher_term_frequency_hit(1, 243, 108);
+ // The term frequency value is not considered when comparing hits.
+ EXPECT_THAT(term_frequency_hit, Not(Lt(higher_term_frequency_hit)));
+ EXPECT_THAT(higher_term_frequency_hit, Not(Lt(term_frequency_hit)));
}
} // namespace
diff --git a/icing/index/index-processor.cc b/icing/index/index-processor.cc
index 7076257..9a773e8 100644
--- a/icing/index/index-processor.cc
+++ b/icing/index/index-processor.cc
@@ -14,98 +14,34 @@
#include "icing/index/index-processor.h"
-#include <cstdint>
#include <memory>
-#include <string>
-#include <string_view>
-#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/index/index.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/proto/document.pb.h"
-#include "icing/proto/schema.pb.h"
-#include "icing/proto/term.pb.h"
-#include "icing/schema/section-manager.h"
-#include "icing/schema/section.h"
+#include "icing/index/data-indexing-handler.h"
+#include "icing/proto/logging.pb.h"
#include "icing/store/document-id.h"
-#include "icing/tokenization/language-segmenter.h"
-#include "icing/tokenization/token.h"
-#include "icing/tokenization/tokenizer-factory.h"
-#include "icing/tokenization/tokenizer.h"
-#include "icing/transform/normalizer.h"
#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
namespace icing {
namespace lib {
-libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>>
-IndexProcessor::Create(const SchemaStore* schema_store,
- const LanguageSegmenter* lang_segmenter,
- const Normalizer* normalizer, Index* index,
- const IndexProcessor::Options& options) {
- ICING_RETURN_ERROR_IF_NULL(schema_store);
- ICING_RETURN_ERROR_IF_NULL(lang_segmenter);
- ICING_RETURN_ERROR_IF_NULL(normalizer);
- ICING_RETURN_ERROR_IF_NULL(index);
-
- return std::unique_ptr<IndexProcessor>(new IndexProcessor(
- schema_store, lang_segmenter, normalizer, index, options));
-}
-
libtextclassifier3::Status IndexProcessor::IndexDocument(
- const DocumentProto& document, DocumentId document_id) {
- if (index_->last_added_document_id() != kInvalidDocumentId &&
- document_id <= index_->last_added_document_id()) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "DocumentId %d must be greater than last added document_id %d",
- document_id, index_->last_added_document_id()));
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ PutDocumentStatsProto* put_document_stats) {
+ std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
+
+ for (auto& data_indexing_handler : data_indexing_handlers_) {
+ ICING_RETURN_IF_ERROR(data_indexing_handler->Handle(
+ tokenized_document, document_id, recovery_mode_, put_document_stats));
}
- ICING_ASSIGN_OR_RETURN(std::vector<Section> sections,
- schema_store_.ExtractSections(document));
- uint32_t num_tokens = 0;
- libtextclassifier3::Status overall_status;
- for (const Section& section : sections) {
- // TODO(b/152934343): pass real namespace ids in
- Index::Editor editor =
- index_->Edit(document_id, section.metadata.id,
- section.metadata.term_match_type, /*namespace_id=*/0);
- for (std::string_view subcontent : section.content) {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- section.metadata.tokenizer, &lang_segmenter_));
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
- tokenizer->Tokenize(subcontent));
- while (itr->Advance()) {
- if (++num_tokens > options_.max_tokens_per_document) {
- switch (options_.token_limit_behavior) {
- case Options::TokenLimitBehavior::kReturnError:
- return absl_ports::ResourceExhaustedError(
- "Max number of tokens reached!");
- case Options::TokenLimitBehavior::kSuppressError:
- return libtextclassifier3::Status::OK;
- }
- }
- std::string term = normalizer_.NormalizeTerm(itr->GetToken().text);
- // Add this term to the index. Even if adding this hit fails, we keep
- // trying to add more hits because it's possible that future hits could
- // still be added successfully. For instance if the lexicon is full, we
- // might fail to add a hit for a new term, but should still be able to
- // add hits for terms that are already in the index.
- auto status = editor.AddHit(term.c_str());
- if (overall_status.ok() && !status.ok()) {
- // If we've succeeded to add everything so far, set overall_status to
- // represent this new failure. If we've already failed, no need to
- // update the status - we're already going to return a resource
- // exhausted error.
- overall_status = status;
- }
- }
- }
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_index_latency_ms(
+ index_timer->GetElapsedMilliseconds());
}
- return overall_status;
+
+ return libtextclassifier3::Status::OK;
}
} // namespace lib
diff --git a/icing/index/index-processor.h b/icing/index/index-processor.h
index c3ccac3..9b96f00 100644
--- a/icing/index/index-processor.h
+++ b/icing/index/index-processor.h
@@ -16,84 +16,49 @@
#define ICING_INDEX_INDEX_PROCESSOR_H_
#include <cstdint>
-#include <string>
+#include <memory>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/index/index.h"
-#include "icing/proto/document.pb.h"
-#include "icing/schema/schema-store.h"
-#include "icing/schema/section-manager.h"
+#include "icing/index/data-indexing-handler.h"
+#include "icing/proto/logging.pb.h"
#include "icing/store/document-id.h"
-#include "icing/tokenization/language-segmenter.h"
-#include "icing/tokenization/token.h"
-#include "icing/transform/normalizer.h"
+#include "icing/util/tokenized-document.h"
namespace icing {
namespace lib {
class IndexProcessor {
public:
- struct Options {
- int32_t max_tokens_per_document;
+ explicit IndexProcessor(std::vector<std::unique_ptr<DataIndexingHandler>>&&
+ data_indexing_handlers,
+ const Clock* clock, bool recovery_mode = false)
+ : data_indexing_handlers_(std::move(data_indexing_handlers)),
+ clock_(*clock),
+ recovery_mode_(recovery_mode) {}
- // Indicates how a document exceeding max_tokens_per_document should be
- // handled.
- enum class TokenLimitBehavior {
- // When set, the first max_tokens_per_document will be indexed. If the
- // token count exceeds max_tokens_per_document, a ResourceExhausted error
- // will be returned.
- kReturnError,
- // When set, the first max_tokens_per_document will be indexed. If the
- // token count exceeds max_tokens_per_document, OK will be returned.
- kSuppressError,
- };
- TokenLimitBehavior token_limit_behavior;
- };
-
- // Factory function to create an IndexProcessor which does not take ownership
- // of any input components, and all pointers must refer to valid objects that
- // outlive the created IndexProcessor instance.
+ // Add tokenized document to the index, associated with document_id. If the
+ // number of tokens in the document exceeds max_tokens_per_document, then only
+ // the first max_tokens_per_document will be added to the index. All tokens of
+ // length exceeding max_token_length will be shortened to max_token_length.
//
- // Returns:
- // An IndexProcessor on success
- // FAILED_PRECONDITION if any of the pointers is null.
- static libtextclassifier3::StatusOr<std::unique_ptr<IndexProcessor>> Create(
- const SchemaStore* schema_store, const LanguageSegmenter* lang_segmenter,
- const Normalizer* normalizer, Index* index, const Options& options);
-
- // Add document to the index, associated with document_id. If the number of
- // tokens in the document exceeds max_tokens_per_document, then only the first
- // max_tokens_per_document will be added to the index. All tokens of length
- // exceeding max_token_length will be shortened to max_token_length.
+ // Indexing a document *may* trigger an index merge. If a merge fails, then
+ // all content in the index will be lost.
+ //
+ // If put_document_stats is present, the fields related to indexing will be
+ // populated.
//
// Returns:
- // INVALID_ARGUMENT if document_id is less than the document_id of a
- // previously indexed
- // document or tokenization fails.
- // RESOURCE_EXHAUSTED if the index is full and can't add anymore content.
- // NOT_FOUND if there is no definition for the document's schema type.
- // INTERNAL_ERROR if any other errors occur
- libtextclassifier3::Status IndexDocument(const DocumentProto& document,
- DocumentId document_id);
+ // - OK on success.
+ // - Any DataIndexingHandler errors.
+ libtextclassifier3::Status IndexDocument(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ PutDocumentStatsProto* put_document_stats = nullptr);
private:
- IndexProcessor(const SchemaStore* schema_store,
- const LanguageSegmenter* lang_segmenter,
- const Normalizer* normalizer, Index* index,
- const Options& options)
- : schema_store_(*schema_store),
- lang_segmenter_(*lang_segmenter),
- normalizer_(*normalizer),
- index_(index),
- options_(options) {}
-
- std::string NormalizeToken(const Token& token);
-
- const SchemaStore& schema_store_;
- const LanguageSegmenter& lang_segmenter_;
- const Normalizer& normalizer_;
- Index* const index_;
- const Options options_;
+ std::vector<std::unique_ptr<DataIndexingHandler>> data_indexing_handlers_;
+ const Clock& clock_; // Does not own.
+ bool recovery_mode_;
};
} // namespace lib
diff --git a/icing/index/index-processor_benchmark.cc b/icing/index/index-processor_benchmark.cc
index 00d116f..8f5e319 100644
--- a/icing/index/index-processor_benchmark.cc
+++ b/icing/index/index-processor_benchmark.cc
@@ -12,32 +12,50 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
+#include "third_party/absl/flags/flag.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/index-processor.h"
#include "icing/index/index.h"
+#include "icing/index/integer-section-indexing-handler.h"
+#include "icing/index/numeric/integer-index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/term-indexing-handler.h"
#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-filesystem.h"
#include "icing/schema/schema-store.h"
-#include "icing/schema/schema-util.h"
-#include "icing/schema/section-manager.h"
+#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
// //icing/index:index-processor_benchmark
//
// $ blaze-bin/icing/index/index-processor_benchmark
-// --benchmarks=all
+// --benchmark_filter=all
//
// Run on an Android device:
// Make target //icing/tokenization:language-segmenter depend on
@@ -53,7 +71,8 @@
// $ adb push blaze-bin/icing/index/index-processor_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/index-processor_benchmark --benchmarks=all
+// $ adb shell /data/local/tmp/index-processor_benchmark
+// --benchmark_filter=all
// --adb
// Flag to tell the benchmark that it'll be run on an Android device via adb,
@@ -65,6 +84,8 @@ namespace lib {
namespace {
+using ::testing::IsTrue;
+
// Creates a fake type config with 10 properties (p0 - p9)
void CreateFakeTypeConfig(SchemaTypeConfigProto* type_config) {
type_config->set_schema_type("Fake_Type");
@@ -74,11 +95,11 @@ void CreateFakeTypeConfig(SchemaTypeConfigProto* type_config) {
property->set_property_name(
IcingStringUtil::StringPrintf("p%d", i)); // p0 - p9
property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
+ property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ property->mutable_string_indexing_config()->set_term_match_type(
TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ property->mutable_string_indexing_config()->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
}
}
@@ -132,10 +153,13 @@ DocumentProto CreateDocumentWithHiragana(int content_length) {
.Build();
}
-std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem,
+std::unique_ptr<Index> CreateIndex(const IcingFilesystem& icing_filesystem,
+ const Filesystem& filesystem,
const std::string& index_dir) {
- Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10);
- return Index::Create(options, &filesystem).ValueOrDie();
+ Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ return Index::Create(options, &filesystem, &icing_filesystem).ValueOrDie();
}
std::unique_ptr<Normalizer> CreateNormalizer() {
@@ -145,14 +169,20 @@ std::unique_ptr<Normalizer> CreateNormalizer() {
.ValueOrDie();
}
-std::unique_ptr<SchemaStore> CreateSchemaStore() {
- Filesystem filesystem;
+std::unique_ptr<SchemaStore> CreateSchemaStore(const Filesystem& filesystem,
+ const Clock* clock,
+ const std::string& base_dir) {
+ std::string schema_store_dir = base_dir + "/schema_store_test";
+ filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
+
std::unique_ptr<SchemaStore> schema_store =
- SchemaStore::Create(&filesystem, GetTestTempDir()).ValueOrDie();
+ SchemaStore::Create(&filesystem, schema_store_dir, clock).ValueOrDie();
SchemaProto schema;
CreateFakeTypeConfig(schema.add_types());
- auto set_schema_status = schema_store->SetSchema(schema);
+ auto set_schema_status = schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false);
if (!set_schema_status.ok()) {
ICING_LOG(ERROR) << set_schema_status.status().error_message();
@@ -161,22 +191,27 @@ std::unique_ptr<SchemaStore> CreateSchemaStore() {
return schema_store;
}
-void CleanUp(const IcingFilesystem& filesystem, const std::string& index_dir) {
- filesystem.DeleteDirectoryRecursively(index_dir.c_str());
+libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DataIndexingHandler>>>
+CreateDataIndexingHandlers(const Clock* clock, const Normalizer* normalizer,
+ Index* index, NumericIndex<int64_t>* integer_index) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<TermIndexingHandler> term_indexing_handler,
+ TermIndexingHandler::Create(
+ clock, normalizer, index,
+ /*build_property_existence_metadata_hits=*/true));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(clock, integer_index));
+
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(term_indexing_handler));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+ return handlers;
}
-std::unique_ptr<IndexProcessor> CreateIndexProcessor(
- const SchemaStore* schema_store,
- const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
- Index* index) {
- IndexProcessor::Options processor_options{};
- processor_options.max_tokens_per_document = 1024 * 1024 * 10;
- processor_options.token_limit_behavior =
- IndexProcessor::Options::TokenLimitBehavior::kReturnError;
-
- return IndexProcessor::Create(schema_store, language_segmenter, normalizer,
- index, processor_options)
- .ValueOrDie();
+void CleanUp(const Filesystem& filesystem, const std::string& base_dir) {
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
}
void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
@@ -186,29 +221,58 @@ void BM_IndexDocumentWithOneProperty(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
- IcingFilesystem filesystem;
- std::string index_dir = GetTestTempDir() + "/index_test/";
-
- CleanUp(filesystem, index_dir);
-
- std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ IcingFilesystem icing_filesystem;
+ Filesystem filesystem;
+ std::string base_dir = GetTestTempDir() + "/index_processor_benchmark";
+ std::string index_dir = base_dir + "/index_test/";
+ std::string integer_index_dir = base_dir + "/integer_index_test/";
+
+ CleanUp(filesystem, base_dir);
+ ASSERT_THAT(filesystem.CreateDirectoryRecursively(base_dir.c_str()),
+ IsTrue());
+
+ std::unique_ptr<Index> index =
+ CreateIndex(icing_filesystem, filesystem, index_dir);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ IntegerIndex::Create(filesystem, integer_index_dir,
+ IntegerIndex::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv=*/true));
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
- std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
- std::unique_ptr<IndexProcessor> index_processor =
- CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
- normalizer.get(), index.get());
+ Clock clock;
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, &clock, base_dir);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers,
+ CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(),
+ integer_index.get()));
+ auto index_processor =
+ std::make_unique<IndexProcessor>(std::move(handlers), &clock);
DocumentProto input_document = CreateDocumentWithOneProperty(state.range(0));
+ TokenizedDocument tokenized_document(std::move(
+ TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+ input_document)
+ .ValueOrDie()));
DocumentId document_id = 0;
for (auto _ : state) {
ICING_ASSERT_OK(
- index_processor->IndexDocument(input_document, document_id++));
+ index_processor->IndexDocument(tokenized_document, document_id++));
}
- CleanUp(filesystem, index_dir);
+ index_processor.reset();
+ schema_store.reset();
+ normalizer.reset();
+ language_segmenter.reset();
+ integer_index.reset();
+ index.reset();
+
+ CleanUp(filesystem, base_dir);
}
BENCHMARK(BM_IndexDocumentWithOneProperty)
->Arg(1000)
@@ -233,30 +297,59 @@ void BM_IndexDocumentWithTenProperties(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
- IcingFilesystem filesystem;
- std::string index_dir = GetTestTempDir() + "/index_test/";
-
- CleanUp(filesystem, index_dir);
-
- std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ IcingFilesystem icing_filesystem;
+ Filesystem filesystem;
+ std::string base_dir = GetTestTempDir() + "/index_processor_benchmark";
+ std::string index_dir = base_dir + "/index_test/";
+ std::string integer_index_dir = base_dir + "/integer_index_test/";
+
+ CleanUp(filesystem, base_dir);
+ ASSERT_THAT(filesystem.CreateDirectoryRecursively(base_dir.c_str()),
+ IsTrue());
+
+ std::unique_ptr<Index> index =
+ CreateIndex(icing_filesystem, filesystem, index_dir);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ IntegerIndex::Create(filesystem, integer_index_dir,
+ IntegerIndex::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv=*/true));
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
- std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
- std::unique_ptr<IndexProcessor> index_processor =
- CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
- normalizer.get(), index.get());
+ Clock clock;
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, &clock, base_dir);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers,
+ CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(),
+ integer_index.get()));
+ auto index_processor =
+ std::make_unique<IndexProcessor>(std::move(handlers), &clock);
DocumentProto input_document =
CreateDocumentWithTenProperties(state.range(0));
+ TokenizedDocument tokenized_document(std::move(
+ TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+ input_document)
+ .ValueOrDie()));
DocumentId document_id = 0;
for (auto _ : state) {
ICING_ASSERT_OK(
- index_processor->IndexDocument(input_document, document_id++));
+ index_processor->IndexDocument(tokenized_document, document_id++));
}
- CleanUp(filesystem, index_dir);
+ index_processor.reset();
+ schema_store.reset();
+ normalizer.reset();
+ language_segmenter.reset();
+ integer_index.reset();
+ index.reset();
+
+ CleanUp(filesystem, base_dir);
}
BENCHMARK(BM_IndexDocumentWithTenProperties)
->Arg(1000)
@@ -281,30 +374,59 @@ void BM_IndexDocumentWithDiacriticLetters(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
- IcingFilesystem filesystem;
- std::string index_dir = GetTestTempDir() + "/index_test/";
-
- CleanUp(filesystem, index_dir);
-
- std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ IcingFilesystem icing_filesystem;
+ Filesystem filesystem;
+ std::string base_dir = GetTestTempDir() + "/index_processor_benchmark";
+ std::string index_dir = base_dir + "/index_test/";
+ std::string integer_index_dir = base_dir + "/integer_index_test/";
+
+ CleanUp(filesystem, base_dir);
+ ASSERT_THAT(filesystem.CreateDirectoryRecursively(base_dir.c_str()),
+ IsTrue());
+
+ std::unique_ptr<Index> index =
+ CreateIndex(icing_filesystem, filesystem, index_dir);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ IntegerIndex::Create(filesystem, integer_index_dir,
+ IntegerIndex::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv=*/true));
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
- std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
- std::unique_ptr<IndexProcessor> index_processor =
- CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
- normalizer.get(), index.get());
+ Clock clock;
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, &clock, base_dir);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers,
+ CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(),
+ integer_index.get()));
+ auto index_processor =
+ std::make_unique<IndexProcessor>(std::move(handlers), &clock);
DocumentProto input_document =
CreateDocumentWithDiacriticLetters(state.range(0));
+ TokenizedDocument tokenized_document(std::move(
+ TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+ input_document)
+ .ValueOrDie()));
DocumentId document_id = 0;
for (auto _ : state) {
ICING_ASSERT_OK(
- index_processor->IndexDocument(input_document, document_id++));
+ index_processor->IndexDocument(tokenized_document, document_id++));
}
- CleanUp(filesystem, index_dir);
+ index_processor.reset();
+ schema_store.reset();
+ normalizer.reset();
+ language_segmenter.reset();
+ integer_index.reset();
+ index.reset();
+
+ CleanUp(filesystem, base_dir);
}
BENCHMARK(BM_IndexDocumentWithDiacriticLetters)
->Arg(1000)
@@ -329,29 +451,58 @@ void BM_IndexDocumentWithHiragana(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
- IcingFilesystem filesystem;
- std::string index_dir = GetTestTempDir() + "/index_test/";
-
- CleanUp(filesystem, index_dir);
-
- std::unique_ptr<Index> index = CreateIndex(filesystem, index_dir);
+ IcingFilesystem icing_filesystem;
+ Filesystem filesystem;
+ std::string base_dir = GetTestTempDir() + "/index_processor_benchmark";
+ std::string index_dir = base_dir + "/index_test/";
+ std::string integer_index_dir = base_dir + "/integer_index_test/";
+
+ CleanUp(filesystem, base_dir);
+ ASSERT_THAT(filesystem.CreateDirectoryRecursively(base_dir.c_str()),
+ IsTrue());
+
+ std::unique_ptr<Index> index =
+ CreateIndex(icing_filesystem, filesystem, index_dir);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ IntegerIndex::Create(filesystem, integer_index_dir,
+ IntegerIndex::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv=*/true));
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
- std::unique_ptr<SchemaStore> schema_store = CreateSchemaStore();
- std::unique_ptr<IndexProcessor> index_processor =
- CreateIndexProcessor(schema_store.get(), language_segmenter.get(),
- normalizer.get(), index.get());
+ Clock clock;
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, &clock, base_dir);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers,
+ CreateDataIndexingHandlers(&clock, normalizer.get(), index.get(),
+ integer_index.get()));
+ auto index_processor =
+ std::make_unique<IndexProcessor>(std::move(handlers), &clock);
DocumentProto input_document = CreateDocumentWithHiragana(state.range(0));
+ TokenizedDocument tokenized_document(std::move(
+ TokenizedDocument::Create(schema_store.get(), language_segmenter.get(),
+ input_document)
+ .ValueOrDie()));
DocumentId document_id = 0;
for (auto _ : state) {
ICING_ASSERT_OK(
- index_processor->IndexDocument(input_document, document_id++));
+ index_processor->IndexDocument(tokenized_document, document_id++));
}
- CleanUp(filesystem, index_dir);
+ index_processor.reset();
+ schema_store.reset();
+ normalizer.reset();
+ language_segmenter.reset();
+ integer_index.reset();
+ index.reset();
+
+ CleanUp(filesystem, base_dir);
}
BENCHMARK(BM_IndexDocumentWithHiragana)
->Arg(1000)
diff --git a/icing/index/index-processor_test.cc b/icing/index/index-processor_test.cc
index 8dfb9c2..3d1be68 100644
--- a/icing/index/index-processor_test.cc
+++ b/icing/index/index-processor_test.cc
@@ -19,162 +19,341 @@
#include <memory>
#include <string>
#include <string_view>
+#include <unordered_map>
#include <utility>
#include <vector>
+#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/absl_ports/str_join.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/index/data-indexing-handler.h"
#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
#include "icing/index/index.h"
+#include "icing/index/integer-section-indexing-handler.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/integer-index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/term-indexing-handler.h"
+#include "icing/index/term-property-id.h"
+#include "icing/join/qualified-id-join-index-impl-v1.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/join/qualified-id-join-indexing-handler.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
-#include "icing/schema/schema-util.h"
-#include "icing/schema/section-manager.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/random-string.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/crc32.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace {
-// type and property names of FakeType
+constexpr std::string_view kIpsumText =
+ "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Nulla convallis "
+ "scelerisque orci quis hendrerit. Sed augue turpis, sodales eu gravida "
+ "nec, scelerisque nec leo. Maecenas accumsan interdum commodo. Aliquam "
+ "mattis sapien est, sit amet interdum risus dapibus sed. Maecenas leo "
+ "erat, fringilla in nisl a, venenatis gravida metus. Phasellus venenatis, "
+ "orci in aliquet mattis, lectus sapien volutpat arcu, sed hendrerit ligula "
+ "arcu nec mauris. Integer dolor mi, rhoncus eget gravida et, pulvinar et "
+ "nunc. Aliquam ac sollicitudin nisi. Vivamus sit amet urna vestibulum, "
+ "tincidunt eros sed, efficitur nisl. Fusce non neque accumsan, sagittis "
+ "nisi eget, sagittis turpis. Ut pulvinar nibh eu purus feugiat faucibus. "
+ "Donec tellus nulla, tincidunt vel lacus id, bibendum fermentum turpis. "
+ "Nullam ultrices sed nibh vitae aliquet. Ut risus neque, consectetur "
+ "vehicula posuere vitae, convallis eu lorem. Donec semper augue eu nibh "
+ "placerat semper.";
+
+// schema types
constexpr std::string_view kFakeType = "FakeType";
+constexpr std::string_view kNestedType = "NestedType";
+
+// Indexable properties and section Id. Section Id is determined by the
+// lexicographical order of indexable property path.
constexpr std::string_view kExactProperty = "exact";
+constexpr std::string_view kIndexableIntegerProperty = "indexableInteger";
constexpr std::string_view kPrefixedProperty = "prefixed";
+constexpr std::string_view kRepeatedProperty = "repeated";
+constexpr std::string_view kRfc822Property = "rfc822";
+constexpr std::string_view kSubProperty = "submessage"; // submessage.nested
+constexpr std::string_view kNestedProperty = "nested"; // submessage.nested
+// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
+// to Android.
+#ifdef ENABLE_URL_TOKENIZER
+constexpr std::string_view kUrlExactProperty = "urlExact";
+constexpr std::string_view kUrlPrefixedProperty = "urlPrefixed";
+#endif // ENABLE_URL_TOKENIZER
+constexpr std::string_view kVerbatimExactProperty = "verbatimExact";
+constexpr std::string_view kVerbatimPrefixedProperty = "verbatimPrefixed";
+
+constexpr SectionId kExactSectionId = 0;
+constexpr SectionId kIndexableIntegerSectionId = 1;
+constexpr SectionId kPrefixedSectionId = 2;
+constexpr SectionId kRepeatedSectionId = 3;
+constexpr SectionId kRfc822SectionId = 4;
+constexpr SectionId kNestedSectionId = 5; // submessage.nested
+#ifdef ENABLE_URL_TOKENIZER
+constexpr SectionId kUrlExactSectionId = 6;
+constexpr SectionId kUrlPrefixedSectionId = 7;
+constexpr SectionId kVerbatimExactSectionId = 8;
+constexpr SectionId kVerbatimPrefixedSectionId = 9;
+#else // !ENABLE_URL_TOKENIZER
+constexpr SectionId kVerbatimExactSectionId = 6;
+constexpr SectionId kVerbatimPrefixedSectionId = 7;
+#endif // ENABLE_URL_TOKENIZER
+
+// Other non-indexable properties.
constexpr std::string_view kUnindexedProperty1 = "unindexed1";
constexpr std::string_view kUnindexedProperty2 = "unindexed2";
-constexpr std::string_view kSubProperty = "submessage";
-constexpr std::string_view kNestedProperty = "nested";
-constexpr std::string_view kRepeatedProperty = "repeated";
constexpr DocumentId kDocumentId0 = 0;
constexpr DocumentId kDocumentId1 = 1;
-constexpr SectionId kExactSectionId = 0;
-constexpr SectionId kPrefixedSectionId = 1;
-constexpr SectionId kRepeatedSectionId = 2;
-constexpr SectionId kNestedSectionId = 3;
-
using Cardinality = PropertyConfigProto::Cardinality;
using DataType = PropertyConfigProto::DataType;
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsEmpty;
+using ::testing::IsTrue;
+using ::testing::SizeIs;
using ::testing::Test;
+#ifdef ENABLE_URL_TOKENIZER
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
+ StringIndexingConfig::TokenizerType::URL;
+#endif // ENABLE_URL_TOKENIZER
+
class IndexProcessorTest : public Test {
protected:
void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
-
- index_dir_ = GetTestTempDir() + "/index_test/";
- Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
- ICING_ASSERT_OK_AND_ASSIGN(index_,
- Index::Create(options, &icing_filesystem_));
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ base_dir_ = GetTestTempDir() + "/index_processor_test";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ index_dir_ = base_dir_ + "/index";
+ integer_index_dir_ = base_dir_ + "/integer_index";
+ qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index";
+ schema_store_dir_ = base_dir_ + "/schema_store";
+ doc_store_dir_ = base_dir_ + "/doc_store";
+
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
- ICING_ASSERT_OK_AND_ASSIGN(lang_segmenter_,
- language_segmenter_factory::Create());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ integer_index_,
+ IntegerIndex::Create(
+ filesystem_, integer_index_dir_,
+ IntegerIndex::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_,
+ QualifiedIdJoinIndexImplV1::Create(
+ filesystem_, qualified_id_join_index_dir_,
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false));
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
ICING_ASSERT_OK_AND_ASSIGN(
normalizer_,
normalizer_factory::Create(
-
/*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
+ ASSERT_TRUE(
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()));
ICING_ASSERT_OK_AND_ASSIGN(
- schema_store_, SchemaStore::Create(&filesystem_, GetTestTempDir()));
- SchemaProto schema;
- CreateFakeTypeConfig(schema.add_types());
- ICING_ASSERT_OK(schema_store_->SetSchema(schema));
-
- IndexProcessor::Options processor_options;
- processor_options.max_tokens_per_document = 1000;
- processor_options.token_limit_behavior =
- IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kFakeType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kExactProperty)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPrefixedProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kUnindexedProperty1)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kUnindexedProperty2)
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kRepeatedProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kVerbatimExactProperty)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kVerbatimPrefixedProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kRfc822Property)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_RFC822)
+ .SetCardinality(CARDINALITY_REPEATED))
+#ifdef ENABLE_URL_TOKENIZER
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kUrlExactProperty)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_URL)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kUrlPrefixedProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_URL)
+ .SetCardinality(CARDINALITY_REPEATED))
+#endif // ENABLE_URL_TOKENIZER
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kIndexableIntegerProperty)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kSubProperty)
+ .SetDataTypeDocument(
+ kNestedType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kNestedType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kNestedProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ doc_store_ = std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(),
- processor_options));
+ std::unique_ptr<TermIndexingHandler> term_indexing_handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer_.get(), index_.get(),
+ /*build_property_existence_metadata_hits=*/true));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(
+ &fake_clock_, integer_index_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler>
+ qualified_id_join_indexing_handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(term_indexing_handler));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+ handlers.push_back(std::move(qualified_id_join_indexing_handler));
+
+ index_processor_ =
+ std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
+
+ mock_icing_filesystem_ = std::make_unique<IcingMockFilesystem>();
}
void TearDown() override {
- filesystem_.DeleteDirectoryRecursively(index_dir_.c_str());
- }
-
- std::unique_ptr<IndexProcessor> index_processor_;
- std::unique_ptr<LanguageSegmenter> lang_segmenter_;
- std::unique_ptr<Normalizer> normalizer_;
- std::unique_ptr<Index> index_;
- std::unique_ptr<SchemaStore> schema_store_;
-
- private:
- static void AddProperty(std::string_view name, DataType::Code type,
- Cardinality::Code cardinality,
- TermMatchType::Code term_match_type,
- SchemaTypeConfigProto* type_config) {
- auto* prop = type_config->add_properties();
- prop->set_property_name(std::string(name));
- prop->set_data_type(type);
- prop->set_cardinality(cardinality);
- prop->mutable_indexing_config()->set_term_match_type(term_match_type);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ index_processor_.reset();
+ doc_store_.reset();
+ schema_store_.reset();
+ normalizer_.reset();
+ lang_segmenter_.reset();
+ qualified_id_join_index_.reset();
+ integer_index_.reset();
+ index_.reset();
+
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
}
- static void CreateFakeTypeConfig(SchemaTypeConfigProto* type_config) {
- type_config->set_schema_type(std::string(kFakeType));
-
- AddProperty(std::string(kExactProperty), DataType::STRING,
- Cardinality::REQUIRED, TermMatchType::EXACT_ONLY, type_config);
-
- AddProperty(std::string(kPrefixedProperty), DataType::STRING,
- Cardinality::OPTIONAL, TermMatchType::PREFIX, type_config);
-
- // Don't set IndexingConfig
- auto* prop = type_config->add_properties();
- prop->set_property_name(std::string(kUnindexedProperty1));
- prop->set_data_type(DataType::STRING);
- prop->set_cardinality(Cardinality::OPTIONAL);
-
- AddProperty(std::string(kUnindexedProperty2), DataType::BYTES,
- Cardinality::OPTIONAL, TermMatchType::UNKNOWN, type_config);
-
- AddProperty(std::string(kRepeatedProperty), DataType::STRING,
- Cardinality::REPEATED, TermMatchType::PREFIX, type_config);
-
- AddProperty(kSubProperty, DataType::DOCUMENT, Cardinality::OPTIONAL,
- TermMatchType::UNKNOWN, type_config);
-
- std::string recipients_name =
- absl_ports::StrCat(kSubProperty, kPropertySeparator, kNestedProperty);
- AddProperty(recipients_name, DataType::STRING, Cardinality::OPTIONAL,
- TermMatchType::PREFIX, type_config);
- }
+ std::unique_ptr<IcingMockFilesystem> mock_icing_filesystem_;
Filesystem filesystem_;
IcingFilesystem icing_filesystem_;
+ FakeClock fake_clock_;
+ std::string base_dir_;
std::string index_dir_;
+ std::string integer_index_dir_;
+ std::string qualified_id_join_index_dir_;
+ std::string schema_store_dir_;
+ std::string doc_store_dir_;
+
+ std::unique_ptr<Index> index_;
+ std::unique_ptr<NumericIndex<int64_t>> integer_index_;
+ std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> doc_store_;
+
+ std::unique_ptr<IndexProcessor> index_processor_;
};
std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
@@ -185,31 +364,18 @@ std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
return infos;
}
-TEST_F(IndexProcessorTest, CreationWithNullPointerShouldFail) {
- IndexProcessor::Options processor_options;
- processor_options.max_tokens_per_document = 1000;
- processor_options.token_limit_behavior =
- IndexProcessor::Options::TokenLimitBehavior::kReturnError;
-
- EXPECT_THAT(IndexProcessor::Create(/*schema_store=*/nullptr,
- lang_segmenter_.get(), normalizer_.get(),
- index_.get(), processor_options),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
- EXPECT_THAT(IndexProcessor::Create(
- schema_store_.get(), /*lang_segmenter=*/nullptr,
- normalizer_.get(), index_.get(), processor_options),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
- EXPECT_THAT(IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- /*normalizer=*/nullptr, index_.get(),
- processor_options),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
- EXPECT_THAT(IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), /*index=*/nullptr,
- processor_options),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency(
+ std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfoTermFrequencyPair> infos;
+ while (iterator->Advance().ok()) {
+ std::vector<TermMatchInfo> matched_terms_stats;
+ iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ for (const TermMatchInfo& term_match_info : matched_terms_stats) {
+ infos.push_back(DocHitInfoTermFrequencyPair(
+ iterator->doc_hit_info(), term_match_info.term_frequencies));
+ }
+ }
+ return infos;
}
TEST_F(IndexProcessorTest, NoTermMatchTypeContent) {
@@ -221,8 +387,29 @@ TEST_F(IndexProcessorTest, NoTermMatchTypeContent) {
.AddBytesProperty(std::string(kUnindexedProperty2),
"attachment bytes")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexProcessorTest, NoValidContent) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "?...!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexProcessorTest, OneDoc) {
@@ -232,19 +419,30 @@ TEST_F(IndexProcessorTest, OneDoc) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "hello world")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("hello", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
- EXPECT_THAT(GetHits(std::move(itr)),
- ElementsAre(EqualsDocHitInfo(
- kDocumentId0, std::vector<SectionId>{kExactSectionId})));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("hello", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
+ {kExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expectedMap)));
ICING_ASSERT_OK_AND_ASSIGN(
- itr, index_->GetIterator("hello", 1U << kPrefixedSectionId,
- TermMatchType::EXACT_ONLY));
+ itr, index_->GetIterator(
+ "hello", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
}
@@ -256,35 +454,69 @@ TEST_F(IndexProcessorTest, MultipleDocs) {
.AddStringProperty(std::string(kExactProperty), "hello world")
.AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+ std::string coffeeRepeatedString = "coffee";
+ for (int i = 0; i < Hit::kMaxTermFrequency + 1; i++) {
+ coffeeRepeatedString += " coffee";
+ }
+
document =
DocumentBuilder()
.SetKey("icing", "fake_type/2")
.SetSchema(std::string(kFakeType))
- .AddStringProperty(std::string(kExactProperty), "pitbull")
- .AddStringProperty(std::string(kPrefixedProperty), "mr. world wide")
+ .AddStringProperty(std::string(kExactProperty), coffeeRepeatedString)
+ .AddStringProperty(std::string(kPrefixedProperty),
+ "mr. world world wide")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("world", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("world", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap1{
+ {kPrefixedSectionId, 2}};
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap2{
+ {kExactSectionId, 1}};
EXPECT_THAT(
- GetHits(std::move(itr)),
- ElementsAre(EqualsDocHitInfo(kDocumentId1,
- std::vector<SectionId>{kPrefixedSectionId}),
- EqualsDocHitInfo(kDocumentId0,
- std::vector<SectionId>{kExactSectionId})));
+ hits, ElementsAre(
+ EqualsDocHitInfoWithTermFrequency(kDocumentId1, expectedMap1),
+ EqualsDocHitInfoWithTermFrequency(kDocumentId0, expectedMap2)));
ICING_ASSERT_OK_AND_ASSIGN(
- itr, index_->GetIterator("world", 1U << kPrefixedSectionId,
- TermMatchType::EXACT_ONLY));
- EXPECT_THAT(GetHits(std::move(itr)),
- ElementsAre(EqualsDocHitInfo(
- kDocumentId1, std::vector<SectionId>{kPrefixedSectionId})));
+ itr, index_->GetIterator(
+ "world", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ 1U << kPrefixedSectionId, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
+ {kPrefixedSectionId, 2}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId1, expectedMap)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("coffee", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expectedMap = {{kExactSectionId, Hit::kMaxTermFrequency}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId1, expectedMap)));
}
TEST_F(IndexProcessorTest, DocWithNestedProperty) {
@@ -296,16 +528,25 @@ TEST_F(IndexProcessorTest, DocWithNestedProperty) {
.AddDocumentProperty(
std::string(kSubProperty),
DocumentBuilder()
+ .SetKey("icing", "nested_type/1")
+ .SetSchema(std::string(kNestedType))
.AddStringProperty(std::string(kNestedProperty),
"rocky raccoon")
.Build())
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("rocky", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("rocky", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kNestedSectionId})));
@@ -320,108 +561,92 @@ TEST_F(IndexProcessorTest, DocWithRepeatedProperty) {
.AddStringProperty(std::string(kRepeatedProperty), "rocky",
"italian stallion")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("italian", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("italian", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kRepeatedSectionId})));
}
-TEST_F(IndexProcessorTest, TooManyTokensReturnError) {
- // Only allow the first four tokens ("hello", "world", "good", "night") to be
- // indexed.
- IndexProcessor::Options options;
- options.max_tokens_per_document = 4;
- options.token_limit_behavior =
- IndexProcessor::Options::TokenLimitBehavior::kReturnError;
+// TODO(b/196771754) This test is disabled on Android because it takes too long
+// to generate all of the unique terms and the test times out. Try storing these
+// unique terms in a file that the test can read from.
+#ifndef __ANDROID__
- ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(), options));
+TEST_F(IndexProcessorTest, HitBufferExhaustedTest) {
+ // Testing has shown that adding ~600,000 hits will fill up the hit buffer.
+ std::vector<std::string> unique_terms_ = GenerateUniqueTerms(200000);
+ std::string content = absl_ports::StrJoin(unique_terms_, " ");
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
- .AddStringProperty(std::string(kExactProperty), "hello world")
- .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
+ .AddStringProperty(std::string(kExactProperty), content)
+ .AddStringProperty(std::string(kPrefixedProperty), content)
+ .AddStringProperty(std::string(kRepeatedProperty), content)
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
- StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED,
+ testing::HasSubstr("Hit buffer is full!")));
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
-
- // "night" should have been indexed.
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("night", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
- EXPECT_THAT(GetHits(std::move(itr)),
- ElementsAre(EqualsDocHitInfo(
- kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
-
- // "moon" should not have been.
- ICING_ASSERT_OK_AND_ASSIGN(itr,
- index_->GetIterator("moon", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
- EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
}
-TEST_F(IndexProcessorTest, TooManyTokensSuppressError) {
- // Only allow the first four tokens ("hello", "world", "good", "night") to be
- // indexed.
- IndexProcessor::Options options;
- options.max_tokens_per_document = 4;
- options.token_limit_behavior =
- IndexProcessor::Options::TokenLimitBehavior::kSuppressError;
-
- ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(), options));
+TEST_F(IndexProcessorTest, LexiconExhaustedTest) {
+ // Testing has shown that adding ~300,000 terms generated this way will
+ // fill up the lexicon.
+ std::vector<std::string> unique_terms_ = GenerateUniqueTerms(300000);
+ std::string content = absl_ports::StrJoin(unique_terms_, " ");
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
- .AddStringProperty(std::string(kExactProperty), "hello world")
- .AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
+ .AddStringProperty(std::string(kExactProperty), content)
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
-
- // "night" should have been indexed.
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("night", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
- EXPECT_THAT(GetHits(std::move(itr)),
- ElementsAre(EqualsDocHitInfo(
- kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
-
- // "moon" should not have been.
- ICING_ASSERT_OK_AND_ASSIGN(itr,
- index_->GetIterator("moon", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
- EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
}
+#endif // __ANDROID__
+
TEST_F(IndexProcessorTest, TooLongTokens) {
// Only allow the tokens of length four, truncating "hello", "world" and
// "night".
- IndexProcessor::Options options;
- options.max_tokens_per_document = 1000;
-
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
/*max_term_byte_size=*/4));
ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer.get(), index_.get(), options));
+ std::unique_ptr<TermIndexingHandler> term_indexing_handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer.get(), index_.get(),
+ /*build_property_existence_metadata_hits=*/true));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(term_indexing_handler));
+
+ index_processor_ =
+ std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
DocumentProto document =
DocumentBuilder()
@@ -430,27 +655,36 @@ TEST_F(IndexProcessorTest, TooLongTokens) {
.AddStringProperty(std::string(kExactProperty), "hello world")
.AddStringProperty(std::string(kPrefixedProperty), "good night moon!")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
// "good" should have been indexed normally.
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("good", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("good", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
// "night" should not have been.
- ICING_ASSERT_OK_AND_ASSIGN(itr,
- index_->GetIterator("night", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("night", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
// "night" should have been truncated to "nigh".
- ICING_ASSERT_OK_AND_ASSIGN(itr,
- index_->GetIterator("nigh", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("nigh", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
@@ -463,7 +697,12 @@ TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "best rocky movies")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
document =
@@ -472,13 +711,20 @@ TEST_F(IndexProcessorTest, NonPrefixedContentPrefixQuery) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kPrefixedProperty), "rocky raccoon")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
// Only document_id 1 should surface in a prefix query for "Rock"
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("rock", kSectionIdMaskAll, TermMatchType::PREFIX));
+ index_->GetIterator("rock", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId1, std::vector<SectionId>{kPrefixedSectionId})));
@@ -491,7 +737,12 @@ TEST_F(IndexProcessorTest, TokenNormalization) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
document =
@@ -500,12 +751,19 @@ TEST_F(IndexProcessorTest, TokenNormalization) {
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "all lower case")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("case", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("case", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(
GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(kDocumentId1,
@@ -520,29 +778,138 @@ TEST_F(IndexProcessorTest, OutOfOrderDocumentIds) {
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
+ .AddInt64Property(std::string(kIndexableIntegerProperty), 123)
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
- // Indexing a document with document_id < last_added_document_id should cause
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size,
+ index_->GetElementsSize());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc,
+ integer_index_->UpdateChecksums());
+
+ // Indexing a document with document_id <= last_added_document_id should cause
// a failure.
document =
DocumentBuilder()
.SetKey("icing", "fake_type/2")
.SetSchema(std::string(kFakeType))
.AddStringProperty(std::string(kExactProperty), "all lower case")
+ .AddInt64Property(std::string(kIndexableIntegerProperty), 456)
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ // Verify that both index_ and integer_index_ are unchanged.
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
+ EXPECT_THAT(integer_index_->UpdateChecksums(),
+ IsOkAndHolds(integer_index_crc));
// As should indexing a document document_id == last_added_document_id.
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ // Verify that both index_ and integer_index_ are unchanged.
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
+ EXPECT_THAT(integer_index_->UpdateChecksums(),
+ IsOkAndHolds(integer_index_crc));
+}
+TEST_F(IndexProcessorTest, OutOfOrderDocumentIdsInRecoveryMode) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIndexingHandler> term_indexing_handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer_.get(), index_.get(),
+ /*build_property_existence_metadata_hits=*/true));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<IntegerSectionIndexingHandler>
+ integer_section_indexing_handler,
+ IntegerSectionIndexingHandler::Create(
+ &fake_clock_, integer_index_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler>
+ qualified_id_join_indexing_handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(term_indexing_handler));
+ handlers.push_back(std::move(integer_section_indexing_handler));
+ handlers.push_back(std::move(qualified_id_join_indexing_handler));
+
+ IndexProcessor index_processor(std::move(handlers), &fake_clock_,
+ /*recovery_mode=*/true);
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "ALL UPPER CASE")
+ .AddInt64Property(std::string(kIndexableIntegerProperty), 123)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t index_element_size,
+ index_->GetElementsSize());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 integer_index_crc,
+ integer_index_->UpdateChecksums());
+
+ // Indexing a document with document_id <= last_added_document_id in recovery
+ // mode should not get any error, but IndexProcessor should still ignore it
+ // and index data should remain unchanged.
+ document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), "all lower case")
+ .AddInt64Property(std::string(kIndexableIntegerProperty), 456)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ // Verify that both index_ and integer_index_ are unchanged.
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
+ EXPECT_THAT(integer_index_->UpdateChecksums(),
+ IsOkAndHolds(integer_index_crc));
+
+ // As should indexing a document document_id == last_added_document_id.
+ EXPECT_THAT(index_processor.IndexDocument(tokenized_document, kDocumentId1),
+ IsOk());
+ // Verify that both index_ and integer_index_ are unchanged.
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(index_element_size));
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kDocumentId1));
+ EXPECT_THAT(integer_index_->UpdateChecksums(),
+ IsOkAndHolds(integer_index_crc));
}
TEST_F(IndexProcessorTest, NonAsciiIndexing) {
+ language_segmenter_factory::SegmenterOptions segmenter_options(
+ ULOC_SIMPLIFIED_CHINESE);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
+
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
@@ -550,12 +917,19 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) {
.AddStringProperty(std::string(kExactProperty),
"你好,世界!你好:世界。“你好”世界?")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("你好", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("你好", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kExactSectionId})));
@@ -563,24 +937,13 @@ TEST_F(IndexProcessorTest, NonAsciiIndexing) {
TEST_F(IndexProcessorTest,
LexiconFullIndexesSmallerTokensReturnsResourceExhausted) {
- IndexProcessor::Options processor_options;
- processor_options.max_tokens_per_document = 1000;
- processor_options.token_limit_behavior =
- IndexProcessor::Options::TokenLimitBehavior::kReturnError;
-
- ICING_ASSERT_OK_AND_ASSIGN(
- index_processor_,
- IndexProcessor::Create(schema_store_.get(), lang_segmenter_.get(),
- normalizer_.get(), index_.get(),
- processor_options));
-
// This is the maximum token length that an empty lexicon constructed for a
// lite index with merge size of 1MiB can support.
constexpr int kMaxTokenLength = 16777217;
// Create a string "ppppppp..." with a length that is too large to fit into
// the lexicon.
std::string enormous_string(kMaxTokenLength + 1, 'p');
- DocumentProto document =
+ DocumentProto document_one =
DocumentBuilder()
.SetKey("icing", "fake_type/1")
.SetSchema(std::string(kFakeType))
@@ -588,23 +951,656 @@ TEST_F(IndexProcessorTest,
absl_ports::StrCat(enormous_string, " foo"))
.AddStringProperty(std::string(kPrefixedProperty), "bar baz")
.Build();
- EXPECT_THAT(index_processor_->IndexDocument(document, kDocumentId0),
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document_one));
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+}
+
+TEST_F(IndexProcessorTest, IndexingDocAutomaticMerge) {
+ // Create the index with a smaller index_merge_size - merging every time we
+ // add 101 documents. This will result in a small LiteIndex, which will be
+ // easier to fill up. The LiteIndex itself will have a size larger than the
+ // index_merge_size because it adds extra buffer to ensure that it always has
+ // room to fit whatever document will trigger the merge.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kExactProperty), kIpsumText)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ Index::Options options(index_dir_,
+ /*index_merge_size=*/document.ByteSizeLong() * 100,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIndexingHandler> term_indexing_handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer_.get(), index_.get(),
+ /*build_property_existence_metadata_hits=*/true));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(term_indexing_handler));
+
+ index_processor_ =
+ std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
+
+ DocumentId doc_id = 0;
+ // Have determined experimentally that indexing 3373 documents with this text
+ // will cause the LiteIndex to fill up. Further indexing will fail unless the
+ // index processor properly merges the LiteIndex into the MainIndex and
+ // empties the LiteIndex.
+ constexpr int kNumDocsLiteIndexExhaustion = 3373;
+ for (; doc_id < kNumDocsLiteIndexExhaustion; ++doc_id) {
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
+ }
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
+}
+
+TEST_F(IndexProcessorTest, IndexingDocMergeFailureResets) {
+ // 1. Setup a mock filesystem to fail to grow the main index.
+ auto open_write_lambda = [this](const char* filename) {
+ std::string main_lexicon_suffix =
+ "/main-lexicon.prop." +
+ std::to_string(GetHasHitsInPrefixSectionPropertyId());
+ std::string filename_string(filename);
+ if (filename_string.length() >= main_lexicon_suffix.length() &&
+ filename_string.substr(
+ filename_string.length() - main_lexicon_suffix.length(),
+ main_lexicon_suffix.length()) == main_lexicon_suffix) {
+ return -1;
+ }
+ return this->filesystem_.OpenForWrite(filename);
+ };
+ ON_CALL(*mock_icing_filesystem_, OpenForWrite)
+ .WillByDefault(open_write_lambda);
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPrefixedProperty), kIpsumText)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ // 2. Recreate the index with the mock filesystem and a merge size that will
+ // only allow one document to be added before requiring a merge.
+ Index::Options options(index_dir_,
+ /*index_merge_size=*/document.ByteSizeLong(),
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/16);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_,
+ Index::Create(options, &filesystem_, mock_icing_filesystem_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIndexingHandler> term_indexing_handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer_.get(), index_.get(),
+ /*build_property_existence_metadata_hits=*/true));
+ std::vector<std::unique_ptr<DataIndexingHandler>> handlers;
+ handlers.push_back(std::move(term_indexing_handler));
+
+ index_processor_ =
+ std::make_unique<IndexProcessor>(std::move(handlers), &fake_clock_);
+
+ // 3. Index one document. This should fit in the LiteIndex without requiring a
+ // merge.
+ DocumentId doc_id = 0;
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
+
+ // 4. Add one more document to trigger a merge, which should fail and result
+ // in a Reset.
+ ++doc_id;
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ StatusIs(libtextclassifier3::StatusCode::DATA_LOSS));
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // 5. Indexing a new document should succeed.
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, doc_id),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(doc_id));
+}
+
+TEST_F(IndexProcessorTest, ExactVerbatimProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kVerbatimExactProperty),
+ "Hello, world!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
- EXPECT_THAT(GetHits(std::move(itr)),
- ElementsAre(EqualsDocHitInfo(
- kDocumentId0, std::vector<SectionId>{kExactSectionId})));
+ index_->GetIterator("Hello, world!", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
+ {kVerbatimExactSectionId, 1}};
+
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expectedMap)));
+}
+TEST_F(IndexProcessorTest, PrefixVerbatimProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kVerbatimPrefixedProperty),
+ "Hello, world!")
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
- itr,
- index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
- EXPECT_THAT(GetHits(std::move(itr)),
- ElementsAre(EqualsDocHitInfo(
- kDocumentId0, std::vector<SectionId>{kPrefixedSectionId})));
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // We expect to match the document we indexed as "Hello, w" is a prefix
+ // of "Hello, world!"
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("Hello, w", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expectedMap{
+ {kVerbatimPrefixedSectionId, 1}};
+
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expectedMap)));
+}
+
+TEST_F(IndexProcessorTest, VerbatimPropertyDoesntMatchSubToken) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kVerbatimPrefixedProperty),
+ "Hello, world!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(1));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("world", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+
+ // We should not have hits for term "world" as the index processor should
+ // create a sole token "Hello, world! for the document.
+ EXPECT_THAT(hits, IsEmpty());
+}
+
+// Some phrases that should match exactly to RFC822 tokens. We normalize the
+// tokens, so the case of the string property shouldn't matter.
+TEST_F(IndexProcessorTest, Rfc822PropertyExact) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kRfc822Property),
+ "<AlexSav@GOOGLE.com>")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kRfc822SectionId, 2}};
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("alexsav", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ expected_map = {{kRfc822SectionId, 1}};
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("com", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("alexsav@google.com", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+}
+
+TEST_F(IndexProcessorTest, Rfc822PropertyExactShouldNotReturnPrefix) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kRfc822Property),
+ "<AlexSav@GOOGLE.com>")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kRfc822SectionId, 2}};
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("alexsa", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+}
+
+// Some prefixes of generated RFC822 tokens.
+#ifdef ENABLE_RFC822_PROPERTY_PREFIX_TEST
+// ENABLE_RFC822_PROPERTY_PREFIX_TEST won't be defined, so this test will not be
+// compiled.
+// TODO(b/250648165): Remove #ifdef to enable this test after fixing the
+// indeterministic behavior of prefix query term frequency in
+// lite index.
+//
+TEST_F(IndexProcessorTest, Rfc822PropertyPrefix) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kRfc822Property),
+ "<alexsav@google.com>")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kRfc822SectionId, 1}};
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("alexsav@", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("goog", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("ale", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+}
+#endif // ENABLE_RFC822_PROPERTY_PREFIX_TEST
+
+TEST_F(IndexProcessorTest, Rfc822PropertyNoMatch) {
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kRfc822Property),
+ "<alexsav@google.com>")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ std::unordered_map<SectionId, Hit::TermFrequency> expect_map{{}};
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("abc.xyz", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ std::vector<DocHitInfo> hits = GetHits(std::move(itr));
+
+ EXPECT_THAT(hits, IsEmpty());
+}
+
+#ifdef ENABLE_URL_TOKENIZER
+TEST_F(IndexProcessorTest, ExactUrlProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUrlExactProperty),
+ "http://www.google.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("google", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kUrlExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("http", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("www.google.com", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("http://www.google.com", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlExactSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+}
+
+TEST_F(IndexProcessorTest, ExactUrlPropertyDoesNotMatchPrefix) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUrlExactProperty),
+ "https://mail.google.com/calendar/render")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("co", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("mail.go", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("mail.google.com", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+}
+
+TEST_F(IndexProcessorTest, PrefixUrlProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUrlPrefixedProperty),
+ "http://www.google.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(7));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // "goo" is a prefix of "google" and "google.com"
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("goo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kUrlPrefixedSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ // "http" is a prefix of "http" and "http://www.google.com"
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("http", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlPrefixedSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+
+ // "www.go" is a prefix of "www.google.com"
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("www.go", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ expected_map = {{kUrlPrefixedSectionId, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ kDocumentId0, expected_map)));
+}
+
+TEST_F(IndexProcessorTest, PrefixUrlPropertyNoMatch) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUrlPrefixedProperty),
+ "https://mail.google.com/calendar/render")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(8));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+ EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+
+ // no token starts with "gle", so we should have no hits
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("gle", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("w.goo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ // tokens have separators removed, so no hits here
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator(".com", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("calendar/render", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ hits = GetHitsWithTermFrequency(std::move(itr));
+ EXPECT_THAT(hits, IsEmpty());
+}
+#endif // ENABLE_URL_TOKENIZER
+
+TEST_F(IndexProcessorTest, IndexableIntegerProperty) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4,
+ 5)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ // Expected to have 1 integer section.
+ EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/1,
+ /*key_upper=*/5, *doc_store_, *schema_store_,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kIndexableIntegerSectionId})));
+}
+
+TEST_F(IndexProcessorTest, IndexableIntegerPropertyNoMatch) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddInt64Property(std::string(kIndexableIntegerProperty), 1, 2, 3, 4,
+ 5)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ // Expected to have 1 integer section.
+ EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(1));
+
+ EXPECT_THAT(index_processor_->IndexDocument(tokenized_document, kDocumentId0),
+ IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(kIndexableIntegerProperty, /*key_lower=*/-1,
+ /*key_upper=*/0, *doc_store_, *schema_store_,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
}
} // namespace
diff --git a/icing/index/index.cc b/icing/index/index.cc
index 927acaf..98058be 100644
--- a/icing/index/index.cc
+++ b/icing/index/index.cc
@@ -14,26 +14,38 @@
#include "icing/index/index.h"
+#include <algorithm>
+#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
#include <utility>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
#include "icing/index/hit/hit.h"
-#include "icing/index/iterator/doc-hit-info-iterator-term.h"
+#include "icing/index/iterator/doc-hit-info-iterator-or.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
-#include "icing/index/lite-index.h"
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
-#include "icing/index/term-property-id.h"
+#include "icing/index/term-metadata.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/index/icing-dynamic-trie.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
+#include "icing/scoring/ranker.h"
+#include "icing/store/document-id.h"
+#include "icing/store/suggestion-result-checker.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
@@ -53,35 +65,92 @@ libtextclassifier3::StatusOr<LiteIndex::Options> CreateLiteIndexOptions(
"Requested hit buffer size %d is too large.",
options.index_merge_size));
}
- return LiteIndex::Options(options.base_dir + "/idx/lite.",
- options.index_merge_size);
+ return LiteIndex::Options(
+ options.base_dir + "/idx/lite.", options.index_merge_size,
+ options.lite_index_sort_at_indexing, options.lite_index_sort_size,
+ options.include_property_existence_metadata_hits);
+}
+
+std::string MakeMainIndexFilepath(const std::string& base_dir) {
+ return base_dir + "/idx/main";
}
-// TODO(tjbarron) implement for real when the main index is added.
IcingDynamicTrie::Options GetMainLexiconOptions() {
+ // The default values for IcingDynamicTrie::Options is fine for the main
+ // lexicon.
return IcingDynamicTrie::Options();
}
-// Helper function to check if a term is in the given namespaces.
-// TODO(samzheng): Implement a method PropertyReadersAll.HasAnyProperty().
-bool IsTermInNamespaces(
- const IcingDynamicTrie::PropertyReadersAll& property_reader,
- uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
- for (NamespaceId namespace_id : namespace_ids) {
- if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
- value_index)) {
- return true;
+enum class MergeAction { kTakeLiteTerm, kTakeMainTerm, kMergeTerms };
+
+// Merge the TermMetadata from lite index and main index. If the term exists in
+// both index, sum up its hit count and push it to the term heap.
+// The heap is a min-heap. So that we can avoid some push operation but the time
+// complexity is O(NlgK) which N is total number of term and K is num_to_return.
+std::vector<TermMetadata> MergeAndRankTermMetadatas(
+ std::vector<TermMetadata> lite_term_metadata_list,
+ std::vector<TermMetadata> main_term_metadata_list, int num_to_return) {
+ std::vector<TermMetadata> merged_term_metadata_heap;
+ merged_term_metadata_heap.reserve(
+ std::min(lite_term_metadata_list.size() + main_term_metadata_list.size(),
+ static_cast<size_t>(num_to_return)));
+
+ auto lite_term_itr = lite_term_metadata_list.begin();
+ auto main_term_itr = main_term_metadata_list.begin();
+ MergeAction merge_action;
+ while (lite_term_itr != lite_term_metadata_list.end() ||
+ main_term_itr != main_term_metadata_list.end()) {
+ // Get pointers to the next metadatas in each group, if available
+ // Determine how to merge.
+ if (main_term_itr == main_term_metadata_list.end()) {
+ merge_action = MergeAction::kTakeLiteTerm;
+ } else if (lite_term_itr == lite_term_metadata_list.end()) {
+ merge_action = MergeAction::kTakeMainTerm;
+ } else if (lite_term_itr->content < main_term_itr->content) {
+ merge_action = MergeAction::kTakeLiteTerm;
+ } else if (main_term_itr->content < lite_term_itr->content) {
+ merge_action = MergeAction::kTakeMainTerm;
+ } else {
+ // The next metadatas refer to the same term. Combine them.
+ merge_action = MergeAction::kMergeTerms;
+ }
+ switch (merge_action) {
+ case MergeAction::kTakeLiteTerm:
+ PushToTermHeap(std::move(*lite_term_itr), num_to_return,
+ merged_term_metadata_heap);
+ ++lite_term_itr;
+ break;
+ case MergeAction::kTakeMainTerm:
+ PushToTermHeap(std::move(*main_term_itr), num_to_return,
+ merged_term_metadata_heap);
+ ++main_term_itr;
+ break;
+ case MergeAction::kMergeTerms:
+ int total_est_hit_count = lite_term_itr->score + main_term_itr->score;
+ PushToTermHeap(TermMetadata(std::move(lite_term_itr->content),
+ total_est_hit_count),
+ num_to_return, merged_term_metadata_heap);
+ ++lite_term_itr;
+ ++main_term_itr;
+ break;
}
}
-
- return false;
+ // Reverse the list since we pop them from a min heap and we need to return in
+ // decreasing order.
+ std::vector<TermMetadata> merged_term_metadata_list =
+ PopAllTermsFromHeap(merged_term_metadata_heap);
+ std::reverse(merged_term_metadata_list.begin(),
+ merged_term_metadata_list.end());
+ return merged_term_metadata_list;
}
} // namespace
libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create(
- const Options& options, const IcingFilesystem* filesystem) {
+ const Options& options, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem) {
ICING_RETURN_ERROR_IF_NULL(filesystem);
+ ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
ICING_ASSIGN_OR_RETURN(LiteIndex::Options lite_index_options,
CreateLiteIndexOptions(options));
@@ -91,81 +160,174 @@ libtextclassifier3::StatusOr<std::unique_ptr<Index>> Index::Create(
IcingDynamicTrie::max_value_index(GetMainLexiconOptions()),
IcingDynamicTrie::max_value_index(
lite_index_options.lexicon_options)));
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<LiteIndex> lite_index,
- LiteIndex::Create(lite_index_options, filesystem));
- return std::unique_ptr<Index>(
- new Index(options, std::move(term_id_codec), std::move(lite_index)));
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<LiteIndex> lite_index,
+ LiteIndex::Create(lite_index_options, icing_filesystem));
+ // Sort the lite index if we've enabled sorting the HitBuffer at indexing
+ // time, and there's an unsorted tail exceeding the threshold.
+ if (options.lite_index_sort_at_indexing &&
+ lite_index->HasUnsortedHitsExceedingSortThreshold()) {
+ lite_index->SortHits();
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(MakeMainIndexFilepath(options.base_dir), filesystem,
+ icing_filesystem));
+ return std::unique_ptr<Index>(new Index(options, std::move(term_id_codec),
+ std::move(lite_index),
+ std::move(main_index), filesystem));
+}
+
+/* static */ libtextclassifier3::StatusOr<int> Index::ReadFlashIndexMagic(
+ const Filesystem* filesystem, const std::string& base_dir) {
+ return MainIndex::ReadFlashIndexMagic(filesystem,
+ MakeMainIndexFilepath(base_dir));
+}
+
+libtextclassifier3::Status Index::TruncateTo(DocumentId document_id) {
+ if (lite_index_->last_added_document_id() != kInvalidDocumentId &&
+ lite_index_->last_added_document_id() > document_id) {
+ ICING_VLOG(1) << "Clipping to " << document_id
+ << ". Throwing out lite index which is at "
+ << lite_index_->last_added_document_id();
+ ICING_RETURN_IF_ERROR(lite_index_->Reset());
+ }
+ if (main_index_->last_added_document_id() != kInvalidDocumentId &&
+ main_index_->last_added_document_id() > document_id) {
+ ICING_VLOG(1) << "Clipping to " << document_id
+ << ". Throwing out lite index which is at "
+ << main_index_->last_added_document_id();
+ ICING_RETURN_IF_ERROR(main_index_->Reset());
+ }
+ return libtextclassifier3::Status::OK;
}
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
-Index::GetIterator(const std::string& term, SectionIdMask section_id_mask,
- TermMatchType::Code term_match_type) {
+Index::GetIterator(const std::string& term, int term_start_index,
+ int unnormalized_term_length, SectionIdMask section_id_mask,
+ TermMatchType::Code term_match_type,
+ bool need_hit_term_frequency) {
+ std::unique_ptr<DocHitInfoIterator> lite_itr;
+ std::unique_ptr<DocHitInfoIterator> main_itr;
switch (term_match_type) {
case TermMatchType::EXACT_ONLY:
- return std::make_unique<DocHitInfoIteratorTermExact>(
- term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
+ lite_itr = std::make_unique<DocHitInfoIteratorTermLiteExact>(
+ term_id_codec_.get(), lite_index_.get(), term, term_start_index,
+ unnormalized_term_length, section_id_mask, need_hit_term_frequency);
+ main_itr = std::make_unique<DocHitInfoIteratorTermMainExact>(
+ main_index_.get(), term, term_start_index, unnormalized_term_length,
+ section_id_mask, need_hit_term_frequency);
+ break;
case TermMatchType::PREFIX:
- return std::make_unique<DocHitInfoIteratorTermPrefix>(
- term_id_codec_.get(), lite_index_.get(), term, section_id_mask);
+ lite_itr = std::make_unique<DocHitInfoIteratorTermLitePrefix>(
+ term_id_codec_.get(), lite_index_.get(), term, term_start_index,
+ unnormalized_term_length, section_id_mask, need_hit_term_frequency);
+ main_itr = std::make_unique<DocHitInfoIteratorTermMainPrefix>(
+ main_index_.get(), term, term_start_index, unnormalized_term_length,
+ section_id_mask, need_hit_term_frequency);
+ break;
default:
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("Invalid TermMatchType: ",
TermMatchType::Code_Name(term_match_type)));
}
+ return std::make_unique<DocHitInfoIteratorOr>(std::move(lite_itr),
+ std::move(main_itr));
}
libtextclassifier3::StatusOr<std::vector<TermMetadata>>
-Index::FindTermsByPrefix(const std::string& prefix,
- const std::vector<NamespaceId>& namespace_ids,
- int num_to_return) {
- std::vector<TermMetadata> term_metadata_list;
- if (num_to_return <= 0) {
- return term_metadata_list;
- }
-
+Index::FindLiteTermsByPrefix(
+ const std::string& prefix,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker) {
// Finds all the terms that start with the given prefix in the lexicon.
IcingDynamicTrie::Iterator term_iterator(lite_index_->lexicon(),
prefix.c_str());
- // A property reader to help check if a term has some property.
- IcingDynamicTrie::PropertyReadersAll property_reader(lite_index_->lexicon());
-
- while (term_iterator.IsValid() && term_metadata_list.size() < num_to_return) {
+ std::vector<TermMetadata> term_metadata_list;
+ while (term_iterator.IsValid()) {
uint32_t term_value_index = term_iterator.GetValueIndex();
- // Skips the terms that don't exist in the given namespaces. We won't skip
- // any terms if namespace_ids is empty.
- if (!namespace_ids.empty() &&
- !IsTermInNamespaces(property_reader, term_value_index, namespace_ids)) {
- term_iterator.Advance();
- continue;
- }
-
ICING_ASSIGN_OR_RETURN(
uint32_t term_id,
term_id_codec_->EncodeTvi(term_value_index, TviType::LITE),
absl_ports::InternalError("Failed to access terms in lexicon."));
-
- term_metadata_list.emplace_back(term_iterator.GetKey(),
- lite_index_->CountHits(term_id));
+ ICING_ASSIGN_OR_RETURN(
+ int hit_score,
+ lite_index_->ScoreHits(term_id, score_by, suggestion_result_checker));
+ if (hit_score > 0) {
+ // There is at least one document in the given namespace has this term.
+ term_metadata_list.push_back(
+ TermMetadata(term_iterator.GetKey(), hit_score));
+ }
term_iterator.Advance();
}
-
return term_metadata_list;
}
-libtextclassifier3::Status Index::Editor::AddHit(const char* term,
- Hit::Score score) {
+libtextclassifier3::StatusOr<std::vector<TermMetadata>>
+Index::FindTermsByPrefix(
+ const std::string& prefix, int num_to_return,
+ TermMatchType::Code scoring_match_type,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
+ const SuggestionResultChecker* suggestion_result_checker) {
+ std::vector<TermMetadata> term_metadata_list;
+ if (num_to_return <= 0) {
+ return term_metadata_list;
+ }
+ // Get results from the LiteIndex.
+ // TODO(b/250648165) support score term by prefix_hit in lite_index.
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<TermMetadata> lite_term_metadata_list,
+ FindLiteTermsByPrefix(prefix, rank_by, suggestion_result_checker));
+ // Append results from the MainIndex.
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<TermMetadata> main_term_metadata_list,
+ main_index_->FindTermsByPrefix(prefix, scoring_match_type, rank_by,
+ suggestion_result_checker));
+ return MergeAndRankTermMetadatas(std::move(lite_term_metadata_list),
+ std::move(main_term_metadata_list),
+ num_to_return);
+}
+
+IndexStorageInfoProto Index::GetStorageInfo() const {
+ IndexStorageInfoProto storage_info;
+ int64_t directory_size = filesystem_->GetDiskUsage(options_.base_dir.c_str());
+ storage_info.set_index_size(Filesystem::SanitizeFileSize(directory_size));
+ storage_info = lite_index_->GetStorageInfo(std::move(storage_info));
+ return main_index_->GetStorageInfo(std::move(storage_info));
+}
+
+libtextclassifier3::Status Index::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id) {
+ if (main_index_->last_added_document_id() != kInvalidDocumentId) {
+ ICING_RETURN_IF_ERROR(main_index_->Optimize(document_id_old_to_new));
+ }
+ return lite_index_->Optimize(document_id_old_to_new, term_id_codec_.get(),
+ new_last_added_document_id);
+}
+
+libtextclassifier3::Status Index::Editor::BufferTerm(const char* term) {
// Step 1: See if this term is already in the lexicon
uint32_t tvi;
- auto tvi_or = lite_index_->FindTerm(term);
+ auto tvi_or = lite_index_->GetTermId(term);
// Step 2: Update the lexicon, either add the term or update its properties
if (tvi_or.ok()) {
+ tvi = tvi_or.ValueOrDie();
+ if (seen_tokens_.find(tvi) != seen_tokens_.end()) {
+ ICING_VLOG(1) << "Updating term frequency for term " << term;
+ if (seen_tokens_[tvi] != Hit::kMaxTermFrequency) {
+ ++seen_tokens_[tvi];
+ }
+ return libtextclassifier3::Status::OK;
+ }
ICING_VLOG(1) << "Term " << term
<< " is already present in lexicon. Updating.";
- tvi = tvi_or.ValueOrDie();
// Already in the lexicon. Just update the properties.
ICING_RETURN_IF_ERROR(lite_index_->UpdateTermProperties(
tvi, term_match_type_ == TermMatchType::PREFIX, namespace_id_));
@@ -175,13 +337,20 @@ libtextclassifier3::Status Index::Editor::AddHit(const char* term,
ICING_ASSIGN_OR_RETURN(
tvi, lite_index_->InsertTerm(term, term_match_type_, namespace_id_));
}
+ // Token seen for the first time in the current document.
+ seen_tokens_[tvi] = 1;
+ return libtextclassifier3::Status::OK;
+}
- // Step 3: Add the hit itself
- Hit hit(section_id_, document_id_, score,
- term_match_type_ == TermMatchType::PREFIX);
- ICING_ASSIGN_OR_RETURN(uint32_t term_id,
- term_id_codec_->EncodeTvi(tvi, TviType::LITE));
- return lite_index_->AddHit(term_id, hit);
+libtextclassifier3::Status Index::Editor::IndexAllBufferedTerms() {
+ for (auto itr = seen_tokens_.begin(); itr != seen_tokens_.end(); itr++) {
+ Hit hit(section_id_, document_id_, /*term_frequency=*/itr->second,
+ term_match_type_ == TermMatchType::PREFIX);
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t term_id, term_id_codec_->EncodeTvi(itr->first, TviType::LITE));
+ ICING_RETURN_IF_ERROR(lite_index_->AddHit(term_id, hit));
+ }
+ return libtextclassifier3::Status::OK;
}
} // namespace lib
diff --git a/icing/index/index.h b/icing/index/index.h
index f30c8ad..a5d75c4 100644
--- a/icing/index/index.h
+++ b/icing/index/index.h
@@ -18,22 +18,31 @@
#include <cstdint>
#include <memory>
#include <string>
-#include <unordered_set>
+#include <unordered_map>
#include <utility>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
#include "icing/index/hit/hit.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
-#include "icing/index/lite-index.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/main/main-index-merger.h"
+#include "icing/index/main/main-index.h"
#include "icing/index/term-id-codec.h"
#include "icing/index/term-metadata.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/store/namespace-id.h"
-#include "icing/util/crc32.h"
+#include "icing/store/suggestion-result-checker.h"
+#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
@@ -61,11 +70,22 @@ namespace lib {
class Index {
public:
struct Options {
- explicit Options(const std::string& base_dir, uint32_t index_merge_size)
- : base_dir(base_dir), index_merge_size(index_merge_size) {}
+ explicit Options(const std::string& base_dir, uint32_t index_merge_size,
+ bool lite_index_sort_at_indexing,
+ uint32_t lite_index_sort_size,
+ bool include_property_existence_metadata_hits = false)
+ : base_dir(base_dir),
+ index_merge_size(index_merge_size),
+ lite_index_sort_at_indexing(lite_index_sort_at_indexing),
+ lite_index_sort_size(lite_index_sort_size),
+ include_property_existence_metadata_hits(
+ include_property_existence_metadata_hits) {}
std::string base_dir;
int32_t index_merge_size;
+ bool lite_index_sort_at_indexing;
+ int32_t lite_index_sort_size;
+ bool include_property_existence_metadata_hits;
};
// Creates an instance of Index in the directory pointed by file_dir.
@@ -76,15 +96,32 @@ class Index {
// INVALID_ARGUMENT if options have invalid values
// INTERNAL on I/O error
static libtextclassifier3::StatusOr<std::unique_ptr<Index>> Create(
- const Options& options, const IcingFilesystem* filesystem);
+ const Options& options, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
+
+ // Reads magic from existing flash (main) index file header. We need this
+ // during Icing initialization phase to determine the version.
+ //
+ // Returns
+ // Valid magic on success
+ // NOT_FOUND if the lite index doesn't exist
+ // INTERNAL on I/O error
+ static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic(
+ const Filesystem* filesystem, const std::string& base_dir);
// Clears all files created by the index. Returns OK if all files were
// cleared.
- libtextclassifier3::Status Reset() { return lite_index_->Reset(); }
+ libtextclassifier3::Status Reset() {
+ ICING_RETURN_IF_ERROR(lite_index_->Reset());
+ return main_index_->Reset();
+ }
// Brings components of the index into memory in anticipation of a query in
// order to reduce latency.
- void Warm() { lite_index_->Warm(); }
+ void Warm() {
+ lite_index_->Warm();
+ main_index_->Warm();
+ }
// Syncs all the data and metadata changes to disk.
//
@@ -92,25 +129,53 @@ class Index {
// OK on success
// INTERNAL on I/O errors
libtextclassifier3::Status PersistToDisk() {
- return lite_index_->PersistToDisk();
+ ICING_RETURN_IF_ERROR(lite_index_->PersistToDisk());
+ return main_index_->PersistToDisk();
}
- // Compute the checksum over the entire Index's subcomponents.
- Crc32 ComputeChecksum() { return lite_index_->ComputeChecksum(); }
+ // Discard parts of the index if they contain data for document ids greater
+ // than document_id.
+ //
+ // NOTE: This means that TruncateTo(kInvalidDocumentId) will have no effect.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL on I/O errors
+ libtextclassifier3::Status TruncateTo(DocumentId document_id);
// DocumentIds are always inserted in increasing order. Returns the largest
// document_id added to the index.
DocumentId last_added_document_id() const {
- return lite_index_->last_added_document_id();
+ DocumentId lite_document_id = lite_index_->last_added_document_id();
+ if (lite_document_id != kInvalidDocumentId) {
+ return lite_document_id;
+ }
+ return main_index_->last_added_document_id();
+ }
+
+ // Sets last_added_document_id to document_id so long as document_id >
+ // last_added_document_id()
+ void set_last_added_document_id(DocumentId document_id) {
+ DocumentId lite_document_id = lite_index_->last_added_document_id();
+ if (lite_document_id == kInvalidDocumentId ||
+ document_id >= lite_document_id) {
+ lite_index_->set_last_added_document_id(document_id);
+ }
}
// Returns debug information for the index in out.
- // verbosity <= 0, simplest debug information - just the lexicons and lite
- // index.
- // verbosity > 0, more detailed debug information including raw postings
- // lists.
- void GetDebugInfo(int verbosity, std::string* out) const {
- lite_index_->GetDebugInfo(verbosity, out);
+ // verbosity = BASIC, simplest debug information - just the lexicons and lite
+ // index.
+ // verbosity = DETAILED, more detailed debug information including raw
+ // postings lists.
+ IndexDebugInfoProto GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
+ IndexDebugInfoProto debug_info;
+ *debug_info.mutable_index_storage_info() = GetStorageInfo();
+ *debug_info.mutable_lite_index_info() =
+ lite_index_->GetDebugInfo(verbosity);
+ *debug_info.mutable_main_index_info() =
+ main_index_->GetDebugInfo(verbosity);
+ return debug_info;
}
// Returns the byte size of the all the elements held in the index. This
@@ -121,33 +186,47 @@ class Index {
// Byte size on success
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<int64_t> GetElementsSize() const {
- return lite_index_->GetElementsSize();
+ ICING_ASSIGN_OR_RETURN(int64_t lite_index_size,
+ lite_index_->GetElementsSize());
+ ICING_ASSIGN_OR_RETURN(int64_t main_index_size,
+ main_index_->GetElementsSize());
+ return lite_index_size + main_index_size;
}
+ // Calculates the StorageInfo for the Index.
+ //
+ // If an IO error occurs while trying to calculate the value for a field, then
+ // that field will be set to -1.
+ IndexStorageInfoProto GetStorageInfo() const;
+
// Create an iterator to iterate through all doc hit infos in the index that
- // match the term. section_id_mask can be set to ignore hits from sections not
- // listed in the mask. Eg. section_id_mask = 1U << 3; would only return hits
- // that occur in section 3.
+ // match the term. term_start_index is the start index of the given term in
+ // the search query. unnormalized_term_length is the length of the given
+ // unnormalized term in the search query not listed in the mask.
+ // Eg. section_id_mask = 1U << 3; would only return hits that occur in
+ // section 3.
//
// Returns:
// unique ptr to a valid DocHitInfoIterator that matches the term
// INVALID_ARGUMENT if given an invalid term_match_type
libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
- const std::string& term, SectionIdMask section_id_mask,
- TermMatchType::Code term_match_type);
+ const std::string& term, int term_start_index,
+ int unnormalized_term_length, SectionIdMask section_id_mask,
+ TermMatchType::Code term_match_type, bool need_hit_term_frequency = true);
// Finds terms with the given prefix in the given namespaces. If
- // 'namespace_ids' is empty, returns results from all the namespaces. The
- // input prefix must be normalized, otherwise inaccurate results may be
- // returned. Results are not sorted specifically and are in their original
- // order. Number of results are no more than 'num_to_return'.
+ // 'namespace_ids' is empty, returns results from all the namespaces. Results
+ // are sorted in decreasing order of hit count. Number of results are no more
+ // than 'num_to_return'.
//
// Returns:
// A list of TermMetadata on success
// INTERNAL_ERROR if failed to access term data.
libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
- const std::string& prefix, const std::vector<NamespaceId>& namespace_ids,
- int num_to_return);
+ const std::string& prefix, int num_to_return,
+ TermMatchType::Code scoring_match_type,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
+ const SuggestionResultChecker* suggestion_result_checker);
// A class that can be used to add hits to the index.
//
@@ -170,14 +249,16 @@ class Index {
namespace_id_(namespace_id),
section_id_(section_id) {}
- libtextclassifier3::Status AddHit(const char* term,
- Hit::Score score = Hit::kMaxHitScore);
+ // Buffer the term in seen_tokens_.
+ libtextclassifier3::Status BufferTerm(const char* term);
+ // Index all the terms stored in seen_tokens_.
+ libtextclassifier3::Status IndexAllBufferedTerms();
private:
// The Editor is able to store previously seen terms as TermIds. This is
// is more efficient than a client doing this externally because TermIds are
// not exposed to clients.
- std::unordered_set<uint32_t> seen_tokens_;
+ std::unordered_map<uint32_t, Hit::TermFrequency> seen_tokens_;
const TermIdCodec* term_id_codec_;
LiteIndex* lite_index_;
DocumentId document_id_;
@@ -191,16 +272,71 @@ class Index {
section_id, term_match_type, namespace_id);
}
+ bool WantsMerge() const { return lite_index_->WantsMerge(); }
+
+ // Merges newly-added hits in the LiteIndex into the MainIndex.
+ //
+ // RETURNS:
+ // - INTERNAL on IO error while writing to the MainIndex.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index.
+ libtextclassifier3::Status Merge() {
+ ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs,
+ main_index_->MergeLexicon(lite_index_->lexicon()));
+ ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs,
+ MainIndexMerger::TranslateAndExpandLiteHits(
+ *lite_index_, *term_id_codec_, outputs));
+ ICING_RETURN_IF_ERROR(main_index_->AddHits(
+ *term_id_codec_, std::move(outputs.backfill_map),
+ std::move(term_id_hit_pairs), lite_index_->last_added_document_id()));
+ ICING_RETURN_IF_ERROR(main_index_->PersistToDisk());
+ return lite_index_->Reset();
+ }
+
+ // Whether the LiteIndex HitBuffer requires sorting. This is only true if
+ // Icing has enabled sorting during indexing time, and the HitBuffer's
+ // unsorted tail has exceeded the lite_index_sort_size.
+ bool LiteIndexNeedSort() const {
+ return options_.lite_index_sort_at_indexing &&
+ lite_index_->HasUnsortedHitsExceedingSortThreshold();
+ }
+
+ // Sorts the LiteIndex HitBuffer.
+ void SortLiteIndex() {
+ lite_index_->SortHits();
+ }
+
+ // Reduces internal file sizes by reclaiming space of deleted documents.
+ // new_last_added_document_id will be used to update the last added document
+ // id in the lite index.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error, this indicates that the index may be in an
+ // invalid state and should be cleared.
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id);
+
private:
Index(const Options& options, std::unique_ptr<TermIdCodec> term_id_codec,
- std::unique_ptr<LiteIndex>&& lite_index)
+ std::unique_ptr<LiteIndex> lite_index,
+ std::unique_ptr<MainIndex> main_index, const Filesystem* filesystem)
: lite_index_(std::move(lite_index)),
+ main_index_(std::move(main_index)),
options_(options),
- term_id_codec_(std::move(term_id_codec)) {}
+ term_id_codec_(std::move(term_id_codec)),
+ filesystem_(filesystem) {}
+
+ libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindLiteTermsByPrefix(
+ const std::string& prefix,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code rank_by,
+ const SuggestionResultChecker* suggestion_result_checker);
std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<MainIndex> main_index_;
const Options options_;
std::unique_ptr<TermIdCodec> term_id_codec_;
+ const Filesystem* filesystem_;
};
} // namespace lib
diff --git a/icing/index/index_test.cc b/icing/index/index_test.cc
index 070e82a..04a6bb7 100644
--- a/icing/index/index_test.cc
+++ b/icing/index/index_test.cc
@@ -14,12 +14,16 @@
#include "icing/index/index.h"
+#include <unistd.h>
+
+#include <algorithm>
#include <cstdint>
#include <limits>
#include <memory>
#include <random>
#include <string>
#include <string_view>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -31,65 +35,101 @@
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/storage.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
+#include "icing/testing/always-true-suggestion-result-checker-impl.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/random-string.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
namespace icing {
namespace lib {
namespace {
+using ::testing::ContainerEq;
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::Ge;
using ::testing::Gt;
using ::testing::IsEmpty;
+using ::testing::IsFalse;
using ::testing::IsTrue;
+using ::testing::Ne;
using ::testing::NiceMock;
using ::testing::Not;
+using ::testing::Return;
using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::testing::StrNe;
using ::testing::Test;
using ::testing::UnorderedElementsAre;
+int GetBlockSize() { return getpagesize(); }
+
class IndexTest : public Test {
protected:
void SetUp() override {
index_dir_ = GetTestTempDir() + "/index_test/";
- Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
- ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
}
void TearDown() override {
- filesystem_.DeleteDirectoryRecursively(index_dir_.c_str());
+ index_.reset();
+ icing_filesystem_.DeleteDirectoryRecursively(index_dir_.c_str());
}
- std::unique_ptr<Index> index_;
+ std::vector<DocHitInfo> GetHits(
+ std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+ }
+
+ libtextclassifier3::StatusOr<std::vector<DocHitInfo>> GetHits(
+ std::string term, int term_start_index, int unnormalized_term_length,
+ TermMatchType::Code match_type) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator(term, term_start_index, unnormalized_term_length,
+ kSectionIdMaskAll, match_type));
+ return GetHits(std::move(itr));
+ }
+
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
std::string index_dir_;
- IcingFilesystem filesystem_;
+ std::unique_ptr<Index> index_;
};
constexpr DocumentId kDocumentId0 = 0;
constexpr DocumentId kDocumentId1 = 1;
constexpr DocumentId kDocumentId2 = 2;
+constexpr DocumentId kDocumentId3 = 3;
+constexpr DocumentId kDocumentId4 = 4;
+constexpr DocumentId kDocumentId5 = 5;
+constexpr DocumentId kDocumentId6 = 6;
+constexpr DocumentId kDocumentId7 = 7;
+constexpr DocumentId kDocumentId8 = 8;
constexpr SectionId kSectionId2 = 2;
constexpr SectionId kSectionId3 = 3;
-std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
- std::vector<DocHitInfo> infos;
- while (iterator->Advance().ok()) {
- infos.push_back(iterator->doc_hit_info());
- }
- return infos;
-}
-
MATCHER_P2(EqualsDocHitInfo, document_id, sections, "") {
const DocHitInfo& actual = arg;
SectionIdMask section_mask = kSectionIdMaskNone;
for (SectionId section : sections) {
- section_mask |= 1U << section;
+ section_mask |= UINT64_C(1) << section;
}
*result_listener << "actual is {document_id=" << actual.document_id()
<< ", section_mask=" << actual.hit_section_ids_mask()
@@ -102,52 +142,111 @@ MATCHER_P2(EqualsDocHitInfo, document_id, sections, "") {
MATCHER_P2(EqualsTermMetadata, content, hit_count, "") {
const TermMetadata& actual = arg;
*result_listener << "actual is {content=" << actual.content
- << ", hit_count=" << actual.hit_count
+ << ", score=" << actual.score
<< "}, but expected was {content=" << content
- << ", hit_count=" << hit_count << "}.";
- return actual.content == content && actual.hit_count == hit_count;
+ << ", score=" << hit_count << "}.";
+ return actual.content == content && actual.score == hit_count;
}
TEST_F(IndexTest, CreationWithNullPointerShouldFail) {
- Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
- EXPECT_THAT(Index::Create(options, /*filesystem=*/nullptr),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ EXPECT_THAT(
+ Index::Create(options, &filesystem_, /*icing_filesystem=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ Index::Create(options, /*filesystem=*/nullptr, &icing_filesystem_),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
TEST_F(IndexTest, EmptyIndex) {
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
ICING_ASSERT_OK_AND_ASSIGN(
- itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ itr, index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST_F(IndexTest, EmptyIndexAfterMerge) {
+ // Merging an empty index should succeed, but have no effects.
+ ICING_ASSERT_OK(index_->Merge());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST_F(IndexTest, CreationWithLiteIndexSortAtIndexingEnabledShouldSort) {
+ // Make the index with lite_index_sort_at_indexing=false and a very small sort
+ // threshold.
+ Index::Options options(index_dir_, /*index_merge_size=*/1024,
+ /*lite_index_sort_at_indexing=*/false,
+ /*lite_index_sort_size=*/16);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("bar"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("baz"), IsOk());
+ ASSERT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // Persist and recreate the index with lite_index_sort_at_indexing=true
+ ASSERT_THAT(index_->PersistToDisk(), IsOk());
+ options = Index::Options(index_dir_, /*index_merge_size=*/1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/16);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ // Check that the index is sorted after recreating with
+ // lite_index_sort_at_indexing, with the unsorted HitBuffer exceeding the sort
+ // threshold.
+ EXPECT_THAT(index_->LiteIndexNeedSort(), IsFalse());
}
TEST_F(IndexTest, AdvancePastEnd) {
- // Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("bar", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ index_->GetIterator("bar", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
EXPECT_THAT(itr->doc_hit_info(),
EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>()));
ICING_ASSERT_OK_AND_ASSIGN(
- itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ itr, index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(), IsOk());
EXPECT_THAT(itr->Advance(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
@@ -155,250 +254,969 @@ TEST_F(IndexTest, AdvancePastEnd) {
EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>()));
}
+TEST_F(IndexTest, AdvancePastEndAfterMerge) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("bar", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(itr->doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(itr->doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>()));
+}
+
+TEST_F(IndexTest, IteratorGetCallStats_mainIndexOnly) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // Merge the index.
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+
+ // Before Advance().
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0));
+
+ // 1st Advance().
+ ICING_ASSERT_OK(itr->Advance());
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/1,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+
+ // 2nd Advance().
+ ICING_ASSERT_OK(itr->Advance());
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/2,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+
+ // 3rd Advance().
+ ASSERT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/2,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+}
+
+TEST_F(IndexTest, IteratorGetCallStats_liteIndexOnly) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+
+ // Before Advance().
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0));
+
+ // 1st Advance().
+ ICING_ASSERT_OK(itr->Advance());
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/1,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0));
+
+ // 2nd Advance().
+ ICING_ASSERT_OK(itr->Advance());
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/2,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0));
+
+ // 3rd Advance().
+ ASSERT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/2,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0));
+}
+
+TEST_F(IndexTest, IteratorGetCallStats) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // Merge the index. 2 hits for "foo" will be merged into the main index.
+ ICING_ASSERT_OK(index_->Merge());
+
+ // Insert 2 more hits for "foo". It will be in the lite index.
+ edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+
+ // Before Advance().
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0));
+
+ // 1st Advance(). DocHitInfoIteratorOr will advance both left and right
+ // iterator (i.e. lite and main index iterator) once, compare document ids,
+ // and return the hit with larger document id. In this case, hit from lite
+ // index will be chosen and returned.
+ ICING_ASSERT_OK(itr->Advance());
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/1,
+ /*num_leaf_advance_calls_main_index=*/1,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+
+ // 2nd Advance(). Since lite index iterator has larger document id in the
+ // previous round, we advance lite index iterator in this round. We still
+ // choose and return hit from lite index.
+ ICING_ASSERT_OK(itr->Advance());
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/2,
+ /*num_leaf_advance_calls_main_index=*/1,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+
+ // 3rd Advance(). Since lite index iterator has larger document id in the
+ // previous round, we advance lite index iterator in this round. However,
+ // there is no hit from lite index anymore, so we choose and return hit from
+ // main index.
+ ICING_ASSERT_OK(itr->Advance());
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/2,
+ /*num_leaf_advance_calls_main_index=*/1,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+
+ // 4th Advance(). Advance main index.
+ ICING_ASSERT_OK(itr->Advance());
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/2,
+ /*num_leaf_advance_calls_main_index=*/2,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+
+ // 5th Advance(). Reach the end.
+ ASSERT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(
+ itr->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/2,
+ /*num_leaf_advance_calls_main_index=*/2,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+}
+
TEST_F(IndexTest, SingleHitSingleTermIndex) {
- // Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, SingleHitSingleTermIndexAfterMerge) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, SingleHitSingleTermIndexAfterOptimize) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ ICING_ASSERT_OK(index_->Optimize(/*document_id_old_to_new=*/{0, 1, 2},
+ /*new_last_added_document_id=*/2));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId2, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId2);
+
+ // Mapping to a different docid will translate the hit
+ ICING_ASSERT_OK(index_->Optimize(
+ /*document_id_old_to_new=*/{0, kInvalidDocumentId, kDocumentId1},
+ /*new_last_added_document_id=*/1));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId1);
+
+ // Mapping to kInvalidDocumentId will remove the hit.
+ ICING_ASSERT_OK(
+ index_->Optimize(/*document_id_old_to_new=*/{0, kInvalidDocumentId},
+ /*new_last_added_document_id=*/0));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId0);
+}
+
+TEST_F(IndexTest, SingleHitSingleTermIndexAfterMergeAndOptimize) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK(index_->Optimize(/*document_id_old_to_new=*/{0, 1, 2},
+ /*new_last_added_document_id=*/2));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId2, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId2);
+
+ // Mapping to a different docid will translate the hit
+ ICING_ASSERT_OK(index_->Optimize(
+ /*document_id_old_to_new=*/{0, kInvalidDocumentId, kDocumentId1},
+ /*new_last_added_document_id=*/1));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId1);
+
+ // Mapping to kInvalidDocumentId will remove the hit.
+ ICING_ASSERT_OK(
+ index_->Optimize(/*document_id_old_to_new=*/{0, kInvalidDocumentId},
+ /*new_last_added_document_id=*/0));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), 0);
}
TEST_F(IndexTest, SingleHitMultiTermIndex) {
- // Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
- EXPECT_THAT(edit.AddHit("bar"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+TEST_F(IndexTest, SingleHitMultiTermIndexAfterMerge) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, MultiHitMultiTermIndexAfterOptimize) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ ICING_ASSERT_OK(index_->Optimize(/*document_id_old_to_new=*/{0, 1, 2},
+ /*new_last_added_document_id=*/2));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(
+ GetHits("bar", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId2);
+
+ // Delete document id 1, and document id 2 is translated to 1.
+ ICING_ASSERT_OK(
+ index_->Optimize(/*document_id_old_to_new=*/{0, kInvalidDocumentId, 1},
+ /*new_last_added_document_id=*/1));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(
+ GetHits("bar", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId1);
+
+ // Delete all the rest documents.
+ ICING_ASSERT_OK(index_->Optimize(
+ /*document_id_old_to_new=*/{kInvalidDocumentId, kInvalidDocumentId},
+ /*new_last_added_document_id=*/kInvalidDocumentId));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ GetHits("bar", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kInvalidDocumentId);
+}
+
+TEST_F(IndexTest, MultiHitMultiTermIndexAfterMergeAndOptimize) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK(index_->Optimize(/*document_id_old_to_new=*/{0, 1, 2},
+ /*new_last_added_document_id=*/2));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(
+ GetHits("bar", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId2);
+
+ // Delete document id 1, and document id 2 is translated to 1.
+ ICING_ASSERT_OK(
+ index_->Optimize(/*document_id_old_to_new=*/{0, kInvalidDocumentId, 1},
+ /*new_last_added_document_id=*/1));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(
+ GetHits("bar", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kDocumentId1);
+
+ // Delete all the rest documents.
+ ICING_ASSERT_OK(index_->Optimize(
+ /*document_id_old_to_new=*/{kInvalidDocumentId, kInvalidDocumentId},
+ /*new_last_added_document_id=*/kInvalidDocumentId));
+ EXPECT_THAT(
+ GetHits("foo", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ GetHits("bar", /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_EQ(index_->last_added_document_id(), kInvalidDocumentId);
}
TEST_F(IndexTest, NoHitMultiTermIndex) {
- // Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
- EXPECT_THAT(edit.AddHit("bar"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("baz", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ index_->GetIterator("baz", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(itr->Advance(),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST_F(IndexTest, NoHitMultiTermIndexAfterMerge) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("baz", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
}
TEST_F(IndexTest, MultiHitMultiTermIndex) {
- // Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("bar"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, MultiHitMultiTermIndexAfterMerge) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(
GetHits(std::move(itr)),
ElementsAre(
EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId2));
}
TEST_F(IndexTest, MultiHitSectionRestrict) {
- // Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
SectionIdMask desired_section = 1U << kSectionId2;
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", desired_section, TermMatchType::EXACT_ONLY));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, desired_section,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, MultiHitSectionRestrictAfterMerge) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ SectionIdMask desired_section = 1U << kSectionId2;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, desired_section,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
}
TEST_F(IndexTest, SingleHitDedupeIndex) {
- // Act
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t size, index_->GetElementsSize());
+ EXPECT_THAT(size, Eq(0));
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(size, index_->GetElementsSize());
+ EXPECT_THAT(size, Gt(0));
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t new_size, index_->GetElementsSize());
+ EXPECT_THAT(new_size, Eq(size));
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, PrefixHit) {
- // Act
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("fool"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
+TEST_F(IndexTest, PrefixHitAfterMerge) {
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
}
TEST_F(IndexTest, MultiPrefixHit) {
- // Act
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("fool"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
EXPECT_THAT(
GetHits(std::move(itr)),
ElementsAre(
EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, MultiPrefixHitAfterMerge) {
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
}
TEST_F(IndexTest, NoExactHitInPrefixQuery) {
- // Act
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("fool"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
/*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId1, std::vector<SectionId>{kSectionId3})));
+}
+
+TEST_F(IndexTest, NoExactHitInPrefixQueryAfterMerge) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId1, std::vector<SectionId>{kSectionId3})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId1));
}
TEST_F(IndexTest, PrefixHitDedupe) {
- // Act
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("foo"), IsOk());
- ASSERT_THAT(edit.AddHit("fool"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- // Assert
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::PREFIX));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, PrefixHitDedupeAfterMerge) {
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
TEST_F(IndexTest, PrefixToString) {
SectionIdMask id_mask = (1U << kSectionId2) | (1U << kSectionId3);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", id_mask, TermMatchType::PREFIX));
- EXPECT_THAT(itr->ToString(), Eq("0000000000001100:foo*"));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, id_mask,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(itr->ToString(), Eq("(0000000000000000000000000000000000000000000"
+ "000000000000000001100:foo* OR "
+ "00000000000000000000000000000000000000000000"
+ "00000000000000001100:foo*)"));
- ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator("foo", kSectionIdMaskAll,
- TermMatchType::PREFIX));
- EXPECT_THAT(itr->ToString(), Eq("1111111111111111:foo*"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(itr->ToString(), Eq("(1111111111111111111111111111111111111111111"
+ "111111111111111111111:foo* OR "
+ "11111111111111111111111111111111111111111111"
+ "11111111111111111111:foo*)"));
- ICING_ASSERT_OK_AND_ASSIGN(itr, index_->GetIterator("foo", kSectionIdMaskNone,
- TermMatchType::PREFIX));
- EXPECT_THAT(itr->ToString(), Eq("0000000000000000:foo*"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskNone, TermMatchType::PREFIX));
+ EXPECT_THAT(itr->ToString(), Eq("(0000000000000000000000000000000000000000000"
+ "000000000000000000000:foo* OR "
+ "00000000000000000000000000000000000000000000"
+ "00000000000000000000:foo*)"));
}
TEST_F(IndexTest, ExactToString) {
SectionIdMask id_mask = (1U << kSectionId2) | (1U << kSectionId3);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("foo", id_mask, TermMatchType::EXACT_ONLY));
- EXPECT_THAT(itr->ToString(), Eq("0000000000001100:foo"));
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, id_mask,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->ToString(), Eq("(0000000000000000000000000000000000000000000"
+ "000000000000000001100:foo OR "
+ "00000000000000000000000000000000000000000000"
+ "00000000000000001100:foo)"));
ICING_ASSERT_OK_AND_ASSIGN(
- itr,
- index_->GetIterator("foo", kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
- EXPECT_THAT(itr->ToString(), Eq("1111111111111111:foo"));
+ itr, index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->ToString(), Eq("(1111111111111111111111111111111111111111111"
+ "111111111111111111111:foo OR "
+ "11111111111111111111111111111111111111111111"
+ "11111111111111111111:foo)"));
- ICING_ASSERT_OK_AND_ASSIGN(itr,
- index_->GetIterator("foo", kSectionIdMaskNone,
- TermMatchType::EXACT_ONLY));
- EXPECT_THAT(itr->ToString(), Eq("0000000000000000:foo"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskNone, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(itr->ToString(), Eq("(0000000000000000000000000000000000000000000"
+ "000000000000000000000:foo OR "
+ "00000000000000000000000000000000000000000000"
+ "00000000000000000000:foo)"));
}
TEST_F(IndexTest, NonAsciiTerms) {
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("こんにちは"), IsOk());
- ASSERT_THAT(edit.AddHit("あなた"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("こんにちは"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("あなた"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("こんに", kSectionIdMaskAll, TermMatchType::PREFIX));
+ index_->GetIterator("こんに", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
- ICING_ASSERT_OK_AND_ASSIGN(itr,
- index_->GetIterator("あなた", kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("あなた", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, NonAsciiTermsAfterMerge) {
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("こんにちは"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("あなた"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("こんに", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("あなた", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::EXACT_ONLY));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
@@ -406,24 +1224,35 @@ TEST_F(IndexTest, NonAsciiTerms) {
TEST_F(IndexTest, FullIndex) {
// Make a smaller index so that it's easier to fill up.
- Index::Options options(index_dir_, /*index_merge_size=*/1024);
- ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
+ Index::Options options(index_dir_, /*index_merge_size=*/1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+
std::default_random_engine random;
- libtextclassifier3::Status status = libtextclassifier3::Status::OK;
- constexpr int kTokenSize = 5;
- DocumentId document_id = 0;
std::vector<std::string> query_terms;
+ std::string prefix = "prefix";
+ for (int i = 0; i < 2600; ++i) {
+ constexpr int kTokenSize = 5;
+ query_terms.push_back(prefix +
+ RandomString(kAlNumAlphabet, kTokenSize, &random));
+ }
+
+ DocumentId document_id = 0;
+ libtextclassifier3::Status status = libtextclassifier3::Status::OK;
+ std::uniform_int_distribution<size_t> uniform(0u, query_terms.size() - 1);
while (status.ok()) {
for (int i = 0; i < 100; ++i) {
Index::Editor edit =
- index_->Edit(document_id, kSectionId2, TermMatchType::EXACT_ONLY,
+ index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- std::string term = RandomString(kAlNumAlphabet, kTokenSize, &random);
- status = edit.AddHit(term.c_str());
- if (i % 50 == 0) {
- // Remember one out of every fifty terms to query for later.
- query_terms.push_back(std::move(term));
+ size_t idx = uniform(random);
+ status = edit.BufferTerm(query_terms.at(idx).c_str());
+ if (!status.ok()) {
+ break;
}
+ status = edit.IndexAllBufferedTerms();
if (!status.ok()) {
break;
}
@@ -431,36 +1260,379 @@ TEST_F(IndexTest, FullIndex) {
++document_id;
}
- // Assert
// Adding more hits should fail.
Index::Editor edit =
- index_->Edit(document_id + 1, kSectionId2, TermMatchType::EXACT_ONLY,
+ index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"),
- StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
- EXPECT_THAT(edit.AddHit("bar"),
- StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
- EXPECT_THAT(edit.AddHit("baz"),
+ std::string term = prefix + "foo";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "bar";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "baz";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
- for (const std::string& term : query_terms) {
+ for (int i = 0; i < query_terms.size(); i += 25) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator(term.c_str(), kSectionIdMaskAll,
- TermMatchType::EXACT_ONLY));
+ index_->GetIterator(query_terms.at(i).c_str(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
// Each query term should contain at least one hit - there may have been
// other hits for this term that were added.
EXPECT_THAT(itr->Advance(), IsOk());
}
- EXPECT_THAT(index_->last_added_document_id(), Eq(document_id - 1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> last_itr,
+ index_->GetIterator(prefix.c_str(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(last_itr->Advance(), IsOk());
+ EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1));
+}
+
+TEST_F(IndexTest, FullIndexMerge) {
+ // Make a smaller index so that it's easier to fill up.
+ Index::Options options(index_dir_, /*index_merge_size=*/1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ std::default_random_engine random;
+ std::vector<std::string> query_terms;
+ std::string prefix = "prefix";
+ for (int i = 0; i < 2600; ++i) {
+ constexpr int kTokenSize = 5;
+ query_terms.push_back(prefix +
+ RandomString(kAlNumAlphabet, kTokenSize, &random));
+ }
+
+ DocumentId document_id = 0;
+ libtextclassifier3::Status status = libtextclassifier3::Status::OK;
+ std::uniform_int_distribution<size_t> uniform(0u, query_terms.size() - 1);
+ while (status.ok()) {
+ for (int i = 0; i < 100; ++i) {
+ Index::Editor edit =
+ index_->Edit(document_id, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ size_t idx = uniform(random);
+ status = edit.BufferTerm(query_terms.at(idx).c_str());
+ if (!status.ok()) {
+ break;
+ }
+ status = edit.IndexAllBufferedTerms();
+ if (!status.ok()) {
+ break;
+ }
+ }
+ ++document_id;
+ }
+ EXPECT_THAT(status,
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ // Adding more hits should fail.
+ Index::Editor edit =
+ index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ std::string term = prefix + "foo";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "bar";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "baz";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> last_itr,
+ index_->GetIterator(prefix.c_str(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(last_itr->Advance(), IsOk());
+ EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id - 1));
+
+ // After merging with the main index. Adding more hits should succeed now.
+ ICING_ASSERT_OK(index_->Merge());
+ edit = index_->Edit(document_id + 1, kSectionId2, TermMatchType::PREFIX, 0);
+ prefix + "foo";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "bar";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ term = prefix + "baz";
+ EXPECT_THAT(edit.BufferTerm(term.c_str()), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator(prefix + "bar", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ // We know that "bar" should have at least one hit because we just added it!
+ EXPECT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(document_id + 1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ last_itr, index_->GetIterator(prefix.c_str(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(last_itr->Advance(), IsOk());
+ EXPECT_THAT(last_itr->doc_hit_info().document_id(), Eq(document_id + 1));
+}
+
+TEST_F(IndexTest, OptimizeShouldWorkForEmptyIndex) {
+ // Optimize an empty index should succeed, but have no effects.
+ ICING_ASSERT_OK(
+ index_->Optimize(std::vector<DocumentId>(),
+ /*new_last_added_document_id=*/kInvalidDocumentId));
+ EXPECT_EQ(index_->last_added_document_id(), kInvalidDocumentId);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("", kSectionIdMaskAll, /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("", kSectionIdMaskAll, /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IndexTest, IndexShouldWorkAtSectionLimit) {
+ std::string prefix = "prefix";
+ std::default_random_engine random;
+ std::vector<std::string> query_terms;
+ // Add 2048 hits to main index, and 2048 hits to lite index.
+ for (int i = 0; i < 4096; ++i) {
+ if (i == 1024) {
+ ICING_ASSERT_OK(index_->Merge());
+ }
+ // Generate a unique term for document i.
+ query_terms.push_back(prefix + RandomString("abcdefg", 5, &random) +
+ std::to_string(i));
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+ SectionId section_id = i % 64;
+ if (section_id == 2) {
+ // Make section 2 an exact section.
+ term_match_type = TermMatchType::EXACT_ONLY;
+ }
+ Index::Editor edit = index_->Edit(/*document_id=*/i, section_id,
+ term_match_type, /*namespace_id=*/0);
+ ICING_ASSERT_OK(edit.BufferTerm(query_terms.at(i).c_str()));
+ ICING_ASSERT_OK(edit.IndexAllBufferedTerms());
+ }
+
+ std::vector<DocHitInfo> exp_prefix_hits;
+ for (int i = 0; i < 4096; ++i) {
+ if (i % 64 == 2) {
+ // Section 2 is an exact section, so we should not see any hits in
+ // prefix search.
+ continue;
+ }
+ exp_prefix_hits.push_back(DocHitInfo(i));
+ exp_prefix_hits.back().UpdateSection(/*section_id=*/i % 64);
+ }
+ std::reverse(exp_prefix_hits.begin(), exp_prefix_hits.end());
+
+ // Check prefix search.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<DocHitInfo> hits,
+ GetHits(prefix, /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(hits, ContainerEq(exp_prefix_hits));
+
+ // Check exact search.
+ for (int i = 0; i < 4096; ++i) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ hits,
+ GetHits(query_terms[i], /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ i, std::vector<SectionId>{(SectionId)(i % 64)})));
+ }
+}
+
+// Skip this test on Android because of timeout.
+#if !defined(__ANDROID__)
+TEST_F(IndexTest, IndexShouldWorkAtDocumentLimit) {
+ std::string prefix = "pre";
+ std::default_random_engine random;
+ const int max_lite_index_size = 1024 * 1024 / 8;
+ int lite_index_size = 0;
+ for (int i = 0; i <= kMaxDocumentId; ++i) {
+ if (i % max_lite_index_size == 0 && i != 0) {
+ ICING_ASSERT_OK(index_->Merge());
+ lite_index_size = 0;
+ }
+ std::string term;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+ SectionId section_id = i % 64;
+ if (section_id == 2) {
+ // Make section 2 an exact section.
+ term_match_type = TermMatchType::EXACT_ONLY;
+ term = std::to_string(i);
+ } else {
+ term = prefix + RandomString("abcd", 5, &random);
+ }
+ Index::Editor edit = index_->Edit(/*document_id=*/i, section_id,
+ term_match_type, /*namespace_id=*/0);
+ ICING_ASSERT_OK(edit.BufferTerm(term.c_str()));
+ ICING_ASSERT_OK(edit.IndexAllBufferedTerms());
+ ++lite_index_size;
+ index_->set_last_added_document_id(i);
+ }
+ // Ensure that the lite index still contains some data to better test both
+ // indexes.
+ ASSERT_THAT(lite_index_size, Eq(max_lite_index_size - 1));
+ EXPECT_EQ(index_->last_added_document_id(), kMaxDocumentId);
+
+ std::vector<DocHitInfo> exp_prefix_hits;
+ for (int i = 0; i <= kMaxDocumentId; ++i) {
+ if (i % 64 == 2) {
+ // Section 2 is an exact section, so we should not see any hits in
+ // prefix search.
+ continue;
+ }
+ exp_prefix_hits.push_back(DocHitInfo(i));
+ exp_prefix_hits.back().UpdateSection(/*section_id=*/i % 64);
+ }
+ std::reverse(exp_prefix_hits.begin(), exp_prefix_hits.end());
+
+ // Check prefix search.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<DocHitInfo> hits,
+ GetHits(prefix, /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(hits, ContainerEq(exp_prefix_hits));
+
+ // Check exact search.
+ for (int i = 0; i <= kMaxDocumentId; ++i) {
+ if (i % 64 == 2) {
+ // Only section 2 is an exact section
+ ICING_ASSERT_OK_AND_ASSIGN(
+ hits,
+ GetHits(std::to_string(i), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ i, std::vector<SectionId>{(SectionId)(2)})));
+ }
+ }
+}
+#endif // if !defined(__ANDROID__)
+
+TEST_F(IndexTest, IndexOptimize) {
+ std::string prefix = "prefix";
+ std::default_random_engine random;
+ std::vector<std::string> query_terms;
+ // Add 1024 hits to main index, and 1024 hits to lite index.
+ for (int i = 0; i < 2048; ++i) {
+ if (i == 1024) {
+ ICING_ASSERT_OK(index_->Merge());
+ }
+ // Generate a unique term for document i.
+ query_terms.push_back(prefix + RandomString("abcdefg", 5, &random) +
+ std::to_string(i));
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+ SectionId section_id = i % 64;
+ if (section_id == 2) {
+ // Make section 2 an exact section.
+ term_match_type = TermMatchType::EXACT_ONLY;
+ }
+ Index::Editor edit = index_->Edit(/*document_id=*/i, section_id,
+ term_match_type, /*namespace_id=*/0);
+ ICING_ASSERT_OK(edit.BufferTerm(query_terms.at(i).c_str()));
+ ICING_ASSERT_OK(edit.IndexAllBufferedTerms());
+ index_->set_last_added_document_id(i);
+ }
+
+ // Delete one document for every three documents.
+ DocumentId document_id = 0;
+ DocumentId new_last_added_document_id = kInvalidDocumentId;
+ std::vector<DocumentId> document_id_old_to_new;
+ for (int i = 0; i < 2048; ++i) {
+ if (i % 3 == 0) {
+ document_id_old_to_new.push_back(kInvalidDocumentId);
+ } else {
+ new_last_added_document_id = document_id++;
+ document_id_old_to_new.push_back(new_last_added_document_id);
+ }
+ }
+
+ std::vector<DocHitInfo> exp_prefix_hits;
+ for (int i = 0; i < 2048; ++i) {
+ if (document_id_old_to_new[i] == kInvalidDocumentId) {
+ continue;
+ }
+ if (i % 64 == 2) {
+ // Section 2 is an exact section, so we should not see any hits in
+ // prefix search.
+ continue;
+ }
+ exp_prefix_hits.push_back(DocHitInfo(document_id_old_to_new[i]));
+ exp_prefix_hits.back().UpdateSection(/*section_id=*/i % 64);
+ }
+ std::reverse(exp_prefix_hits.begin(), exp_prefix_hits.end());
+
+ // Check that optimize is correct
+ ICING_ASSERT_OK(
+ index_->Optimize(document_id_old_to_new, new_last_added_document_id));
+ EXPECT_EQ(index_->last_added_document_id(), new_last_added_document_id);
+ // Check prefix search.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<DocHitInfo> hits,
+ GetHits(prefix, /*term_start_index=*/0, /*unnormalized_term_length=*/0,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(hits, ContainerEq(exp_prefix_hits));
+ // Check exact search.
+ for (int i = 0; i < 2048; ++i) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ hits,
+ GetHits(query_terms[i], /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, TermMatchType::EXACT_ONLY));
+ if (document_id_old_to_new[i] == kInvalidDocumentId) {
+ EXPECT_THAT(hits, IsEmpty());
+ } else {
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ document_id_old_to_new[i],
+ std::vector<SectionId>{(SectionId)(i % 64)})));
+ }
+ }
+
+ // Check that optimize does not block merge.
+ ICING_ASSERT_OK(index_->Merge());
+ EXPECT_EQ(index_->last_added_document_id(), new_last_added_document_id);
+ // Check prefix search.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ hits, GetHits(prefix, /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, TermMatchType::PREFIX));
+ EXPECT_THAT(hits, ContainerEq(exp_prefix_hits));
+ // Check exact search.
+ for (int i = 0; i < 2048; ++i) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ hits,
+ GetHits(query_terms[i], /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, TermMatchType::EXACT_ONLY));
+ if (document_id_old_to_new[i] == kInvalidDocumentId) {
+ EXPECT_THAT(hits, IsEmpty());
+ } else {
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ document_id_old_to_new[i],
+ std::vector<SectionId>{(SectionId)(i % 64)})));
+ }
+ }
}
TEST_F(IndexTest, IndexCreateIOFailure) {
// Create the index with mock filesystem. By default, Mock will return false,
// so the first attempted file operation will fail.
- NiceMock<IcingMockFilesystem> mock_filesystem;
- Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
- EXPECT_THAT(Index::Create(options, &mock_filesystem),
+ NiceMock<IcingMockFilesystem> mock_icing_filesystem;
+ ON_CALL(mock_icing_filesystem, CreateDirectoryRecursively)
+ .WillByDefault(Return(false));
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ EXPECT_THAT(Index::Create(options, &filesystem_, &mock_icing_filesystem),
StatusIs(libtextclassifier3::StatusCode::INTERNAL));
}
@@ -468,28 +1640,32 @@ TEST_F(IndexTest, IndexCreateCorruptionFailure) {
// Add some content to the index
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("foo"), IsOk());
- ASSERT_THAT(edit.AddHit("bar"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// Close the index.
index_.reset();
// Corrrupt the index file.
std::string hit_buffer_filename = index_dir_ + "/idx/lite.hb";
- ScopedFd sfd(filesystem_.OpenForWrite(hit_buffer_filename.c_str()));
+ ScopedFd sfd(icing_filesystem_.OpenForWrite(hit_buffer_filename.c_str()));
ASSERT_THAT(sfd.is_valid(), IsTrue());
constexpr std::string_view kCorruptBytes = "ffffffffffffffffffffff";
// The first page of the hit_buffer is taken up by the header. Overwrite the
// first page of content.
- constexpr int kHitBufferStartOffset = 4096;
- ASSERT_THAT(filesystem_.PWrite(sfd.get(), kHitBufferStartOffset,
- kCorruptBytes.data(), kCorruptBytes.length()),
- IsTrue());
+ int hit_buffer_start_offset = GetBlockSize();
+ ASSERT_THAT(
+ icing_filesystem_.PWrite(sfd.get(), hit_buffer_start_offset,
+ kCorruptBytes.data(), kCorruptBytes.length()),
+ IsTrue());
// Recreate the index.
- Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
- EXPECT_THAT(Index::Create(options, &filesystem_),
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ EXPECT_THAT(Index::Create(options, &filesystem_, &icing_filesystem_),
StatusIs(libtextclassifier3::StatusCode::DATA_LOSS));
}
@@ -497,217 +1673,1070 @@ TEST_F(IndexTest, IndexPersistence) {
// Add some content to the index
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("foo"), IsOk());
- ASSERT_THAT(edit.AddHit("bar"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
EXPECT_THAT(index_->PersistToDisk(), IsOk());
// Close the index.
index_.reset();
// Recreate the index.
- Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
- ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
// Check that the hits are present.
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<DocHitInfoIterator> itr,
- index_->GetIterator("f", kSectionIdMaskAll, TermMatchType::PREFIX));
+ index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
EXPECT_THAT(GetHits(std::move(itr)),
ElementsAre(EqualsDocHitInfo(
kDocumentId0, std::vector<SectionId>{kSectionId2})));
-
- EXPECT_THAT(index_->last_added_document_id(), Eq(kDocumentId0));
}
-TEST_F(IndexTest, InvalidHitBufferSize) {
- Index::Options options(
- index_dir_, /*index_merge_size=*/std::numeric_limits<uint32_t>::max());
- EXPECT_THAT(Index::Create(options, &filesystem_),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST_F(IndexTest, ComputeChecksumSameBetweenCalls) {
- // Add some content to the index.
- Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
- TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("foo"), IsOk());
-
- Crc32 checksum = index_->ComputeChecksum();
- // Calling it again shouldn't change the checksum
- EXPECT_THAT(index_->ComputeChecksum(), Eq(checksum));
-}
-
-TEST_F(IndexTest, ComputeChecksumSameAcrossInstances) {
- // Add some content to the index.
+TEST_F(IndexTest, IndexPersistenceAfterMerge) {
+ // Add some content to the index
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("foo"), IsOk());
-
- Crc32 checksum = index_->ComputeChecksum();
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ ICING_ASSERT_OK(index_->Merge());
+ EXPECT_THAT(index_->PersistToDisk(), IsOk());
- // Recreate the index, checksum should still be the same across instances
+ // Close the index.
index_.reset();
- Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024);
- ICING_ASSERT_OK_AND_ASSIGN(index_, Index::Create(options, &filesystem_));
- EXPECT_THAT(index_->ComputeChecksum(), Eq(checksum));
-}
-
-TEST_F(IndexTest, ComputeChecksumChangesOnModification) {
- // Add some content to the index.
- Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
- TermMatchType::PREFIX, /*namespace_id=*/0);
- ASSERT_THAT(edit.AddHit("foo"), IsOk());
-
- Crc32 checksum = index_->ComputeChecksum();
+ // Recreate the index.
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
- // Modifying the index changes the checksum;
- EXPECT_THAT(edit.AddHit("bar"), IsOk());
+ // Check that the hits are present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
- EXPECT_THAT(index_->ComputeChecksum(), Not(Eq(checksum)));
+TEST_F(IndexTest, InvalidHitBufferSize) {
+ Index::Options options(
+ index_dir_, /*index_merge_size=*/std::numeric_limits<uint32_t>::max(),
+ /*lite_index_sort_at_indexing=*/true, /*lite_index_sort_size=*/1024 * 8);
+ EXPECT_THAT(Index::Create(options, &filesystem_, &icing_filesystem_),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnEmpty) {
Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
TermMatchType::PREFIX, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("fool"), IsOk());
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/0),
- IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"foo", /*num_to_return=*/0, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"foo", /*num_to_return=*/-1, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(IsEmpty()));
+
+ ICING_ASSERT_OK(index_->Merge());
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"foo", /*namespace_ids=*/{0},
- /*num_to_return=*/-1),
- IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"foo", /*num_to_return=*/0, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"foo", /*num_to_return=*/-1, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(IsEmpty()));
}
TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectResult) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
- EXPECT_THAT(edit.AddHit("bar"), IsOk());
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("bar"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // "b" should only match "bar" but not "foo".
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"b", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
+
+ ICING_ASSERT_OK(index_->Merge());
// "b" should only match "bar" but not "foo".
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"b", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"b", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("bar", 1))));
}
TEST_F(IndexTest, FindTermByPrefixShouldRespectNumToReturn) {
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("fo"), IsOk());
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
- EXPECT_THAT(edit.AddHit("fool"), IsOk());
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
// We have 3 results but only 2 should be returned.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/2),
- IsOkAndHolds(SizeIs(2)));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/2, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(SizeIs(2)));
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ // We have 3 results but only 2 should be returned.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/2, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(SizeIs(2)));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInOneNamespace) {
+TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- EXPECT_THAT(edit1.AddHit("fo"), IsOk());
- EXPECT_THAT(edit1.AddHit("foo"), IsOk());
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
Index::Editor edit2 =
index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/1);
- EXPECT_THAT(edit2.AddHit("fool"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
- // namespace with id 0 has 2 results.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
- EqualsTermMetadata("foo", 1))));
+ Index::Editor edit3 =
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/2);
+ EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
- // namespace with id 1 has 1 result.
+ // Should return "fo", "foo" and "fool" across all namespaces.
EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fool", 1))));
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
+ EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ // Should return "fo", "foo" and "fool" across all namespaces.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
+ EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInMultipleNamespaces) {
+TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- EXPECT_THAT(edit1.AddHit("fo"), IsOk());
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit1.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
Index::Editor edit2 =
index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/1);
- EXPECT_THAT(edit2.AddHit("foo"), IsOk());
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit2.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
- Index::Editor edit3 =
- index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/2);
- EXPECT_THAT(edit3.AddHit("fool"), IsOk());
+ // 'foo' has 1 hit, 'fool' has 2 hits.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
+ EqualsTermMetadata("foo", 1))));
+
+ ICING_ASSERT_OK(index_->Merge());
- // Should return "foo" and "fool" which are in namespaces with ids 1 and 2.
EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{1, 2},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
+ EqualsTermMetadata("foo", 1))));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsInAllNamespaces) {
+TEST_F(IndexTest, FindTermByPrefixMultipleHitBatch) {
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ // Create multiple hit batches.
+ for (int i = 0; i < 4000; i++) {
+ Index::Editor edit = index_->Edit(i, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ }
+
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 4000))));
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 4000))));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnInOrder) {
+ // Push 6 term-six, 5 term-five, 4 term-four, 3 term-three, 2 term-two and one
+ // term-one into lite index.
Index::Editor edit1 =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- EXPECT_THAT(edit1.AddHit("fo"), IsOk());
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit1.BufferTerm("term-one"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("term-two"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("term-three"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("term-four"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("term-five"), IsOk());
+ EXPECT_THAT(edit1.BufferTerm("term-six"), IsOk());
+ EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
Index::Editor edit2 =
- index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/1);
- EXPECT_THAT(edit2.AddHit("foo"), IsOk());
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit2.BufferTerm("term-two"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("term-three"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("term-four"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("term-five"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("term-six"), IsOk());
+ EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
Index::Editor edit3 =
- index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
- /*namespace_id=*/2);
- EXPECT_THAT(edit3.AddHit("fool"), IsOk());
+ index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit3.BufferTerm("term-three"), IsOk());
+ EXPECT_THAT(edit3.BufferTerm("term-four"), IsOk());
+ EXPECT_THAT(edit3.BufferTerm("term-five"), IsOk());
+ EXPECT_THAT(edit3.BufferTerm("term-six"), IsOk());
+ EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
+
+ Index::Editor edit4 =
+ index_->Edit(kDocumentId4, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit4.BufferTerm("term-four"), IsOk());
+ EXPECT_THAT(edit4.BufferTerm("term-five"), IsOk());
+ EXPECT_THAT(edit4.BufferTerm("term-six"), IsOk());
+ EXPECT_THAT(edit4.IndexAllBufferedTerms(), IsOk());
- // Should return "fo", "foo" and "fool" across all namespaces.
- EXPECT_THAT(index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{},
- /*num_to_return=*/10),
- IsOkAndHolds(UnorderedElementsAre(
- EqualsTermMetadata("fo", 1), EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 1))));
+ Index::Editor edit5 =
+ index_->Edit(kDocumentId5, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit5.BufferTerm("term-five"), IsOk());
+ EXPECT_THAT(edit5.BufferTerm("term-six"), IsOk());
+ EXPECT_THAT(edit5.IndexAllBufferedTerms(), IsOk());
+
+ Index::Editor edit6 =
+ index_->Edit(kDocumentId6, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit6.BufferTerm("term-six"), IsOk());
+ EXPECT_THAT(edit6.IndexAllBufferedTerms(), IsOk());
+
+ // verify the order in lite index is correct.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"t", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-five", 5),
+ EqualsTermMetadata("term-four", 4),
+ EqualsTermMetadata("term-three", 3),
+ EqualsTermMetadata("term-two", 2),
+ EqualsTermMetadata("term-one", 1))));
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"t", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-five", 5),
+ EqualsTermMetadata("term-four", 4),
+ EqualsTermMetadata("term-three", 3),
+ EqualsTermMetadata("term-two", 2),
+ EqualsTermMetadata("term-one", 1))));
+
+ // keep push terms to the lite index. We will add 2 document to term-five,
+ // term-three and term-one. The output order should be 5-6-3-4-1-2.
+ Index::Editor edit7 =
+ index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit7.BufferTerm("term-one"), IsOk());
+ EXPECT_THAT(edit7.BufferTerm("term-three"), IsOk());
+ EXPECT_THAT(edit7.BufferTerm("term-five"), IsOk());
+ EXPECT_THAT(edit7.IndexAllBufferedTerms(), IsOk());
+
+ Index::Editor edit8 =
+ index_->Edit(kDocumentId8, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit8.BufferTerm("term-one"), IsOk());
+ EXPECT_THAT(edit8.BufferTerm("term-three"), IsOk());
+ EXPECT_THAT(edit8.BufferTerm("term-five"), IsOk());
+ EXPECT_THAT(edit8.IndexAllBufferedTerms(), IsOk());
+
+ // verify the combination of lite index and main index is in correct order.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"t", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(
+ EqualsTermMetadata("term-five", 7), EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-three", 5),
+ EqualsTermMetadata("term-four", 4), EqualsTermMetadata("term-one", 3),
+ EqualsTermMetadata("term-two", 2))));
+
+ // Get the first three terms.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"t", /*num_to_return=*/3, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("term-five", 7),
+ EqualsTermMetadata("term-six", 6),
+ EqualsTermMetadata("term-three", 5))));
}
-TEST_F(IndexTest, FindTermByPrefixShouldReturnCorrectHitCount) {
+TEST_F(IndexTest, FindTermByPrefix_InTermMatchTypePrefix_ShouldReturnInOrder) {
Index::Editor edit1 =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit1.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit1.IndexAllBufferedTerms(), IsOk());
+
+ Index::Editor edit2 =
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
+
+ Index::Editor edit3 =
+ index_->Edit(kDocumentId3, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit3.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit3.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+ // verify the order in pls is correct
+ // "fo" { {doc0, exact_hit}, {doc1, prefix_hit}, {doc2, prefix_hit} }
+ // "foo" { {doc1, exact_hit}, {doc2, prefix_hit} }
+ // "fool" { {doc2, exact_hit} }
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f",
+ /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3),
+ EqualsTermMetadata("foo", 2),
+ EqualsTermMetadata("fool", 1))));
+ // Find by exact only, all terms should be equally.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::EXACT_ONLY,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
+ EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnHitCountForMain) {
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId4, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId5, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId6, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId7, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // 'foo' has 1 hit, 'fool' has 8 hits.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 8),
+ EqualsTermMetadata("foo", 1))));
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 8))));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnCombinedHitCount) {
+ Index::Editor edit =
index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
/*namespace_id=*/0);
- EXPECT_THAT(edit1.AddHit("foo"), IsOk());
- EXPECT_THAT(edit1.AddHit("fool"), IsOk());
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fool", 2),
+ EqualsTermMetadata("foo", 1))));
+}
+
+TEST_F(IndexTest, FindTermRankComparison) {
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+ EXPECT_THAT(edit.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
Index::Editor edit2 =
- index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ index_->Edit(kDocumentId2, kSectionId2, TermMatchType::PREFIX,
/*namespace_id=*/0);
- EXPECT_THAT(edit2.AddHit("fool"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("fo"), IsOk());
+ EXPECT_THAT(edit2.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit2.IndexAllBufferedTerms(), IsOk());
- // 'foo' has 1 hit, 'fool' has 2 hits.
EXPECT_THAT(
- index_->FindTermsByPrefix(/*prefix=*/"f", /*namespace_ids=*/{0},
- /*num_to_return=*/10),
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::EXACT_ONLY,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3),
+ EqualsTermMetadata("foo", 2),
+ EqualsTermMetadata("fool", 1))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::EXACT_ONLY,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 2),
+ EqualsTermMetadata("foo", 2),
+ EqualsTermMetadata("fool", 1))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::EXACT_ONLY,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE, &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
+ EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::EXACT_ONLY,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY,
+ &impl),
+ IsOkAndHolds(ElementsAre(EqualsTermMetadata("fo", 3),
+ EqualsTermMetadata("foo", 2),
+ EqualsTermMetadata("fool", 1))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::EXACT_ONLY,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 2),
+ EqualsTermMetadata("foo", 2),
+ EqualsTermMetadata("fool", 1))));
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::EXACT_ONLY,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE, &impl),
+ IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("fo", 1),
+ EqualsTermMetadata("foo", 1),
+ EqualsTermMetadata("fool", 1))));
+}
+
+TEST_F(IndexTest, FindTermByPrefixShouldReturnTermsFromBothIndices) {
+ Index::Editor edit =
+ index_->Edit(kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ AlwaysTrueSuggestionResultCheckerImpl impl;
+
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // 'foo' has 1 hit in the main index, 'fool' has 1 hit in the lite index.
+ EXPECT_THAT(
+ index_->FindTermsByPrefix(
+ /*prefix=*/"f", /*num_to_return=*/10, TermMatchType::PREFIX,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &impl),
IsOkAndHolds(UnorderedElementsAre(EqualsTermMetadata("foo", 1),
- EqualsTermMetadata("fool", 2))));
+ EqualsTermMetadata("fool", 1))));
}
TEST_F(IndexTest, GetElementsSize) {
// Check empty index.
- EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Eq(0)));
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t size, index_->GetElementsSize());
+ EXPECT_THAT(size, Eq(0));
// Add an element.
Index::Editor edit = index_->Edit(
kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
- EXPECT_THAT(edit.AddHit("foo"), IsOk());
- EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(Gt(0)));
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(size, index_->GetElementsSize());
+ EXPECT_THAT(size, Gt(0));
+
+ ASSERT_THAT(index_->Merge(), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(size, index_->GetElementsSize());
+ EXPECT_THAT(size, Gt(0));
+}
+
+TEST_F(IndexTest, ExactResultsFromLiteAndMain) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ ICING_ASSERT_OK(index_->Merge());
+
+ edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("footer"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, PrefixResultsFromLiteAndMain) {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ ICING_ASSERT_OK(index_->Merge());
+
+ edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("footer"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ EXPECT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, GetDebugInfo) {
+ // Add two documents to the lite index, merge them into the main index and
+ // then add another doc to the lite index.
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId1);
+ ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ ICING_ASSERT_OK(index_->Merge());
+
+ edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId2);
+ ASSERT_THAT(edit.BufferTerm("footer"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ IndexDebugInfoProto out0 = index_->GetDebugInfo(DebugInfoVerbosity::BASIC);
+ ICING_LOG(DBG) << "main_index_info:\n" << out0.main_index_info();
+ ICING_LOG(DBG) << "lite_index_info:\n" << out0.lite_index_info();
+ EXPECT_THAT(out0.main_index_info(), Not(IsEmpty()));
+ EXPECT_THAT(out0.lite_index_info(), Not(IsEmpty()));
+
+ IndexDebugInfoProto out1 = index_->GetDebugInfo(DebugInfoVerbosity::DETAILED);
+ ICING_LOG(DBG) << "main_index_info:\n" << out1.main_index_info();
+ ICING_LOG(DBG) << "lite_index_info:\n" << out1.lite_index_info();
+ EXPECT_THAT(out1.main_index_info(),
+ SizeIs(Gt(out0.main_index_info().size())));
+ EXPECT_THAT(out1.lite_index_info(),
+ SizeIs(Gt(out0.lite_index_info().size())));
+
+ // Add one more doc to the lite index. Debug strings should change.
+ edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ index_->set_last_added_document_id(kDocumentId3);
+ ASSERT_THAT(edit.BufferTerm("far"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ IndexDebugInfoProto out2 = index_->GetDebugInfo(DebugInfoVerbosity::BASIC);
+ ICING_LOG(DBG) << "main_index_info:\n" << out2.main_index_info();
+ ICING_LOG(DBG) << "lite_index_info:\n" << out2.lite_index_info();
+ EXPECT_THAT(out2.main_index_info(), Not(IsEmpty()));
+ EXPECT_THAT(out2.lite_index_info(), Not(IsEmpty()));
+ EXPECT_THAT(out2.main_index_info(), StrEq(out0.main_index_info()));
+ EXPECT_THAT(out2.lite_index_info(), StrNe(out0.lite_index_info()));
+
+ // Merge into the man index. Debug strings should change again.
+ ICING_ASSERT_OK(index_->Merge());
+
+ IndexDebugInfoProto out3 = index_->GetDebugInfo(DebugInfoVerbosity::BASIC);
+ EXPECT_TRUE(out3.has_index_storage_info());
+ ICING_LOG(DBG) << "main_index_info:\n" << out3.main_index_info();
+ ICING_LOG(DBG) << "lite_index_info:\n" << out3.lite_index_info();
+ EXPECT_THAT(out3.main_index_info(), Not(IsEmpty()));
+ EXPECT_THAT(out3.lite_index_info(), Not(IsEmpty()));
+ EXPECT_THAT(out3.main_index_info(), StrNe(out2.main_index_info()));
+ EXPECT_THAT(out3.lite_index_info(), StrNe(out2.lite_index_info()));
+}
+
+TEST_F(IndexTest, BackfillingMultipleTermsSucceeds) {
+ // Add two documents to the lite index, merge them into the main index and
+ // then add another doc to the lite index.
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId0, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // After this merge the index should have posting lists for
+ // "fool" {(doc0,sec3)},
+ // "foot" {(doc1,sec3)},
+ // "foo" {(doc1,sec3),(doc0,sec3),(doc0,sec2)}
+ ICING_ASSERT_OK(index_->Merge());
+
+ // Add one more doc to the lite index.
+ edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("far"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // After this merge the index should add a posting list for "far" and a
+ // backfill branch point for "f". In addition to the posting lists described
+ // above, which are unaffected, the new posting lists should be
+ // "far" {(doc2,sec2)},
+ // "f" {(doc1,sec3),(doc0,sec3)}
+ // Multiple pre-existing hits should be added to the new backfill branch
+ // point.
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId3})));
+}
+
+TEST_F(IndexTest, BackfillingNewTermsSucceeds) {
+ // Add two documents to the lite index, merge them into the main index and
+ // then add another doc to the lite index.
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::EXACT_ONLY, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ ASSERT_THAT(edit.BufferTerm("fool"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ // After this merge the index should have posting lists for
+ // "fool" {(doc0,sec2)},
+ // "foot" {(doc1,sec3)},
+ // "foo" {(doc1,sec3),(doc0,sec2)}
+ ICING_ASSERT_OK(index_->Merge());
+
+ edit = index_->Edit(kDocumentId2, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("footer"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ // Add one more doc to the lite index. Debug strings should change.
+ edit = index_->Edit(kDocumentId3, kSectionId2, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("far"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // After this merge the index should add posting lists for "far" and "footer"
+ // and a backfill branch point for "f". The new posting lists should be
+ // "fool" {(doc0,sec2)},
+ // "foot" {(doc1,sec3)},
+ // "foo" {(doc2,sec3),(doc1,sec3),(doc0,sec2)}
+ // "footer" {(doc2,sec2)},
+ // "far" {(doc3,sec2)},
+ // "f" {(doc2,sec3),(doc1,sec3)}
+ // Multiple pre-existing hits should be added to the new backfill branch
+ // point.
+ ICING_ASSERT_OK(index_->Merge());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId2, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3})));
+}
+
+TEST_F(IndexTest, TruncateToInvalidDocumentIdHasNoEffect) {
+ ICING_EXPECT_OK(index_->TruncateTo(kInvalidDocumentId));
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+
+ // Add one document to the lite index
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ // Clipping to invalid should have no effect.
+ ICING_EXPECT_OK(index_->TruncateTo(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ // Clipping to invalid should still have no effect even if hits are in main.
+ ICING_ASSERT_OK(index_->Merge());
+ ICING_EXPECT_OK(index_->TruncateTo(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ // Clipping to invalid should still have no effect even if both indices have
+ // hits.
+ ICING_EXPECT_OK(index_->TruncateTo(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, TruncateToLastAddedDocumentIdHasNoEffect) {
+ ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id()));
+ EXPECT_THAT(index_->GetElementsSize(), IsOkAndHolds(0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+
+ // Add one document to the lite index
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId0);
+ ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id()));
+ // Clipping to invalid should have no effect.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ // Clipping to invalid should still have no effect even if hits are in main.
+ ICING_ASSERT_OK(index_->Merge());
+ ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId1);
+
+ // Clipping to invalid should still have no effect even if both indices have
+ // hits.
+ ICING_EXPECT_OK(index_->TruncateTo(index_->last_added_document_id()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0,
+ kSectionIdMaskAll, TermMatchType::PREFIX));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId3}),
+ EqualsDocHitInfo(kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, TruncateToThrowsOutLiteIndex) {
+ // Add one document to the lite index and merge it into main.
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId0);
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ // Add another document to the lite index.
+ edit = index_->Edit(kDocumentId1, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId1);
+
+ EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk());
+
+ // Clipping to document 0 should toss out the lite index, but keep the main.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId2})));
+}
+
+TEST_F(IndexTest, TruncateToThrowsOutBothIndices) {
+ // Add two documents to the lite index and merge them into main.
+ Index::Editor edit = index_->Edit(kDocumentId0, kSectionId2,
+ TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId0);
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foul"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId1);
+
+ ICING_ASSERT_OK(index_->Merge());
+
+ // Add another document to the lite index.
+ edit = index_->Edit(kDocumentId2, kSectionId3, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foot"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ index_->set_last_added_document_id(kDocumentId2);
+
+ EXPECT_THAT(index_->TruncateTo(kDocumentId0), IsOk());
+
+ // Clipping to document 0 should toss out both indices.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index_->GetIterator("f", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::PREFIX));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IndexTest, IndexStorageInfoProto) {
+ // Add two documents to the lite index and merge them into main.
+ {
+ Index::Editor edit = index_->Edit(
+ kDocumentId0, kSectionId2, TermMatchType::PREFIX, /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foo"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+ edit = index_->Edit(kDocumentId1, kSectionId2, TermMatchType::PREFIX,
+ /*namespace_id=*/0);
+ ASSERT_THAT(edit.BufferTerm("foul"), IsOk());
+ EXPECT_THAT(edit.IndexAllBufferedTerms(), IsOk());
+
+ ICING_ASSERT_OK(index_->Merge());
+ }
+
+ IndexStorageInfoProto storage_info = index_->GetStorageInfo();
+ EXPECT_THAT(storage_info.index_size(), Ge(0));
+ EXPECT_THAT(storage_info.lite_index_lexicon_size(), Ge(0));
+ EXPECT_THAT(storage_info.lite_index_hit_buffer_size(), Ge(0));
+ EXPECT_THAT(storage_info.main_index_lexicon_size(), Ge(0));
+ EXPECT_THAT(storage_info.main_index_storage_size(), Ge(0));
+ EXPECT_THAT(storage_info.main_index_block_size(), Ge(0));
+ // There should be 1 block for the header and 1 block for two posting lists.
+ EXPECT_THAT(storage_info.num_blocks(), Eq(2));
+ EXPECT_THAT(storage_info.min_free_fraction(), Ge(0));
}
} // namespace
diff --git a/icing/index/integer-section-indexing-handler.cc b/icing/index/integer-section-indexing-handler.cc
new file mode 100644
index 0000000..63b09df
--- /dev/null
+++ b/icing/index/integer-section-indexing-handler.cc
@@ -0,0 +1,112 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/integer-section-indexing-handler.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/clock.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<IntegerSectionIndexingHandler>>
+IntegerSectionIndexingHandler::Create(const Clock* clock,
+ NumericIndex<int64_t>* integer_index) {
+ ICING_RETURN_ERROR_IF_NULL(clock);
+ ICING_RETURN_ERROR_IF_NULL(integer_index);
+
+ return std::unique_ptr<IntegerSectionIndexingHandler>(
+ new IntegerSectionIndexingHandler(clock, integer_index));
+}
+
+libtextclassifier3::Status IntegerSectionIndexingHandler::Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ bool recovery_mode, PutDocumentStatsProto* put_document_stats) {
+ std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
+
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(
+ IcingStringUtil::StringPrintf("Invalid DocumentId %d", document_id));
+ }
+
+ if (integer_index_.last_added_document_id() != kInvalidDocumentId &&
+ document_id <= integer_index_.last_added_document_id()) {
+ if (recovery_mode) {
+ // Skip the document if document_id <= last_added_document_id in recovery
+ // mode without returning an error.
+ return libtextclassifier3::Status::OK;
+ }
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "DocumentId %d must be greater than last added document_id %d",
+ document_id, integer_index_.last_added_document_id()));
+ }
+ integer_index_.set_last_added_document_id(document_id);
+
+ libtextclassifier3::Status status;
+ // We have to add integer sections into integer index in reverse order because
+ // sections are sorted by SectionId in ascending order, but BasicHit should be
+ // added in descending order of SectionId (posting list requirement).
+ for (auto riter = tokenized_document.integer_sections().rbegin();
+ riter != tokenized_document.integer_sections().rend(); ++riter) {
+ const Section<int64_t>& section = *riter;
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor = integer_index_.Edit(
+ section.metadata.path, document_id, section.metadata.id);
+
+ for (int64_t key : section.content) {
+ status = editor->BufferKey(key);
+ if (!status.ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to buffer keys into integer index due to: "
+ << status.error_message();
+ break;
+ }
+ }
+ if (!status.ok()) {
+ break;
+ }
+
+ // Add all the seen keys to the integer index.
+ status = std::move(*editor).IndexAllBufferedKeys();
+ if (!status.ok()) {
+ ICING_LOG(WARNING) << "Failed to add keys into integer index due to: "
+ << status.error_message();
+ break;
+ }
+ }
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_integer_index_latency_ms(
+ index_timer->GetElapsedMilliseconds());
+ }
+
+ return status;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/integer-section-indexing-handler.h b/icing/index/integer-section-indexing-handler.h
new file mode 100644
index 0000000..0a501aa
--- /dev/null
+++ b/icing/index/integer-section-indexing-handler.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_INTEGER_SECTION_INDEXING_HANDLER_H_
+#define ICING_INDEX_INTEGER_SECTION_INDEXING_HANDLER_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/data-indexing-handler.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/store/document-id.h"
+#include "icing/util/clock.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+class IntegerSectionIndexingHandler : public DataIndexingHandler {
+ public:
+ // Creates an IntegerSectionIndexingHandler instance which does not take
+ // ownership of any input components. All pointers must refer to valid objects
+ // that outlive the created IntegerSectionIndexingHandler instance.
+ //
+ // Returns:
+ // - An IntegerSectionIndexingHandler instance on success
+ // - FAILED_PRECONDITION_ERROR if any of the input pointer is null
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<IntegerSectionIndexingHandler>>
+ Create(const Clock* clock, NumericIndex<int64_t>* integer_index);
+
+ ~IntegerSectionIndexingHandler() override = default;
+
+ // Handles the integer indexing process: add hits into the integer index for
+ // all contents in tokenized_document.integer_sections.
+ //
+ // Returns:
+ // - OK on success.
+ // - INVALID_ARGUMENT_ERROR if document_id is invalid OR document_id is less
+ // than or equal to the document_id of a previously indexed document in
+ // non recovery mode.
+ // - Any NumericIndex<int64_t>::Editor errors.
+ libtextclassifier3::Status Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ bool recovery_mode, PutDocumentStatsProto* put_document_stats) override;
+
+ private:
+ explicit IntegerSectionIndexingHandler(const Clock* clock,
+ NumericIndex<int64_t>* integer_index)
+ : DataIndexingHandler(clock), integer_index_(*integer_index) {}
+
+ NumericIndex<int64_t>& integer_index_; // Does not own.
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_INTEGER_SECTION_INDEXING_HANDLER_H_
diff --git a/icing/index/integer-section-indexing-handler_test.cc b/icing/index/integer-section-indexing-handler_test.cc
new file mode 100644
index 0000000..91cc06f
--- /dev/null
+++ b/icing/index/integer-section-indexing-handler_test.cc
@@ -0,0 +1,601 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/integer-section-indexing-handler.h"
+
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/integer-index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsTrue;
+
+// Indexable properties (section) and section id. Section id is determined by
+// the lexicographical order of indexable property paths.
+// Schema type with indexable properties: FakeType
+// Section id = 0: "body"
+// Section id = 1: "timestamp"
+// Section id = 2: "title"
+static constexpr std::string_view kFakeType = "FakeType";
+static constexpr std::string_view kPropertyBody = "body";
+static constexpr std::string_view kPropertyTimestamp = "timestamp";
+static constexpr std::string_view kPropertyTitle = "title";
+
+static constexpr SectionId kSectionIdTimestamp = 1;
+
+// Schema type with nested indexable properties: NestedType
+// Section id = 0: "name"
+// Section id = 1: "nested.body"
+// Section id = 2: "nested.timestamp"
+// Section id = 3: "nested.title"
+// Section id = 4: "price"
+static constexpr std::string_view kNestedType = "NestedType";
+static constexpr std::string_view kPropertyName = "name";
+static constexpr std::string_view kPropertyNestedDoc = "nested";
+static constexpr std::string_view kPropertyPrice = "price";
+
+static constexpr SectionId kSectionIdNestedTimestamp = 2;
+static constexpr SectionId kSectionIdPrice = 4;
+
+class IntegerSectionIndexingHandlerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ base_dir_ = GetTestTempDir() + "/icing_test";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ integer_index_working_path_ = base_dir_ + "/integer_index";
+ schema_store_dir_ = base_dir_ + "/schema_store";
+ document_store_dir_ = base_dir_ + "/document_store";
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ integer_index_,
+ IntegerIndex::Create(filesystem_, integer_index_working_path_,
+ /*num_data_threshold_for_bucket_split=*/65536,
+ /*pre_mapping_fbv=*/false));
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
+
+ ASSERT_THAT(
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kFakeType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyTitle)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyBody)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyTimestamp)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kNestedType)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyNestedDoc)
+ .SetDataTypeDocument(
+ kFakeType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyPrice)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ASSERT_TRUE(
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult doc_store_create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false,
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(doc_store_create_result.document_store);
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ lang_segmenter_.reset();
+ integer_index_.reset();
+
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ FakeClock fake_clock_;
+ std::string base_dir_;
+ std::string integer_index_working_path_;
+ std::string schema_store_dir_;
+ std::string document_store_dir_;
+
+ std::unique_ptr<NumericIndex<int64_t>> integer_index_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+};
+
+std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+}
+
+TEST_F(IntegerSectionIndexingHandlerTest, CreationWithNullPointerShouldFail) {
+ EXPECT_THAT(IntegerSectionIndexingHandler::Create(/*clock=*/nullptr,
+ integer_index_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ EXPECT_THAT(IntegerSectionIndexingHandler::Create(&fake_clock_,
+ /*integer_index=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(IntegerSectionIndexingHandlerTest, HandleIntegerSection) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "title")
+ .AddStringProperty(std::string(kPropertyBody), "body")
+ .AddInt64Property(std::string(kPropertyTimestamp), 123)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(tokenized_document.document()));
+
+ ASSERT_THAT(integer_index_->last_added_document_id(), Eq(kInvalidDocumentId));
+ // Handle document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerSectionIndexingHandler> handler,
+ IntegerSectionIndexingHandler::Create(&fake_clock_,
+ integer_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(document_id));
+
+ // Query "timestamp".
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ document_id, std::vector<SectionId>{kSectionIdTimestamp})));
+}
+
+TEST_F(IntegerSectionIndexingHandlerTest, HandleNestedIntegerSection) {
+ DocumentProto nested_document =
+ DocumentBuilder()
+ .SetKey("icing", "nested_type/1")
+ .SetSchema(std::string(kNestedType))
+ .AddDocumentProperty(
+ std::string(kPropertyNestedDoc),
+ DocumentBuilder()
+ .SetKey("icing", "nested_fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle),
+ "nested title")
+ .AddStringProperty(std::string(kPropertyBody), "nested body")
+ .AddInt64Property(std::string(kPropertyTimestamp), 123)
+ .Build())
+ .AddInt64Property(std::string(kPropertyPrice), 456)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(nested_document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(tokenized_document.document()));
+
+ ASSERT_THAT(integer_index_->last_added_document_id(), Eq(kInvalidDocumentId));
+ // Handle nested_document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerSectionIndexingHandler> handler,
+ IntegerSectionIndexingHandler::Create(&fake_clock_,
+ integer_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(document_id));
+
+ // Query "nested.timestamp".
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(
+ "nested.timestamp", /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ document_id, std::vector<SectionId>{kSectionIdNestedTimestamp})));
+
+ // Query "price".
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ integer_index_->GetIterator(
+ kPropertyPrice, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ document_id, std::vector<SectionId>{kSectionIdPrice})));
+
+ // Query "timestamp". Should get empty result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IntegerSectionIndexingHandlerTest, HandleShouldSkipEmptyIntegerSection) {
+ // Create a FakeType document without "timestamp".
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "title")
+ .AddStringProperty(std::string(kPropertyBody), "body")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(tokenized_document.document()));
+
+ ASSERT_THAT(integer_index_->last_added_document_id(), Eq(kInvalidDocumentId));
+ // Handle document. Index data should remain unchanged since there is no
+ // indexable integer, but last_added_document_id should be updated.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerSectionIndexingHandler> handler,
+ IntegerSectionIndexingHandler::Create(&fake_clock_,
+ integer_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(document_id));
+
+ // Query "timestamp". Should get empty result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IntegerSectionIndexingHandlerTest,
+ HandleInvalidDocumentIdShouldReturnInvalidArgumentError) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "title")
+ .AddStringProperty(std::string(kPropertyBody), "body")
+ .AddInt64Property(std::string(kPropertyTimestamp), 123)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+ ICING_ASSERT_OK(document_store_->Put(tokenized_document.document()));
+
+ static constexpr DocumentId kCurrentDocumentId = 3;
+ integer_index_->set_last_added_document_id(kCurrentDocumentId);
+ ASSERT_THAT(integer_index_->last_added_document_id(), Eq(kCurrentDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerSectionIndexingHandler> handler,
+ IntegerSectionIndexingHandler::Create(&fake_clock_,
+ integer_index_.get()));
+
+ // Handling document with kInvalidDocumentId should cause a failure, and both
+ // index data and last_added_document_id should remain unchanged.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kInvalidDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kCurrentDocumentId));
+
+ // Query "timestamp". Should get empty result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+
+ // Recovery mode should get the same result.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kInvalidDocumentId,
+ /*recovery_mode=*/true, /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(kCurrentDocumentId));
+
+ // Query "timestamp". Should get empty result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IntegerSectionIndexingHandlerTest,
+ HandleOutOfOrderDocumentIdShouldReturnInvalidArgumentError) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "title")
+ .AddStringProperty(std::string(kPropertyBody), "body")
+ .AddInt64Property(std::string(kPropertyTimestamp), 123)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(tokenized_document.document()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerSectionIndexingHandler> handler,
+ IntegerSectionIndexingHandler::Create(&fake_clock_,
+ integer_index_.get()));
+
+ // Handling document with document_id == last_added_document_id should cause a
+ // failure, and both index data and last_added_document_id should remain
+ // unchanged.
+ integer_index_->set_last_added_document_id(document_id);
+ ASSERT_THAT(integer_index_->last_added_document_id(), Eq(document_id));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(document_id));
+
+ // Query "timestamp". Should get empty result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+
+ // Handling document with document_id < last_added_document_id should cause a
+ // failure, and both index data and last_added_document_id should remain
+ // unchanged.
+ integer_index_->set_last_added_document_id(document_id + 1);
+ ASSERT_THAT(integer_index_->last_added_document_id(), Eq(document_id + 1));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(document_id + 1));
+
+ // Query "timestamp". Should get empty result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(IntegerSectionIndexingHandlerTest,
+ HandleRecoveryModeShouldIgnoreDocsLELastAddedDocId) {
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "title one")
+ .AddStringProperty(std::string(kPropertyBody), "body one")
+ .AddInt64Property(std::string(kPropertyTimestamp), 123)
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/2")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "title two")
+ .AddStringProperty(std::string(kPropertyBody), "body two")
+ .AddInt64Property(std::string(kPropertyTimestamp), 456)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document1,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document2,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(tokenized_document1.document()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(tokenized_document2.document()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerSectionIndexingHandler> handler,
+ IntegerSectionIndexingHandler::Create(&fake_clock_,
+ integer_index_.get()));
+
+ // Handle document with document_id > last_added_document_id in recovery mode.
+ // The handler should index this document and update last_added_document_id.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document1, document_id1, /*recovery_mode=*/true,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(document_id1));
+
+ // Query "timestamp".
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ document_id1, std::vector<SectionId>{kSectionIdTimestamp})));
+
+ // Handle document with document_id == last_added_document_id in recovery
+ // mode. We should not get any error, but the handler should ignore the
+ // document, so both index data and last_added_document_id should remain
+ // unchanged.
+ integer_index_->set_last_added_document_id(document_id2);
+ ASSERT_THAT(integer_index_->last_added_document_id(), Eq(document_id2));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document2, document_id2, /*recovery_mode=*/true,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(document_id2));
+
+ // Query "timestamp". Should not get hits for document2.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ document_id1, std::vector<SectionId>{kSectionIdTimestamp})));
+
+ // Handle document with document_id < last_added_document_id in recovery mode.
+ // We should not get any error, but the handler should ignore the document, so
+ // both index data and last_added_document_id should remain unchanged.
+ integer_index_->set_last_added_document_id(document_id2 + 1);
+ ASSERT_THAT(integer_index_->last_added_document_id(), Eq(document_id2 + 1));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document2, document_id2, /*recovery_mode=*/true,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(integer_index_->last_added_document_id(), Eq(document_id2 + 1));
+
+ // Query "timestamp". Should not get hits for document2.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr,
+ integer_index_->GetIterator(
+ kPropertyTimestamp, /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *document_store_,
+ *schema_store_, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(
+ document_id1, std::vector<SectionId>{kSectionIdTimestamp})));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc b/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc
index e75ed87..1917fd0 100644
--- a/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id.cc
@@ -32,7 +32,6 @@ libtextclassifier3::Status DocHitInfoIteratorAllDocumentId::Advance() {
if (!IsDocumentIdValid(current_document_id_)) {
// Reached the end, set these to invalid values and return
doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
@@ -40,5 +39,12 @@ libtextclassifier3::Status DocHitInfoIteratorAllDocumentId::Advance() {
return libtextclassifier3::Status::OK;
}
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorAllDocumentId::TrimRightMostNode() && {
+ // The all document id node should be trimmed.
+ TrimmedNode node = {nullptr, /*term=*/"", /*term_start_index_=*/0,
+ /*unnormalized_term_length_=*/0};
+ return node;
+}
} // namespace lib
} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id.h b/icing/index/iterator/doc-hit-info-iterator-all-document-id.h
index 0fa74f5..60c5e0c 100644
--- a/icing/index/iterator/doc-hit-info-iterator-all-document-id.h
+++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id.h
@@ -35,10 +35,18 @@ class DocHitInfoIteratorAllDocumentId : public DocHitInfoIterator {
libtextclassifier3::Status Advance() override;
- int32_t GetNumBlocksInspected() const override { return 0; }
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
- int32_t GetNumLeafAdvanceCalls() const override {
- return document_id_limit_ - current_document_id_;
+ void MapChildren(const ChildrenMapper& mapper) override {}
+
+ CallStats GetCallStats() const override {
+ return CallStats(
+ /*num_leaf_advance_calls_lite_index_in=*/0,
+ /*num_leaf_advance_calls_main_index_in=*/0,
+ /*num_leaf_advance_calls_integer_index_in=*/0,
+ /*num_leaf_advance_calls_no_index_in=*/document_id_limit_ -
+ current_document_id_,
+ /*num_blocks_inspected_in=*/0);
}
std::string ToString() const override {
diff --git a/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc b/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc
index 7366b97..379cb4d 100644
--- a/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-all-document-id_test.cc
@@ -32,6 +32,7 @@ namespace {
using ::testing::ElementsAreArray;
using ::testing::Eq;
+using ::testing::IsNull;
using ::testing::Not;
TEST(DocHitInfoIteratorAllDocumentIdTest, Initialize) {
@@ -40,9 +41,8 @@ TEST(DocHitInfoIteratorAllDocumentIdTest, Initialize) {
// We'll always start with an invalid document_id, need to Advance before we
// get anything out of this.
- EXPECT_THAT(all_it.doc_hit_info().document_id(), Eq(kInvalidDocumentId));
- EXPECT_THAT(all_it.hit_intersect_section_ids_mask(),
- Eq(kSectionIdMaskNone));
+ EXPECT_THAT(all_it.doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{}));
}
{
@@ -53,26 +53,25 @@ TEST(DocHitInfoIteratorAllDocumentIdTest, Initialize) {
}
}
-TEST(DocHitInfoIteratorAllDocumentIdTest, GetNumBlocksInspected) {
+TEST(DocHitInfoIteratorAllDocumentIdTest, GetCallStats) {
DocHitInfoIteratorAllDocumentId all_it(100);
- EXPECT_THAT(all_it.GetNumBlocksInspected(), Eq(0));
-
- // Number of iterations is chosen arbitrarily. Just meant to demonstrate that
- // no matter how many Advance calls are made, GetNumBlocksInspected should
- // always return 0.
- for (int i = 0; i < 5; ++i) {
- EXPECT_THAT(all_it.Advance(), IsOk());
- EXPECT_THAT(all_it.GetNumBlocksInspected(), Eq(0));
- }
-}
-
-TEST(DocHitInfoIteratorAllDocumentIdTest, GetNumLeafAdvanceCalls) {
- DocHitInfoIteratorAllDocumentId all_it(100);
- EXPECT_THAT(all_it.GetNumLeafAdvanceCalls(), Eq(0));
+ EXPECT_THAT(
+ all_it.GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/0));
for (int i = 1; i <= 5; ++i) {
EXPECT_THAT(all_it.Advance(), IsOk());
- EXPECT_THAT(all_it.GetNumLeafAdvanceCalls(), Eq(i));
+ EXPECT_THAT(
+ all_it.GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/i, /*num_blocks_inspected=*/0));
}
}
@@ -86,12 +85,8 @@ TEST(DocHitInfoIteratorAllDocumentIdTest, Advance) {
// Test one advance
DocHitInfoIteratorAllDocumentId all_it(5);
EXPECT_THAT(all_it.Advance(), IsOk());
- EXPECT_THAT(all_it.doc_hit_info().document_id(), Eq(5));
-
- // Advancing shouldn't affect the intersect section ids mask, since there's
- // no intersecting going on
- EXPECT_THAT(all_it.hit_intersect_section_ids_mask(),
- Eq(kSectionIdMaskNone));
+ EXPECT_THAT(all_it.doc_hit_info(),
+ EqualsDocHitInfo(5, std::vector<SectionId>{}));
}
{
@@ -108,6 +103,16 @@ TEST(DocHitInfoIteratorAllDocumentIdTest, Advance) {
}
}
+TEST(DocHitInfoIteratorAllDocumentIdTest, TrimAllDocumentIdIterator) {
+ DocHitInfoIteratorAllDocumentId all_it(100);
+ ICING_ASSERT_OK_AND_ASSIGN(DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(all_it).TrimRightMostNode());
+ // The whole iterator is trimmed
+ EXPECT_THAT(trimmed_node.term_, testing::IsEmpty());
+ EXPECT_THAT(trimmed_node.term_start_index_, 0);
+ EXPECT_THAT(trimmed_node.iterator_, IsNull());
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-and.cc b/icing/index/iterator/doc-hit-info-iterator-and.cc
index f224583..249bd0e 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-and.cc
@@ -14,8 +14,7 @@
#include "icing/index/iterator/doc-hit-info-iterator-and.h"
-#include <stddef.h>
-
+#include <cstddef>
#include <cstdint>
#include <memory>
#include <string>
@@ -38,8 +37,6 @@ namespace {
// When combining ANDed iterators, n-ary operator has better performance when
// number of operands > 3 according to benchmark cl/243720660
-// TODO (samzheng): Tune this number when it's necessary, e.g. implementation
-// changes.
inline constexpr int kBinaryAndIteratorPerformanceThreshold = 3;
// The minimum number of iterators needed to construct a And iterator. The And
@@ -58,11 +55,12 @@ std::unique_ptr<DocHitInfoIterator> CreateAndIterator(
if (iterators.size() <= kBinaryAndIteratorPerformanceThreshold &&
iterators.size() >= kMinBinaryIterators) {
// Accumulate the iterators that need to be ANDed together.
- iterator = std::move(iterators.at(0));
- for (size_t i = 1; i < iterators.size(); ++i) {
+ iterator = std::move(iterators.at(iterators.size() - 1));
+ for (int i = iterators.size() - 2; i >= 0; --i) {
std::unique_ptr<DocHitInfoIterator> temp_iterator = std::move(iterator);
iterator = std::make_unique<DocHitInfoIteratorAnd>(
- std::move(temp_iterator), std::move(iterators[i]));
+ /*short_it=*/std::move(iterators[i]),
+ /*long_it=*/std::move(temp_iterator));
}
} else {
// If the vector is too small, the AndNary iterator can handle it and return
@@ -85,7 +83,6 @@ libtextclassifier3::Status DocHitInfoIteratorAnd::Advance() {
// Didn't find anything for the first iterator, reset to invalid values and
// return.
doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
@@ -107,18 +104,21 @@ libtextclassifier3::Status DocHitInfoIteratorAnd::Advance() {
// Guaranteed that short_doc_id and long_doc_id match now
doc_hit_info_ = short_->doc_hit_info();
- doc_hit_info_.MergeSectionsFrom(long_->doc_hit_info());
- hit_intersect_section_ids_mask_ = short_->hit_intersect_section_ids_mask() &
- long_->hit_intersect_section_ids_mask();
+ doc_hit_info_.MergeSectionsFrom(long_->doc_hit_info().hit_section_ids_mask());
return libtextclassifier3::Status::OK;
}
-int32_t DocHitInfoIteratorAnd::GetNumBlocksInspected() const {
- return short_->GetNumBlocksInspected() + long_->GetNumBlocksInspected();
-}
-
-int32_t DocHitInfoIteratorAnd::GetNumLeafAdvanceCalls() const {
- return short_->GetNumLeafAdvanceCalls() + long_->GetNumLeafAdvanceCalls();
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorAnd::TrimRightMostNode() && {
+ ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_long,
+ std::move(*long_).TrimRightMostNode());
+ if (trimmed_long.iterator_ == nullptr) {
+ trimmed_long.iterator_ = std::move(short_);
+ } else {
+ trimmed_long.iterator_ = std::make_unique<DocHitInfoIteratorAnd>(
+ std::move(short_), std::move(trimmed_long.iterator_));
+ }
+ return trimmed_long;
}
std::string DocHitInfoIteratorAnd::ToString() const {
@@ -141,7 +141,6 @@ libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() {
// Didn't find anything for the first iterator, reset to invalid values and
// return
doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
@@ -164,6 +163,7 @@ libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() {
DocumentId unused;
ICING_ASSIGN_OR_RETURN(
unused, AdvanceTo(iterator.get(), potential_document_id));
+ (void)unused; // Silence unused warning.
}
if (iterator->doc_hit_info().document_id() == potential_document_id) {
@@ -184,31 +184,41 @@ libtextclassifier3::Status DocHitInfoIteratorAndNary::Advance() {
// Found a DocumentId which exists in all the iterators
doc_hit_info_ = iterators_.at(0)->doc_hit_info();
- hit_intersect_section_ids_mask_ =
- iterators_.at(0)->hit_intersect_section_ids_mask();
for (size_t i = 1; i < iterators_.size(); i++) {
- doc_hit_info_.MergeSectionsFrom(iterators_.at(i)->doc_hit_info());
- hit_intersect_section_ids_mask_ &=
- iterators_.at(i)->hit_intersect_section_ids_mask();
+ doc_hit_info_.MergeSectionsFrom(
+ iterators_.at(i)->doc_hit_info().hit_section_ids_mask());
}
return libtextclassifier3::Status::OK;
}
-int32_t DocHitInfoIteratorAndNary::GetNumBlocksInspected() const {
- int32_t blockCount = 0;
- for (const std::unique_ptr<DocHitInfoIterator>& iter : iterators_) {
- blockCount += iter->GetNumBlocksInspected();
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorAndNary::TrimRightMostNode() && {
+ ICING_ASSIGN_OR_RETURN(
+ TrimmedNode trimmed_right,
+ std::move(*iterators_.rbegin()->get()).TrimRightMostNode());
+ if (trimmed_right.iterator_ == nullptr) {
+ if (iterators_.size() > 2) {
+ iterators_.pop_back();
+ trimmed_right.iterator_ =
+ std::make_unique<DocHitInfoIteratorAndNary>(std::move(iterators_));
+ } else if (iterators_.size() == 2) {
+ trimmed_right.iterator_ = std::move(iterators_.at(0));
+ }
+ } else {
+ iterators_.at(iterators_.size() - 1) = std::move(trimmed_right.iterator_);
+ trimmed_right.iterator_ =
+ std::make_unique<DocHitInfoIteratorAndNary>(std::move(iterators_));
}
- return blockCount;
+ return trimmed_right;
}
-int32_t DocHitInfoIteratorAndNary::GetNumLeafAdvanceCalls() const {
- int32_t leafCount = 0;
- for (const std::unique_ptr<DocHitInfoIterator>& iter : iterators_) {
- leafCount += iter->GetNumLeafAdvanceCalls();
+DocHitInfoIterator::CallStats DocHitInfoIteratorAndNary::GetCallStats() const {
+ CallStats call_stats;
+ for (const auto& iter : iterators_) {
+ call_stats += iter->GetCallStats();
}
- return leafCount;
+ return call_stats;
}
std::string DocHitInfoIteratorAndNary::ToString() const {
diff --git a/icing/index/iterator/doc-hit-info-iterator-and.h b/icing/index/iterator/doc-hit-info-iterator-and.h
index 4618fb9..8c52ac9 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and.h
+++ b/icing/index/iterator/doc-hit-info-iterator-and.h
@@ -18,6 +18,7 @@
#include <cstdint>
#include <memory>
#include <string>
+#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -40,12 +41,32 @@ class DocHitInfoIteratorAnd : public DocHitInfoIterator {
std::unique_ptr<DocHitInfoIterator> long_it);
libtextclassifier3::Status Advance() override;
- int32_t GetNumBlocksInspected() const override;
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
- int32_t GetNumLeafAdvanceCalls() const override;
+ CallStats GetCallStats() const override {
+ return short_->GetCallStats() + long_->GetCallStats();
+ }
std::string ToString() const override;
+ void MapChildren(const ChildrenMapper& mapper) override {
+ short_ = mapper(std::move(short_));
+ long_ = mapper(std::move(long_));
+ }
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ short_->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
+ long_->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
+ }
+
private:
std::unique_ptr<DocHitInfoIterator> short_;
std::unique_ptr<DocHitInfoIterator> long_;
@@ -61,12 +82,31 @@ class DocHitInfoIteratorAndNary : public DocHitInfoIterator {
libtextclassifier3::Status Advance() override;
- int32_t GetNumBlocksInspected() const override;
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
- int32_t GetNumLeafAdvanceCalls() const override;
+ CallStats GetCallStats() const override;
std::string ToString() const override;
+ void MapChildren(const ChildrenMapper& mapper) override {
+ for (int i = 0; i < iterators_.size(); ++i) {
+ iterators_[i] = mapper(std::move(iterators_[i]));
+ }
+ }
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ for (size_t i = 0; i < iterators_.size(); ++i) {
+ iterators_.at(i)->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
+ }
+ }
+
private:
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_;
};
diff --git a/icing/index/iterator/doc-hit-info-iterator-and_test.cc b/icing/index/iterator/doc-hit-info-iterator-and_test.cc
index 35574b7..f204ada 100644
--- a/icing/index/iterator/doc-hit-info-iterator-and_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-and_test.cc
@@ -74,39 +74,33 @@ TEST(DocHitInfoIteratorAndTest, Initialize) {
std::make_unique<DocHitInfoIteratorDummy>());
// We start out with invalid values
- EXPECT_THAT(and_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId)));
- EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(),
- Eq(kSectionIdMaskNone));
+ EXPECT_THAT(and_iter.doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{}));
}
-TEST(DocHitInfoIteratorAndTest, GetNumBlocksInspected) {
- int first_iter_blocks = 4; // arbitrary value
+TEST(DocHitInfoIteratorAndTest, GetCallStats) {
+ DocHitInfoIterator::CallStats first_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/2,
+ /*num_leaf_advance_calls_main_index_in=*/5,
+ /*num_leaf_advance_calls_integer_index_in=*/3,
+ /*num_leaf_advance_calls_no_index_in=*/1,
+ /*num_blocks_inspected_in=*/4); // arbitrary value
auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
- first_iter->SetNumBlocksInspected(first_iter_blocks);
-
- int second_iter_blocks = 7; // arbitrary value
+ first_iter->SetCallStats(first_iter_call_stats);
+
+ DocHitInfoIterator::CallStats second_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/6,
+ /*num_leaf_advance_calls_main_index_in=*/2,
+ /*num_leaf_advance_calls_integer_index_in=*/10,
+ /*num_leaf_advance_calls_no_index_in=*/3,
+ /*num_blocks_inspected_in=*/7); // arbitrary value
auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
- second_iter->SetNumBlocksInspected(second_iter_blocks);
+ second_iter->SetCallStats(second_iter_call_stats);
DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
- EXPECT_THAT(and_iter.GetNumBlocksInspected(),
- Eq(first_iter_blocks + second_iter_blocks));
-}
-
-TEST(DocHitInfoIteratorAndTest, GetNumLeafAdvanceCalls) {
- int first_iter_leaves = 4; // arbitrary value
- auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
- first_iter->SetNumLeafAdvanceCalls(first_iter_leaves);
-
- int second_iter_leaves = 7; // arbitrary value
- auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
- second_iter->SetNumLeafAdvanceCalls(second_iter_leaves);
-
- DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
-
- EXPECT_THAT(and_iter.GetNumLeafAdvanceCalls(),
- Eq(first_iter_leaves + second_iter_leaves));
+ EXPECT_THAT(and_iter.GetCallStats(),
+ Eq(first_iter_call_stats + second_iter_call_stats));
}
TEST(DocHitInfoIteratorAndTest, AdvanceNoOverlap) {
@@ -171,29 +165,257 @@ TEST(DocHitInfoIteratorAndTest, AdvanceNestedIterators) {
EXPECT_THAT(GetDocumentIds(outer_iter.get()), ElementsAre(10, 6, 2));
}
+TEST(DocHitInfoIteratorAndTest, TrimAndIterator) {
+ std::vector<DocHitInfo> left_vector = {DocHitInfo(3), DocHitInfo(2)};
+ std::vector<DocHitInfo> right_vector = {DocHitInfo(1), DocHitInfo(0)};
+
+ std::unique_ptr<DocHitInfoIterator> left_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(left_vector);
+ std::unique_ptr<DocHitInfoIterator> right_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(right_vector, "term", 10);
+
+ std::unique_ptr<DocHitInfoIterator> iter =
+ std::make_unique<DocHitInfoIteratorAnd>(std::move(left_iter),
+ std::move(right_iter));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(*iter).TrimRightMostNode());
+ EXPECT_THAT(trimmed_node.term_, Eq("term"));
+ EXPECT_THAT(trimmed_node.term_start_index_, Eq(10));
+ EXPECT_THAT(GetDocumentIds(trimmed_node.iterator_.get()), ElementsAre(3, 2));
+}
+
+TEST(DocHitInfoIteratorAndTest, TrimAndIterator_TwoLayer) {
+ // Build an interator tree like:
+ //
+ // AND
+ // / \
+ // first AND
+ // | / \
+ // {0, 1} second third
+ // | |
+ // {1} {0}
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(1), DocHitInfo(0)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(1)};
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(0)};
+
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector);
+ std::unique_ptr<DocHitInfoIterator> third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector, "term", 10);
+
+ std::unique_ptr<DocHitInfoIterator> nested_iter =
+ std::make_unique<DocHitInfoIteratorAnd>(std::move(second_iter),
+ std::move(third_iter));
+ std::unique_ptr<DocHitInfoIterator> iter =
+ std::make_unique<DocHitInfoIteratorAnd>(std::move(first_iter),
+ std::move(nested_iter));
+
+ // The third_iter is trimmed.
+ // AND
+ // / \
+ // first second
+ // | |
+ // {0, 1} {1}
+ ICING_ASSERT_OK_AND_ASSIGN(DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(*iter).TrimRightMostNode());
+ EXPECT_THAT(GetDocumentIds(trimmed_node.iterator_.get()), ElementsAre(1));
+ EXPECT_THAT(trimmed_node.term_, Eq("term"));
+ EXPECT_THAT(trimmed_node.term_start_index_, Eq(10));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, TrimAndNaryIterator) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(2), DocHitInfo(1),
+ DocHitInfo(0)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(2), DocHitInfo(1)};
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(2)};
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(first_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(second_vector));
+ iterators.push_back(
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector, "term", 10));
+
+ std::unique_ptr<DocHitInfoIterator> iter =
+ std::make_unique<DocHitInfoIteratorAndNary>(std::move(iterators));
+
+ // The third iterator is trimmed
+ ICING_ASSERT_OK_AND_ASSIGN(DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(*iter).TrimRightMostNode());
+ EXPECT_THAT(trimmed_node.term_, Eq("term"));
+ EXPECT_THAT(trimmed_node.term_start_index_, Eq(10));
+ EXPECT_THAT(GetDocumentIds(trimmed_node.iterator_.get()), ElementsAre(2, 1));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, TrimAndNaryIterator_TwoLayer) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(3), DocHitInfo(2),
+ DocHitInfo(1), DocHitInfo(0)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(2), DocHitInfo(1),
+ DocHitInfo(0)};
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(1), DocHitInfo(0)};
+ std::vector<DocHitInfo> forth_vector = {DocHitInfo(0)};
+
+ // Build nested iterator
+ std::unique_ptr<DocHitInfoIterator> third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector);
+ std::unique_ptr<DocHitInfoIterator> forth_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(forth_vector, "term", 10);
+ std::unique_ptr<DocHitInfoIterator> nested_iter =
+ std::make_unique<DocHitInfoIteratorAnd>(std::move(third_iter),
+ std::move(forth_iter));
+
+ // Build outer iterator
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(first_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(second_vector));
+ iterators.push_back(std::move(nested_iter));
+ std::unique_ptr<DocHitInfoIterator> iter =
+ std::make_unique<DocHitInfoIteratorAndNary>(std::move(iterators));
+
+ // The forth iterator is trimmed.
+ ICING_ASSERT_OK_AND_ASSIGN(DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(*iter).TrimRightMostNode());
+ EXPECT_THAT(trimmed_node.term_, Eq("term"));
+ EXPECT_THAT(trimmed_node.term_start_index_, Eq(10));
+ EXPECT_THAT(GetDocumentIds(trimmed_node.iterator_.get()), ElementsAre(1, 0));
+}
+
TEST(DocHitInfoIteratorAndTest, SectionIdMask) {
// Arbitrary section ids for the documents in the DocHitInfoIterators.
// Created to test correct section_id_mask behavior.
SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
- SectionIdMask mask_anded_result = 0b00000100;
SectionIdMask mask_ored_result = 0b01010111;
std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)};
std::vector<DocHitInfo> second_vector = {DocHitInfo(4, section_id_mask2)};
auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector);
- first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector);
- second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+ second_iter->set_hit_section_ids_mask(section_id_mask2);
DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
ICING_EXPECT_OK(and_iter.Advance());
EXPECT_THAT(and_iter.doc_hit_info().hit_section_ids_mask(),
Eq(mask_ored_result));
- EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
+}
+
+TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats) {
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+ doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+
+ SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{0, 1}, {2, 2}, {4, 3}, {6, 4}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(4);
+ doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map2 = {{1, 2}, {2, 6}};
+
+ std::vector<DocHitInfoTermFrequencyPair> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfoTermFrequencyPair> second_vector = {doc_hit_info2};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ second_iter->set_hit_section_ids_mask(section_id_mask2);
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter),
+ std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(and_iter.Advance());
+ EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(4));
+
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(
+ EqualsTermMatchInfo("hi", expected_section_ids_tf_map1),
+ EqualsTermMatchInfo("hello", expected_section_ids_tf_map2)));
+
+ EXPECT_FALSE(and_iter.Advance().ok());
+ }
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+
+ SectionIdMask section_id_mask1 = 0b00000101; // hits in sections 0, 2
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{0, 1}, {2, 2}};
+
+ std::vector<DocHitInfoTermFrequencyPair> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfoTermFrequencyPair> second_vector = {doc_hit_info1};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi");
+ second_iter->set_hit_section_ids_mask(section_id_mask1);
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter),
+ std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(and_iter.Advance());
+ EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(4));
+
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hi", expected_section_ids_tf_map1)));
+
+ EXPECT_FALSE(and_iter.Advance().ok());
+ }
+}
+
+TEST(DocHitInfoIteratorAndTest, PopulateMatchedTermsStats_NoMatchingDocument) {
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(5);
+ doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ std::vector<DocHitInfoTermFrequencyPair> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfoTermFrequencyPair> second_vector = {doc_hit_info2};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+
+ DocHitInfoIteratorAnd and_iter(std::move(first_iter), std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+ EXPECT_FALSE(and_iter.Advance().ok());
}
TEST(DocHitInfoIteratorAndNaryTest, Initialize) {
@@ -205,9 +427,8 @@ TEST(DocHitInfoIteratorAndNaryTest, Initialize) {
DocHitInfoIteratorAndNary and_iter(std::move(iterators));
// We start out with invalid values
- EXPECT_THAT(and_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId)));
- EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(),
- Eq(kSectionIdMaskNone));
+ EXPECT_THAT(and_iter.doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{}));
}
TEST(DocHitInfoIteratorAndNaryTest, InitializeEmpty) {
@@ -220,22 +441,42 @@ TEST(DocHitInfoIteratorAndNaryTest, InitializeEmpty) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST(DocHitInfoIteratorAndNaryTest, GetNumBlocksInspected) {
- int first_iter_blocks = 4; // arbitrary value
+TEST(DocHitInfoIteratorAndNaryTest, GetCallStats) {
+ DocHitInfoIterator::CallStats first_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/2,
+ /*num_leaf_advance_calls_main_index_in=*/5,
+ /*num_leaf_advance_calls_integer_index_in=*/3,
+ /*num_leaf_advance_calls_no_index_in=*/1,
+ /*num_blocks_inspected_in=*/4); // arbitrary value
auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
- first_iter->SetNumBlocksInspected(first_iter_blocks);
-
- int second_iter_blocks = 7; // arbitrary value
+ first_iter->SetCallStats(first_iter_call_stats);
+
+ DocHitInfoIterator::CallStats second_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/6,
+ /*num_leaf_advance_calls_main_index_in=*/2,
+ /*num_leaf_advance_calls_integer_index_in=*/10,
+ /*num_leaf_advance_calls_no_index_in=*/3,
+ /*num_blocks_inspected_in=*/7); // arbitrary value
auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
- second_iter->SetNumBlocksInspected(second_iter_blocks);
-
- int third_iter_blocks = 13; // arbitrary value
+ second_iter->SetCallStats(second_iter_call_stats);
+
+ DocHitInfoIterator::CallStats third_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/1000,
+ /*num_leaf_advance_calls_main_index_in=*/2000,
+ /*num_leaf_advance_calls_integer_index_in=*/3000,
+ /*num_leaf_advance_calls_no_index_in=*/0,
+ /*num_blocks_inspected_in=*/200); // arbitrary value
auto third_iter = std::make_unique<DocHitInfoIteratorDummy>();
- third_iter->SetNumBlocksInspected(third_iter_blocks);
-
- int fourth_iter_blocks = 1; // arbitrary value
+ third_iter->SetCallStats(third_iter_call_stats);
+
+ DocHitInfoIterator::CallStats fourth_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/200,
+ /*num_leaf_advance_calls_main_index_in=*/400,
+ /*num_leaf_advance_calls_integer_index_in=*/100,
+ /*num_leaf_advance_calls_no_index_in=*/20,
+ /*num_blocks_inspected_in=*/50); // arbitrary value
auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>();
- fourth_iter->SetNumBlocksInspected(fourth_iter_blocks);
+ fourth_iter->SetCallStats(fourth_iter_call_stats);
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
iterators.push_back(std::move(first_iter));
@@ -244,38 +485,9 @@ TEST(DocHitInfoIteratorAndNaryTest, GetNumBlocksInspected) {
iterators.push_back(std::move(fourth_iter));
DocHitInfoIteratorAndNary and_iter(std::move(iterators));
- EXPECT_THAT(and_iter.GetNumBlocksInspected(),
- Eq(first_iter_blocks + second_iter_blocks + third_iter_blocks +
- fourth_iter_blocks));
-}
-
-TEST(DocHitInfoIteratorAndNaryTest, GetNumLeafAdvanceCalls) {
- int first_iter_leaves = 4; // arbitrary value
- auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
- first_iter->SetNumLeafAdvanceCalls(first_iter_leaves);
-
- int second_iter_leaves = 7; // arbitrary value
- auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
- second_iter->SetNumLeafAdvanceCalls(second_iter_leaves);
-
- int third_iter_leaves = 13; // arbitrary value
- auto third_iter = std::make_unique<DocHitInfoIteratorDummy>();
- third_iter->SetNumLeafAdvanceCalls(third_iter_leaves);
-
- int fourth_iter_leaves = 13; // arbitrary value
- auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>();
- fourth_iter->SetNumLeafAdvanceCalls(fourth_iter_leaves);
-
- std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
- iterators.push_back(std::move(first_iter));
- iterators.push_back(std::move(second_iter));
- iterators.push_back(std::move(third_iter));
- iterators.push_back(std::move(fourth_iter));
- DocHitInfoIteratorAndNary and_iter(std::move(iterators));
-
- EXPECT_THAT(and_iter.GetNumLeafAdvanceCalls(),
- Eq(first_iter_leaves + second_iter_leaves + third_iter_leaves +
- fourth_iter_leaves));
+ EXPECT_THAT(and_iter.GetCallStats(),
+ Eq(first_iter_call_stats + second_iter_call_stats +
+ third_iter_call_stats + fourth_iter_call_stats));
}
TEST(DocHitInfoIteratorAndNaryTest, Advance) {
@@ -311,7 +523,6 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) {
SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
SectionIdMask section_id_mask3 = 0b00001100; // hits in sections 2, 3
SectionIdMask section_id_mask4 = 0b00100100; // hits in sections 2, 5
- SectionIdMask mask_anded_result = 0b00000100;
SectionIdMask mask_ored_result = 0b01101111;
std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)};
@@ -320,16 +531,16 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) {
std::vector<DocHitInfo> fourth_vector = {DocHitInfo(4, section_id_mask4)};
auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector);
- first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector);
- second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+ second_iter->set_hit_section_ids_mask(section_id_mask2);
auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(third_vector);
- third_iter->set_hit_intersect_section_ids_mask(section_id_mask3);
+ third_iter->set_hit_section_ids_mask(section_id_mask3);
auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(fourth_vector);
- fourth_iter->set_hit_intersect_section_ids_mask(section_id_mask4);
+ fourth_iter->set_hit_section_ids_mask(section_id_mask4);
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
iterators.push_back(std::move(first_iter));
@@ -342,7 +553,81 @@ TEST(DocHitInfoIteratorAndNaryTest, SectionIdMask) {
ICING_EXPECT_OK(and_iter.Advance());
EXPECT_THAT(and_iter.doc_hit_info().hit_section_ids_mask(),
Eq(mask_ored_result));
- EXPECT_THAT(and_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
+}
+
+TEST(DocHitInfoIteratorAndNaryTest, PopulateMatchedTermsStats) {
+ // Arbitrary section ids/term frequencies for the documents in the
+ // DocHitInfoIterators.
+ // For term "hi", document 10 and 8
+ DocHitInfoTermFrequencyPair doc_hit_info1_hi = DocHitInfo(10);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1_hi = {{0, 1}, {2, 2}, {6, 4}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2_hi = DocHitInfo(8);
+ doc_hit_info2_hi.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+
+ // For term "hello", document 10 and 9
+ DocHitInfoTermFrequencyPair doc_hit_info1_hello = DocHitInfo(10);
+ doc_hit_info1_hello.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+ doc_hit_info1_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1_hello = {{0, 2}, {3, 3}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2_hello = DocHitInfo(9);
+ doc_hit_info2_hello.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/3);
+ doc_hit_info2_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/2);
+
+ // For term "ciao", document 10 and 9
+ DocHitInfoTermFrequencyPair doc_hit_info1_ciao = DocHitInfo(10);
+ doc_hit_info1_ciao.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+ doc_hit_info1_ciao.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/3);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1_ciao = {{0, 2}, {1, 3}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2_ciao = DocHitInfo(9);
+ doc_hit_info2_ciao.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+ doc_hit_info2_ciao.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/2);
+
+ std::vector<DocHitInfoTermFrequencyPair> first_vector = {doc_hit_info1_hi,
+ doc_hit_info2_hi};
+ std::vector<DocHitInfoTermFrequencyPair> second_vector = {
+ doc_hit_info1_hello, doc_hit_info2_hello};
+ std::vector<DocHitInfoTermFrequencyPair> third_vector = {doc_hit_info1_ciao,
+ doc_hit_info2_ciao};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ auto third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector, "ciao");
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+
+ DocHitInfoIteratorAndNary and_iter(std::move(iterators));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(and_iter.Advance());
+ EXPECT_THAT(and_iter.doc_hit_info().document_id(), Eq(10));
+
+ and_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(
+ EqualsTermMatchInfo("hi", expected_section_ids_tf_map1_hi),
+ EqualsTermMatchInfo("hello", expected_section_ids_tf_map1_hello),
+ EqualsTermMatchInfo("ciao", expected_section_ids_tf_map1_ciao)));
+
+ EXPECT_FALSE(and_iter.Advance().ok());
}
} // namespace
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.cc b/icing/index/iterator/doc-hit-info-iterator-filter.cc
index 482a5ab..82d1ac7 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.cc
@@ -31,7 +31,6 @@
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
-#include "icing/util/clock.h"
namespace icing {
namespace lib {
@@ -39,12 +38,12 @@ namespace lib {
DocHitInfoIteratorFilter::DocHitInfoIteratorFilter(
std::unique_ptr<DocHitInfoIterator> delegate,
const DocumentStore* document_store, const SchemaStore* schema_store,
- const Clock* clock, const Options& options)
+ const Options& options, int64_t current_time_ms)
: delegate_(std::move(delegate)),
document_store_(*document_store),
schema_store_(*schema_store),
options_(options),
- current_time_milliseconds_(clock->GetSystemTimeMilliseconds()) {
+ current_time_ms_(current_time_ms) {
// Precompute all the NamespaceIds
for (std::string_view name_space : options_.namespaces) {
auto namespace_id_or = document_store_.GetNamespaceId(name_space);
@@ -57,81 +56,68 @@ DocHitInfoIteratorFilter::DocHitInfoIteratorFilter(
// Precompute all the SchemaTypeIds
for (std::string_view schema_type : options_.schema_types) {
- auto schema_type_id_or = schema_store_.GetSchemaTypeId(schema_type);
+ libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
+ schema_type_ids_or =
+ schema_store_.GetSchemaTypeIdsWithChildren(schema_type);
// If we can't find the SchemaTypeId, just throw it away
- if (schema_type_id_or.ok()) {
- target_schema_type_ids_.emplace(schema_type_id_or.ValueOrDie());
+ if (schema_type_ids_or.ok()) {
+ const std::unordered_set<SchemaTypeId>* schema_type_ids =
+ schema_type_ids_or.ValueOrDie();
+ target_schema_type_ids_.insert(schema_type_ids->begin(),
+ schema_type_ids->end());
}
}
}
libtextclassifier3::Status DocHitInfoIteratorFilter::Advance() {
- if (!delegate_->Advance().ok()) {
- // Didn't find anything on the delegate iterator.
- doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
- return absl_ports::ResourceExhaustedError(
- "No more DocHitInfos in iterator");
- }
-
- if (current_time_milliseconds_ < 0) {
- // This shouldn't happen, but we add a sanity check here for any unknown
- // errors.
- return absl_ports::InternalError(
- "Couldn't get current time. Try again in a bit");
- }
-
- if (options_.filter_deleted) {
- if (!document_store_.DoesDocumentExist(
- delegate_->doc_hit_info().document_id())) {
- // Document doesn't exist, keep searching
- return Advance();
+ while (delegate_->Advance().ok()) {
+ // Try to get the DocumentFilterData
+ auto document_filter_data_optional =
+ document_store_.GetAliveDocumentFilterData(
+ delegate_->doc_hit_info().document_id(), current_time_ms_);
+ if (!document_filter_data_optional) {
+ // Didn't find the DocumentFilterData in the filter cache. This could be
+ // because the Document doesn't exist or the DocumentId isn't valid or the
+ // filter cache is in some invalid state. This is bad, but not the query's
+ // responsibility to fix, so just skip this result for now.
+ continue;
}
- }
-
- // Try to get the DocumentFilterData
- auto document_filter_data_or = document_store_.GetDocumentFilterData(
- delegate_->doc_hit_info().document_id());
- if (!document_filter_data_or.ok()) {
- // Didn't find the DocumentFilterData in the filter cache. This could be
- // because the DocumentId isn't valid or the filter cache is in some invalid
- // state. This is bad, but not the query's responsibility to fix, so just
- // skip this result for now.
- return Advance();
- }
- // We should be guaranteed that this exists now.
- DocumentFilterData data = std::move(document_filter_data_or).ValueOrDie();
+ // We should be guaranteed that this exists now.
+ DocumentFilterData data = document_filter_data_optional.value();
- if (!options_.namespaces.empty() &&
- target_namespace_ids_.count(data.namespace_id()) == 0) {
- // Doesn't match one of the specified namespaces. Keep searching
- return Advance();
- }
+ if (!options_.namespaces.empty() &&
+ target_namespace_ids_.count(data.namespace_id()) == 0) {
+ // Doesn't match one of the specified namespaces. Keep searching
+ continue;
+ }
- if (!options_.schema_types.empty() &&
- target_schema_type_ids_.count(data.schema_type_id()) == 0) {
- // Doesn't match one of the specified schema types. Keep searching
- return Advance();
- }
+ if (!options_.schema_types.empty() &&
+ target_schema_type_ids_.count(data.schema_type_id()) == 0) {
+ // Doesn't match one of the specified schema types. Keep searching
+ continue;
+ }
- if (current_time_milliseconds_ >= data.expiration_timestamp_ms()) {
- // Current time has exceeded the document's expiration time
- return Advance();
+ // Satisfied all our specified filters
+ doc_hit_info_ = delegate_->doc_hit_info();
+ return libtextclassifier3::Status::OK;
}
- // Satisfied all our specified filters
- doc_hit_info_ = delegate_->doc_hit_info();
- hit_intersect_section_ids_mask_ = delegate_->hit_intersect_section_ids_mask();
- return libtextclassifier3::Status::OK;
+ // Didn't find anything on the delegate iterator.
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator");
}
-int32_t DocHitInfoIteratorFilter::GetNumBlocksInspected() const {
- return delegate_->GetNumBlocksInspected();
-}
-
-int32_t DocHitInfoIteratorFilter::GetNumLeafAdvanceCalls() const {
- return delegate_->GetNumLeafAdvanceCalls();
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorFilter::TrimRightMostNode() && {
+ ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_delegate,
+ std::move(*delegate_).TrimRightMostNode());
+ if (trimmed_delegate.iterator_ != nullptr) {
+ trimmed_delegate.iterator_ = std::make_unique<DocHitInfoIteratorFilter>(
+ std::move(trimmed_delegate.iterator_), &document_store_, &schema_store_,
+ options_, current_time_ms_);
+ }
+ return trimmed_delegate;
}
std::string DocHitInfoIteratorFilter::ToString() const {
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter.h b/icing/index/iterator/doc-hit-info-iterator-filter.h
index bf027e4..608665e 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter.h
+++ b/icing/index/iterator/doc-hit-info-iterator-filter.h
@@ -20,6 +20,7 @@
#include <string>
#include <string_view>
#include <unordered_set>
+#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -27,7 +28,6 @@
#include "icing/schema/schema-store.h"
#include "icing/store/document-store.h"
#include "icing/store/namespace-id.h"
-#include "icing/util/clock.h"
namespace icing {
namespace lib {
@@ -37,10 +37,6 @@ namespace lib {
class DocHitInfoIteratorFilter : public DocHitInfoIterator {
public:
struct Options {
- // Filter out/don't return DocHitInfos that are associated with nonexistent
- // Documents.
- bool filter_deleted = true;
-
// List of namespaces that documents must have. An empty vector means that
// all namespaces are valid, and no documents will be filtered out.
//
@@ -61,16 +57,27 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator {
explicit DocHitInfoIteratorFilter(
std::unique_ptr<DocHitInfoIterator> delegate,
const DocumentStore* document_store, const SchemaStore* schema_store,
- const Clock* clock, const Options& options);
+ const Options& options, int64_t current_time_ms);
libtextclassifier3::Status Advance() override;
- int32_t GetNumBlocksInspected() const override;
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
+
+ void MapChildren(const ChildrenMapper& mapper) override {
+ delegate_ = mapper(std::move(delegate_));
+ }
- int32_t GetNumLeafAdvanceCalls() const override;
+ CallStats GetCallStats() const override { return delegate_->GetCallStats(); }
std::string ToString() const override;
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ delegate_->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
+ }
+
private:
std::unique_ptr<DocHitInfoIterator> delegate_;
const DocumentStore& document_store_;
@@ -78,7 +85,7 @@ class DocHitInfoIteratorFilter : public DocHitInfoIterator {
const Options options_;
std::unordered_set<NamespaceId> target_namespace_ids_;
std::unordered_set<SchemaTypeId> target_schema_type_ids_;
- const int64_t current_time_milliseconds_;
+ int64_t current_time_ms_;
};
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
index e769013..0ed4d02 100644
--- a/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-filter_test.cc
@@ -17,6 +17,7 @@
#include <limits>
#include <memory>
#include <string>
+#include <string_view>
#include <utility>
#include <vector>
@@ -25,9 +26,12 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-and.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
@@ -45,6 +49,18 @@ using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsEmpty;
+libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store) {
+ return DocumentStore::Create(
+ filesystem, base_dir, clock, schema_store,
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr);
+}
+
class DocHitInfoIteratorDeletedFilterTest : public ::testing::Test {
protected:
DocHitInfoIteratorDeletedFilterTest()
@@ -59,18 +75,22 @@ class DocHitInfoIteratorDeletedFilterTest : public ::testing::Test {
test_document3_ =
DocumentBuilder().SetKey("icing", "email/3").SetSchema("email").Build();
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ document_store_ = std::move(create_result.document_store);
}
void TearDown() override {
@@ -100,38 +120,11 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, EmptyOriginalIterator) {
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator_empty), document_store_.get(),
- schema_store_.get(), &fake_clock_, options_);
+ schema_store_.get(), options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
-TEST_F(DocHitInfoIteratorDeletedFilterTest, TurnOffDeletedFilterOk) {
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- document_store_->Put(test_document1_));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- document_store_->Put(test_document2_));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- document_store_->Put(test_document3_));
-
- // Deletes test document 2
- ICING_ASSERT_OK(document_store_->Delete(test_document2_.namespace_(),
- test_document2_.uri()));
-
- std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
- DocHitInfo(document_id2),
- DocHitInfo(document_id3)};
- std::unique_ptr<DocHitInfoIterator> original_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
-
- options_.filter_deleted = false;
- DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
-
- EXPECT_THAT(GetDocumentIds(&filtered_iterator),
- ElementsAre(document_id1, document_id2, document_id3));
-}
-
TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) {
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(test_document1_));
@@ -140,8 +133,9 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) {
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
document_store_->Put(test_document3_));
// Deletes test document 2
- ICING_ASSERT_OK(document_store_->Delete(test_document2_.namespace_(),
- test_document2_.uri()));
+ ICING_ASSERT_OK(document_store_->Delete(
+ test_document2_.namespace_(), test_document2_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1),
DocHitInfo(document_id2),
@@ -151,7 +145,7 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, DeletedDocumentsAreFiltered) {
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator),
ElementsAre(document_id1, document_id3));
@@ -177,7 +171,7 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, NonExistingDocumentsAreFiltered) {
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator),
ElementsAre(document_id1, document_id2, document_id3));
@@ -190,7 +184,7 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, NegativeDocumentIdIsIgnored) {
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(filtered_iterator.Advance(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
@@ -204,7 +198,7 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, InvalidDocumentIdIsIgnored) {
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(filtered_iterator.Advance(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
@@ -221,7 +215,7 @@ TEST_F(DocHitInfoIteratorDeletedFilterTest, GreaterThanMaxDocumentIdIsIgnored) {
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(filtered_iterator.Advance(),
StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
@@ -251,18 +245,22 @@ class DocHitInfoIteratorNamespaceFilterTest : public ::testing::Test {
.SetSchema("email")
.Build();
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ document_store_ = std::move(create_result.document_store);
}
void TearDown() override {
@@ -295,7 +293,7 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, EmptyOriginalIterator) {
options_.namespaces = std::vector<std::string_view>{};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator_empty), document_store_.get(),
- schema_store_.get(), &fake_clock_, options_);
+ schema_store_.get(), options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
@@ -313,7 +311,7 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest,
options_.namespaces = std::vector<std::string_view>{"nonexistent_namespace"};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
@@ -330,7 +328,7 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, NoNamespacesReturnsAll) {
options_.namespaces = std::vector<std::string_view>{};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
}
@@ -354,7 +352,7 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest,
options_.namespaces = std::vector<std::string_view>{namespace1_};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator),
ElementsAre(document_id1, document_id2));
@@ -380,7 +378,7 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, FilterForMultipleNamespacesOk) {
options_.namespaces = std::vector<std::string_view>{namespace1_, namespace3_};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator),
ElementsAre(document_id1, document_id2, document_id4));
@@ -388,36 +386,58 @@ TEST_F(DocHitInfoIteratorNamespaceFilterTest, FilterForMultipleNamespacesOk) {
class DocHitInfoIteratorSchemaTypeFilterTest : public ::testing::Test {
protected:
+ static constexpr std::string_view kSchema1 = "email";
+ static constexpr std::string_view kSchema2 = "message";
+ static constexpr std::string_view kSchema3 = "person";
+ static constexpr std::string_view kSchema4 = "artist";
+ static constexpr std::string_view kSchema5 = "emailMessage";
+
DocHitInfoIteratorSchemaTypeFilterTest()
: test_dir_(GetTestTempDir() + "/icing") {}
void SetUp() override {
filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- document1_schema1_ =
- DocumentBuilder().SetKey("namespace", "1").SetSchema(schema1_).Build();
- document2_schema2_ =
- DocumentBuilder().SetKey("namespace", "2").SetSchema(schema2_).Build();
- document3_schema3_ =
- DocumentBuilder().SetKey("namespace", "3").SetSchema(schema3_).Build();
- document4_schema1_ =
- DocumentBuilder().SetKey("namespace", "4").SetSchema(schema1_).Build();
-
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type(schema1_);
- type_config = schema.add_types();
- type_config->set_schema_type(schema2_);
- type_config = schema.add_types();
- type_config->set_schema_type(schema3_);
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ document1_schema1_ = DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema(std::string(kSchema1))
+ .Build();
+ document2_schema2_ = DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema(std::string(kSchema2))
+ .Build();
+ document3_schema3_ = DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema(std::string(kSchema3))
+ .Build();
+ document4_schema1_ = DocumentBuilder()
+ .SetKey("namespace", "4")
+ .SetSchema(std::string(kSchema1))
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType(kSchema1))
+ .AddType(SchemaTypeConfigBuilder().SetType(kSchema2))
+ .AddType(SchemaTypeConfigBuilder().SetType(kSchema3))
+ .AddType(SchemaTypeConfigBuilder().SetType(kSchema4).AddParentType(
+ kSchema3))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(std::string(kSchema5))
+ .AddParentType(kSchema1)
+ .AddParentType(kSchema2))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ document_store_ = std::move(create_result.document_store);
}
void TearDown() override {
@@ -433,9 +453,6 @@ class DocHitInfoIteratorSchemaTypeFilterTest : public ::testing::Test {
FakeClock fake_clock_;
const Filesystem filesystem_;
const std::string test_dir_;
- const std::string schema1_ = "email";
- const std::string schema2_ = "message";
- const std::string schema3_ = "person";
DocumentProto document1_schema1_;
DocumentProto document2_schema2_;
DocumentProto document3_schema3_;
@@ -450,7 +467,7 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, EmptyOriginalIterator) {
options_.schema_types = std::vector<std::string_view>{};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator_empty), document_store_.get(),
- schema_store_.get(), &fake_clock_, options_);
+ schema_store_.get(), options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
@@ -469,7 +486,7 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest,
std::vector<std::string_view>{"nonexistent_schema_type"};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
@@ -486,7 +503,7 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, NoSchemaTypesReturnsAll) {
options_.schema_types = std::vector<std::string_view>{};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
}
@@ -504,10 +521,10 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest,
std::unique_ptr<DocHitInfoIterator> original_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- options_.schema_types = std::vector<std::string_view>{schema1_};
+ options_.schema_types = std::vector<std::string_view>{kSchema1};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
}
@@ -527,15 +544,119 @@ TEST_F(DocHitInfoIteratorSchemaTypeFilterTest, FilterForMultipleSchemaTypesOk) {
std::unique_ptr<DocHitInfoIterator> original_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- options_.schema_types = std::vector<std::string_view>{schema2_, schema3_};
+ options_.schema_types = std::vector<std::string_view>{kSchema2, kSchema3};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator),
ElementsAre(document_id2, document_id3));
}
+TEST_F(DocHitInfoIteratorSchemaTypeFilterTest,
+ FilterForSchemaTypePolymorphismOk) {
+ // Add some irrelevant documents.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2_schema2_));
+
+ // Create a person document and an artist document, where the artist should be
+ // able to be interpreted as a person by polymorphism.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId person_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "person")
+ .SetSchema("person")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId artist_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "artist")
+ .SetSchema("artist")
+ .Build()));
+
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id1), DocHitInfo(document_id2),
+ DocHitInfo(person_document_id), DocHitInfo(artist_document_id)};
+
+ // Filters for the "person" type should also include the "artist" type.
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ options_.schema_types = {"person"};
+ DocHitInfoIteratorFilter filtered_iterator_1(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator_1),
+ ElementsAre(person_document_id, artist_document_id));
+
+ // Filters for the "artist" type should not include the "person" type.
+ original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ options_.schema_types = {"artist"};
+ DocHitInfoIteratorFilter filtered_iterator_2(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator_2),
+ ElementsAre(artist_document_id));
+}
+
+TEST_F(DocHitInfoIteratorSchemaTypeFilterTest,
+ FilterForSchemaTypeMultipleParentPolymorphismOk) {
+ // Create an email and a message document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "email")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId message_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "message")
+ .SetSchema("message")
+ .Build()));
+
+ // Create a emailMessage document, which the should be able to be interpreted
+ // as both an email and a message by polymorphism.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_message_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "emailMessage")
+ .SetSchema("emailMessage")
+ .Build()));
+
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(email_document_id), DocHitInfo(message_document_id),
+ DocHitInfo(email_message_document_id)};
+
+ // Filters for the "email" type should also include the "emailMessage" type.
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ options_.schema_types = std::vector<std::string_view>{"email"};
+ DocHitInfoIteratorFilter filtered_iterator_1(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator_1),
+ ElementsAre(email_document_id, email_message_document_id));
+
+ // Filters for the "message" type should also include the "emailMessage" type.
+ original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ options_.schema_types = std::vector<std::string_view>{"message"};
+ DocHitInfoIteratorFilter filtered_iterator_2(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator_2),
+ ElementsAre(message_document_id, email_message_document_id));
+
+ // Filters for a irrelevant type should return nothing.
+ original_iterator = std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+ options_.schema_types = std::vector<std::string_view>{"person"};
+ DocHitInfoIteratorFilter filtered_iterator_3(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(GetDocumentIds(&filtered_iterator_3), IsEmpty());
+}
+
class DocHitInfoIteratorExpirationFilterTest : public ::testing::Test {
protected:
DocHitInfoIteratorExpirationFilterTest()
@@ -544,18 +665,22 @@ class DocHitInfoIteratorExpirationFilterTest : public ::testing::Test {
void SetUp() override {
filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type(email_schema_);
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType(email_schema_))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ document_store_ = std::move(create_result.document_store);
}
void TearDown() override {
@@ -576,6 +701,16 @@ class DocHitInfoIteratorExpirationFilterTest : public ::testing::Test {
};
TEST_F(DocHitInfoIteratorExpirationFilterTest, TtlZeroIsntFilteredOut) {
+ // Arbitrary value
+ fake_clock_.SetSystemTimeMilliseconds(100);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
// Insert a document
DocumentProto document = DocumentBuilder()
.SetKey("namespace", "1")
@@ -584,23 +719,30 @@ TEST_F(DocHitInfoIteratorExpirationFilterTest, TtlZeroIsntFilteredOut) {
.SetTtlMs(0)
.Build();
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- document_store_->Put(document));
+ document_store->Put(document));
std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
std::unique_ptr<DocHitInfoIterator> original_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- // Arbitrary value
- fake_clock_.SetSystemTimeMilliseconds(100);
-
DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ std::move(original_iterator), document_store.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
}
TEST_F(DocHitInfoIteratorExpirationFilterTest, BeforeTtlNotFilteredOut) {
+ // Arbitrary value, but must be less than document's creation_timestamp + ttl
+ fake_clock_.SetSystemTimeMilliseconds(50);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
// Insert a document
DocumentProto document = DocumentBuilder()
.SetKey("namespace", "1")
@@ -609,92 +751,84 @@ TEST_F(DocHitInfoIteratorExpirationFilterTest, BeforeTtlNotFilteredOut) {
.SetTtlMs(100)
.Build();
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- document_store_->Put(document));
+ document_store->Put(document));
std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
std::unique_ptr<DocHitInfoIterator> original_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- // Arbitrary value, but must be less than document's creation_timestamp + ttl
- fake_clock_.SetSystemTimeMilliseconds(50);
-
DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ std::move(original_iterator), document_store.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
}
TEST_F(DocHitInfoIteratorExpirationFilterTest, EqualTtlFilteredOut) {
+ // Current time is exactly the document's creation_timestamp + ttl
+ fake_clock_.SetSystemTimeMilliseconds(150);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
// Insert a document
DocumentProto document = DocumentBuilder()
.SetKey("namespace", "1")
.SetSchema(email_schema_)
- .SetCreationTimestampMs(0)
+ .SetCreationTimestampMs(50)
.SetTtlMs(100)
.Build();
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- document_store_->Put(document));
+ document_store->Put(document));
std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
std::unique_ptr<DocHitInfoIterator> original_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- // Current time is exactly the document's creation_timestamp + ttl
- fake_clock_.SetSystemTimeMilliseconds(100);
-
DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ std::move(original_iterator), document_store.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
TEST_F(DocHitInfoIteratorExpirationFilterTest, PastTtlFilteredOut) {
+ // Arbitrary value, but must be greater than the document's
+ // creation_timestamp + ttl
+ fake_clock_.SetSystemTimeMilliseconds(151);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
// Insert a document
DocumentProto document = DocumentBuilder()
.SetKey("namespace", "1")
.SetSchema(email_schema_)
- .SetCreationTimestampMs(0)
+ .SetCreationTimestampMs(50)
.SetTtlMs(100)
.Build();
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- document_store_->Put(document));
+ document_store->Put(document));
std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(document_id1)};
std::unique_ptr<DocHitInfoIterator> original_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- // Arbitrary value, but must be greater than the document's
- // creation_timestamp + ttl
- fake_clock_.SetSystemTimeMilliseconds(101);
-
DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
+ std::move(original_iterator), document_store.get(), schema_store_.get(),
+ options_, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
}
-TEST_F(DocHitInfoIteratorExpirationFilterTest,
- InvalidTimeFiltersReturnsInternalError) {
- // Put something in the original iterator so we don't get a ResourceExhausted
- // error
- std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(/*document_id_in=*/0)};
- std::unique_ptr<DocHitInfoIterator> original_iterator =
- std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
-
- // -1 is an invalid timestamp
- fake_clock_.SetSystemTimeMilliseconds(-1);
-
- DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options_);
-
- EXPECT_THAT(filtered_iterator.Advance(),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-}
-
class DocHitInfoIteratorFilterTest : public ::testing::Test {
protected:
DocHitInfoIteratorFilterTest() : test_dir_(GetTestTempDir() + "/icing") {}
@@ -728,24 +862,27 @@ class DocHitInfoIteratorFilterTest : public ::testing::Test {
document5_namespace1_schema1_ = DocumentBuilder()
.SetKey(namespace1_, "5")
.SetSchema(schema1_)
- .SetCreationTimestampMs(0)
+ .SetCreationTimestampMs(1)
.SetTtlMs(100)
.Build();
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type(schema1_);
- type_config = schema.add_types();
- type_config->set_schema_type(schema2_);
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType(schema1_))
+ .AddType(SchemaTypeConfigBuilder().SetType(schema2_))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ document_store_ = std::move(create_result.document_store);
}
void TearDown() override {
@@ -773,26 +910,37 @@ class DocHitInfoIteratorFilterTest : public ::testing::Test {
};
TEST_F(DocHitInfoIteratorFilterTest, CombineAllFiltersOk) {
+ // Filters out document5 since it's expired
+ fake_clock_.SetSystemTimeMilliseconds(199);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id1,
- document_store_->Put(document1_namespace1_schema1_));
+ document_store->Put(document1_namespace1_schema1_));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id2,
- document_store_->Put(document2_namespace1_schema1_));
+ document_store->Put(document2_namespace1_schema1_));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id3,
- document_store_->Put(document3_namespace2_schema1_));
+ document_store->Put(document3_namespace2_schema1_));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id4,
- document_store_->Put(document4_namespace1_schema2_));
+ document_store->Put(document4_namespace1_schema2_));
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id5,
- document_store_->Put(document5_namespace1_schema1_));
+ document_store->Put(document5_namespace1_schema1_));
// Deletes document2, causing it to be filtered out
ICING_ASSERT_OK(
- document_store_->Delete(document2_namespace1_schema1_.namespace_(),
- document2_namespace1_schema1_.uri()));
+ document_store->Delete(document2_namespace1_schema1_.namespace_(),
+ document2_namespace1_schema1_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
std::vector<DocHitInfo> doc_hit_infos = {
DocHitInfo(document_id1), DocHitInfo(document_id2),
@@ -810,13 +958,9 @@ TEST_F(DocHitInfoIteratorFilterTest, CombineAllFiltersOk) {
// Filters out document4 by schema type
options.schema_types = std::vector<std::string_view>{schema1_};
- // Filters out document5 since it's expired
- FakeClock fake_clock;
- fake_clock.SetSystemTimeMilliseconds(199);
-
DocHitInfoIteratorFilter filtered_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock, options);
+ std::move(original_iterator), document_store.get(), schema_store_.get(),
+ options, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocumentIds(&filtered_iterator), ElementsAre(document_id1));
}
@@ -849,7 +993,7 @@ TEST_F(DocHitInfoIteratorFilterTest, SectionIdMasksArePopulatedCorrectly) {
DocHitInfoIteratorFilter::Options options;
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options);
+ options, fake_clock_.GetSystemTimeMilliseconds());
EXPECT_THAT(GetDocHitInfos(&filtered_iterator),
ElementsAre(EqualsDocHitInfo(document_id1, section_ids1),
@@ -857,28 +1001,71 @@ TEST_F(DocHitInfoIteratorFilterTest, SectionIdMasksArePopulatedCorrectly) {
EqualsDocHitInfo(document_id3, section_ids3)));
}
-TEST_F(DocHitInfoIteratorFilterTest, GetNumBlocksInspected) {
+TEST_F(DocHitInfoIteratorFilterTest, GetCallStats) {
+ DocHitInfoIterator::CallStats original_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/2,
+ /*num_leaf_advance_calls_main_index_in=*/5,
+ /*num_leaf_advance_calls_integer_index_in=*/3,
+ /*num_leaf_advance_calls_no_index_in=*/1,
+ /*num_blocks_inspected_in=*/4); // arbitrary value
auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>();
- original_iterator->SetNumBlocksInspected(5);
+ original_iterator->SetCallStats(original_call_stats);
DocHitInfoIteratorFilter::Options options;
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options);
+ options, fake_clock_.GetSystemTimeMilliseconds());
- EXPECT_THAT(filtered_iterator.GetNumBlocksInspected(), Eq(5));
+ EXPECT_THAT(filtered_iterator.GetCallStats(), Eq(original_call_stats));
}
-TEST_F(DocHitInfoIteratorFilterTest, GetNumLeafAdvanceCalls) {
- auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>();
- original_iterator->SetNumLeafAdvanceCalls(6);
+TEST_F(DocHitInfoIteratorFilterTest, TrimFilterIterator) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(document1_namespace1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(document2_namespace1_schema1_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(document3_namespace2_schema1_));
+
+ // Build an interator tree like:
+ // Filter
+ // |
+ // AND
+ // / \
+ // {1, 3} {2}
+ std::vector<DocHitInfo> left_vector = {DocHitInfo(document_id1),
+ DocHitInfo(document_id3)};
+ std::vector<DocHitInfo> right_vector = {DocHitInfo(document_id2)};
+
+ std::unique_ptr<DocHitInfoIterator> left_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(left_vector);
+ std::unique_ptr<DocHitInfoIterator> right_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(right_vector, "term", 10);
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorAnd>(std::move(left_iter),
+ std::move(right_iter));
DocHitInfoIteratorFilter::Options options;
+ // Filters out document3 by namespace
+ options.namespaces = std::vector<std::string_view>{namespace1_};
DocHitInfoIteratorFilter filtered_iterator(
std::move(original_iterator), document_store_.get(), schema_store_.get(),
- &fake_clock_, options);
-
- EXPECT_THAT(filtered_iterator.GetNumLeafAdvanceCalls(), Eq(6));
+ options, fake_clock_.GetSystemTimeMilliseconds());
+
+ // The trimmed tree.
+ // Filter
+ // |
+ // {1, 3}
+ ICING_ASSERT_OK_AND_ASSIGN(DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(filtered_iterator).TrimRightMostNode());
+ EXPECT_THAT(trimmed_node.term_, Eq("term"));
+ EXPECT_THAT(trimmed_node.term_start_index_, Eq(10));
+ EXPECT_THAT(GetDocumentIds(trimmed_node.iterator_.get()),
+ ElementsAre(document_id1));
}
} // namespace
diff --git a/icing/index/iterator/doc-hit-info-iterator-none.h b/icing/index/iterator/doc-hit-info-iterator-none.h
new file mode 100644
index 0000000..c2853f1
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-none.h
@@ -0,0 +1,52 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_NONE_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_NONE_H_
+
+#include <cstdint>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+
+namespace icing {
+namespace lib {
+
+// Iterator that will return no results.
+class DocHitInfoIteratorNone : public DocHitInfoIterator {
+ public:
+ libtextclassifier3::Status Advance() override {
+ return absl_ports::ResourceExhaustedError(
+ "DocHitInfoIterator NONE has no hits.");
+ }
+
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override {
+ TrimmedNode node = {nullptr, /*term=*/"", /*term_start_index_=*/0,
+ /*unnormalized_term_length_=*/0};
+ return node;
+ }
+
+ void MapChildren(const ChildrenMapper& mapper) override {}
+
+ CallStats GetCallStats() const override { return CallStats(); }
+
+ std::string ToString() const override { return "(NONE)"; }
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_NONE_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-not.cc b/icing/index/iterator/doc-hit-info-iterator-not.cc
index e1ece5c..10a8292 100644
--- a/icing/index/iterator/doc-hit-info-iterator-not.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-not.cc
@@ -15,13 +15,15 @@
#include "icing/index/iterator/doc-hit-info-iterator-not.h"
#include <cstdint>
+#include <memory>
+#include <utility>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
-#include "icing/schema/section.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/store/document-id.h"
namespace icing {
@@ -35,40 +37,40 @@ DocHitInfoIteratorNot::DocHitInfoIteratorNot(
DocHitInfoIteratorAllDocumentId(document_id_limit)) {}
libtextclassifier3::Status DocHitInfoIteratorNot::Advance() {
- if (!all_document_id_iterator_.Advance().ok()) {
- doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- return absl_ports::ResourceExhaustedError(
- "No more DocHitInfos in iterator");
- }
+ while (all_document_id_iterator_.Advance().ok()) {
+ if (all_document_id_iterator_.doc_hit_info().document_id() <
+ to_be_excluded_->doc_hit_info().document_id()) {
+ // Since DocumentIds are returned from DocHitInfoIterators in decreasing
+ // order, we have passed the last NOT result if we're smaller than its
+ // DocumentId. Advance the NOT result if so.
+ to_be_excluded_->Advance().IgnoreError();
+ }
- if (all_document_id_iterator_.doc_hit_info().document_id() <
- to_be_excluded_->doc_hit_info().document_id()) {
- // Since DocumentIds are returned from DocHitInfoIterators in decreasing
- // order, we have passed the last NOT result if we're smaller than its
- // DocumentId. Advance the NOT result if so.
- to_be_excluded_->Advance().IgnoreError();
- }
+ if (all_document_id_iterator_.doc_hit_info().document_id() ==
+ to_be_excluded_->doc_hit_info().document_id()) {
+ // This is a NOT result, skip and Advance to the next result.
+ continue;
+ }
- if (all_document_id_iterator_.doc_hit_info().document_id() ==
- to_be_excluded_->doc_hit_info().document_id()) {
- // This is a NOT result, skip and Advance to the next result.
- return Advance();
+ // No errors, we've found a valid result
+ doc_hit_info_ = all_document_id_iterator_.doc_hit_info();
+ return libtextclassifier3::Status::OK;
}
- // No errors, we've found a valid result
- doc_hit_info_ = all_document_id_iterator_.doc_hit_info();
-
- return libtextclassifier3::Status::OK;
+ // Didn't find a hit, return with error
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator");
}
-int32_t DocHitInfoIteratorNot::GetNumBlocksInspected() const {
- return to_be_excluded_->GetNumBlocksInspected() +
- all_document_id_iterator_.GetNumBlocksInspected();
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorNot::TrimRightMostNode() && {
+ // Don't generate suggestion if the last operator is NOT.
+ return absl_ports::InvalidArgumentError(
+ "Cannot generate suggestion if the last term is NOT operator.");
}
-int32_t DocHitInfoIteratorNot::GetNumLeafAdvanceCalls() const {
- return to_be_excluded_->GetNumLeafAdvanceCalls() +
- all_document_id_iterator_.GetNumLeafAdvanceCalls();
+void DocHitInfoIteratorNot::MapChildren(const ChildrenMapper& mapper) {
+ to_be_excluded_ = mapper(std::move(to_be_excluded_));
}
std::string DocHitInfoIteratorNot::ToString() const {
diff --git a/icing/index/iterator/doc-hit-info-iterator-not.h b/icing/index/iterator/doc-hit-info-iterator-not.h
index 58e909d..11575fb 100644
--- a/icing/index/iterator/doc-hit-info-iterator-not.h
+++ b/icing/index/iterator/doc-hit-info-iterator-not.h
@@ -30,14 +30,12 @@ namespace lib {
// Iterator that will return all documents that are *not* specified by the
// to_be_excluded_iterator.
//
-// NOTE: The hit_intersect_section_ids_mask is meaningless for this iterator.
+// NOTE: doc_hit_info_.hit_section_ids_mask() is meaningless for this iterator.
// When this iterator produces a result, it's because the Document was not
// present in the to_be_excluded_iterator. There is no concept of the Document
// having been chosen because it's term was in a specific section. Since we
// don't know anything about the sections for the Document, the
-// hit_intersect_section_ids_mask is always kSectionIdMaskNone. Correspondingly,
-// this means that the doc_hit_info.hit_section_ids_mask will also always be
-// kSectionIdMaskNone.
+// doc_hit_info.hit_section_ids_mask() is always kSectionIdMaskNone.
class DocHitInfoIteratorNot : public DocHitInfoIterator {
public:
// to_be_excluded_iterator: The results of this iterator will be excluded
@@ -50,9 +48,17 @@ class DocHitInfoIteratorNot : public DocHitInfoIterator {
libtextclassifier3::Status Advance() override;
- int32_t GetNumBlocksInspected() const override;
+ // The NOT operator is not suppose to be trimmed.
+ // We shouldn't generate suggestion for the last term if the last term belongs
+ // to NOT operator.
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
- int32_t GetNumLeafAdvanceCalls() const override;
+ void MapChildren(const ChildrenMapper& mapper) override;
+
+ CallStats GetCallStats() const override {
+ return to_be_excluded_->GetCallStats() +
+ all_document_id_iterator_.GetCallStats();
+ }
std::string ToString() const override;
diff --git a/icing/index/iterator/doc-hit-info-iterator-not_test.cc b/icing/index/iterator/doc-hit-info-iterator-not_test.cc
index 5d0e4ac..a8c835f 100644
--- a/icing/index/iterator/doc-hit-info-iterator-not_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-not_test.cc
@@ -102,40 +102,39 @@ TEST(DocHitInfoIteratorNotTest, AllDocumentIdOverlapOk) {
EXPECT_THAT(GetDocumentIds(&not_iterator), IsEmpty());
}
-TEST(DocHitInfoIteratorNotTest, GetNumBlocksInspected) {
- int to_be_excluded_iterator_blocks = 4; // arbitrary value
+TEST(DocHitInfoIteratorNotTest, GetCallStats) {
+ DocHitInfoIterator::CallStats to_be_excluded_iterator_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/2,
+ /*num_leaf_advance_calls_main_index_in=*/5,
+ /*num_leaf_advance_calls_integer_index_in=*/3,
+ /*num_leaf_advance_calls_no_index_in=*/1,
+ /*num_blocks_inspected_in=*/4); // arbitrary value
auto to_be_excluded_iterator = std::make_unique<DocHitInfoIteratorDummy>();
- to_be_excluded_iterator->SetNumBlocksInspected(
- to_be_excluded_iterator_blocks);
-
- DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
- /*document_id_limit=*/5);
-
- // The AllDocumentId iterator doesn't count any blocks as being inspected
- // since it's just decrementing 1 from the document_id_limit.
- EXPECT_THAT(not_iterator.GetNumBlocksInspected(),
- Eq(to_be_excluded_iterator_blocks));
-}
-
-TEST(DocHitInfoIteratorNotTest, GetNumLeafAdvanceCalls) {
- int to_be_excluded_iterator_leaves = 4; // arbitrary value
- auto to_be_excluded_iterator = std::make_unique<DocHitInfoIteratorDummy>();
- to_be_excluded_iterator->SetNumLeafAdvanceCalls(
- to_be_excluded_iterator_leaves);
+ to_be_excluded_iterator->SetCallStats(to_be_excluded_iterator_call_stats);
int all_document_id_limit = 5;
// Since we iterate from [limit, 0] inclusive, add 1 for the 0th advance call
int all_leaf_advance_calls = all_document_id_limit + 1;
DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
- all_document_id_limit);
+ /*document_id_limit=*/5);
while (not_iterator.Advance().ok()) {
// Advance through the whole not iterator
}
- // The AllDocumentId iterator counts each DocumentId as a leaf advance call
- EXPECT_THAT(not_iterator.GetNumLeafAdvanceCalls(),
- Eq(to_be_excluded_iterator_leaves + all_leaf_advance_calls));
+ // The AllDocumentId iterator doesn't count lite/main/integer index or blocks
+ // as being inspected since it's just decrementing 1 from the
+ // document_id_limit.
+ EXPECT_THAT(
+ not_iterator.GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ to_be_excluded_iterator_call_stats.num_leaf_advance_calls_lite_index,
+ to_be_excluded_iterator_call_stats.num_leaf_advance_calls_main_index,
+ to_be_excluded_iterator_call_stats
+ .num_leaf_advance_calls_integer_index,
+ to_be_excluded_iterator_call_stats.num_leaf_advance_calls_no_index +
+ all_leaf_advance_calls,
+ to_be_excluded_iterator_call_stats.num_blocks_inspected));
}
TEST(DocHitInfoIteratorNotTest, SectionIdsAlwaysNone) {
@@ -155,6 +154,17 @@ TEST(DocHitInfoIteratorNotTest, SectionIdsAlwaysNone) {
DocHitInfo(0, kSectionIdMaskNone)));
}
+TEST(DocHitInfoIteratorNotTest, TrimNotIterator) {
+ std::vector<DocHitInfo> exclude_doc_hit_infos = {DocHitInfo(0)};
+ std::unique_ptr<DocHitInfoIterator> to_be_excluded_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(exclude_doc_hit_infos);
+
+ DocHitInfoIteratorNot not_iterator(std::move(to_be_excluded_iterator),
+ /*document_id_limit=*/5);
+ EXPECT_THAT(std::move(not_iterator).TrimRightMostNode(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-or.cc b/icing/index/iterator/doc-hit-info-iterator-or.cc
index 9d18753..6251365 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-or.cc
@@ -20,7 +20,9 @@
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/store/document-id.h"
+#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
@@ -29,8 +31,6 @@ namespace {
// When combining Or iterators, n-ary operator has better performance when
// number of operands > 2 according to benchmark cl/243321264
-// TODO (samzheng): Tune this number when it's necessary, e.g. implementation
-// changes.
constexpr int kBinaryOrIteratorPerformanceThreshold = 2;
} // namespace
@@ -59,6 +59,26 @@ DocHitInfoIteratorOr::DocHitInfoIteratorOr(
std::unique_ptr<DocHitInfoIterator> right_it)
: left_(std::move(left_it)), right_(std::move(right_it)) {}
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorOr::TrimRightMostNode() && {
+ // Trim the whole OR iterator. Only keep the prefix of the right iterator.
+ //
+ // The OR operator has higher priority, it is not possible that we have an
+ // unfinished prefix in the nested iterator right-most child we need to search
+ // suggestion for.
+ //
+ // eg: `foo OR (bar baz)` is not valid for search suggestion since there is no
+ // unfinished last term to be filled.
+ //
+ // If we need to trim a OR iterator for search suggestion, the right child
+ // must be the last term. We don't need left side information to
+ // generate suggestion for the right side.
+ ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_right,
+ std::move(*right_).TrimRightMostNode());
+ trimmed_right.iterator_ = nullptr;
+ return trimmed_right;
+}
+
libtextclassifier3::Status DocHitInfoIteratorOr::Advance() {
// Cache the document_id of the left iterator for comparison to the right.
DocumentId orig_left_document_id = left_document_id_;
@@ -94,7 +114,6 @@ libtextclassifier3::Status DocHitInfoIteratorOr::Advance() {
right_document_id_ == kInvalidDocumentId) {
// Reached the end, set these to invalid values and return
doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
@@ -110,27 +129,19 @@ libtextclassifier3::Status DocHitInfoIteratorOr::Advance() {
} else {
chosen = left_.get();
}
+ current_ = chosen;
doc_hit_info_ = chosen->doc_hit_info();
- hit_intersect_section_ids_mask_ = chosen->hit_intersect_section_ids_mask();
// If equal, combine.
if (left_document_id_ == right_document_id_) {
- doc_hit_info_.MergeSectionsFrom(right_->doc_hit_info());
- hit_intersect_section_ids_mask_ &= right_->hit_intersect_section_ids_mask();
+ doc_hit_info_.MergeSectionsFrom(
+ right_->doc_hit_info().hit_section_ids_mask());
}
return libtextclassifier3::Status::OK;
}
-int32_t DocHitInfoIteratorOr::GetNumBlocksInspected() const {
- return left_->GetNumBlocksInspected() + right_->GetNumBlocksInspected();
-}
-
-int32_t DocHitInfoIteratorOr::GetNumLeafAdvanceCalls() const {
- return left_->GetNumLeafAdvanceCalls() + right_->GetNumLeafAdvanceCalls();
-}
-
std::string DocHitInfoIteratorOr::ToString() const {
return absl_ports::StrCat("(", left_->ToString(), " OR ", right_->ToString(),
")");
@@ -140,7 +151,28 @@ DocHitInfoIteratorOrNary::DocHitInfoIteratorOrNary(
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators)
: iterators_(std::move(iterators)) {}
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorOrNary::TrimRightMostNode() && {
+ // Trim the whole OR iterator.
+ //
+ // The OR operator has higher priority, it is not possible that we have an
+ // unfinished prefix in the nested iterator right-most child we need to search
+ // suggestion for.
+ //
+ // eg: `foo OR (bar baz)` is not valid for search suggestion since there is no
+ // unfinished last term to be filled.
+ //
+ // If we need to trim a OR iterator for search suggestion, the right-most
+ // child must be the last term. We don't need left side information to
+ // generate suggestion for the right side.
+ ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_right,
+ std::move(*iterators_.back()).TrimRightMostNode());
+ trimmed_right.iterator_ = nullptr;
+ return trimmed_right;
+}
+
libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
+ current_iterators_.clear();
if (iterators_.size() < 2) {
return absl_ports::InvalidArgumentError(
"Not enough iterators to OR together");
@@ -150,7 +182,6 @@ libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
// 0 is the smallest (last) DocumentId, can't advance further. Reset to
// invalid values and return directly
doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
@@ -180,43 +211,31 @@ libtextclassifier3::Status DocHitInfoIteratorOrNary::Advance() {
// None of the iterators had a next document_id, reset to invalid values and
// return
doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
// Found the next hit DocumentId, now calculate the section info.
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
for (const auto& iterator : iterators_) {
if (iterator->doc_hit_info().document_id() == next_document_id) {
+ current_iterators_.push_back(iterator.get());
if (doc_hit_info_.document_id() == kInvalidDocumentId) {
doc_hit_info_ = iterator->doc_hit_info();
- hit_intersect_section_ids_mask_ =
- iterator->hit_intersect_section_ids_mask();
} else {
- doc_hit_info_.MergeSectionsFrom(iterator->doc_hit_info());
- hit_intersect_section_ids_mask_ &=
- iterator->hit_intersect_section_ids_mask();
+ doc_hit_info_.MergeSectionsFrom(
+ iterator->doc_hit_info().hit_section_ids_mask());
}
}
}
return libtextclassifier3::Status::OK;
}
-int32_t DocHitInfoIteratorOrNary::GetNumBlocksInspected() const {
- int32_t blockCount = 0;
- for (const auto& iter : iterators_) {
- blockCount += iter->GetNumBlocksInspected();
- }
- return blockCount;
-}
-
-int32_t DocHitInfoIteratorOrNary::GetNumLeafAdvanceCalls() const {
- int32_t leafCount = 0;
+DocHitInfoIterator::CallStats DocHitInfoIteratorOrNary::GetCallStats() const {
+ CallStats call_stats;
for (const auto& iter : iterators_) {
- leafCount += iter->GetNumLeafAdvanceCalls();
+ call_stats += iter->GetCallStats();
}
- return leafCount;
+ return call_stats;
}
std::string DocHitInfoIteratorOrNary::ToString() const {
diff --git a/icing/index/iterator/doc-hit-info-iterator-or.h b/icing/index/iterator/doc-hit-info-iterator-or.h
index 4128e0f..8c0427b 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or.h
+++ b/icing/index/iterator/doc-hit-info-iterator-or.h
@@ -16,7 +16,9 @@
#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_OR_H_
#include <cstdint>
+#include <memory>
#include <string>
+#include <utility>
#include "icing/index/iterator/doc-hit-info-iterator.h"
@@ -34,17 +36,44 @@ class DocHitInfoIteratorOr : public DocHitInfoIterator {
explicit DocHitInfoIteratorOr(std::unique_ptr<DocHitInfoIterator> left_it,
std::unique_ptr<DocHitInfoIterator> right_it);
- libtextclassifier3::Status Advance() override;
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
- int32_t GetNumBlocksInspected() const override;
+ libtextclassifier3::Status Advance() override;
- int32_t GetNumLeafAdvanceCalls() const override;
+ CallStats GetCallStats() const override {
+ return left_->GetCallStats() + right_->GetCallStats();
+ }
std::string ToString() const override;
+ void MapChildren(const ChildrenMapper &mapper) override {
+ left_ = mapper(std::move(left_));
+ right_ = mapper(std::move(right_));
+ }
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo> *matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ current_->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
+ // If equal, then current_ == left_. Combine with results from right_.
+ if (left_document_id_ == right_document_id_) {
+ right_->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
+ }
+ }
+
private:
std::unique_ptr<DocHitInfoIterator> left_;
std::unique_ptr<DocHitInfoIterator> right_;
+ // Pointer to the chosen iterator that points to the current doc_hit_info_. If
+ // both left_ and right_ point to the same docid, then chosen_ == left.
+ // chosen_ does not own the iterator it points to.
+ DocHitInfoIterator *current_;
DocumentId left_document_id_ = kMaxDocumentId;
DocumentId right_document_id_ = kMaxDocumentId;
};
@@ -57,16 +86,38 @@ class DocHitInfoIteratorOrNary : public DocHitInfoIterator {
explicit DocHitInfoIteratorOrNary(
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators);
- libtextclassifier3::Status Advance() override;
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
- int32_t GetNumBlocksInspected() const override;
+ libtextclassifier3::Status Advance() override;
- int32_t GetNumLeafAdvanceCalls() const override;
+ CallStats GetCallStats() const override;
std::string ToString() const override;
+ void MapChildren(const ChildrenMapper &mapper) override {
+ for (int i = 0; i < iterators_.size(); ++i) {
+ iterators_[i] = mapper(std::move(iterators_[i]));
+ }
+ }
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo> *matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ for (size_t i = 0; i < current_iterators_.size(); i++) {
+ current_iterators_.at(i)->PopulateMatchedTermsStats(
+ matched_terms_stats, filtering_section_mask);
+ }
+ }
+
private:
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators_;
+ // Pointers to the iterators that point to the current doc_hit_info_.
+ // current_iterators_ does not own the iterators it points to.
+ std::vector<DocHitInfoIterator *> current_iterators_;
};
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-or_test.cc b/icing/index/iterator/doc-hit-info-iterator-or_test.cc
index 3faa5ab..d198b53 100644
--- a/icing/index/iterator/doc-hit-info-iterator-or_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-or_test.cc
@@ -19,7 +19,6 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/index/iterator/doc-hit-info-iterator-and.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/schema/section.h"
@@ -33,6 +32,7 @@ namespace {
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::IsEmpty;
TEST(CreateAndIteratorTest, Or) {
// Basic test that we can create a working Or iterator. Further testing of
@@ -73,38 +73,33 @@ TEST(DocHitInfoIteratorOrTest, Initialize) {
std::make_unique<DocHitInfoIteratorDummy>());
// We start out with invalid values
- EXPECT_THAT(or_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId)));
- EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(kSectionIdMaskNone));
+ EXPECT_THAT(or_iter.doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{}));
}
-TEST(DocHitInfoIteratorOrTest, GetNumBlocksInspected) {
- int first_iter_blocks = 4; // arbitrary value
+TEST(DocHitInfoIteratorOrTest, GetCallStats) {
+ DocHitInfoIterator::CallStats first_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/2,
+ /*num_leaf_advance_calls_main_index_in=*/5,
+ /*num_leaf_advance_calls_integer_index_in=*/3,
+ /*num_leaf_advance_calls_no_index_in=*/1,
+ /*num_blocks_inspected_in=*/4); // arbitrary value
auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
- first_iter->SetNumBlocksInspected(first_iter_blocks);
-
- int second_iter_blocks = 7; // arbitrary value
- auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
- second_iter->SetNumBlocksInspected(second_iter_blocks);
-
- DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
-
- EXPECT_THAT(or_iter.GetNumBlocksInspected(),
- Eq(first_iter_blocks + second_iter_blocks));
-}
-
-TEST(DocHitInfoIteratorOrTest, GetNumLeafAdvanceCalls) {
- int first_iter_leaves = 4; // arbitrary value
- auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
- first_iter->SetNumLeafAdvanceCalls(first_iter_leaves);
-
- int second_iter_leaves = 7; // arbitrary value
+ first_iter->SetCallStats(first_iter_call_stats);
+
+ DocHitInfoIterator::CallStats second_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/6,
+ /*num_leaf_advance_calls_main_index_in=*/2,
+ /*num_leaf_advance_calls_integer_index_in=*/10,
+ /*num_leaf_advance_calls_no_index_in=*/3,
+ /*num_blocks_inspected_in=*/7); // arbitrary value
auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
- second_iter->SetNumLeafAdvanceCalls(second_iter_leaves);
+ second_iter->SetCallStats(second_iter_call_stats);
DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
- EXPECT_THAT(or_iter.GetNumLeafAdvanceCalls(),
- Eq(first_iter_leaves + second_iter_leaves));
+ EXPECT_THAT(or_iter.GetCallStats(),
+ Eq(first_iter_call_stats + second_iter_call_stats));
}
TEST(DocHitInfoIteratorOrTest, Advance) {
@@ -155,24 +150,200 @@ TEST(DocHitInfoIteratorOrTest, SectionIdMask) {
// Created to test correct section_id_mask behavior.
SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
- SectionIdMask mask_anded_result = 0b00000100;
SectionIdMask mask_ored_result = 0b01010111;
std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)};
std::vector<DocHitInfo> second_vector = {DocHitInfo(4, section_id_mask2)};
auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector);
- first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector);
- second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+ second_iter->set_hit_section_ids_mask(section_id_mask2);
DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
ICING_EXPECT_OK(or_iter.Advance());
EXPECT_THAT(or_iter.doc_hit_info().hit_section_ids_mask(),
Eq(mask_ored_result));
- EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
+}
+
+TEST(DocHitInfoIteratorOrTest, PopulateMatchedTermsStats) {
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+ doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+ SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{0, 1}, {2, 2}, {4, 3}, {6, 4}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(4);
+ doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map2 = {{1, 2}, {2, 6}};
+
+ std::vector<DocHitInfoTermFrequencyPair> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfoTermFrequencyPair> second_vector = {doc_hit_info2};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ second_iter->set_hit_section_ids_mask(section_id_mask2);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(
+ EqualsTermMatchInfo("hi", expected_section_ids_tf_map1),
+ EqualsTermMatchInfo("hello", expected_section_ids_tf_map2)));
+
+ EXPECT_FALSE(or_iter.Advance().ok());
+ }
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ SectionIdMask section_id_mask1 = 0b00000101; // hits in sections 0, 2
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{0, 1}, {2, 2}};
+
+ std::vector<DocHitInfoTermFrequencyPair> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfoTermFrequencyPair> second_vector = {doc_hit_info1};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hi");
+ second_iter->set_hit_section_ids_mask(section_id_mask1);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hi", expected_section_ids_tf_map1)));
+ EXPECT_FALSE(or_iter.Advance().ok());
+ }
+ {
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(4);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/3);
+ doc_hit_info1.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+ SectionIdMask section_id_mask1 = 0b01010101; // hits in sections 0, 2, 4, 6
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{0, 1}, {2, 2}, {4, 3}, {6, 4}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(5);
+ doc_hit_info2.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+ SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map2 = {{1, 2}, {2, 6}};
+
+ std::vector<DocHitInfoTermFrequencyPair> first_vector = {doc_hit_info1};
+ std::vector<DocHitInfoTermFrequencyPair> second_vector = {doc_hit_info2};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
+
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ second_iter->set_hit_section_ids_mask(section_id_mask2);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(5));
+
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("hello",
+ expected_section_ids_tf_map2)));
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(4));
+
+ matched_terms_stats.clear();
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hi", expected_section_ids_tf_map1)));
+
+ EXPECT_FALSE(or_iter.Advance().ok());
+ }
+}
+
+TEST(DocHitInfoIteratorOrTest, TrimOrIterator) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(0)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(1)};
+
+ std::unique_ptr<DocHitInfoIterator> first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector);
+ std::unique_ptr<DocHitInfoIterator> second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "term", 10);
+
+ DocHitInfoIteratorOr or_iter(std::move(first_iter), std::move(second_iter));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(or_iter).TrimRightMostNode());
+ // The whole iterator is trimmed
+ ASSERT_TRUE(trimmed_node.iterator_ == nullptr);
+ ASSERT_THAT(trimmed_node.term_, Eq("term"));
+ ASSERT_THAT(trimmed_node.term_start_index_, Eq(10));
+}
+
+TEST(DocHitInfoIteratorOrNaryTest, TrimOrNaryIterator) {
+ std::vector<DocHitInfo> first_vector = {DocHitInfo(0)};
+ std::vector<DocHitInfo> second_vector = {DocHitInfo(1)};
+ std::vector<DocHitInfo> third_vector = {DocHitInfo(2)};
+ std::vector<DocHitInfo> forth_vector = {DocHitInfo(3)};
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(first_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(second_vector));
+ iterators.push_back(std::make_unique<DocHitInfoIteratorDummy>(third_vector));
+ iterators.push_back(
+ std::make_unique<DocHitInfoIteratorDummy>(forth_vector, "term", 10));
+ DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(or_iter).TrimRightMostNode());
+ // The whole iterator is trimmed
+ ASSERT_TRUE(trimmed_node.iterator_ == nullptr);
+ ASSERT_THAT(trimmed_node.term_, Eq("term"));
+ ASSERT_THAT(trimmed_node.term_start_index_, Eq(10));
}
TEST(DocHitInfoIteratorOrNaryTest, Initialize) {
@@ -184,8 +355,8 @@ TEST(DocHitInfoIteratorOrNaryTest, Initialize) {
DocHitInfoIteratorOrNary or_iter(std::move(iterators));
// We start out with invalid values
- EXPECT_THAT(or_iter.doc_hit_info(), Eq(DocHitInfo(kInvalidDocumentId)));
- EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(kSectionIdMaskNone));
+ EXPECT_THAT(or_iter.doc_hit_info(),
+ EqualsDocHitInfo(kInvalidDocumentId, std::vector<SectionId>{}));
}
TEST(DocHitInfoIteratorOrNaryTest, InitializeEmpty) {
@@ -198,51 +369,42 @@ TEST(DocHitInfoIteratorOrNaryTest, InitializeEmpty) {
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST(DocHitInfoIteratorOrNaryTest, GetNumBlocksInspected) {
- int first_iter_blocks = 4; // arbitrary value
- auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
- first_iter->SetNumBlocksInspected(first_iter_blocks);
-
- int second_iter_blocks = 7; // arbitrary value
- auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
- second_iter->SetNumBlocksInspected(second_iter_blocks);
-
- int third_iter_blocks = 13; // arbitrary value
- auto third_iter = std::make_unique<DocHitInfoIteratorDummy>();
- third_iter->SetNumBlocksInspected(third_iter_blocks);
-
- int fourth_iter_blocks = 1; // arbitrary value
- auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>();
- fourth_iter->SetNumBlocksInspected(fourth_iter_blocks);
-
- std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
- iterators.push_back(std::move(first_iter));
- iterators.push_back(std::move(second_iter));
- iterators.push_back(std::move(third_iter));
- iterators.push_back(std::move(fourth_iter));
- DocHitInfoIteratorOrNary or_iter(std::move(iterators));
-
- EXPECT_THAT(or_iter.GetNumBlocksInspected(),
- Eq(first_iter_blocks + second_iter_blocks + third_iter_blocks +
- fourth_iter_blocks));
-}
-
-TEST(DocHitInfoIteratorOrNaryTest, GetNumLeafAdvanceCalls) {
- int first_iter_leaves = 4; // arbitrary value
+TEST(DocHitInfoIteratorOrNaryTest, GetCallStats) {
+ DocHitInfoIterator::CallStats first_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/2,
+ /*num_leaf_advance_calls_main_index_in=*/5,
+ /*num_leaf_advance_calls_integer_index_in=*/3,
+ /*num_leaf_advance_calls_no_index_in=*/1,
+ /*num_blocks_inspected_in=*/4); // arbitrary value
auto first_iter = std::make_unique<DocHitInfoIteratorDummy>();
- first_iter->SetNumLeafAdvanceCalls(first_iter_leaves);
-
- int second_iter_leaves = 7; // arbitrary value
+ first_iter->SetCallStats(first_iter_call_stats);
+
+ DocHitInfoIterator::CallStats second_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/6,
+ /*num_leaf_advance_calls_main_index_in=*/2,
+ /*num_leaf_advance_calls_integer_index_in=*/10,
+ /*num_leaf_advance_calls_no_index_in=*/3,
+ /*num_blocks_inspected_in=*/7); // arbitrary value
auto second_iter = std::make_unique<DocHitInfoIteratorDummy>();
- second_iter->SetNumLeafAdvanceCalls(second_iter_leaves);
-
- int third_iter_leaves = 13; // arbitrary value
+ second_iter->SetCallStats(second_iter_call_stats);
+
+ DocHitInfoIterator::CallStats third_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/1000,
+ /*num_leaf_advance_calls_main_index_in=*/2000,
+ /*num_leaf_advance_calls_integer_index_in=*/3000,
+ /*num_leaf_advance_calls_no_index_in=*/0,
+ /*num_blocks_inspected_in=*/200); // arbitrary value
auto third_iter = std::make_unique<DocHitInfoIteratorDummy>();
- third_iter->SetNumLeafAdvanceCalls(third_iter_leaves);
-
- int fourth_iter_leaves = 13; // arbitrary value
+ third_iter->SetCallStats(third_iter_call_stats);
+
+ DocHitInfoIterator::CallStats fourth_iter_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/200,
+ /*num_leaf_advance_calls_main_index_in=*/400,
+ /*num_leaf_advance_calls_integer_index_in=*/100,
+ /*num_leaf_advance_calls_no_index_in=*/20,
+ /*num_blocks_inspected_in=*/50); // arbitrary value
auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>();
- fourth_iter->SetNumLeafAdvanceCalls(fourth_iter_leaves);
+ fourth_iter->SetCallStats(fourth_iter_call_stats);
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
iterators.push_back(std::move(first_iter));
@@ -251,9 +413,9 @@ TEST(DocHitInfoIteratorOrNaryTest, GetNumLeafAdvanceCalls) {
iterators.push_back(std::move(fourth_iter));
DocHitInfoIteratorOrNary or_iter(std::move(iterators));
- EXPECT_THAT(or_iter.GetNumLeafAdvanceCalls(),
- Eq(first_iter_leaves + second_iter_leaves + third_iter_leaves +
- fourth_iter_leaves));
+ EXPECT_THAT(or_iter.GetCallStats(),
+ Eq(first_iter_call_stats + second_iter_call_stats +
+ third_iter_call_stats + fourth_iter_call_stats));
}
TEST(DocHitInfoIteratorOrNaryTest, Advance) {
@@ -282,7 +444,6 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) {
SectionIdMask section_id_mask2 = 0b00000110; // hits in sections 1, 2
SectionIdMask section_id_mask3 = 0b00001100; // hits in sections 2, 3
SectionIdMask section_id_mask4 = 0b00100100; // hits in sections 2, 5
- SectionIdMask mask_anded_result = 0b00000100;
SectionIdMask mask_ored_result = 0b01101111;
std::vector<DocHitInfo> first_vector = {DocHitInfo(4, section_id_mask1)};
@@ -291,16 +452,16 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) {
std::vector<DocHitInfo> fourth_vector = {DocHitInfo(4, section_id_mask4)};
auto first_iter = std::make_unique<DocHitInfoIteratorDummy>(first_vector);
- first_iter->set_hit_intersect_section_ids_mask(section_id_mask1);
+ first_iter->set_hit_section_ids_mask(section_id_mask1);
auto second_iter = std::make_unique<DocHitInfoIteratorDummy>(second_vector);
- second_iter->set_hit_intersect_section_ids_mask(section_id_mask2);
+ second_iter->set_hit_section_ids_mask(section_id_mask2);
auto third_iter = std::make_unique<DocHitInfoIteratorDummy>(third_vector);
- third_iter->set_hit_intersect_section_ids_mask(section_id_mask3);
+ third_iter->set_hit_section_ids_mask(section_id_mask3);
auto fourth_iter = std::make_unique<DocHitInfoIteratorDummy>(fourth_vector);
- fourth_iter->set_hit_intersect_section_ids_mask(section_id_mask4);
+ fourth_iter->set_hit_section_ids_mask(section_id_mask4);
std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
iterators.push_back(std::move(first_iter));
@@ -313,7 +474,108 @@ TEST(DocHitInfoIteratorOrNaryTest, SectionIdMask) {
ICING_EXPECT_OK(or_iter.Advance());
EXPECT_THAT(or_iter.doc_hit_info().hit_section_ids_mask(),
Eq(mask_ored_result));
- EXPECT_THAT(or_iter.hit_intersect_section_ids_mask(), Eq(mask_anded_result));
+}
+
+TEST(DocHitInfoIteratorOrNaryTest, PopulateMatchedTermsStats) {
+ // Arbitrary section ids/term frequencies for the documents in the
+ // DocHitInfoIterators.
+ // For term "hi", document 10 and 8
+ DocHitInfoTermFrequencyPair doc_hit_info1_hi = DocHitInfo(10);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+ doc_hit_info1_hi.UpdateSection(/*section_id=*/6, /*hit_term_frequency=*/4);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1_hi = {{0, 1}, {2, 2}, {6, 4}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2_hi = DocHitInfo(8);
+ doc_hit_info2_hi.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/2);
+ doc_hit_info2_hi.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/6);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map2_hi = {{1, 2}, {2, 6}};
+
+ // For term "hello", document 10 and 9
+ DocHitInfoTermFrequencyPair doc_hit_info1_hello = DocHitInfo(10);
+ doc_hit_info1_hello.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+ doc_hit_info1_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1_hello = {{0, 2}, {3, 3}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2_hello = DocHitInfo(9);
+ doc_hit_info2_hello.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/3);
+ doc_hit_info2_hello.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/2);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map2_hello = {{2, 3}, {3, 2}};
+
+ // For term "ciao", document 9 and 8
+ DocHitInfoTermFrequencyPair doc_hit_info1_ciao = DocHitInfo(9);
+ doc_hit_info1_ciao.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/2);
+ doc_hit_info1_ciao.UpdateSection(/*section_id=*/1, /*hit_term_frequency=*/3);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1_ciao = {{0, 2}, {1, 3}};
+
+ DocHitInfoTermFrequencyPair doc_hit_info2_ciao = DocHitInfo(8);
+ doc_hit_info2_ciao.UpdateSection(/*section_id=*/3, /*hit_term_frequency=*/3);
+ doc_hit_info2_ciao.UpdateSection(/*section_id=*/4, /*hit_term_frequency=*/2);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map2_ciao = {{3, 3}, {4, 2}};
+
+ std::vector<DocHitInfoTermFrequencyPair> first_vector = {doc_hit_info1_hi,
+ doc_hit_info2_hi};
+ std::vector<DocHitInfoTermFrequencyPair> second_vector = {
+ doc_hit_info1_hello, doc_hit_info2_hello};
+ std::vector<DocHitInfoTermFrequencyPair> third_vector = {doc_hit_info1_ciao,
+ doc_hit_info2_ciao};
+
+ auto first_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(first_vector, "hi");
+ auto second_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(second_vector, "hello");
+ auto third_iter =
+ std::make_unique<DocHitInfoIteratorDummy>(third_vector, "ciao");
+
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ iterators.push_back(std::move(first_iter));
+ iterators.push_back(std::move(second_iter));
+ iterators.push_back(std::move(third_iter));
+
+ DocHitInfoIteratorOrNary or_iter(std::move(iterators));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(10));
+
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(
+ EqualsTermMatchInfo("hi", expected_section_ids_tf_map1_hi),
+ EqualsTermMatchInfo("hello", expected_section_ids_tf_map1_hello)));
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(9));
+
+ matched_terms_stats.clear();
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(
+ EqualsTermMatchInfo("hello", expected_section_ids_tf_map2_hello),
+ EqualsTermMatchInfo("ciao", expected_section_ids_tf_map1_ciao)));
+
+ ICING_EXPECT_OK(or_iter.Advance());
+ EXPECT_THAT(or_iter.doc_hit_info().document_id(), Eq(8));
+
+ matched_terms_stats.clear();
+ or_iter.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(
+ EqualsTermMatchInfo("hi", expected_section_ids_tf_map2_hi),
+ EqualsTermMatchInfo("ciao", expected_section_ids_tf_map2_ciao)));
+
+ EXPECT_FALSE(or_iter.Advance().ok());
}
} // namespace
diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-document.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-document.cc
new file mode 100644
index 0000000..e6a1c67
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-property-in-document.cc
@@ -0,0 +1,65 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-property-in-document.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+DocHitInfoIteratorPropertyInDocument::DocHitInfoIteratorPropertyInDocument(
+ std::unique_ptr<DocHitInfoIterator> meta_hit_iterator)
+ : meta_hit_iterator_(std::move(meta_hit_iterator)) {}
+
+libtextclassifier3::Status DocHitInfoIteratorPropertyInDocument::Advance() {
+ while (meta_hit_iterator_->Advance().ok()) {
+ // Currently, the metadata hits added by PropertyExistenceIndexingHandler
+ // can only have a section id of 0, so the section mask has to be 1 << 0.
+ if (meta_hit_iterator_->doc_hit_info().hit_section_ids_mask() == (1 << 0)) {
+ doc_hit_info_ = meta_hit_iterator_->doc_hit_info();
+ // Hits returned by "hasProperty" should not be associated with any
+ // section.
+ doc_hit_info_.set_hit_section_ids_mask(/*section_id_mask=*/0);
+ return libtextclassifier3::Status::OK;
+ }
+ }
+
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator");
+}
+
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorPropertyInDocument::TrimRightMostNode() && {
+ // Don't generate suggestion if the last operator is this custom function.
+ return absl_ports::InvalidArgumentError(
+ "Cannot generate suggestion if the last term is hasProperty().");
+}
+
+std::string DocHitInfoIteratorPropertyInDocument::ToString() const {
+ return meta_hit_iterator_->ToString();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-document.h b/icing/index/iterator/doc-hit-info-iterator-property-in-document.h
new file mode 100644
index 0000000..bb2c97a
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-property-in-document.h
@@ -0,0 +1,73 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_DOCUMENT_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_DOCUMENT_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// The iterator returned by the "hasProperty" function in advanced query that
+// post-processes metadata hits added by PropertyExistenceIndexingHandler.
+// Specifically, it filters out hits that are not recognized as metadata, and
+// always set hit_section_ids_mask to 0.
+//
+// It is marked as a subclass of DocHitInfoLeafIterator because section
+// restriction should not be passed down to meta_hit_iterator.
+class DocHitInfoIteratorPropertyInDocument : public DocHitInfoLeafIterator {
+ public:
+ explicit DocHitInfoIteratorPropertyInDocument(
+ std::unique_ptr<DocHitInfoIterator> meta_hit_iterator);
+
+ libtextclassifier3::Status Advance() override;
+
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
+
+ CallStats GetCallStats() const override {
+ return meta_hit_iterator_->GetCallStats();
+ }
+
+ std::string ToString() const override;
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ meta_hit_iterator_->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
+ }
+
+ private:
+ std::unique_ptr<DocHitInfoIterator> meta_hit_iterator_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_DOCUMENT_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc
new file mode 100644
index 0000000..8b98302
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.cc
@@ -0,0 +1,103 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-property-in-schema.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+DocHitInfoIteratorPropertyInSchema::DocHitInfoIteratorPropertyInSchema(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ std::set<std::string> target_sections, int64_t current_time_ms)
+ : delegate_(std::move(delegate)),
+ document_store_(*document_store),
+ schema_store_(*schema_store),
+ target_properties_(std::move(target_sections)),
+ current_time_ms_(current_time_ms) {}
+
+libtextclassifier3::Status DocHitInfoIteratorPropertyInSchema::Advance() {
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+
+ // Maps from SchemaTypeId to a bool indicating whether or not the type has
+ // the requested property.
+ std::unordered_map<SchemaTypeId, bool> property_defined_types;
+ while (delegate_->Advance().ok()) {
+ DocumentId document_id = delegate_->doc_hit_info().document_id();
+ auto data_optional = document_store_.GetAliveDocumentFilterData(
+ document_id, current_time_ms_);
+ if (!data_optional) {
+ // Ran into some error retrieving information on this hit, skip
+ continue;
+ }
+
+ // Guaranteed that the DocumentFilterData exists at this point
+ SchemaTypeId schema_type_id = data_optional.value().schema_type_id();
+ bool valid_match = false;
+ auto itr = property_defined_types.find(schema_type_id);
+ if (itr != property_defined_types.end()) {
+ valid_match = itr->second;
+ } else {
+ for (const auto& property : target_properties_) {
+ if (schema_store_.IsPropertyDefinedInSchema(schema_type_id, property)) {
+ valid_match = true;
+ break;
+ }
+ }
+ property_defined_types[schema_type_id] = valid_match;
+ }
+
+ if (valid_match) {
+ doc_hit_info_ = delegate_->doc_hit_info();
+ return libtextclassifier3::Status::OK;
+ }
+
+ // The document's schema does not define any properties listed in
+ // target_properties_. Continue.
+ }
+
+ // Didn't find anything on the delegate iterator.
+ return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator");
+}
+
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorPropertyInSchema::TrimRightMostNode() && {
+ // Don't generate suggestion if the last operator is this custom function.
+ return absl_ports::InvalidArgumentError(
+ "Cannot generate suggestion if the last term is hasPropertyDefined().");
+}
+
+std::string DocHitInfoIteratorPropertyInSchema::ToString() const {
+ return absl_ports::StrCat("(", absl_ports::StrJoin(target_properties_, ","),
+ "): ", delegate_->ToString());
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h
new file mode 100644
index 0000000..c16a1c4
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema.h
@@ -0,0 +1,80 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_SCHEMA_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_SCHEMA_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+// An iterator that helps filter for DocHitInfos whose schemas define the
+// properties named in target_properties_.
+class DocHitInfoIteratorPropertyInSchema : public DocHitInfoIterator {
+ public:
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed. The delegate should be at minimum be
+ // a DocHitInfoIteratorAllDocumentId, but other optimizations are possible,
+ // cf. go/icing-property-in-schema-existence.
+ explicit DocHitInfoIteratorPropertyInSchema(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ std::set<std::string> target_sections, int64_t current_time_ms);
+
+ libtextclassifier3::Status Advance() override;
+
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
+
+ void MapChildren(const ChildrenMapper& mapper) override {
+ delegate_ = mapper(std::move(delegate_));
+ }
+
+ CallStats GetCallStats() const override { return delegate_->GetCallStats(); }
+
+ std::string ToString() const override;
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ delegate_->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
+ }
+
+ private:
+ std::unique_ptr<DocHitInfoIterator> delegate_;
+ const DocumentStore& document_store_;
+ const SchemaStore& schema_store_;
+
+ std::set<std::string> target_properties_;
+ int64_t current_time_ms_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_PROPERTY_IN_SCHEMA_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc b/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc
new file mode 100644
index 0000000..3f5a0a7
--- /dev/null
+++ b/icing/index/iterator/doc-hit-info-iterator-property-in-schema_test.cc
@@ -0,0 +1,269 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/doc-hit-info-iterator-property-in-schema.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+class DocHitInfoIteratorPropertyInSchemaTest : public ::testing::Test {
+ protected:
+ DocHitInfoIteratorPropertyInSchemaTest()
+ : test_dir_(GetTestTempDir() + "/icing") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ document1_ = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("email")
+ .Build();
+ document2_ =
+ DocumentBuilder().SetKey("namespace", "uri2").SetSchema("note").Build();
+
+ indexed_section_0 = "indexedSection0";
+ unindexed_section_1 = "unindexedSection1";
+ not_defined_section_2 = "notDefinedSection2";
+
+ schema_ =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ // Add an indexed property so we generate section
+ // metadata on it
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(indexed_section_0)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(unindexed_section_1)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("note").AddProperty(
+ PropertyConfigBuilder()
+ .SetName(unindexed_section_1)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, test_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ std::string indexed_section_0;
+ std::string unindexed_section_1;
+ std::string not_defined_section_2;
+ SchemaProto schema_;
+ DocumentProto document1_;
+ DocumentProto document2_;
+ FakeClock fake_clock_;
+};
+
+TEST_F(DocHitInfoIteratorPropertyInSchemaTest,
+ AdvanceToDocumentWithIndexedProperty) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document1_));
+
+ auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ document_store_->num_documents());
+
+ DocHitInfoIteratorPropertyInSchema property_defined_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_target_sections=*/{indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(GetDocumentIds(&property_defined_iterator),
+ ElementsAre(document_id));
+
+ EXPECT_FALSE(property_defined_iterator.Advance().ok());
+}
+
+TEST_F(DocHitInfoIteratorPropertyInSchemaTest,
+ AdvanceToDocumentWithUnindexedProperty) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document1_));
+
+ auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ document_store_->num_documents());
+
+ DocHitInfoIteratorPropertyInSchema property_defined_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_target_sections=*/{unindexed_section_1},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(GetDocumentIds(&property_defined_iterator),
+ ElementsAre(document_id));
+
+ EXPECT_FALSE(property_defined_iterator.Advance().ok());
+}
+
+TEST_F(DocHitInfoIteratorPropertyInSchemaTest, NoMatchWithUndefinedProperty) {
+ ICING_EXPECT_OK(document_store_->Put(document1_));
+
+ auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ document_store_->num_documents());
+
+ DocHitInfoIteratorPropertyInSchema property_defined_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_target_sections=*/{not_defined_section_2},
+ fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_FALSE(property_defined_iterator.Advance().ok());
+}
+
+TEST_F(DocHitInfoIteratorPropertyInSchemaTest,
+ CorrectlySetsSectionIdMasksAndPopulatesTermMatchInfo) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document1_));
+
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask original_section_id_mask = 0b00000101; // hits in sections 0, 2
+
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+
+ // Create a hit that was found in the indexed section
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {doc_hit_info1};
+
+ auto original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "hi");
+ original_iterator->set_hit_section_ids_mask(original_section_id_mask);
+
+ DocHitInfoIteratorPropertyInSchema property_defined_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_target_sections=*/{indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ std::vector<TermMatchInfo> matched_terms_stats;
+ property_defined_iterator.PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(property_defined_iterator.Advance());
+ EXPECT_THAT(property_defined_iterator.doc_hit_info().document_id(),
+ Eq(document_id));
+
+ // The expected mask is the same as the original mask, since the iterator
+ // should treat it as a pass-through.
+ SectionIdMask expected_section_id_mask = original_section_id_mask;
+ EXPECT_EQ(property_defined_iterator.doc_hit_info().hit_section_ids_mask(),
+ expected_section_id_mask);
+
+ property_defined_iterator.PopulateMatchedTermsStats(&matched_terms_stats);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{0, 1}, {2, 2}};
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hi", expected_section_ids_tf_map)));
+
+ EXPECT_FALSE(property_defined_iterator.Advance().ok());
+}
+
+TEST_F(DocHitInfoIteratorPropertyInSchemaTest,
+ TrimRightMostNodeResultsInError) {
+ auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ document_store_->num_documents());
+
+ DocHitInfoIteratorPropertyInSchema property_defined_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_target_sections=*/{indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(std::move(property_defined_iterator).TrimRightMostNode(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(DocHitInfoIteratorPropertyInSchemaTest,
+ FindPropertyDefinedByMultipleTypes) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2_));
+ auto original_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ document_store_->num_documents());
+
+ DocHitInfoIteratorPropertyInSchema property_defined_iterator(
+ std::move(original_iterator), document_store_.get(), schema_store_.get(),
+ /*target_target_sections=*/{unindexed_section_1},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(GetDocumentIds(&property_defined_iterator),
+ ElementsAre(document_id2, document_id1));
+
+ EXPECT_FALSE(property_defined_iterator.Advance().ok());
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc
index 8acb91a..35dc0b9 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.cc
@@ -16,98 +16,231 @@
#include <cstdint>
#include <memory>
+#include <set>
#include <string>
#include <string_view>
+#include <unordered_map>
#include <utility>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/absl_ports/str_join.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/iterator/section-restrict-data.h"
+#include "icing/proto/search.pb.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
+#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
-DocHitInfoIteratorSectionRestrict::DocHitInfoIteratorSectionRestrict(
- std::unique_ptr<DocHitInfoIterator> delegate,
- const DocumentStore* document_store, const SchemaStore* schema_store,
- std::string_view target_section)
- : delegate_(std::move(delegate)),
- document_store_(*document_store),
- schema_store_(*schema_store),
- target_section_(target_section) {}
+// An iterator that simply takes ownership of SectionRestrictData.
+class SectionRestrictDataHolderIterator : public DocHitInfoIterator {
+ public:
+ explicit SectionRestrictDataHolderIterator(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ std::unique_ptr<SectionRestrictData> data)
+ : delegate_(std::move(delegate)), data_(std::move(data)) {}
+
+ libtextclassifier3::Status Advance() override {
+ auto result = delegate_->Advance();
+ doc_hit_info_ = delegate_->doc_hit_info();
+ return result;
+ }
-libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() {
- if (!delegate_->Advance().ok()) {
- // Didn't find anything on the delegate iterator.
- doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
- return absl_ports::ResourceExhaustedError(
- "No more DocHitInfos in iterator");
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override {
+ ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_delegate,
+ std::move(*delegate_).TrimRightMostNode());
+ if (trimmed_delegate.iterator_ != nullptr) {
+ trimmed_delegate.iterator_ =
+ std::make_unique<SectionRestrictDataHolderIterator>(
+ std::move(trimmed_delegate.iterator_), std::move(data_));
+ }
+ return trimmed_delegate;
+ }
+
+ void MapChildren(const ChildrenMapper& mapper) override {
+ delegate_ = mapper(std::move(delegate_));
}
- DocumentId document_id = delegate_->doc_hit_info().document_id();
+ CallStats GetCallStats() const override { return delegate_->GetCallStats(); }
- SectionIdMask section_id_mask =
- delegate_->doc_hit_info().hit_section_ids_mask();
+ std::string ToString() const override { return delegate_->ToString(); }
- auto data_or = document_store_.GetDocumentFilterData(document_id);
- if (!data_or.ok()) {
- // Ran into some error retrieving information on this hit, skip
- return Advance();
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask) const override {
+ return delegate_->PopulateMatchedTermsStats(matched_terms_stats,
+ filtering_section_mask);
}
- // Guaranteed that the DocumentFilterData exists at this point
- DocumentFilterData data = std::move(data_or).ValueOrDie();
- SchemaTypeId schema_type_id = data.schema_type_id();
-
- // A hit can be in multiple sections at once, need to check that at least one
- // of the confirmed section ids match the name of the target section
- while (section_id_mask != 0) {
- // There was a hit in this section id
- SectionId section_id = __builtin_ctz(section_id_mask);
-
- auto section_metadata_or =
- schema_store_.GetSectionMetadata(schema_type_id, section_id);
-
- if (section_metadata_or.ok()) {
- const SectionMetadata* section_metadata =
- section_metadata_or.ValueOrDie();
-
- if (section_metadata->path == target_section_) {
- // The hit was in the target section name, return OK/found
- doc_hit_info_ = delegate_->doc_hit_info();
- hit_intersect_section_ids_mask_ =
- delegate_->hit_intersect_section_ids_mask();
- return libtextclassifier3::Status::OK;
- }
- }
+ private:
+ std::unique_ptr<DocHitInfoIterator> delegate_;
+ std::unique_ptr<SectionRestrictData> data_;
+};
+
+DocHitInfoIteratorSectionRestrict::DocHitInfoIteratorSectionRestrict(
+ std::unique_ptr<DocHitInfoIterator> delegate, SectionRestrictData* data)
+ : delegate_(std::move(delegate)), data_(data) {}
- // Mark this section as checked
- section_id_mask &= ~(1U << section_id);
+std::unique_ptr<DocHitInfoIterator>
+DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::unique_ptr<DocHitInfoIterator> iterator,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ std::set<std::string> target_sections, int64_t current_time_ms) {
+ std::unordered_map<std::string, std::set<std::string>> type_property_filters;
+ type_property_filters[std::string(SchemaStore::kSchemaTypeWildcard)] =
+ std::move(target_sections);
+ auto data = std::make_unique<SectionRestrictData>(
+ document_store, schema_store, current_time_ms, type_property_filters);
+ std::unique_ptr<DocHitInfoIterator> result =
+ ApplyRestrictions(std::move(iterator), data.get());
+ return std::make_unique<SectionRestrictDataHolderIterator>(std::move(result),
+ std::move(data));
+}
+
+std::unique_ptr<DocHitInfoIterator>
+DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::unique_ptr<DocHitInfoIterator> iterator,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ const SearchSpecProto& search_spec, int64_t current_time_ms) {
+ std::unordered_map<std::string, std::set<std::string>> type_property_filters;
+ // TODO(b/294274922): Add support for polymorphism in type property filters.
+ for (const TypePropertyMask& type_property_mask :
+ search_spec.type_property_filters()) {
+ type_property_filters[type_property_mask.schema_type()] =
+ std::set<std::string>(type_property_mask.paths().begin(),
+ type_property_mask.paths().end());
}
+ auto data = std::make_unique<SectionRestrictData>(
+ document_store, schema_store, current_time_ms, type_property_filters);
+ std::unique_ptr<DocHitInfoIterator> result =
+ ApplyRestrictions(std::move(iterator), data.get());
+ return std::make_unique<SectionRestrictDataHolderIterator>(std::move(result),
+ std::move(data));
+}
- // Didn't find a matching section name for this hit, go to the next hit
- return Advance();
+std::unique_ptr<DocHitInfoIterator>
+DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::unique_ptr<DocHitInfoIterator> iterator, SectionRestrictData* data) {
+ ChildrenMapper mapper;
+ mapper = [&data, &mapper](std::unique_ptr<DocHitInfoIterator> iterator)
+ -> std::unique_ptr<DocHitInfoIterator> {
+ if (iterator->is_leaf()) {
+ return std::make_unique<DocHitInfoIteratorSectionRestrict>(
+ std::move(iterator), data);
+ } else {
+ iterator->MapChildren(mapper);
+ return iterator;
+ }
+ };
+ return mapper(std::move(iterator));
}
-int32_t DocHitInfoIteratorSectionRestrict::GetNumBlocksInspected() const {
- return delegate_->GetNumBlocksInspected();
+libtextclassifier3::Status DocHitInfoIteratorSectionRestrict::Advance() {
+ doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
+ while (delegate_->Advance().ok()) {
+ DocumentId document_id = delegate_->doc_hit_info().document_id();
+
+ auto data_optional = data_->document_store().GetAliveDocumentFilterData(
+ document_id, data_->current_time_ms());
+ if (!data_optional) {
+ // Ran into some error retrieving information on this hit, skip
+ continue;
+ }
+
+ // Guaranteed that the DocumentFilterData exists at this point
+ SchemaTypeId schema_type_id = data_optional.value().schema_type_id();
+ auto schema_type_or = data_->schema_store().GetSchemaType(schema_type_id);
+ if (!schema_type_or.ok()) {
+ // Ran into error retrieving schema type, skip
+ continue;
+ }
+ const std::string* schema_type = std::move(schema_type_or).ValueOrDie();
+ SectionIdMask allowed_sections_mask =
+ data_->ComputeAllowedSectionsMask(*schema_type);
+
+ // A hit can be in multiple sections at once, need to check which of the
+ // section ids match the sections allowed by type_property_masks_. This can
+ // be done by doing a bitwise and of the section_id_mask in the doc hit and
+ // the allowed_sections_mask.
+ SectionIdMask section_id_mask =
+ delegate_->doc_hit_info().hit_section_ids_mask() &
+ allowed_sections_mask;
+
+ // Return this document if:
+ // - the sectionIdMask is not empty after applying property filters, or
+ // - no property filters apply for its schema type (allowed_sections_mask
+ // == kSectionIdMaskAll). This is needed to ensure that in case of empty
+ // query (which uses doc-hit-info-iterator-all-document-id), where
+ // section_id_mask is kSectionIdMaskNone, doc hits with no property
+ // restrictions don't get filtered out. Doc hits for schema types for
+ // whom property filters are specified will still get filtered out.
+ if (allowed_sections_mask == kSectionIdMaskAll ||
+ section_id_mask != kSectionIdMaskNone) {
+ doc_hit_info_ = delegate_->doc_hit_info();
+ doc_hit_info_.set_hit_section_ids_mask(section_id_mask);
+ return libtextclassifier3::Status::OK;
+ }
+ // Didn't find a matching section name for this hit. Continue.
+ }
+
+ // Didn't find anything on the delegate iterator.
+ return absl_ports::ResourceExhaustedError("No more DocHitInfos in iterator");
}
-int32_t DocHitInfoIteratorSectionRestrict::GetNumLeafAdvanceCalls() const {
- return delegate_->GetNumLeafAdvanceCalls();
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorSectionRestrict::TrimRightMostNode() && {
+ ICING_ASSIGN_OR_RETURN(TrimmedNode trimmed_delegate,
+ std::move(*delegate_).TrimRightMostNode());
+ // TrimRightMostNode is only used by suggestion processor to process query
+ // expression, so an entry for wildcard should always be present in
+ // type_property_filters_ when code flow reaches here. If the InternalError
+ // below is returned, that means TrimRightMostNode hasn't been called in the
+ // right context.
+ const auto it = data_->type_property_filters().find("*");
+ if (it == data_->type_property_filters().end()) {
+ return absl_ports::InternalError(
+ "A wildcard entry should always be present in type property filters "
+ "whenever TrimRightMostNode() is called for "
+ "DocHitInfoIteratorSectionRestrict");
+ }
+ const std::set<std::string>& target_sections = it->second;
+ if (target_sections.empty()) {
+ return absl_ports::InternalError(
+ "Target sections should not be empty whenever TrimRightMostNode() is "
+ "called for DocHitInfoIteratorSectionRestrict");
+ }
+ if (trimmed_delegate.iterator_ == nullptr) {
+ // TODO(b/228240987): Update TrimmedNode and downstream code to handle
+ // multiple section restricts.
+ trimmed_delegate.target_section_ = std::move(*target_sections.begin());
+ return trimmed_delegate;
+ }
+ trimmed_delegate.iterator_ =
+ std::unique_ptr<DocHitInfoIteratorSectionRestrict>(
+ new DocHitInfoIteratorSectionRestrict(
+ std::move(trimmed_delegate.iterator_), std::move(data_)));
+ return std::move(trimmed_delegate);
}
std::string DocHitInfoIteratorSectionRestrict::ToString() const {
- return absl_ports::StrCat(target_section_, ": ", delegate_->ToString());
+ std::string output = "";
+ for (auto it = data_->type_property_filters().cbegin();
+ it != data_->type_property_filters().cend(); it++) {
+ std::string paths = absl_ports::StrJoin(it->second, ",");
+ output += (it->first) + ":" + (paths) + "; ";
+ }
+ std::string result = "{" + output.substr(0, output.size() - 2) + "}: ";
+ return absl_ports::StrCat(result, delegate_->ToString());
}
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
index ae5a896..387ff52 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict.h
@@ -17,12 +17,18 @@
#include <cstdint>
#include <memory>
+#include <set>
#include <string>
-#include <string_view>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/iterator/section-restrict-data.h"
+#include "icing/proto/search.pb.h"
#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
namespace icing {
@@ -35,30 +41,65 @@ namespace lib {
// That class is meant to be applied to the root of a query tree and filter over
// all results at the end. This class is more used in the limited scope of a
// term or a small group of terms.
-class DocHitInfoIteratorSectionRestrict : public DocHitInfoIterator {
+class DocHitInfoIteratorSectionRestrict : public DocHitInfoLeafIterator {
public:
// Does not take any ownership, and all pointers must refer to valid objects
// that outlive the one constructed.
explicit DocHitInfoIteratorSectionRestrict(
- std::unique_ptr<DocHitInfoIterator> delegate,
+ std::unique_ptr<DocHitInfoIterator> delegate, SectionRestrictData* data);
+
+ // Methods that apply section restrictions to all DocHitInfoLeafIterator nodes
+ // inside the provided iterator tree, and return the root of the tree
+ // afterwards. These methods do not take any ownership for the raw pointer
+ // parameters, which must refer to valid objects that outlive the iterator
+ // returned.
+ static std::unique_ptr<DocHitInfoIterator> ApplyRestrictions(
+ std::unique_ptr<DocHitInfoIterator> iterator,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ std::set<std::string> target_sections, int64_t current_time_ms);
+ static std::unique_ptr<DocHitInfoIterator> ApplyRestrictions(
+ std::unique_ptr<DocHitInfoIterator> iterator,
const DocumentStore* document_store, const SchemaStore* schema_store,
- std::string_view target_section);
+ const SearchSpecProto& search_spec, int64_t current_time_ms);
+ static std::unique_ptr<DocHitInfoIterator> ApplyRestrictions(
+ std::unique_ptr<DocHitInfoIterator> iterator, SectionRestrictData* data);
libtextclassifier3::Status Advance() override;
- int32_t GetNumBlocksInspected() const override;
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
- int32_t GetNumLeafAdvanceCalls() const override;
+ CallStats GetCallStats() const override { return delegate_->GetCallStats(); }
std::string ToString() const override;
+ // Note that the DocHitInfoIteratorSectionRestrict can only be applied at
+ // DocHitInfoLeafIterator, which can be a term iterator or another
+ // DocHitInfoIteratorSectionRestrict.
+ //
+ // To filter the matching sections, filtering_section_mask should be set to
+ // doc_hit_info_.hit_section_ids_mask() held in the outermost
+ // DocHitInfoIteratorSectionRestrict, which is equal to the intersection of
+ // all hit_section_ids_mask in the DocHitInfoIteratorSectionRestrict chain,
+ // since for any two section restrict iterators chained together, the outer
+ // one's hit_section_ids_mask is always a subset of the inner one's
+ // hit_section_ids_mask.
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (doc_hit_info_.document_id() == kInvalidDocumentId) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ delegate_->PopulateMatchedTermsStats(
+ matched_terms_stats,
+ /*filtering_section_mask=*/filtering_section_mask &
+ doc_hit_info_.hit_section_ids_mask());
+ }
+
private:
std::unique_ptr<DocHitInfoIterator> delegate_;
- const DocumentStore& document_store_;
- const SchemaStore& schema_store_;
-
- // Ensure that this does not outlive the underlying string value.
- std::string_view target_section_;
+ // Does not own.
+ SectionRestrictData* data_;
};
} // namespace lib
diff --git a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
index df79c6d..ee65fe1 100644
--- a/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
+++ b/icing/index/iterator/doc-hit-info-iterator-section-restrict_test.cc
@@ -15,6 +15,7 @@
#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
#include <memory>
+#include <set>
#include <string>
#include <utility>
#include <vector>
@@ -24,11 +25,13 @@
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator-and.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
@@ -46,6 +49,9 @@ using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsEmpty;
+constexpr SectionId kIndexedSectionId0 = 0;
+constexpr SectionId kIndexedSectionId1 = 1;
+
class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
protected:
DocHitInfoIteratorSectionRestrictTest()
@@ -53,33 +59,58 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
void SetUp() override {
filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- document_ =
- DocumentBuilder().SetKey("namespace", "uri").SetSchema("email").Build();
-
- auto type_config = schema_.add_types();
- type_config->set_schema_type("email");
-
- // Add an indexed property so we generate section metadata on it
- auto property = type_config->add_properties();
- property->set_property_name(indexed_property_);
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ document1_ = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("email")
+ .Build();
+ document2_ = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("email")
+ .Build();
+ document3_ = DocumentBuilder()
+ .SetKey("namespace", "uri3")
+ .SetSchema("email")
+ .Build();
+
+ indexed_section_0 = "indexedSection0";
+ indexed_section_1 = "indexedSection1";
+ schema_ =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ // Add an indexed property so we generate section
+ // metadata on it
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(indexed_section_0)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(indexed_section_1)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
- // First and only indexed property, so it gets the first id of 0
- indexed_section_id_ = 0;
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ICING_ASSERT_OK(schema_store_->SetSchema(schema_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, test_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
}
void TearDown() override {
@@ -92,30 +123,87 @@ class DocHitInfoIteratorSectionRestrictTest : public ::testing::Test {
std::unique_ptr<DocumentStore> document_store_;
const Filesystem filesystem_;
const std::string test_dir_;
+ std::string indexed_section_0;
+ std::string indexed_section_1;
SchemaProto schema_;
- DocumentProto document_;
- const std::string indexed_property_ = "subject";
- int indexed_section_id_;
+ DocumentProto document1_;
+ DocumentProto document2_;
+ DocumentProto document3_;
FakeClock fake_clock_;
};
+TEST_F(DocHitInfoIteratorSectionRestrictTest,
+ PopulateMatchedTermsStats_IncludesHitWithMatchingSection) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document1_));
+
+ // Arbitrary section ids for the documents in the DocHitInfoIterators.
+ // Created to test correct section_id_mask behavior.
+ SectionIdMask original_section_id_mask = 0b00000101; // hits in sections 0, 2
+
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id);
+ doc_hit_info1.UpdateSection(/*section_id=*/0, /*hit_term_frequency=*/1);
+ doc_hit_info1.UpdateSection(/*section_id=*/2, /*hit_term_frequency=*/2);
+
+ // Create a hit that was found in the indexed section
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {doc_hit_info1};
+
+ auto original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "hi");
+ original_iterator->set_hit_section_ids_mask(original_section_id_mask);
+
+ // Filtering for the indexed section name (which has a section id of 0) should
+ // get a result.
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ std::vector<TermMatchInfo> matched_terms_stats;
+ section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
+
+ ICING_EXPECT_OK(section_restrict_iterator->Advance());
+ EXPECT_THAT(section_restrict_iterator->doc_hit_info().document_id(),
+ Eq(document_id));
+ SectionIdMask expected_section_id_mask = 0b00000001; // hits in sections 0
+ EXPECT_EQ(section_restrict_iterator->doc_hit_info().hit_section_ids_mask(),
+ expected_section_id_mask);
+
+ section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{0, 1}};
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hi", expected_section_ids_tf_map)));
+
+ EXPECT_FALSE(section_restrict_iterator->Advance().ok());
+}
+
TEST_F(DocHitInfoIteratorSectionRestrictTest, EmptyOriginalIterator) {
std::unique_ptr<DocHitInfoIterator> original_iterator_empty =
std::make_unique<DocHitInfoIteratorDummy>();
- DocHitInfoIteratorSectionRestrict filtered_iterator(
- std::move(original_iterator_empty), document_store_.get(),
- schema_store_.get(), /*target_section=*/"");
+ std::unique_ptr<DocHitInfoIterator> filtered_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator_empty), document_store_.get(),
+ schema_store_.get(), /*target_sections=*/std::set<std::string>(),
+ fake_clock_.GetSystemTimeMilliseconds());
- EXPECT_THAT(GetDocumentIds(&filtered_iterator), IsEmpty());
+ EXPECT_THAT(GetDocumentIds(filtered_iterator.get()), IsEmpty());
+ std::vector<TermMatchInfo> matched_terms_stats;
+ filtered_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
}
TEST_F(DocHitInfoIteratorSectionRestrictTest, IncludesHitWithMatchingSection) {
// Populate the DocumentStore's FilterCache with this document's data
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- document_store_->Put(document_));
+ document_store_->Put(document1_));
- SectionIdMask section_id_mask = 1U << indexed_section_id_;
+ SectionIdMask section_id_mask = 1U << kIndexedSectionId0;
// Create a hit that was found in the indexed section
std::vector<DocHitInfo> doc_hit_infos = {
@@ -125,14 +213,107 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, IncludesHitWithMatchingSection) {
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
// Filtering for the indexed section name should get a result
- DocHitInfoIteratorSectionRestrict section_restrict_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- indexed_property_);
-
- EXPECT_THAT(GetDocumentIds(&section_restrict_iterator),
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()),
ElementsAre(document_id));
}
+TEST_F(DocHitInfoIteratorSectionRestrictTest,
+ IncludesHitWithMultipleMatchingSectionsWithMultipleSectionRestricts) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document1_));
+
+ SectionIdMask section_id_mask = 1U << kIndexedSectionId0;
+ section_id_mask |= 1U << kIndexedSectionId1;
+
+ // Create a hit that was found in the indexed section
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id, section_id_mask)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Filter for both target_sections
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{indexed_section_0, indexed_section_1},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ ICING_ASSERT_OK(section_restrict_iterator->Advance());
+ std::vector<SectionId> expected_section_ids = {kIndexedSectionId0,
+ kIndexedSectionId1};
+ EXPECT_THAT(section_restrict_iterator->doc_hit_info(),
+ EqualsDocHitInfo(document_id, expected_section_ids));
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest,
+ IncludesHitWithMultipleMatchingSectionsWithSingleSectionRestrict) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document1_));
+
+ SectionIdMask section_id_mask = 1U << kIndexedSectionId0;
+ section_id_mask |= 1U << kIndexedSectionId1;
+
+ // Create a hit that was found in the indexed section
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id, section_id_mask)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Filter for both target_sections
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{indexed_section_1},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ ICING_ASSERT_OK(section_restrict_iterator->Advance());
+ std::vector<SectionId> expected_section_ids = {kIndexedSectionId1};
+ EXPECT_THAT(section_restrict_iterator->doc_hit_info(),
+ EqualsDocHitInfo(document_id, expected_section_ids));
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest,
+ IncludesHitWithSingleMatchingSectionsWithMultiSectionRestrict) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document1_));
+
+ SectionIdMask section_id_mask = 1U << kIndexedSectionId1;
+
+ // Create a hit that was found in the indexed section
+ std::vector<DocHitInfo> doc_hit_infos = {
+ DocHitInfo(document_id, section_id_mask)};
+
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ // Filter for both target_sections
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{indexed_section_0, indexed_section_1},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ ICING_ASSERT_OK(section_restrict_iterator->Advance());
+ std::vector<SectionId> expected_section_ids = {kIndexedSectionId1};
+ EXPECT_THAT(section_restrict_iterator->doc_hit_info(),
+ EqualsDocHitInfo(document_id, expected_section_ids));
+}
+
TEST_F(DocHitInfoIteratorSectionRestrictTest, NoMatchingDocumentFilterData) {
// Create a hit with a document id that doesn't exist in the DocumentStore yet
std::vector<DocHitInfo> doc_hit_infos = {DocHitInfo(/*document_id_in=*/0)};
@@ -141,20 +322,25 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest, NoMatchingDocumentFilterData) {
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
// Filtering for the indexed section name should get a result
- DocHitInfoIteratorSectionRestrict section_restrict_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- /*target_section=*/"");
-
- EXPECT_THAT(GetDocumentIds(&section_restrict_iterator), IsEmpty());
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{""}, fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), IsEmpty());
+ std::vector<TermMatchInfo> matched_terms_stats;
+ section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
}
TEST_F(DocHitInfoIteratorSectionRestrictTest,
DoesntIncludeHitWithWrongSectionName) {
// Populate the DocumentStore's FilterCache with this document's data
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- document_store_->Put(document_));
+ document_store_->Put(document1_));
- SectionIdMask section_id_mask = 1U << indexed_section_id_;
+ SectionIdMask section_id_mask = 1U << kIndexedSectionId0;
// Create a hit that was found in the indexed section
std::vector<DocHitInfo> doc_hit_infos = {
@@ -164,18 +350,24 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest,
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
// Filtering for the indexed section name should get a result
- DocHitInfoIteratorSectionRestrict section_restrict_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- "some_section_name");
-
- EXPECT_THAT(GetDocumentIds(&section_restrict_iterator), IsEmpty());
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{"some_section_name"},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), IsEmpty());
+ std::vector<TermMatchInfo> matched_terms_stats;
+ section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
}
TEST_F(DocHitInfoIteratorSectionRestrictTest,
DoesntIncludeHitWithNoSectionIds) {
// Populate the DocumentStore's FilterCache with this document's data
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- document_store_->Put(document_));
+ document_store_->Put(document1_));
// Create a hit that doesn't exist in any sections, so it shouldn't match any
// section filters
@@ -185,18 +377,24 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest,
std::unique_ptr<DocHitInfoIterator> original_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- DocHitInfoIteratorSectionRestrict section_restrict_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- indexed_property_);
-
- EXPECT_THAT(GetDocumentIds(&section_restrict_iterator), IsEmpty());
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), IsEmpty());
+ std::vector<TermMatchInfo> matched_terms_stats;
+ section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
}
TEST_F(DocHitInfoIteratorSectionRestrictTest,
DoesntIncludeHitWithDifferentSectionId) {
// Populate the DocumentStore's FilterCache with this document's data
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- document_store_->Put(document_));
+ document_store_->Put(document1_));
// Anything that's not 0, which is the indexed property
SectionId not_matching_section_id = 2;
@@ -204,38 +402,140 @@ TEST_F(DocHitInfoIteratorSectionRestrictTest,
// Create a hit that exists in a different section, so it shouldn't match any
// section filters
std::vector<DocHitInfo> doc_hit_infos = {
- DocHitInfo(document_id, kSectionIdMaskNone << not_matching_section_id)};
+ DocHitInfo(document_id, UINT64_C(1) << not_matching_section_id)};
std::unique_ptr<DocHitInfoIterator> original_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- DocHitInfoIteratorSectionRestrict section_restrict_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- indexed_property_);
-
- EXPECT_THAT(GetDocumentIds(&section_restrict_iterator), IsEmpty());
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ EXPECT_THAT(GetDocumentIds(section_restrict_iterator.get()), IsEmpty());
+ std::vector<TermMatchInfo> matched_terms_stats;
+ section_restrict_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, IsEmpty());
}
-TEST_F(DocHitInfoIteratorSectionRestrictTest, GetNumBlocksInspected) {
+TEST_F(DocHitInfoIteratorSectionRestrictTest, GetCallStats) {
+ DocHitInfoIterator::CallStats original_call_stats(
+ /*num_leaf_advance_calls_lite_index_in=*/2,
+ /*num_leaf_advance_calls_main_index_in=*/5,
+ /*num_leaf_advance_calls_integer_index_in=*/3,
+ /*num_leaf_advance_calls_no_index_in=*/1,
+ /*num_blocks_inspected_in=*/4); // arbitrary value
auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>();
- original_iterator->SetNumBlocksInspected(5);
+ original_iterator->SetCallStats(original_call_stats);
- DocHitInfoIteratorSectionRestrict section_restrict_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- /*target_section=*/"");
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(),
+ /*target_sections=*/{""}, fake_clock_.GetSystemTimeMilliseconds());
- EXPECT_THAT(section_restrict_iterator.GetNumBlocksInspected(), Eq(5));
+ EXPECT_THAT(section_restrict_iterator->GetCallStats(),
+ Eq(original_call_stats));
}
-TEST_F(DocHitInfoIteratorSectionRestrictTest, GetNumLeafAdvanceCalls) {
- auto original_iterator = std::make_unique<DocHitInfoIteratorDummy>();
- original_iterator->SetNumLeafAdvanceCalls(6);
+TEST_F(DocHitInfoIteratorSectionRestrictTest,
+ TrimSectionRestrictIterator_TwoLayer) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3_));
+
+ // 0 is the indexed property
+ SectionId matching_section_id = 0;
+ // Anything that's not 0, which is the indexed property
+ SectionId not_matching_section_id = 2;
- DocHitInfoIteratorSectionRestrict section_restrict_iterator(
- std::move(original_iterator), document_store_.get(), schema_store_.get(),
- /*target_section=*/"");
+ // Build an iterator tree like:
+ // AND
+ // / \
+ // [1, 1],[2, 2] [3, 2]
+ std::vector<DocHitInfo> left_infos = {
+ DocHitInfo(document_id1, 1U << matching_section_id),
+ DocHitInfo(document_id2, 1U << not_matching_section_id)};
+ std::vector<DocHitInfo> right_infos = {
+ DocHitInfo(document_id3, 1U << not_matching_section_id)};
+
+ std::unique_ptr<DocHitInfoIterator> left_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(left_infos);
+ std::unique_ptr<DocHitInfoIterator> right_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(right_infos, "term", 10);
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorAnd>(std::move(left_iterator),
+ std::move(right_iterator));
+
+ // After applying section restriction:
+ // AND
+ // / \
+ // Restrict Restrict
+ // | |
+ // [1, 1],[2, 2] [3, 2]
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(), {indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ // The trimmed tree.
+ // Restrict
+ // |
+ // [1, 1],[2, 2]
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocHitInfoIterator::TrimmedNode node,
+ std::move(*section_restrict_iterator).TrimRightMostNode());
+
+ EXPECT_THAT(GetDocumentIds(node.iterator_.get()), ElementsAre(document_id1));
+ EXPECT_THAT(node.term_, Eq("term"));
+ EXPECT_THAT(node.term_start_index_, Eq(10));
+ EXPECT_THAT(node.target_section_, Eq(indexed_section_0));
+}
+
+TEST_F(DocHitInfoIteratorSectionRestrictTest, TrimSectionRestrictIterator) {
+ // Populate the DocumentStore's FilterCache with this document's data
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2_));
- EXPECT_THAT(section_restrict_iterator.GetNumLeafAdvanceCalls(), Eq(6));
+ // 0 is the indexed property
+ SectionId matching_section_id = 0;
+ // Anything that's not 0, which is the indexed property
+ SectionId not_matching_section_id = 2;
+
+ // Build an interator tree like:
+ // Restrict
+ // |
+ // [1, 1],[2, 2]
+ std::vector<DocHitInfo> doc_infos = {
+ DocHitInfo(document_id1, 1U << matching_section_id),
+ DocHitInfo(document_id2, 1U << not_matching_section_id)};
+ std::unique_ptr<DocHitInfoIterator> original_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_infos, "term", 10);
+
+ std::unique_ptr<DocHitInfoIterator> section_restrict_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(original_iterator), document_store_.get(),
+ schema_store_.get(), {indexed_section_0},
+ fake_clock_.GetSystemTimeMilliseconds());
+
+ // The trimmed tree has null iterator but has target section.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocHitInfoIterator::TrimmedNode node,
+ std::move(*section_restrict_iterator).TrimRightMostNode());
+
+ EXPECT_THAT(node.iterator_, testing::IsNull());
+ EXPECT_THAT(node.term_, Eq("term"));
+ EXPECT_THAT(node.term_start_index_, Eq(10));
+ EXPECT_THAT(node.target_section_, Eq(indexed_section_0));
}
} // namespace
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.cc b/icing/index/iterator/doc-hit-info-iterator-term.cc
deleted file mode 100644
index 97ca3c4..0000000
--- a/icing/index/iterator/doc-hit-info-iterator-term.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/index/iterator/doc-hit-info-iterator-term.h"
-
-#include <cstdint>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/index/hit/doc-hit-info.h"
-#include "icing/schema/section.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
- std::string mask(kMaxSectionId + 1, '0');
- for (SectionId i = kMaxSectionId; i >= 0; --i) {
- if (section_id_mask & (1U << i)) {
- mask[kMaxSectionId - i] = '1';
- }
- }
- return mask;
-}
-
-} // namespace
-
-libtextclassifier3::Status DocHitInfoIteratorTerm::Advance() {
- if (cached_hits_idx_ == -1) {
- ICING_RETURN_IF_ERROR(RetrieveMoreHits());
- } else {
- ++cached_hits_idx_;
- }
- if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
- // Nothing more for the iterator to return. Set these members to invalid
- // values.
- doc_hit_info_ = DocHitInfo();
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
- return absl_ports::ResourceExhaustedError(
- "No more DocHitInfos in iterator");
- }
- doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
- hit_intersect_section_ids_mask_ = doc_hit_info_.hit_section_ids_mask();
- return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::Status DocHitInfoIteratorTermExact::RetrieveMoreHits() {
- // Exact match only. All hits in lite lexicon are exact.
- ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->FindTerm(term_));
- ICING_ASSIGN_OR_RETURN(uint32_t term_id,
- term_id_codec_->EncodeTvi(tvi, TviType::LITE));
- lite_index_->AppendHits(term_id, section_restrict_mask_,
- /*only_from_prefix_sections=*/false, &cached_hits_);
- cached_hits_idx_ = 0;
- return libtextclassifier3::Status::OK;
-}
-
-std::string DocHitInfoIteratorTermExact::ToString() const {
- return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
- term_);
-}
-
-libtextclassifier3::Status DocHitInfoIteratorTermPrefix::RetrieveMoreHits() {
- // Take union of lite terms.
- int term_len = term_.length();
- int terms_matched = 0;
- for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
- it.IsValid(); it.Advance()) {
- bool exact_match = strlen(it.GetKey()) == term_len;
- ICING_ASSIGN_OR_RETURN(
- uint32_t term_id,
- term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
- lite_index_->AppendHits(term_id, section_restrict_mask_,
- /*only_from_prefix_sections=*/!exact_match,
- &cached_hits_);
- ++terms_matched;
- }
- if (terms_matched > 1) {
- SortAndDedupeDocumentIds();
- }
- cached_hits_idx_ = 0;
- return libtextclassifier3::Status::OK;
-}
-
-void DocHitInfoIteratorTermPrefix::SortAndDedupeDocumentIds() {
- // Re-sort cached document_ids and merge sections.
- sort(cached_hits_.begin(), cached_hits_.end());
-
- int idx = 0;
- for (int i = 1; i < cached_hits_.size(); ++i) {
- const DocHitInfo& hit_info = cached_hits_.at(i);
- DocHitInfo& collapsed_hit_info = cached_hits_.at(idx);
- if (collapsed_hit_info.document_id() == hit_info.document_id()) {
- collapsed_hit_info.MergeSectionsFrom(hit_info);
- } else {
- // New document_id.
- cached_hits_.at(++idx) = hit_info;
- }
- }
- // idx points to last doc hit info.
- cached_hits_.resize(idx + 1);
-}
-
-std::string DocHitInfoIteratorTermPrefix::ToString() const {
- return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
- term_, "*");
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator-term.h b/icing/index/iterator/doc-hit-info-iterator-term.h
deleted file mode 100644
index 7d02fc2..0000000
--- a/icing/index/iterator/doc-hit-info-iterator-term.h
+++ /dev/null
@@ -1,108 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
-#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
-
-#include <cstdint>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/index/hit/doc-hit-info.h"
-#include "icing/index/iterator/doc-hit-info-iterator.h"
-#include "icing/index/lite-index.h"
-#include "icing/index/term-id-codec.h"
-#include "icing/schema/section.h"
-
-namespace icing {
-namespace lib {
-
-class DocHitInfoIteratorTerm : public DocHitInfoIterator {
- public:
- explicit DocHitInfoIteratorTerm(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index, const std::string term,
- SectionIdMask section_restrict_mask)
- : term_(term),
- lite_index_(lite_index),
- cached_hits_idx_(-1),
- term_id_codec_(term_id_codec),
- num_advance_calls_(0),
- section_restrict_mask_(section_restrict_mask) {}
-
- libtextclassifier3::Status Advance() override;
-
- int32_t GetNumBlocksInspected() const override {
- // TODO(b/137862424): Implement this once the main index is added.
- return 0;
- }
- int32_t GetNumLeafAdvanceCalls() const override { return num_advance_calls_; }
-
- protected:
- // Add DocHitInfos corresponding to term_ to cached_hits_.
- virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
-
- const std::string term_;
- LiteIndex* const lite_index_;
- // Stores hits retrieved from the index. This may only be a subset of the hits
- // that are present in the index. Current value pointed to by the Iterator is
- // tracked by cached_hits_idx_.
- std::vector<DocHitInfo> cached_hits_;
- int cached_hits_idx_;
- const TermIdCodec* term_id_codec_;
- int num_advance_calls_;
- // Mask indicating which sections hits should be considered for.
- // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
- const SectionIdMask section_restrict_mask_;
-};
-
-class DocHitInfoIteratorTermExact : public DocHitInfoIteratorTerm {
- public:
- explicit DocHitInfoIteratorTermExact(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index,
- const std::string& term,
- SectionIdMask section_id_mask)
- : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
- section_id_mask) {}
-
- std::string ToString() const override;
-
- protected:
- libtextclassifier3::Status RetrieveMoreHits() override;
-};
-
-class DocHitInfoIteratorTermPrefix : public DocHitInfoIteratorTerm {
- public:
- explicit DocHitInfoIteratorTermPrefix(const TermIdCodec* term_id_codec,
- LiteIndex* lite_index,
- const std::string& term,
- SectionIdMask section_id_mask)
- : DocHitInfoIteratorTerm(term_id_codec, lite_index, term,
- section_id_mask) {}
-
- std::string ToString() const override;
-
- protected:
- libtextclassifier3::Status RetrieveMoreHits() override;
-
- private:
- // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
- // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
- // merged.
- void SortAndDedupeDocumentIds();
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_H_
diff --git a/icing/index/iterator/doc-hit-info-iterator-test-util.h b/icing/index/iterator/doc-hit-info-iterator-test-util.h
index c4d7aa7..c75fb33 100644
--- a/icing/index/iterator/doc-hit-info-iterator-test-util.h
+++ b/icing/index/iterator/doc-hit-info-iterator-test-util.h
@@ -15,7 +15,7 @@
#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TEST_UTIL_H_
-#include <cstdint>
+#include <cinttypes>
#include <string>
#include <utility>
#include <vector>
@@ -32,21 +32,69 @@
namespace icing {
namespace lib {
+class DocHitInfoTermFrequencyPair {
+ public:
+ DocHitInfoTermFrequencyPair(
+ const DocHitInfo& doc_hit_info,
+ const Hit::TermFrequencyArray& hit_term_frequency = {})
+ : doc_hit_info_(doc_hit_info), hit_term_frequency_(hit_term_frequency) {}
+
+ void UpdateSection(SectionId section_id,
+ Hit::TermFrequency hit_term_frequency) {
+ doc_hit_info_.UpdateSection(section_id);
+ hit_term_frequency_[section_id] = hit_term_frequency;
+ }
+
+ void MergeSectionsFrom(const DocHitInfoTermFrequencyPair& other) {
+ SectionIdMask other_mask = other.doc_hit_info_.hit_section_ids_mask();
+ doc_hit_info_.MergeSectionsFrom(other_mask);
+ while (other_mask) {
+ SectionId section_id = __builtin_ctzll(other_mask);
+ hit_term_frequency_[section_id] = other.hit_term_frequency_[section_id];
+ other_mask &= ~(UINT64_C(1) << section_id);
+ }
+ }
+
+ DocHitInfo doc_hit_info() const { return doc_hit_info_; }
+
+ Hit::TermFrequency hit_term_frequency(SectionId section_id) const {
+ return hit_term_frequency_[section_id];
+ }
+
+ private:
+ DocHitInfo doc_hit_info_;
+ Hit::TermFrequencyArray hit_term_frequency_;
+};
+
// Dummy class to help with testing. It starts with an kInvalidDocumentId doc
// hit info until an Advance is called (like normal DocHitInfoIterators). It
// will then proceed to return the doc_hit_infos in order as Advance's are
// called. After all doc_hit_infos are returned, Advance will return a NotFound
// error (also like normal DocHitInfoIterators).
-class DocHitInfoIteratorDummy : public DocHitInfoIterator {
+class DocHitInfoIteratorDummy : public DocHitInfoLeafIterator {
public:
DocHitInfoIteratorDummy() = default;
- explicit DocHitInfoIteratorDummy(std::vector<DocHitInfo> doc_hit_infos)
- : doc_hit_infos_(std::move(doc_hit_infos)) {}
+ explicit DocHitInfoIteratorDummy(
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos,
+ std::string term = "")
+ : doc_hit_infos_(std::move(doc_hit_infos)), term_(std::move(term)) {}
+
+ explicit DocHitInfoIteratorDummy(const std::vector<DocHitInfo>& doc_hit_infos,
+ std::string term = "",
+ int term_start_index = 0,
+ int unnormalized_term_length = 0)
+ : term_(std::move(term)),
+ term_start_index_(term_start_index),
+ unnormalized_term_length_(unnormalized_term_length) {
+ for (auto& doc_hit_info : doc_hit_infos) {
+ doc_hit_infos_.push_back(DocHitInfoTermFrequencyPair(doc_hit_info));
+ }
+ }
libtextclassifier3::Status Advance() override {
+ ++index_;
if (index_ < doc_hit_infos_.size()) {
- doc_hit_info_ = doc_hit_infos_.at(index_);
- index_++;
+ doc_hit_info_ = doc_hit_infos_.at(index_).doc_hit_info();
return libtextclassifier3::Status::OK;
}
@@ -54,43 +102,74 @@ class DocHitInfoIteratorDummy : public DocHitInfoIterator {
"No more DocHitInfos in iterator");
}
- void set_hit_intersect_section_ids_mask(
- SectionIdMask hit_intersect_section_ids_mask) {
- hit_intersect_section_ids_mask_ = hit_intersect_section_ids_mask;
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override {
+ DocHitInfoIterator::TrimmedNode node = {nullptr, term_, term_start_index_,
+ unnormalized_term_length_};
+ return node;
}
- int32_t GetNumBlocksInspected() const override {
- return num_blocks_inspected_;
+ // Imitates behavior of DocHitInfoIteratorTermMain/DocHitInfoIteratorTermLite
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (index_ == -1 || index_ >= doc_hit_infos_.size()) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ SectionIdMask section_mask =
+ doc_hit_info_.hit_section_ids_mask() & filtering_section_mask;
+ SectionIdMask section_mask_copy = section_mask;
+ std::array<Hit::TermFrequency, kTotalNumSections> section_term_frequencies =
+ {Hit::kNoTermFrequency};
+ while (section_mask_copy) {
+ SectionId section_id = __builtin_ctzll(section_mask_copy);
+ section_term_frequencies.at(section_id) =
+ doc_hit_infos_.at(index_).hit_term_frequency(section_id);
+ section_mask_copy &= ~(UINT64_C(1) << section_id);
+ }
+ TermMatchInfo term_stats(term_, section_mask,
+ std::move(section_term_frequencies));
+
+ for (auto& cur_term_stats : *matched_terms_stats) {
+ if (cur_term_stats.term == term_stats.term) {
+ // Same docId and same term, we don't need to add the term and the term
+ // frequency should always be the same
+ return;
+ }
+ }
+ matched_terms_stats->push_back(term_stats);
}
- void SetNumBlocksInspected(int32_t num_blocks_inspected) {
- num_blocks_inspected_ = num_blocks_inspected;
+ void set_hit_section_ids_mask(SectionIdMask hit_section_ids_mask) {
+ doc_hit_info_.set_hit_section_ids_mask(hit_section_ids_mask);
}
- int32_t GetNumLeafAdvanceCalls() const override {
- return num_leaf_advance_calls_;
- }
+ CallStats GetCallStats() const override { return call_stats_; }
- void SetNumLeafAdvanceCalls(int32_t num_leaf_advance_calls) {
- num_leaf_advance_calls_ = num_leaf_advance_calls;
+ void SetCallStats(CallStats call_stats) {
+ call_stats_ = std::move(call_stats);
}
std::string ToString() const override {
std::string ret = "<";
- for (auto& doc_hit_info : doc_hit_infos_) {
- absl_ports::StrAppend(&ret, IcingStringUtil::StringPrintf(
- "[%d,%d]", doc_hit_info.document_id(),
- doc_hit_info.hit_section_ids_mask()));
+ for (auto& doc_hit_info_pair : doc_hit_infos_) {
+ absl_ports::StrAppend(
+ &ret, IcingStringUtil::StringPrintf(
+ "[%d,%" PRIu64 "]",
+ doc_hit_info_pair.doc_hit_info().document_id(),
+ doc_hit_info_pair.doc_hit_info().hit_section_ids_mask()));
}
absl_ports::StrAppend(&ret, ">");
return ret;
}
private:
- int32_t index_ = 0;
- int32_t num_blocks_inspected_ = 0;
- int32_t num_leaf_advance_calls_ = 0;
- std::vector<DocHitInfo> doc_hit_infos_;
+ int32_t index_ = -1;
+ CallStats call_stats_;
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos_;
+ std::string term_;
+ int term_start_index_;
+ int unnormalized_term_length_;
};
inline std::vector<DocumentId> GetDocumentIds(DocHitInfoIterator* iterator) {
diff --git a/icing/index/iterator/doc-hit-info-iterator.h b/icing/index/iterator/doc-hit-info-iterator.h
index bcc2b6e..728f957 100644
--- a/icing/index/iterator/doc-hit-info-iterator.h
+++ b/icing/index/iterator/doc-hit-info-iterator.h
@@ -15,8 +15,14 @@
#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_H_
+#include <array>
#include <cstdint>
+#include <functional>
+#include <memory>
#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
@@ -28,10 +34,29 @@
namespace icing {
namespace lib {
+// Data structure that maps a single matched query term to its section mask
+// and the list of term frequencies.
+// TODO(b/158603837): add stat on whether the matched terms are prefix matched
+// or not. This information will be used to boost exact match.
+struct TermMatchInfo {
+ std::string_view term;
+ // SectionIdMask associated to the term.
+ SectionIdMask section_ids_mask;
+ // Array with fixed size kMaxSectionId. For every section id, i.e.
+ // vector index, it stores the term frequency of the term.
+ std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies;
+
+ explicit TermMatchInfo(
+ std::string_view term, SectionIdMask section_ids_mask,
+ std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies)
+ : term(term),
+ section_ids_mask(section_ids_mask),
+ term_frequencies(std::move(term_frequencies)) {}
+};
+
// Iterator over DocHitInfos (collapsed Hits) in REVERSE document_id order.
//
-// NOTE: You must call Advance() before calling hit_info() or
-// hit_intersect_section_ids_mask().
+// NOTE: You must call Advance() before calling hit_info().
//
// Example:
// DocHitInfoIterator itr = GetIterator(...);
@@ -40,10 +65,161 @@ namespace lib {
// }
class DocHitInfoIterator {
public:
+ using ChildrenMapper = std::function<std::unique_ptr<DocHitInfoIterator>(
+ std::unique_ptr<DocHitInfoIterator>)>;
+
+ // CallStats is a wrapper class of all stats to collect among all levels of
+ // the DocHitInfoIterator tree. Mostly the internal nodes will aggregate the
+ // number of all leaf nodes, while the leaf nodes will return the actual
+ // numbers.
+ struct CallStats {
+ // The number of times Advance() was called on the leaf node for term lite
+ // index.
+ // - Leaf nodes:
+ // - DocHitInfoIteratorTermLite should maintain and set it correctly.
+ // - Others should set it 0.
+ // - Internal nodes: should aggregate values from all children.
+ int32_t num_leaf_advance_calls_lite_index;
+
+ // The number of times Advance() was called on the leaf node for term main
+ // index.
+ // - Leaf nodes:
+ // - DocHitInfoIteratorTermMain should maintain and set it correctly.
+ // - Others should set it 0.
+ // - Internal nodes: should aggregate values from all children.
+ int32_t num_leaf_advance_calls_main_index;
+
+ // The number of times Advance() was called on the leaf node for integer
+ // index.
+ // - Leaf nodes:
+ // - DocHitInfoIteratorNumeric should maintain and set it correctly.
+ // - Others should set it 0.
+ // - Internal nodes: should aggregate values from all children.
+ int32_t num_leaf_advance_calls_integer_index;
+
+ // The number of times Advance() was called on the leaf node without reading
+ // any hits from index. Usually it is a special field for
+ // DocHitInfoIteratorAllDocumentId.
+ // - Leaf nodes:
+ // - DocHitInfoIteratorAllDocumentId should maintain and set it correctly.
+ // - Others should set it 0.
+ // - Internal nodes: should aggregate values from all children.
+ int32_t num_leaf_advance_calls_no_index;
+
+ // The number of flash index blocks that have been read as a result of
+ // operations on this object.
+ // - Leaf nodes: should maintain and set it correctly for all child classes
+ // involving flash index block access.
+ // - Internal nodes: should aggregate values from all children.
+ int32_t num_blocks_inspected;
+
+ explicit CallStats()
+ : CallStats(/*num_leaf_advance_calls_lite_index_in=*/0,
+ /*num_leaf_advance_calls_main_index_in=*/0,
+ /*num_leaf_advance_calls_integer_index_in=*/0,
+ /*num_leaf_advance_calls_no_index_in=*/0,
+ /*num_blocks_inspected_in=*/0) {}
+
+ explicit CallStats(int32_t num_leaf_advance_calls_lite_index_in,
+ int32_t num_leaf_advance_calls_main_index_in,
+ int32_t num_leaf_advance_calls_integer_index_in,
+ int32_t num_leaf_advance_calls_no_index_in,
+ int32_t num_blocks_inspected_in)
+ : num_leaf_advance_calls_lite_index(
+ num_leaf_advance_calls_lite_index_in),
+ num_leaf_advance_calls_main_index(
+ num_leaf_advance_calls_main_index_in),
+ num_leaf_advance_calls_integer_index(
+ num_leaf_advance_calls_integer_index_in),
+ num_leaf_advance_calls_no_index(num_leaf_advance_calls_no_index_in),
+ num_blocks_inspected(num_blocks_inspected_in) {}
+
+ int32_t num_leaf_advance_calls() const {
+ return num_leaf_advance_calls_lite_index +
+ num_leaf_advance_calls_main_index +
+ num_leaf_advance_calls_integer_index +
+ num_leaf_advance_calls_no_index;
+ }
+
+ bool operator==(const CallStats& other) const {
+ return num_leaf_advance_calls_lite_index ==
+ other.num_leaf_advance_calls_lite_index &&
+ num_leaf_advance_calls_main_index ==
+ other.num_leaf_advance_calls_main_index &&
+ num_leaf_advance_calls_integer_index ==
+ other.num_leaf_advance_calls_integer_index &&
+ num_leaf_advance_calls_no_index ==
+ other.num_leaf_advance_calls_no_index &&
+ num_blocks_inspected == other.num_blocks_inspected;
+ }
+
+ CallStats operator+(const CallStats& other) const {
+ return CallStats(num_leaf_advance_calls_lite_index +
+ other.num_leaf_advance_calls_lite_index,
+ num_leaf_advance_calls_main_index +
+ other.num_leaf_advance_calls_main_index,
+ num_leaf_advance_calls_integer_index +
+ other.num_leaf_advance_calls_integer_index,
+ num_leaf_advance_calls_no_index +
+ other.num_leaf_advance_calls_no_index,
+ num_blocks_inspected + other.num_blocks_inspected);
+ }
+
+ CallStats& operator+=(const CallStats& other) {
+ *this = *this + other;
+ return *this;
+ }
+ };
+
+ struct TrimmedNode {
+ // the query results which we should only search for suggestion in these
+ // documents.
+ std::unique_ptr<DocHitInfoIterator> iterator_;
+ // term of the trimmed node which we need to generate suggested strings.
+ std::string term_;
+ // the string in the query which indicates the target section we should
+ // search for suggestions.
+ std::string target_section_;
+ // the start index of the current term in the given search query.
+ int term_start_index_;
+ // The length of the given unnormalized term in the search query
+ int unnormalized_term_length_;
+
+ TrimmedNode(std::unique_ptr<DocHitInfoIterator> iterator, std::string term,
+ int term_start_index, int unnormalized_term_length)
+ : iterator_(std::move(iterator)),
+ term_(term),
+ target_section_(""),
+ term_start_index_(term_start_index),
+ unnormalized_term_length_(unnormalized_term_length) {}
+ };
+
+ // Trim the rightmost iterator of the iterator tree.
+ // This is to support search suggestions for the last term which is the
+ // right-most node of the root iterator tree. Only support trim the right-most
+ // node on the AND, AND_NARY, OR, OR_NARY, OR_LEAF, Filter, and the
+ // property-in-schema-check iterator.
+ //
+ // After calling this method, this iterator is no longer usable. Please use
+ // the returned iterator.
+ // Returns:
+ // the new iterator without the right-most child, if was able to trim the
+ // right-most node.
+ // nullptr if the current iterator should be trimmed.
+ // INVALID_ARGUMENT if the right-most node is not suppose to be trimmed.
+ virtual libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && = 0;
+
+ // Map all direct children of this iterator according to the passed mapper.
+ virtual void MapChildren(const ChildrenMapper& mapper) = 0;
+
+ virtual bool is_leaf() { return false; }
+
virtual ~DocHitInfoIterator() = default;
// Returns:
// OK if was able to advance to a new document_id.
+ // INVALID_ARGUMENT if there are less than 2 iterators for an AND/OR
+ // iterator
// RESOUCE_EXHAUSTED if we've run out of document_ids to iterate over
virtual libtextclassifier3::Status Advance() = 0;
@@ -52,27 +228,25 @@ class DocHitInfoIterator {
// construction or if Advance returned an error.
const DocHitInfo& doc_hit_info() const { return doc_hit_info_; }
- // SectionIdMask representing which sections (if any) have matched *ALL* query
- // terms for the current document_id.
- SectionIdMask hit_intersect_section_ids_mask() const {
- return hit_intersect_section_ids_mask_;
- }
-
- // Gets the number of flash index blocks that have been read as a
- // result of operations on this object.
- virtual int32_t GetNumBlocksInspected() const = 0;
-
- // HitIterators may be constructed into trees. Internal nodes will return the
- // sum of the number of Advance() calls to all leaf nodes. Leaf nodes will
- // return the number of times Advance() was called on it.
- virtual int32_t GetNumLeafAdvanceCalls() const = 0;
+ // Returns CallStats of the DocHitInfoIterator tree.
+ virtual CallStats GetCallStats() const = 0;
// A string representing the iterator.
virtual std::string ToString() const = 0;
+ // For the last hit docid, retrieves all the matched query terms and other
+ // stats, see TermMatchInfo.
+ // filtering_section_mask filters the matching sections and should be set only
+ // by DocHitInfoIteratorSectionRestrict.
+ // If Advance() wasn't called after construction, Advance() returned false or
+ // the concrete HitIterator didn't override this method, the vectors aren't
+ // populated.
+ virtual void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const {}
+
protected:
DocHitInfo doc_hit_info_;
- SectionIdMask hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
// Helper function to advance the given iterator to at most the given
// document_id.
@@ -87,11 +261,20 @@ class DocHitInfoIterator {
// Didn't find anything for the other iterator, reset to invalid values and
// return.
doc_hit_info_ = DocHitInfo(kInvalidDocumentId);
- hit_intersect_section_ids_mask_ = kSectionIdMaskNone;
return absl_ports::ResourceExhaustedError(
"No more DocHitInfos in iterator");
}
-}; // namespace DocHitInfoIterator
+};
+
+// A leaf node is a term node or a chain of section restriction node applied on
+// a term node.
+class DocHitInfoLeafIterator : public DocHitInfoIterator {
+ public:
+ bool is_leaf() override { return true; }
+
+ // Calling MapChildren on leaf node does not make sense, and will do nothing.
+ void MapChildren(const ChildrenMapper& mapper) override {}
+};
} // namespace lib
} // namespace icing
diff --git a/icing/index/iterator/doc-hit-info-iterator_benchmark.cc b/icing/index/iterator/doc-hit-info-iterator_benchmark.cc
index 90e4888..993c3b8 100644
--- a/icing/index/iterator/doc-hit-info-iterator_benchmark.cc
+++ b/icing/index/iterator/doc-hit-info-iterator_benchmark.cc
@@ -14,15 +14,15 @@
#include <vector>
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator-and.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
-#include "testing/base/public/benchmark.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
namespace icing {
namespace lib {
@@ -35,7 +35,7 @@ namespace {
//
// $
// blaze-bin/icing/index/iterator/doc-hit-info-iterator_benchmark
-// --benchmarks=all
+// --benchmark_filter=all
//
// Run on an Android device:
// $ blaze build --config=android_arm64 -c opt --dynamic_mode=off
@@ -47,7 +47,7 @@ namespace {
// /data/local/tmp/
//
// $ adb shell /data/local/tmp/doc-hit-info-iterator_benchmark
-// --benchmarks=all
+// --benchmark_filter=all
// Functor to be used with std::generate to create a container of DocHitInfos.
// DocHitInfos are generated starting at docid starting_docid and continuing at
diff --git a/icing/index/iterator/section-restrict-data.cc b/icing/index/iterator/section-restrict-data.cc
new file mode 100644
index 0000000..085437d
--- /dev/null
+++ b/icing/index/iterator/section-restrict-data.cc
@@ -0,0 +1,82 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/iterator/section-restrict-data.h"
+
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+SectionIdMask SectionRestrictData::GenerateSectionMask(
+ const std::string& schema_type,
+ const std::set<std::string>& target_sections) const {
+ SectionIdMask section_mask = kSectionIdMaskNone;
+ auto section_metadata_list = schema_store_.GetSectionMetadata(schema_type);
+ if (!section_metadata_list.ok()) {
+ // The current schema doesn't have section metadata.
+ return kSectionIdMaskNone;
+ }
+ for (const SectionMetadata& section_metadata :
+ *section_metadata_list.ValueOrDie()) {
+ if (target_sections.find(section_metadata.path) != target_sections.end()) {
+ section_mask |= UINT64_C(1) << section_metadata.id;
+ }
+ }
+ return section_mask;
+}
+
+SectionIdMask SectionRestrictData::ComputeAllowedSectionsMask(
+ const std::string& schema_type) {
+ if (const auto type_property_mask_itr =
+ type_property_masks_.find(schema_type);
+ type_property_mask_itr != type_property_masks_.end()) {
+ return type_property_mask_itr->second;
+ }
+
+ // Section id mask of schema_type is never calculated before, so
+ // calculate it here and put it into type_property_masks_.
+ // - If type property filters of schema_type or wildcard (*) are
+ // specified, then create a mask according to the filters.
+ // - Otherwise, create a mask to match all properties.
+ SectionIdMask new_section_id_mask = kSectionIdMaskAll;
+ if (const auto itr = type_property_filters_.find(schema_type);
+ itr != type_property_filters_.end()) {
+ // Property filters defined for given schema type
+ new_section_id_mask = GenerateSectionMask(schema_type, itr->second);
+ } else if (const auto wildcard_itr = type_property_filters_.find(
+ std::string(SchemaStore::kSchemaTypeWildcard));
+ wildcard_itr != type_property_filters_.end()) {
+ // Property filters defined for wildcard entry
+ new_section_id_mask =
+ GenerateSectionMask(schema_type, wildcard_itr->second);
+ } else {
+ // Do not cache the section mask if no property filters apply to this schema
+ // type to avoid taking up unnecessary space.
+ return kSectionIdMaskAll;
+ }
+
+ type_property_masks_[schema_type] = new_section_id_mask;
+ return new_section_id_mask;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/iterator/section-restrict-data.h b/icing/index/iterator/section-restrict-data.h
new file mode 100644
index 0000000..26ca597
--- /dev/null
+++ b/icing/index/iterator/section-restrict-data.h
@@ -0,0 +1,98 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_SECTION_RESTRICT_DATA_H_
+#define ICING_INDEX_ITERATOR_SECTION_RESTRICT_DATA_H_
+
+#include <cstdint>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+class SectionRestrictData {
+ public:
+ // Does not take any ownership, and all pointers must refer to valid objects
+ // that outlive the one constructed.
+ SectionRestrictData(const DocumentStore* document_store,
+ const SchemaStore* schema_store, int64_t current_time_ms,
+ std::unordered_map<std::string, std::set<std::string>>
+ type_property_filters)
+ : document_store_(*document_store),
+ schema_store_(*schema_store),
+ current_time_ms_(current_time_ms),
+ type_property_filters_(std::move(type_property_filters)) {}
+
+ // Calculates the section mask of allowed sections(determined by the
+ // property filters map) for the given schema type and caches the same for any
+ // future calls.
+ //
+ // Returns:
+ // - If type_property_filters_ has an entry for the given schema type or
+ // wildcard(*), return a bitwise or of section IDs in the schema type
+ // that that are also present in the relevant filter list.
+ // - Otherwise, return kSectionIdMaskAll.
+ SectionIdMask ComputeAllowedSectionsMask(const std::string& schema_type);
+
+ const DocumentStore& document_store() const { return document_store_; }
+
+ const SchemaStore& schema_store() const { return schema_store_; }
+
+ int64_t current_time_ms() const { return current_time_ms_; }
+
+ const std::unordered_map<std::string, std::set<std::string>>&
+ type_property_filters() const {
+ return type_property_filters_;
+ }
+
+ private:
+ const DocumentStore& document_store_;
+ const SchemaStore& schema_store_;
+ int64_t current_time_ms_;
+
+ // Map of property filters per schema type. Supports wildcard(*) for schema
+ // type that will apply to all schema types that are not specifically
+ // specified in the mapping otherwise.
+ std::unordered_map<std::string, std::set<std::string>> type_property_filters_;
+ // Mapping of schema type to the section mask of allowed sections for that
+ // schema type. This section mask is lazily calculated based on the
+ // specified property filters and cached for any future use.
+ std::unordered_map<std::string, SectionIdMask> type_property_masks_;
+
+ // Generates a section mask for the given schema type and the target
+ // sections.
+ //
+ // Returns:
+ // - A bitwise or of section IDs in the schema_type that that are also
+ // present in the target_sections list.
+ // - If none of the sections in the schema_type are present in the
+ // target_sections list, return kSectionIdMaskNone.
+ // This is done by doing a bitwise or of the target section ids for the
+ // given schema type.
+ SectionIdMask GenerateSectionMask(
+ const std::string& schema_type,
+ const std::set<std::string>& target_sections) const;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_SECTION_RESTRICT_DATA_H_
diff --git a/icing/index/lite-index.cc b/icing/index/lite-index.cc
deleted file mode 100644
index 489c53d..0000000
--- a/icing/index/lite-index.cc
+++ /dev/null
@@ -1,457 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/index/lite-index.h"
-
-#include <inttypes.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <sys/mman.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/file/filesystem.h"
-#include "icing/index/hit/doc-hit-info.h"
-#include "icing/index/hit/hit.h"
-#include "icing/index/term-property-id.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/legacy/core/icing-timer.h"
-#include "icing/legacy/index/icing-array-storage.h"
-#include "icing/legacy/index/icing-dynamic-trie.h"
-#include "icing/legacy/index/icing-filesystem.h"
-#include "icing/legacy/index/icing-lite-index-header.h"
-#include "icing/legacy/index/icing-mmapper.h"
-#include "icing/proto/term.pb.h"
-#include "icing/schema/section.h"
-#include "icing/store/document-id.h"
-#include "icing/util/crc32.h"
-#include "icing/util/logging.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-// Point at which we declare the trie full.
-constexpr double kTrieFullFraction = 0.95;
-
-std::string MakeHitBufferFilename(const std::string& filename_base) {
- return filename_base + "hb";
-}
-
-size_t header_size() { return sizeof(IcingLiteIndex_HeaderImpl::HeaderData); }
-
-} // namespace
-
-const LiteIndex::Element::Value LiteIndex::Element::kInvalidValue =
- LiteIndex::Element(0, Hit()).value();
-
-libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
- const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
- ICING_RETURN_ERROR_IF_NULL(filesystem);
-
- std::unique_ptr<LiteIndex> lite_index =
- std::unique_ptr<LiteIndex>(new LiteIndex(options, filesystem));
- ICING_RETURN_IF_ERROR(lite_index->Initialize());
- return std::move(lite_index);
-}
-
-// size is max size in elements. An appropriate lexicon and display
-// mapping size will be chosen based on hit buffer size.
-LiteIndex::LiteIndex(const LiteIndex::Options& options,
- const IcingFilesystem* filesystem)
- : hit_buffer_(*filesystem),
- hit_buffer_crc_(0),
- lexicon_(options.filename_base + "lexicon", MakeTrieRuntimeOptions(),
- filesystem),
- header_mmap_(false, MAP_SHARED),
- options_(options),
- filesystem_(filesystem) {}
-
-LiteIndex::~LiteIndex() {
- if (initialized()) {
- libtextclassifier3::Status unused = PersistToDisk();
- }
-}
-
-IcingDynamicTrie::RuntimeOptions LiteIndex::MakeTrieRuntimeOptions() {
- return IcingDynamicTrie::RuntimeOptions().set_storage_policy(
- IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc);
-}
-
-libtextclassifier3::Status LiteIndex::Initialize() {
- // Size of hit buffer's header struct, rounded up to the nearest number of
- // system memory pages.
- const size_t header_padded_size =
- IcingMMapper::page_aligned_size(header_size());
-
- // Variable declarations cannot cross goto jumps, so declare these up top.
- libtextclassifier3::Status status;
- uint64_t file_size;
- IcingTimer timer;
-
- if (!lexicon_.CreateIfNotExist(options_.lexicon_options) ||
- !lexicon_.Init()) {
- return absl_ports::InternalError("Failed to initialize lexicon trie");
- }
-
- hit_buffer_fd_.reset(filesystem_->OpenForWrite(
- MakeHitBufferFilename(options_.filename_base).c_str()));
- if (!hit_buffer_fd_.is_valid()) {
- status = absl_ports::InternalError("Failed to open hit buffer file");
- goto error;
- }
-
- file_size = filesystem_->GetFileSize(hit_buffer_fd_.get());
- if (file_size == IcingFilesystem::kBadFileSize) {
- status = absl_ports::InternalError("Failed to query hit buffer file size");
- goto error;
- }
-
- if (file_size < header_padded_size) {
- if (file_size != 0) {
- status = absl_ports::InternalError(IcingStringUtil::StringPrintf(
- "Hit buffer had unexpected size %" PRIu64, file_size));
- goto error;
- }
-
- ICING_VLOG(2) << "Creating new hit buffer";
- // Make sure files are fresh.
- if (!lexicon_.Remove() ||
- !lexicon_.CreateIfNotExist(options_.lexicon_options) ||
- !lexicon_.Init()) {
- status =
- absl_ports::InternalError("Failed to refresh lexicon during clear");
- goto error;
- }
-
- // Create fresh hit buffer by first emptying the hit buffer file and then
- // allocating header_padded_size of the cleared space.
- if (!filesystem_->Truncate(hit_buffer_fd_.get(), 0) ||
- !filesystem_->Truncate(hit_buffer_fd_.get(), header_padded_size)) {
- status = absl_ports::InternalError("Failed to truncate hit buffer file");
- goto error;
- }
-
- // Set up header.
- header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
- header_ = std::make_unique<IcingLiteIndex_HeaderImpl>(
- reinterpret_cast<IcingLiteIndex_HeaderImpl::HeaderData*>(
- header_mmap_.address()));
- header_->Reset();
-
- if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
- sizeof(Element::Value), header_->cur_size(),
- options_.hit_buffer_size, &hit_buffer_crc_, true)) {
- status = absl_ports::InternalError("Failed to initialize new hit buffer");
- goto error;
- }
-
- UpdateChecksum();
- } else {
- header_mmap_.Remap(hit_buffer_fd_.get(), 0, header_size());
- header_ = std::make_unique<IcingLiteIndex_HeaderImpl>(
- reinterpret_cast<IcingLiteIndex_HeaderImpl::HeaderData*>(
- header_mmap_.address()));
-
- if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
- sizeof(Element::Value), header_->cur_size(),
- options_.hit_buffer_size, &hit_buffer_crc_, true)) {
- status = absl_ports::InternalError(
- "Failed to re-initialize existing hit buffer");
- goto error;
- }
-
- // Check integrity.
- if (!header_->check_magic()) {
- status = absl_ports::InternalError("Lite index header magic mismatch");
- goto error;
- }
- Crc32 crc = ComputeChecksum();
- if (crc.Get() != header_->lite_index_crc()) {
- status = absl_ports::DataLossError(
- IcingStringUtil::StringPrintf("Lite index crc check failed: %u vs %u",
- crc.Get(), header_->lite_index_crc()));
- goto error;
- }
- }
-
- ICING_VLOG(2) << IcingStringUtil::StringPrintf("Lite index init ok in %.3fms",
- timer.Elapsed() * 1000);
- return status;
-
-error:
- header_ = nullptr;
- header_mmap_.Unmap();
- lexicon_.Close();
- hit_buffer_crc_ = 0;
- hit_buffer_.Reset();
- hit_buffer_fd_.reset();
- if (status.ok()) {
- return absl_ports::InternalError(
- "Error handling code ran but status was ok");
- }
- return status;
-}
-
-Crc32 LiteIndex::ComputeChecksum() {
- IcingTimer timer;
-
- // Update crcs.
- uint32_t dependent_crcs[2];
- hit_buffer_.UpdateCrc();
- dependent_crcs[0] = hit_buffer_crc_;
- dependent_crcs[1] = lexicon_.UpdateCrc();
-
- // Compute the master crc.
-
- // Header crc, excluding the actual crc field.
- Crc32 all_crc(header_->CalculateHeaderCrc());
- all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
- sizeof(dependent_crcs)));
- ICING_VLOG(2) << IcingStringUtil::StringPrintf(
- "Lite index crc computed in %.3fms", timer.Elapsed() * 1000);
-
- return all_crc;
-}
-
-libtextclassifier3::Status LiteIndex::Reset() {
- IcingTimer timer;
-
- // TODO(b/140436942): When these components have been changed to return errors
- // they should be propagated from here.
- lexicon_.Clear();
- hit_buffer_.Clear();
- header_->Reset();
- UpdateChecksum();
-
- ICING_VLOG(2) << IcingStringUtil::StringPrintf("Lite index clear in %.3fms",
- timer.Elapsed() * 1000);
- return libtextclassifier3::Status::OK;
-}
-
-void LiteIndex::Warm() {
- hit_buffer_.Warm();
- lexicon_.Warm();
-}
-
-libtextclassifier3::Status LiteIndex::PersistToDisk() {
- bool success = true;
- if (!lexicon_.Sync()) {
- ICING_VLOG(1) << "Failed to sync the lexicon.";
- success = false;
- }
- hit_buffer_.Sync();
- UpdateChecksum();
- header_mmap_.Sync();
-
- return (success) ? libtextclassifier3::Status::OK
- : absl_ports::InternalError(
- "Unable to sync lite index components.");
-}
-
-void LiteIndex::UpdateChecksum() {
- header_->set_lite_index_crc(ComputeChecksum().Get());
-}
-
-libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm(
- const std::string& term, TermMatchType::Code term_match_type,
- NamespaceId namespace_id) {
- uint32_t tvi;
- if (!lexicon_.Insert(term.c_str(), "", &tvi, false)) {
- return absl_ports::ResourceExhaustedError(
- absl_ports::StrCat("Unable to add term ", term, " to lexicon!"));
- }
- ICING_RETURN_IF_ERROR(UpdateTermProperties(
- tvi, term_match_type == TermMatchType::PREFIX, namespace_id));
- return tvi;
-}
-
-libtextclassifier3::Status LiteIndex::UpdateTermProperties(
- uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
- if (hasPrefixHits &&
- !lexicon_.SetProperty(tvi, GetHasHitsInPrefixSectionPropertyId())) {
- return absl_ports::ResourceExhaustedError(
- "Insufficient disk space to create prefix property!");
- }
-
- if (!lexicon_.SetProperty(tvi, GetNamespacePropertyId(namespace_id))) {
- return absl_ports::ResourceExhaustedError(
- "Insufficient disk space to create namespace property!");
- }
-
- return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
- if (is_full()) {
- return absl_ports::ResourceExhaustedError("Hit buffer is full!");
- }
-
- header_->set_last_added_docid(hit.document_id());
-
- Element elt(term_id, hit);
- uint32_t cur_size = header_->cur_size();
- Element::Value* valp = hit_buffer_.GetMutableMem<Element::Value>(cur_size, 1);
- if (valp == nullptr) {
- return absl_ports::ResourceExhaustedError(
- "Allocating more space in hit buffer failed!");
- }
- *valp = elt.value();
- header_->set_cur_size(cur_size + 1);
-
- return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::StatusOr<uint32_t> LiteIndex::FindTerm(
- const std::string& term) const {
- char dummy;
- uint32_t tvi;
- if (!lexicon_.Find(term.c_str(), &dummy, &tvi)) {
- return absl_ports::NotFoundError(
- absl_ports::StrCat("Could not find ", term, " in the lexicon."));
- }
- return tvi;
-}
-
-uint32_t LiteIndex::AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
- bool only_from_prefix_sections,
- std::vector<DocHitInfo>* hits_out) {
- uint32_t count = 0;
- DocumentId last_document_id = kInvalidDocumentId;
- for (uint32_t idx = Seek(term_id); idx < header_->cur_size(); idx++) {
- Element elt(hit_buffer_.array_cast<Element>()[idx]);
- if (elt.term_id() != term_id) break;
-
- const Hit& hit = elt.hit();
- // Check sections.
- if (((1u << hit.section_id()) & section_id_mask) == 0) {
- continue;
- }
- // Check prefix section only.
- if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
- continue;
- }
- DocumentId document_id = hit.document_id();
- if (document_id != last_document_id) {
- count++;
- if (hits_out != nullptr) {
- hits_out->push_back(DocHitInfo(document_id));
- }
- last_document_id = document_id;
- }
- if (hits_out != nullptr) {
- hits_out->back().UpdateSection(hit.section_id(), hit.score());
- }
- }
- return count;
-}
-
-uint32_t LiteIndex::CountHits(uint32_t term_id) {
- return AppendHits(term_id, kSectionIdMaskAll,
- /*only_from_prefix_sections=*/false,
- /*hits_out=*/nullptr);
-}
-
-bool LiteIndex::is_full() const {
- return (header_->cur_size() == options_.hit_buffer_size ||
- lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
-}
-
-void LiteIndex::GetDebugInfo(int verbosity, std::string* out) const {
- absl_ports::StrAppend(
- out, IcingStringUtil::StringPrintf("Lite Index\nHit buffer %u/%u\n",
- header_->cur_size(),
- options_.hit_buffer_size));
-
- // Lexicon.
- out->append("Lexicon stats:\n");
- lexicon_.GetDebugInfo(verbosity, out);
-}
-
-libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
- int64_t header_and_hit_buffer_file_size =
- filesystem_->GetFileSize(hit_buffer_fd_.get());
-
- if (header_and_hit_buffer_file_size == Filesystem::kBadFileSize) {
- return absl_ports::InternalError(
- "Failed to get element size of the LiteIndex's header and hit buffer");
- }
-
- int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
- if (lexicon_disk_usage == IcingFilesystem::kBadFileSize) {
- return absl_ports::InternalError(
- "Failed to get element size of LiteIndex's lexicon");
- }
-
- // On initialization, we grow the file to a padded size first. So this size
- // won't count towards the size taken up by elements
- size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
-
- return header_and_hit_buffer_file_size - header_padded_size +
- lexicon_disk_usage;
-}
-
-uint32_t LiteIndex::Seek(uint32_t term_id) {
- // Make searchable by sorting by hit buffer.
- uint32_t sort_len = header_->cur_size() - header_->searchable_end();
- if (sort_len > 0) {
- IcingTimer timer;
-
- auto* array_start =
- hit_buffer_.GetMutableMem<Element::Value>(0, header_->cur_size());
- Element::Value* sort_start = array_start + header_->searchable_end();
- std::sort(sort_start, array_start + header_->cur_size());
-
- // Now merge with previous region. Since the previous region is already
- // sorted and deduplicated, optimize the merge by skipping everything less
- // than the new region's smallest value.
- if (header_->searchable_end() > 0) {
- std::inplace_merge(array_start, array_start + header_->searchable_end(),
- array_start + header_->cur_size());
- }
- ICING_VLOG(2) << IcingStringUtil::StringPrintf(
- "Lite index sort and merge %u into %u in %.3fms", sort_len,
- header_->searchable_end(), timer.Elapsed() * 1000);
-
- // Now the entire array is sorted.
- header_->set_searchable_end(header_->cur_size());
-
- // Update crc in-line.
- UpdateChecksum();
- }
-
- // Binary search for our term_id. Make sure we get the first
- // element. Using kBeginSortValue ensures this for the hit value.
- Element elt(term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kMaxHitScore));
-
- const Element::Value* array = hit_buffer_.array_cast<Element::Value>();
- const Element::Value* ptr =
- std::lower_bound(array, array + header_->cur_size(), elt.value());
- return ptr - array;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/index/lite-index.h b/icing/index/lite-index.h
deleted file mode 100644
index b60a947..0000000
--- a/icing/index/lite-index.h
+++ /dev/null
@@ -1,269 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// A small index with continuous updates (doesn't need explicit Flush
-// to persiste) but has more possibility for corruption. It can always
-// detect corruption reliably.
-
-#ifndef ICING_INDEX_LITE_INDEX_H_
-#define ICING_INDEX_LITE_INDEX_H_
-
-#include <cstdint>
-#include <limits>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/file/filesystem.h"
-#include "icing/index/hit/doc-hit-info.h"
-#include "icing/index/hit/hit.h"
-#include "icing/legacy/index/icing-array-storage.h"
-#include "icing/legacy/index/icing-dynamic-trie.h"
-#include "icing/legacy/index/icing-filesystem.h"
-#include "icing/legacy/index/icing-lite-index-header.h"
-#include "icing/legacy/index/icing-lite-index-options.h"
-#include "icing/legacy/index/icing-mmapper.h"
-#include "icing/proto/term.pb.h"
-#include "icing/schema/section.h"
-#include "icing/store/document-id.h"
-#include "icing/store/namespace-id.h"
-#include "icing/util/bit-util.h"
-#include "icing/util/crc32.h"
-
-namespace icing {
-namespace lib {
-
-class LiteIndex {
- public:
- // An entry in the hit buffer.
- class Element {
- public:
- // Layout bits: 24 termid + 32 hit value + 8 hit score.
- using Value = uint64_t;
-
- static constexpr int kTermIdBits = 24;
- static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
- static constexpr int kHitScoreBits = sizeof(Hit::Score) * 8;
-
- static const Value kInvalidValue;
-
- explicit Element(Value v = kInvalidValue) : value_(v) {}
-
- Element(uint32_t term_id, const Hit& hit) {
- static_assert(
- kTermIdBits + kHitValueBits + kHitScoreBits <= sizeof(Value) * 8,
- "LiteIndexElementTooBig");
-
- value_ = 0;
- // Term id goes into the most significant bits because it takes
- // precedent in sorts.
- bit_util::BitfieldSet(term_id, kHitValueBits + kHitScoreBits, kTermIdBits,
- &value_);
- bit_util::BitfieldSet(hit.value(), kHitScoreBits, kHitValueBits, &value_);
- bit_util::BitfieldSet(hit.score(), 0, kHitScoreBits, &value_);
- }
-
- uint32_t term_id() const {
- return bit_util::BitfieldGet(value_, kHitValueBits + kHitScoreBits,
- kTermIdBits);
- }
-
- Hit hit() const {
- return Hit(bit_util::BitfieldGet(value_, kHitScoreBits, kHitValueBits),
- bit_util::BitfieldGet(value_, 0, kHitScoreBits));
- }
-
- Value value() const { return value_; }
-
- private:
- Value value_;
- };
-
- using Options = IcingLiteIndexOptions;
-
- // Updates checksum of subcomponents.
- ~LiteIndex();
-
- // Creates lite index from storage. The files will be created if they do not
- // already exist.
- //
- // Returns:
- // OK on success
- // DATA_LOSS if the index was corrupted and cleared
- // INTERNAL on I/O error
- static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create(
- const Options& options, const IcingFilesystem* filesystem);
-
- // Resets all internal members of the index. Returns OK if all operations were
- // successful.
- libtextclassifier3::Status Reset();
-
- // Advises the OS to cache pages in the index, which will be accessed for a
- // query soon.
- void Warm();
-
- // Syncs all modified files in the index to disk.
- //
- // Returns:
- // OK on success
- // INTERNAL on I/O error
- libtextclassifier3::Status PersistToDisk();
-
- // Calculate the checksum of all sub-components of the LiteIndex
- Crc32 ComputeChecksum();
-
- // Returns term_id if term found, NOT_FOUND otherwise.
- libtextclassifier3::StatusOr<uint32_t> FindTerm(
- const std::string& term) const;
-
- // Returns an iterator for all terms for which 'prefix' is a prefix.
- class PrefixIterator {
- public:
- explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate)
- : delegate_(delegate) {}
- bool IsValid() const { return delegate_.IsValid(); }
-
- void Advance() { delegate_.Advance(); }
-
- const char* GetKey() const { return delegate_.GetKey(); }
-
- uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); }
-
- private:
- IcingDynamicTrie::Iterator delegate_;
- };
-
- PrefixIterator FindTermPrefixes(const std::string& prefix) const {
- return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix.c_str()));
- }
-
- // Inserts a term with its properties.
- //
- // Returns:
- // A value index on success
- // RESOURCE_EXHAUSTED if lexicon is full or no disk space is available
- libtextclassifier3::StatusOr<uint32_t> InsertTerm(
- const std::string& term, TermMatchType::Code term_match_type,
- NamespaceId namespace_id);
-
- // Updates term properties by setting hasPrefixHits and namespace id of the
- // term.
- //
- // Returns:
- // OK on success
- // RESOURCE_EXHAUSTED if no disk space is available
- libtextclassifier3::Status UpdateTermProperties(uint32_t tvi,
- bool hasPrefixHits,
- NamespaceId namespace_id);
-
- // Append hit to buffer. term_id must be encoded using the same term_id_codec
- // supplied to the index constructor. Returns non-OK if hit cannot be added
- // (either due to hit buffer or file system capacity reached).
- libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit);
-
- // Add all hits with term_id from the sections specified in section_id_mask,
- // skipping hits in non-prefix sections if only_from_prefix_sections is true,
- // to hits_out.
- uint32_t AppendHits(uint32_t term_id, SectionIdMask section_id_mask,
- bool only_from_prefix_sections,
- std::vector<DocHitInfo>* hits_out);
-
- // Returns the hit count of the term.
- uint32_t CountHits(uint32_t term_id);
-
- // Check if buffer has reached its capacity.
- bool is_full() const;
-
- constexpr static uint32_t max_hit_buffer_size() {
- return std::numeric_limits<uint32_t>::max() / sizeof(LiteIndex::Element);
- }
-
- // We keep track of the last added document_id. This is always the largest
- // document_id that has been added because hits can only be added in order of
- // increasing document_id.
- DocumentId last_added_document_id() const {
- return header_->last_added_docid();
- }
-
- const IcingDynamicTrie& lexicon() const { return lexicon_; }
-
- // Returns debug information for the index in out.
- // verbosity <= 0, simplest debug information - size of lexicon, hit buffer
- // verbosity > 0, more detailed debug information from the lexicon.
- void GetDebugInfo(int verbosity, std::string* out) const;
-
- // Returns the byte size of all the elements held in the index. This excludes
- // the size of any internal metadata of the index, e.g. the index's header.
- //
- // Returns:
- // Byte size on success
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
-
- private:
- static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
-
- LiteIndex(const Options& options, const IcingFilesystem* filesystem);
-
- // Initializes lite index from storage. Must be called exactly once after
- // object construction.
- //
- // Returns:
- // OK on success
- // DATA_LOSS if the index was corrupted and cleared
- // INTERNAL on I/O error
- libtextclassifier3::Status Initialize();
-
- bool initialized() const { return header_ != nullptr; }
-
- // Sets the computed checksum in the header
- void UpdateChecksum();
-
- // Returns the position of the first element with term_id, or the size of the
- // hit buffer if term_id is not present.
- uint32_t Seek(uint32_t term_id);
-
- // File descriptor that points to where the header and hit buffer are written
- // to.
- ScopedFd hit_buffer_fd_;
-
- // Mmapped region past the header that stores the hits.
- IcingArrayStorage hit_buffer_;
-
- // Crc checksum of the hits, excludes the header.
- uint32_t hit_buffer_crc_;
-
- // Trie that maps indexed terms to their term id
- IcingDynamicTrie lexicon_;
-
- // TODO(b/140437260): Port over to MemoryMappedFile
- // Memory mapped region of the underlying file that reflects the header.
- IcingMMapper header_mmap_;
-
- // Wrapper around the mmapped header that contains stats on the lite index.
- std::unique_ptr<IcingLiteIndex_Header> header_;
-
- // Options used to initialize the LiteIndex.
- const Options options_;
-
- // TODO(b/139087650) Move to icing::Filesystem
- const IcingFilesystem* const filesystem_;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_INDEX_LITE_INDEX_H_
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.cc b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
new file mode 100644
index 0000000..21eecb6
--- /dev/null
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.cc
@@ -0,0 +1,217 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <numeric>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/schema/section.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
+ std::string mask(kTotalNumSections, '0');
+ for (SectionId i = kMaxSectionId; i >= 0; --i) {
+ if (section_id_mask & (UINT64_C(1) << i)) {
+ mask[kMaxSectionId - i] = '1';
+ }
+ }
+ return mask;
+}
+
+} // namespace
+
+libtextclassifier3::Status DocHitInfoIteratorTermLite::Advance() {
+ if (cached_hits_idx_ == -1) {
+ libtextclassifier3::Status status = RetrieveMoreHits();
+ if (!status.ok()) {
+ if (!absl_ports::IsNotFound(status)) {
+ // NOT_FOUND is expected to happen (not every term will be in the main
+ // index!). Other errors are worth logging.
+ ICING_LOG(ERROR)
+ << "Encountered unexpected failure while retrieving hits "
+ << status.error_message();
+ }
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ } else {
+ ++cached_hits_idx_;
+ }
+ if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
+ // Nothing more for the iterator to return. Set these members to invalid
+ // values.
+ doc_hit_info_ = DocHitInfo();
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ ++num_advance_calls_;
+ doc_hit_info_ = cached_hits_.at(cached_hits_idx_);
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorTermLite::TrimRightMostNode() && {
+ // Leaf iterator should trim itself.
+ DocHitInfoIterator::TrimmedNode node = {nullptr, term_, term_start_index_,
+ unnormalized_term_length_};
+ return node;
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermLiteExact::RetrieveMoreHits() {
+ // Exact match only. All hits in lite lexicon are exact.
+ ICING_ASSIGN_OR_RETURN(uint32_t tvi, lite_index_->GetTermId(term_));
+ ICING_ASSIGN_OR_RETURN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ lite_index_->FetchHits(
+ term_id, section_restrict_mask_,
+ /*only_from_prefix_sections=*/false,
+ /*score_by=*/
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
+ /*namespace_checker=*/nullptr, &cached_hits_,
+ need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
+ cached_hits_idx_ = 0;
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermLiteExact::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_);
+}
+
+libtextclassifier3::Status
+DocHitInfoIteratorTermLitePrefix::RetrieveMoreHits() {
+ // Take union of lite terms.
+ int term_len = term_.length();
+ int terms_matched = 0;
+ for (LiteIndex::PrefixIterator it = lite_index_->FindTermPrefixes(term_);
+ it.IsValid(); it.Advance()) {
+ bool exact_match = strlen(it.GetKey()) == term_len;
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t term_id,
+ term_id_codec_->EncodeTvi(it.GetValueIndex(), TviType::LITE));
+ lite_index_->FetchHits(
+ term_id, section_restrict_mask_,
+ /*only_from_prefix_sections=*/!exact_match,
+ /*score_by=*/
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE,
+ /*namespace_checker=*/nullptr, &cached_hits_,
+ need_hit_term_frequency_ ? &cached_hit_term_frequency_ : nullptr);
+ ++terms_matched;
+ }
+ if (terms_matched > 1) {
+ SortAndDedupeDocumentIds();
+ }
+ cached_hits_idx_ = 0;
+ return libtextclassifier3::Status::OK;
+}
+
+void DocHitInfoIteratorTermLitePrefix::SortDocumentIds() {
+ // Re-sort cached document_ids and merge sections.
+ if (!need_hit_term_frequency_) {
+ // If we don't need to also sort cached_hit_term_frequency_ along with
+ // cached_hits_, then just simply sort cached_hits_.
+ sort(cached_hits_.begin(), cached_hits_.end());
+ } else {
+ // Sort cached_hit_term_frequency_ along with cached_hits_.
+ std::vector<int> indices(cached_hits_.size());
+ std::iota(indices.begin(), indices.end(), 0);
+ std::sort(indices.begin(), indices.end(), [this](int i, int j) {
+ return cached_hits_[i] < cached_hits_[j];
+ });
+ // Now indices is a map from sorted index to current index. In other words,
+ // the sorted cached_hits_[i] should be the current cached_hits_[indices[i]]
+ // for every valid i.
+ std::vector<bool> done(indices.size());
+ // Apply permutation
+ for (int i = 0; i < indices.size(); ++i) {
+ if (done[i]) {
+ continue;
+ }
+ done[i] = true;
+ int curr = i;
+ int next = indices[i];
+ // Since every finite permutation is formed by disjoint cycles, we can
+ // start with the current element, at index i, and swap the element at
+ // this position with whatever element that *should* be here. Then,
+ // continue to swap the original element, at its updated positions, with
+ // the element that should be occupying that position until the original
+ // element has reached *its* correct position. This completes applying the
+ // single cycle in the permutation.
+ while (next != i) {
+ std::swap(cached_hits_[curr], cached_hits_[next]);
+ std::swap(cached_hit_term_frequency_[curr],
+ cached_hit_term_frequency_[next]);
+ done[next] = true;
+ curr = next;
+ next = indices[next];
+ }
+ }
+ }
+}
+
+void DocHitInfoIteratorTermLitePrefix::SortAndDedupeDocumentIds() {
+ SortDocumentIds();
+ int idx = 0;
+ for (int i = 1; i < cached_hits_.size(); ++i) {
+ const DocHitInfo& hit_info = cached_hits_[i];
+ DocHitInfo& collapsed_hit_info = cached_hits_[idx];
+ if (collapsed_hit_info.document_id() == hit_info.document_id()) {
+ SectionIdMask curr_mask = hit_info.hit_section_ids_mask();
+ collapsed_hit_info.MergeSectionsFrom(curr_mask);
+ if (need_hit_term_frequency_) {
+ Hit::TermFrequencyArray& collapsed_term_frequency =
+ cached_hit_term_frequency_[idx];
+ while (curr_mask) {
+ SectionId section_id = __builtin_ctzll(curr_mask);
+ collapsed_term_frequency[section_id] =
+ cached_hit_term_frequency_[i][section_id];
+ curr_mask &= ~(UINT64_C(1) << section_id);
+ }
+ }
+ } else {
+ // New document_id.
+ ++idx;
+ cached_hits_[idx] = hit_info;
+ if (need_hit_term_frequency_) {
+ cached_hit_term_frequency_[idx] = cached_hit_term_frequency_[i];
+ }
+ }
+ }
+ // idx points to last doc hit info.
+ cached_hits_.resize(idx + 1);
+ if (need_hit_term_frequency_) {
+ cached_hit_term_frequency_.resize(idx + 1);
+ }
+}
+
+std::string DocHitInfoIteratorTermLitePrefix::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_, "*");
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/lite/doc-hit-info-iterator-term-lite.h b/icing/index/lite/doc-hit-info-iterator-term-lite.h
new file mode 100644
index 0000000..7facd88
--- /dev/null
+++ b/icing/index/lite/doc-hit-info-iterator-term-lite.h
@@ -0,0 +1,173 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class DocHitInfoIteratorTermLite : public DocHitInfoLeafIterator {
+ public:
+ explicit DocHitInfoIteratorTermLite(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ int term_start_index,
+ int unnormalized_term_length,
+ SectionIdMask section_restrict_mask,
+ bool need_hit_term_frequency)
+ : term_(term),
+ term_start_index_(term_start_index),
+ unnormalized_term_length_(unnormalized_term_length),
+ lite_index_(lite_index),
+ cached_hits_idx_(-1),
+ term_id_codec_(term_id_codec),
+ num_advance_calls_(0),
+ section_restrict_mask_(section_restrict_mask),
+ need_hit_term_frequency_(need_hit_term_frequency) {}
+
+ libtextclassifier3::Status Advance() override;
+
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
+
+ CallStats GetCallStats() const override {
+ return CallStats(
+ /*num_leaf_advance_calls_lite_index_in=*/num_advance_calls_,
+ /*num_leaf_advance_calls_main_index_in=*/0,
+ /*num_leaf_advance_calls_integer_index_in=*/0,
+ /*num_leaf_advance_calls_no_index_in=*/0,
+ /*num_blocks_inspected_in=*/0);
+ }
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (cached_hits_idx_ == -1 || cached_hits_idx_ >= cached_hits_.size()) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ SectionIdMask section_mask =
+ doc_hit_info_.hit_section_ids_mask() & filtering_section_mask;
+ SectionIdMask section_mask_copy = section_mask;
+ std::array<Hit::TermFrequency, kTotalNumSections> section_term_frequencies =
+ {Hit::kNoTermFrequency};
+ while (section_mask_copy) {
+ SectionId section_id = __builtin_ctzll(section_mask_copy);
+ if (need_hit_term_frequency_) {
+ section_term_frequencies.at(section_id) =
+ cached_hit_term_frequency_.at(cached_hits_idx_)[section_id];
+ }
+ section_mask_copy &= ~(UINT64_C(1) << section_id);
+ }
+ TermMatchInfo term_stats(term_, section_mask,
+ std::move(section_term_frequencies));
+
+ for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
+ if (cur_term_stats.term == term_stats.term) {
+ // Same docId and same term, we don't need to add the term and the term
+ // frequency should always be the same
+ return;
+ }
+ }
+ matched_terms_stats->push_back(std::move(term_stats));
+ }
+
+ protected:
+ // Add DocHitInfos corresponding to term_ to cached_hits_.
+ //
+ // Returns:
+ // - OK, on success
+ // - NOT_FOUND if no term matching term_ was found in the lexicon.
+ // - INVALID_ARGUMENT if unable to properly encode the termid
+ virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
+
+ const std::string term_;
+ // The start index of the given term in the search query
+ int term_start_index_;
+ // The length of the given unnormalized term in the search query
+ int unnormalized_term_length_;
+ LiteIndex* const lite_index_;
+ // Stores hits retrieved from the index. This may only be a subset of the hits
+ // that are present in the index. Current value pointed to by the Iterator is
+ // tracked by cached_hits_idx_.
+ std::vector<DocHitInfo> cached_hits_;
+ std::vector<Hit::TermFrequencyArray> cached_hit_term_frequency_;
+ int cached_hits_idx_;
+ const TermIdCodec* term_id_codec_;
+ int num_advance_calls_;
+ // Mask indicating which sections hits should be considered for.
+ // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
+ const SectionIdMask section_restrict_mask_;
+ const bool need_hit_term_frequency_;
+};
+
+class DocHitInfoIteratorTermLiteExact : public DocHitInfoIteratorTermLite {
+ public:
+ explicit DocHitInfoIteratorTermLiteExact(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ int term_start_index,
+ int unnormalized_term_length,
+ SectionIdMask section_id_mask,
+ bool need_hit_term_frequency)
+ : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+ term_start_index, unnormalized_term_length,
+ section_id_mask, need_hit_term_frequency) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+};
+
+class DocHitInfoIteratorTermLitePrefix : public DocHitInfoIteratorTermLite {
+ public:
+ explicit DocHitInfoIteratorTermLitePrefix(const TermIdCodec* term_id_codec,
+ LiteIndex* lite_index,
+ const std::string& term,
+ int term_start_index,
+ int unnormalized_term_length,
+ SectionIdMask section_id_mask,
+ bool need_hit_term_frequency)
+ : DocHitInfoIteratorTermLite(term_id_codec, lite_index, term,
+ term_start_index, unnormalized_term_length,
+ section_id_mask, need_hit_term_frequency) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+
+ private:
+ // After retrieving DocHitInfos from the index, a DocHitInfo for docid 1 and
+ // "foo" and a DocHitInfo for docid 1 and "fool". These DocHitInfos should be
+ // merged.
+ void SortDocumentIds();
+ void SortAndDedupeDocumentIds();
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_LITE_H_
diff --git a/icing/legacy/index/icing-lite-index-header.h b/icing/index/lite/lite-index-header.h
index ac2d3c0..75de8fa 100644
--- a/icing/legacy/index/icing-lite-index-header.h
+++ b/icing/index/lite/lite-index-header.h
@@ -15,16 +15,19 @@
#ifndef ICING_LEGACY_INDEX_ICING_LITE_INDEX_HEADER_H_
#define ICING_LEGACY_INDEX_ICING_LITE_INDEX_HEADER_H_
+#include <cstddef>
+#include <cstdint>
+
#include "icing/legacy/core/icing-string-util.h"
-#include "icing/legacy/index/icing-common-types.h"
+#include "icing/store/document-id.h"
namespace icing {
namespace lib {
// A wrapper around the actual mmapped header data.
-class IcingLiteIndex_Header {
+class LiteIndex_Header {
public:
- virtual ~IcingLiteIndex_Header() = default;
+ virtual ~LiteIndex_Header() = default;
// Returns true if the magic of the header matches the hard-coded magic
// value associated with this header format.
@@ -47,10 +50,17 @@ class IcingLiteIndex_Header {
virtual void Reset() = 0;
};
-class IcingLiteIndex_HeaderImpl : public IcingLiteIndex_Header {
+class LiteIndex_HeaderImpl : public LiteIndex_Header {
public:
struct HeaderData {
- static const uint32_t kMagic = 0x6dfba6a0;
+ static uint32_t GetCurrentMagic(
+ bool include_property_existence_metadata_hits) {
+ if (!include_property_existence_metadata_hits) {
+ return 0x01c61418;
+ } else {
+ return 0x56e07d5b;
+ }
+ }
uint32_t lite_index_crc;
uint32_t magic;
@@ -66,10 +76,15 @@ class IcingLiteIndex_HeaderImpl : public IcingLiteIndex_Header {
uint32_t searchable_end;
};
- explicit IcingLiteIndex_HeaderImpl(HeaderData *hdr) : hdr_(hdr) {}
+ explicit LiteIndex_HeaderImpl(HeaderData *hdr,
+ bool include_property_existence_metadata_hits)
+ : hdr_(hdr),
+ include_property_existence_metadata_hits_(
+ include_property_existence_metadata_hits) {}
bool check_magic() const override {
- return hdr_->magic == HeaderData::kMagic;
+ return hdr_->magic == HeaderData::GetCurrentMagic(
+ include_property_existence_metadata_hits_);
}
uint32_t lite_index_crc() const override { return hdr_->lite_index_crc; }
@@ -96,16 +111,18 @@ class IcingLiteIndex_HeaderImpl : public IcingLiteIndex_Header {
void Reset() override {
hdr_->lite_index_crc = 0;
- hdr_->magic = HeaderData::kMagic;
- hdr_->last_added_docid = kIcingInvalidDocId;
+ hdr_->magic =
+ HeaderData::GetCurrentMagic(include_property_existence_metadata_hits_);
+ hdr_->last_added_docid = kInvalidDocumentId;
hdr_->cur_size = 0;
hdr_->searchable_end = 0;
}
private:
HeaderData *hdr_;
+ bool include_property_existence_metadata_hits_;
};
-static_assert(24 == sizeof(IcingLiteIndex_HeaderImpl::HeaderData),
+static_assert(24 == sizeof(LiteIndex_HeaderImpl::HeaderData),
"sizeof(HeaderData) != 24");
} // namespace lib
diff --git a/icing/legacy/index/icing-lite-index-options.cc b/icing/index/lite/lite-index-options.cc
index 4bf0d38..7e6c076 100644
--- a/icing/legacy/index/icing-lite-index-options.cc
+++ b/icing/index/lite/lite-index-options.cc
@@ -12,13 +12,31 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/legacy/index/icing-lite-index-options.h"
+#include "icing/index/lite/lite-index-options.h"
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
namespace icing {
namespace lib {
namespace {
+constexpr int kIcingMaxVariantsPerToken = 10; // Maximum number of variants
+
+constexpr size_t kIcingMaxSearchableDocumentSize = (1u << 16) - 1; // 64K
+// Max num tokens per document. 64KB is our original maximum (searchable)
+// document size. We clip if document exceeds this.
+constexpr uint32_t kIcingMaxNumTokensPerDoc =
+ kIcingMaxSearchableDocumentSize / 5;
+constexpr uint32_t kIcingMaxNumHitsPerDocument =
+ kIcingMaxNumTokensPerDoc * kIcingMaxVariantsPerToken;
+
uint32_t CalculateHitBufferSize(uint32_t hit_buffer_want_merge_bytes) {
constexpr uint32_t kHitBufferSlopMult = 2;
@@ -27,7 +45,7 @@ uint32_t CalculateHitBufferSize(uint32_t hit_buffer_want_merge_bytes) {
// TODO(b/111690435) Move LiteIndex::Element to a separate file so that this
// can use sizeof(LiteIndex::Element)
uint32_t hit_capacity_elts_with_slop =
- hit_buffer_want_merge_bytes / sizeof(uint64_t);
+ hit_buffer_want_merge_bytes / sizeof(TermIdHitPair);
// Add some slop for index variants on top of max num tokens.
hit_capacity_elts_with_slop += kIcingMaxNumHitsPerDocument;
hit_capacity_elts_with_slop *= kHitBufferSlopMult;
@@ -51,10 +69,16 @@ IcingDynamicTrie::Options CalculateTrieOptions(uint32_t hit_buffer_size) {
} // namespace
-IcingLiteIndexOptions::IcingLiteIndexOptions(
- const std::string& filename_base, uint32_t hit_buffer_want_merge_bytes)
+LiteIndexOptions::LiteIndexOptions(
+ const std::string& filename_base, uint32_t hit_buffer_want_merge_bytes,
+ bool hit_buffer_sort_at_indexing, uint32_t hit_buffer_sort_threshold_bytes,
+ bool include_property_existence_metadata_hits)
: filename_base(filename_base),
- hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes) {
+ hit_buffer_want_merge_bytes(hit_buffer_want_merge_bytes),
+ hit_buffer_sort_at_indexing(hit_buffer_sort_at_indexing),
+ hit_buffer_sort_threshold_bytes(hit_buffer_sort_threshold_bytes),
+ include_property_existence_metadata_hits(
+ include_property_existence_metadata_hits) {
hit_buffer_size = CalculateHitBufferSize(hit_buffer_want_merge_bytes);
lexicon_options = CalculateTrieOptions(hit_buffer_size);
display_mappings_options = CalculateTrieOptions(hit_buffer_size);
diff --git a/icing/legacy/index/icing-lite-index-options.h b/icing/index/lite/lite-index-options.h
index 2922621..8b03449 100644
--- a/icing/legacy/index/icing-lite-index-options.h
+++ b/icing/index/lite/lite-index-options.h
@@ -15,20 +15,25 @@
#ifndef ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_
#define ICING_LEGACY_INDEX_ICING_LITE_INDEX_OPTIONS_H_
-#include "icing/legacy/index/icing-common-types.h"
+#include <cstdint>
+#include <string>
+
#include "icing/legacy/index/icing-dynamic-trie.h"
namespace icing {
namespace lib {
-struct IcingLiteIndexOptions {
- IcingLiteIndexOptions() = default;
- // Creates IcingLiteIndexOptions based off of the specified parameters. All
+struct LiteIndexOptions {
+ LiteIndexOptions() = default;
+ // Creates LiteIndexOptions based off of the specified parameters. All
// other fields are calculated based on the value of
// hit_buffer_want_merge_bytes and the logic in CalculateHitBufferSize and
// CalculateTrieOptions.
- IcingLiteIndexOptions(const std::string& filename_base,
- uint32_t hit_buffer_want_merge_bytes);
+ LiteIndexOptions(const std::string& filename_base,
+ uint32_t hit_buffer_want_merge_bytes,
+ bool hit_buffer_sort_at_indexing,
+ uint32_t hit_buffer_sort_threshold_bytes,
+ bool include_property_existence_metadata_hits = false);
IcingDynamicTrie::Options lexicon_options;
IcingDynamicTrie::Options display_mappings_options;
@@ -36,6 +41,9 @@ struct IcingLiteIndexOptions {
std::string filename_base;
uint32_t hit_buffer_want_merge_bytes = 0;
uint32_t hit_buffer_size = 0;
+ bool hit_buffer_sort_at_indexing = false;
+ uint32_t hit_buffer_sort_threshold_bytes = 0;
+ bool include_property_existence_metadata_hits = false;
};
} // namespace lib
diff --git a/icing/index/lite/lite-index.cc b/icing/index/lite/lite-index.cc
new file mode 100644
index 0000000..3f9cc93
--- /dev/null
+++ b/icing/index/lite/lite-index.cc
@@ -0,0 +1,716 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/lite-index.h"
+
+#include <sys/mman.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/lite/lite-index-header.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/core/icing-timer.h"
+#include "icing/legacy/index/icing-array-storage.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/suggestion-result-checker.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Point at which we declare the trie full.
+constexpr double kTrieFullFraction = 0.95;
+
+std::string MakeHitBufferFilename(const std::string& filename_base) {
+ return filename_base + "hb";
+}
+
+size_t header_size() { return sizeof(LiteIndex_HeaderImpl::HeaderData); }
+
+} // namespace
+
+const TermIdHitPair::Value TermIdHitPair::kInvalidValue =
+ TermIdHitPair(0, Hit()).value();
+
+libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> LiteIndex::Create(
+ const LiteIndex::Options& options, const IcingFilesystem* filesystem) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+
+ std::unique_ptr<LiteIndex> lite_index =
+ std::unique_ptr<LiteIndex>(new LiteIndex(options, filesystem));
+ ICING_RETURN_IF_ERROR(lite_index->Initialize());
+ return std::move(lite_index);
+}
+
+// size is max size in elements. An appropriate lexicon and display
+// mapping size will be chosen based on hit buffer size.
+LiteIndex::LiteIndex(const LiteIndex::Options& options,
+ const IcingFilesystem* filesystem)
+ : hit_buffer_(*filesystem),
+ hit_buffer_crc_(0),
+ lexicon_(options.filename_base + "lexicon", MakeTrieRuntimeOptions(),
+ filesystem),
+ header_mmap_(false, MAP_SHARED),
+ options_(options),
+ filesystem_(filesystem) {}
+
+LiteIndex::~LiteIndex() {
+ if (initialized()) {
+ libtextclassifier3::Status unused = PersistToDisk();
+ }
+}
+
+IcingDynamicTrie::RuntimeOptions LiteIndex::MakeTrieRuntimeOptions() {
+ return IcingDynamicTrie::RuntimeOptions().set_storage_policy(
+ IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc);
+}
+
+libtextclassifier3::Status LiteIndex::Initialize() {
+ // Size of hit buffer's header struct, rounded up to the nearest number of
+ // system memory pages.
+ const size_t header_padded_size =
+ IcingMMapper::page_aligned_size(header_size());
+
+ // Variable declarations cannot cross goto jumps, so declare these up top.
+ libtextclassifier3::Status status;
+ uint64_t file_size;
+ IcingTimer timer;
+
+ absl_ports::unique_lock l(&mutex_);
+ if (!lexicon_.CreateIfNotExist(options_.lexicon_options) ||
+ !lexicon_.Init()) {
+ return absl_ports::InternalError("Failed to initialize lexicon trie");
+ }
+
+ hit_buffer_fd_.reset(filesystem_->OpenForWrite(
+ MakeHitBufferFilename(options_.filename_base).c_str()));
+ if (!hit_buffer_fd_.is_valid()) {
+ status = absl_ports::InternalError("Failed to open hit buffer file");
+ goto error;
+ }
+
+ file_size = filesystem_->GetFileSize(hit_buffer_fd_.get());
+ if (file_size == IcingFilesystem::kBadFileSize) {
+ status = absl_ports::InternalError("Failed to query hit buffer file size");
+ goto error;
+ }
+
+ if (file_size < header_padded_size) {
+ if (file_size != 0) {
+ status = absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Hit buffer had unexpected size %" PRIu64, file_size));
+ goto error;
+ }
+
+ ICING_VLOG(2) << "Creating new hit buffer";
+ // Make sure files are fresh.
+ if (!lexicon_.Remove() ||
+ !lexicon_.CreateIfNotExist(options_.lexicon_options) ||
+ !lexicon_.Init()) {
+ status =
+ absl_ports::InternalError("Failed to refresh lexicon during clear");
+ goto error;
+ }
+
+ // Create fresh hit buffer by first emptying the hit buffer file and then
+ // allocating header_padded_size of the cleared space.
+ if (!filesystem_->Truncate(hit_buffer_fd_.get(), 0) ||
+ !filesystem_->Truncate(hit_buffer_fd_.get(), header_padded_size)) {
+ status = absl_ports::InternalError("Failed to truncate hit buffer file");
+ goto error;
+ }
+
+ // Set up header.
+ header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
+ header_ = std::make_unique<LiteIndex_HeaderImpl>(
+ reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
+ header_mmap_.address()),
+ options_.include_property_existence_metadata_hits);
+ header_->Reset();
+
+ if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
+ sizeof(TermIdHitPair::Value), header_->cur_size(),
+ options_.hit_buffer_size, &hit_buffer_crc_, true)) {
+ status = absl_ports::InternalError("Failed to initialize new hit buffer");
+ goto error;
+ }
+
+ UpdateChecksum();
+ } else {
+ header_mmap_.Remap(hit_buffer_fd_.get(), kHeaderFileOffset, header_size());
+ header_ = std::make_unique<LiteIndex_HeaderImpl>(
+ reinterpret_cast<LiteIndex_HeaderImpl::HeaderData*>(
+ header_mmap_.address()),
+ options_.include_property_existence_metadata_hits);
+
+ if (!hit_buffer_.Init(hit_buffer_fd_.get(), header_padded_size, true,
+ sizeof(TermIdHitPair::Value), header_->cur_size(),
+ options_.hit_buffer_size, &hit_buffer_crc_, true)) {
+ status = absl_ports::InternalError(
+ "Failed to re-initialize existing hit buffer");
+ goto error;
+ }
+
+ // Check integrity.
+ if (!header_->check_magic()) {
+ status = absl_ports::InternalError("Lite index header magic mismatch");
+ goto error;
+ }
+ Crc32 crc = ComputeChecksum();
+ if (crc.Get() != header_->lite_index_crc()) {
+ status = absl_ports::DataLossError(
+ IcingStringUtil::StringPrintf("Lite index crc check failed: %u vs %u",
+ crc.Get(), header_->lite_index_crc()));
+ goto error;
+ }
+ }
+
+ ICING_VLOG(2) << "Lite index init ok in " << timer.Elapsed() * 1000 << "ms";
+ return status;
+
+error:
+ header_ = nullptr;
+ header_mmap_.Unmap();
+ lexicon_.Close();
+ hit_buffer_crc_ = 0;
+ hit_buffer_.Reset();
+ hit_buffer_fd_.reset();
+ if (status.ok()) {
+ return absl_ports::InternalError(
+ "Error handling code ran but status was ok");
+ }
+ return status;
+}
+
+Crc32 LiteIndex::ComputeChecksum() {
+ IcingTimer timer;
+
+ // Update crcs.
+ uint32_t dependent_crcs[2];
+ hit_buffer_.UpdateCrc();
+ dependent_crcs[0] = hit_buffer_crc_;
+ dependent_crcs[1] = lexicon_.UpdateCrc();
+
+ // Compute the master crc.
+
+ // Header crc, excluding the actual crc field.
+ Crc32 all_crc(header_->CalculateHeaderCrc());
+ all_crc.Append(std::string_view(reinterpret_cast<const char*>(dependent_crcs),
+ sizeof(dependent_crcs)));
+ ICING_VLOG(2) << "Lite index crc computed in " << timer.Elapsed() * 1000
+ << "ms";
+
+ return all_crc;
+}
+
+libtextclassifier3::Status LiteIndex::Reset() {
+ IcingTimer timer;
+
+ absl_ports::unique_lock l(&mutex_);
+ // TODO(b/140436942): When these components have been changed to return errors
+ // they should be propagated from here.
+ lexicon_.Clear();
+ hit_buffer_.Clear();
+ header_->Reset();
+ UpdateChecksum();
+
+ ICING_VLOG(2) << "Lite index clear in " << timer.Elapsed() * 1000 << "ms";
+ return libtextclassifier3::Status::OK;
+}
+
+void LiteIndex::Warm() {
+ absl_ports::shared_lock l(&mutex_);
+ hit_buffer_.Warm();
+ lexicon_.Warm();
+}
+
+libtextclassifier3::Status LiteIndex::PersistToDisk() {
+ absl_ports::unique_lock l(&mutex_);
+ bool success = true;
+ if (!lexicon_.Sync()) {
+ ICING_VLOG(1) << "Failed to sync the lexicon.";
+ success = false;
+ }
+ hit_buffer_.Sync();
+ UpdateChecksum();
+ header_mmap_.Sync();
+
+ return (success) ? libtextclassifier3::Status::OK
+ : absl_ports::InternalError(
+ "Unable to sync lite index components.");
+}
+
+void LiteIndex::UpdateChecksum() {
+ header_->set_lite_index_crc(ComputeChecksum().Get());
+}
+
+libtextclassifier3::StatusOr<uint32_t> LiteIndex::InsertTerm(
+ const std::string& term, TermMatchType::Code term_match_type,
+ NamespaceId namespace_id) {
+ absl_ports::unique_lock l(&mutex_);
+ uint32_t tvi;
+ libtextclassifier3::Status status =
+ lexicon_.Insert(term.c_str(), "", &tvi, false);
+ if (!status.ok()) {
+ ICING_LOG(DBG) << "Unable to add term " << term << " to lexicon!\n"
+ << status.error_message();
+ return status;
+ }
+ ICING_RETURN_IF_ERROR(UpdateTermPropertiesImpl(
+ tvi, term_match_type == TermMatchType::PREFIX, namespace_id));
+ return tvi;
+}
+
+libtextclassifier3::Status LiteIndex::UpdateTermProperties(
+ uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
+ absl_ports::unique_lock l(&mutex_);
+ return UpdateTermPropertiesImpl(tvi, hasPrefixHits, namespace_id);
+}
+
+libtextclassifier3::Status LiteIndex::UpdateTermPropertiesImpl(
+ uint32_t tvi, bool hasPrefixHits, NamespaceId namespace_id) {
+ if (hasPrefixHits &&
+ !lexicon_.SetProperty(tvi, GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::ResourceExhaustedError(
+ "Insufficient disk space to create prefix property!");
+ }
+
+ if (!lexicon_.SetProperty(tvi, GetNamespacePropertyId(namespace_id))) {
+ return absl_ports::ResourceExhaustedError(
+ "Insufficient disk space to create namespace property!");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status LiteIndex::AddHit(uint32_t term_id, const Hit& hit) {
+ absl_ports::unique_lock l(&mutex_);
+ if (is_full()) {
+ return absl_ports::ResourceExhaustedError("Hit buffer is full!");
+ }
+
+ TermIdHitPair term_id_hit_pair(term_id, hit);
+ uint32_t cur_size = header_->cur_size();
+ TermIdHitPair::Value* valp =
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(cur_size, 1);
+ if (valp == nullptr) {
+ return absl_ports::ResourceExhaustedError(
+ "Allocating more space in hit buffer failed!");
+ }
+ *valp = term_id_hit_pair.value();
+ header_->set_cur_size(cur_size + 1);
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<uint32_t> LiteIndex::GetTermId(
+ const std::string& term) const {
+ absl_ports::shared_lock l(&mutex_);
+ char dummy;
+ uint32_t tvi;
+ if (!lexicon_.Find(term.c_str(), &dummy, &tvi)) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Could not find ", term, " in the lexicon."));
+ }
+ return tvi;
+}
+
+void LiteIndex::ScoreAndAppendFetchedHit(
+ const Hit& hit, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker,
+ DocumentId& last_document_id, bool& is_last_document_desired,
+ int& total_score_out, std::vector<DocHitInfo>* hits_out,
+ std::vector<Hit::TermFrequencyArray>* term_frequency_out) const {
+ // Check sections.
+ if (((UINT64_C(1) << hit.section_id()) & section_id_mask) == 0) {
+ return;
+ }
+ // Check prefix section only.
+ if (only_from_prefix_sections && !hit.is_in_prefix_section()) {
+ return;
+ }
+ // Check whether this Hit is desired.
+ // TODO(b/230553264) Move common logic into helper function once we support
+ // score term by prefix_hit in lite_index.
+ DocumentId document_id = hit.document_id();
+ bool is_new_document = document_id != last_document_id;
+ if (is_new_document) {
+ last_document_id = document_id;
+ is_last_document_desired =
+ suggestion_result_checker == nullptr ||
+ suggestion_result_checker->BelongsToTargetResults(document_id,
+ hit.section_id());
+ }
+ if (!is_last_document_desired) {
+ // The document is removed or expired or not desired.
+ return;
+ }
+
+ // Score the hit by the strategy
+ switch (score_by) {
+ case SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE:
+ total_score_out = 1;
+ break;
+ case SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT:
+ if (is_new_document) {
+ ++total_score_out;
+ }
+ break;
+ case SuggestionScoringSpecProto::SuggestionRankingStrategy::TERM_FREQUENCY:
+ if (hit.has_term_frequency()) {
+ total_score_out += hit.term_frequency();
+ } else {
+ ++total_score_out;
+ }
+ break;
+ }
+
+ // Append the Hit or update hit section to the output vector.
+ if (is_new_document && hits_out != nullptr) {
+ hits_out->push_back(DocHitInfo(document_id));
+ if (term_frequency_out != nullptr) {
+ term_frequency_out->push_back(Hit::TermFrequencyArray());
+ }
+ }
+ if (hits_out != nullptr) {
+ hits_out->back().UpdateSection(hit.section_id());
+ if (term_frequency_out != nullptr) {
+ term_frequency_out->back()[hit.section_id()] = hit.term_frequency();
+ }
+ }
+}
+
+int LiteIndex::FetchHits(
+ uint32_t term_id, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker,
+ std::vector<DocHitInfo>* hits_out,
+ std::vector<Hit::TermFrequencyArray>* term_frequency_out) {
+ bool need_sort_at_querying = false;
+ {
+ absl_ports::shared_lock l(&mutex_);
+
+ // We sort here when:
+ // 1. We don't enable sorting at indexing time (i.e. we sort at querying
+ // time), and there is an unsorted tail portion. OR
+ // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold,
+ // regardless of whether or not hit_buffer_sort_at_indexing is enabled.
+ // This is more of a sanity check. We should not really be encountering
+ // this case.
+ need_sort_at_querying = NeedSortAtQuerying();
+ }
+ if (need_sort_at_querying) {
+ absl_ports::unique_lock l(&mutex_);
+ IcingTimer timer;
+
+ // Transition from shared_lock to unique_lock is safe here because it
+ // doesn't hurt to sort again if sorting was done already by another thread
+ // after need_sort_at_querying is evaluated.
+ // We check need_sort_at_querying to improve query concurrency as threads
+ // can avoid acquiring the unique lock if no sorting is needed.
+ SortHitsImpl();
+
+ if (options_.hit_buffer_sort_at_indexing) {
+ // This is the second case for sort. Log as this should be a very rare
+ // occasion.
+ ICING_LOG(WARNING) << "Sorting HitBuffer at querying time when "
+ "hit_buffer_sort_at_indexing is enabled. Sort and "
+ "merge HitBuffer in "
+ << timer.Elapsed() * 1000 << " ms.";
+ }
+ }
+
+ // This downgrade from an unique_lock to a shared_lock is safe because we're
+ // searching for the term in the searchable (sorted) section of the HitBuffer
+ // only in Seek().
+ // Any operations that might execute in between the transition of downgrading
+ // the lock here are guaranteed not to alter the searchable section (or the
+ // LiteIndex) due to a global lock in IcingSearchEngine.
+ absl_ports::shared_lock l(&mutex_);
+
+ // Search in the HitBuffer array for Hits with the corresponding term_id.
+ // Hits are added in increasing order of doc ids, so hits that get appended
+ // later have larger docIds. This means that:
+ // 1. Hits in the unsorted tail will have larger docIds than hits in the
+ // sorted portion.
+ // 2. Hits at the end of the unsorted tail will have larger docIds than hits
+ // in the front of the tail.
+ // We want to retrieve hits in descending order of docIds. Therefore we should
+ // search by doing:
+ // 1. Linear search first in reverse iteration order over the unsorted tail
+ // portion.
+ // 2. Followed by binary search on the sorted portion.
+ const TermIdHitPair* array = hit_buffer_.array_cast<TermIdHitPair>();
+
+ DocumentId last_document_id = kInvalidDocumentId;
+ // Record whether the last document belongs to the given namespaces.
+ bool is_last_document_desired = false;
+ int total_score = 0;
+
+ // Linear search over unsorted tail in reverse iteration order.
+ // This should only be performed when hit_buffer_sort_at_indexing is enabled.
+ // When disabled, the entire HitBuffer should be sorted already and only
+ // binary search is needed.
+ if (options_.hit_buffer_sort_at_indexing) {
+ uint32_t unsorted_length = header_->cur_size() - header_->searchable_end();
+ for (uint32_t i = 1; i <= unsorted_length; ++i) {
+ TermIdHitPair term_id_hit_pair = array[header_->cur_size() - i];
+ if (term_id_hit_pair.term_id() == term_id) {
+ // We've found a matched hit.
+ const Hit& matched_hit = term_id_hit_pair.hit();
+ // Score the hit and add to total_score. Also add the hits and its term
+ // frequency info to hits_out and term_frequency_out if the two vectors
+ // are non-null.
+ ScoreAndAppendFetchedHit(matched_hit, section_id_mask,
+ only_from_prefix_sections, score_by,
+ suggestion_result_checker, last_document_id,
+ is_last_document_desired, total_score,
+ hits_out, term_frequency_out);
+ }
+ }
+ }
+
+ // Do binary search over the sorted section and repeat the above steps.
+ TermIdHitPair target_term_id_hit_pair(
+ term_id, Hit(Hit::kMaxDocumentIdSortValue, Hit::kDefaultTermFrequency));
+ for (const TermIdHitPair* ptr = std::lower_bound(
+ array, array + header_->searchable_end(), target_term_id_hit_pair);
+ ptr < array + header_->searchable_end(); ++ptr) {
+ if (ptr->term_id() != term_id) {
+ // We've processed all matches. Stop iterating further.
+ break;
+ }
+
+ const Hit& matched_hit = ptr->hit();
+ // Score the hit and add to total_score. Also add the hits and its term
+ // frequency info to hits_out and term_frequency_out if the two vectors are
+ // non-null.
+ ScoreAndAppendFetchedHit(
+ matched_hit, section_id_mask, only_from_prefix_sections, score_by,
+ suggestion_result_checker, last_document_id, is_last_document_desired,
+ total_score, hits_out, term_frequency_out);
+ }
+ return total_score;
+}
+
+libtextclassifier3::StatusOr<int> LiteIndex::ScoreHits(
+ uint32_t term_id,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker) {
+ return FetchHits(term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false, score_by,
+ suggestion_result_checker,
+ /*hits_out=*/nullptr);
+}
+
+bool LiteIndex::is_full() const {
+ return (header_->cur_size() == options_.hit_buffer_size ||
+ lexicon_.min_free_fraction() < (1.0 - kTrieFullFraction));
+}
+
+std::string LiteIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) {
+ absl_ports::unique_lock l(&mutex_);
+ std::string res;
+ std::string lexicon_info;
+ lexicon_.GetDebugInfo(verbosity, &lexicon_info);
+ IcingStringUtil::SStringAppendF(
+ &res, 0,
+ "curr_size: %u\n"
+ "hit_buffer_size: %u\n"
+ "last_added_document_id %u\n"
+ "searchable_end: %u\n"
+ "index_crc: %u\n"
+ "\n"
+ "lite_lexicon_info:\n%s\n",
+ header_->cur_size(), options_.hit_buffer_size,
+ header_->last_added_docid(), header_->searchable_end(),
+ ComputeChecksum().Get(), lexicon_info.c_str());
+ return res;
+}
+
+libtextclassifier3::StatusOr<int64_t> LiteIndex::GetElementsSize() const {
+ IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
+ if (storage_info.lite_index_hit_buffer_size() == -1 ||
+ storage_info.lite_index_lexicon_size() == -1) {
+ return absl_ports::AbortedError(
+ "Failed to get size of LiteIndex's members.");
+ }
+ // On initialization, we grow the file to a padded size first. So this size
+ // won't count towards the size taken up by elements
+ size_t header_padded_size = IcingMMapper::page_aligned_size(header_size());
+ return storage_info.lite_index_hit_buffer_size() - header_padded_size +
+ storage_info.lite_index_lexicon_size();
+}
+
+IndexStorageInfoProto LiteIndex::GetStorageInfo(
+ IndexStorageInfoProto storage_info) const {
+ absl_ports::shared_lock l(&mutex_);
+ int64_t header_and_hit_buffer_file_size =
+ filesystem_->GetFileSize(hit_buffer_fd_.get());
+ storage_info.set_lite_index_hit_buffer_size(
+ IcingFilesystem::SanitizeFileSize(header_and_hit_buffer_file_size));
+ int64_t lexicon_disk_usage = lexicon_.GetElementsSize();
+ if (lexicon_disk_usage != Filesystem::kBadFileSize) {
+ storage_info.set_lite_index_lexicon_size(lexicon_disk_usage);
+ } else {
+ storage_info.set_lite_index_lexicon_size(-1);
+ }
+ return storage_info;
+}
+
+void LiteIndex::SortHitsImpl() {
+ // Make searchable by sorting by hit buffer.
+ uint32_t sort_len = header_->cur_size() - header_->searchable_end();
+ if (sort_len <= 0) {
+ return;
+ }
+ IcingTimer timer;
+
+ auto* array_start =
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(0, header_->cur_size());
+ TermIdHitPair::Value* sort_start = array_start + header_->searchable_end();
+ std::sort(sort_start, array_start + header_->cur_size());
+
+ // Now merge with previous region. Since the previous region is already
+ // sorted and deduplicated, optimize the merge by skipping everything less
+ // than the new region's smallest value.
+ if (header_->searchable_end() > 0) {
+ std::inplace_merge(array_start, array_start + header_->searchable_end(),
+ array_start + header_->cur_size());
+ }
+ ICING_VLOG(2) << "Lite index sort and merge " << sort_len << " into "
+ << header_->searchable_end() << " in " << timer.Elapsed() * 1000
+ << "ms";
+
+ // Now the entire array is sorted.
+ header_->set_searchable_end(header_->cur_size());
+
+ // Update crc in-line.
+ UpdateChecksum();
+}
+
+libtextclassifier3::Status LiteIndex::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id) {
+ absl_ports::unique_lock l(&mutex_);
+ header_->set_last_added_docid(new_last_added_document_id);
+ if (header_->cur_size() == 0) {
+ return libtextclassifier3::Status::OK;
+ }
+ // Sort the hits so that hits with the same term id will be grouped together,
+ // which helps later to determine which terms will be unused after compaction.
+ SortHitsImpl();
+ uint32_t new_size = 0;
+ uint32_t curr_term_id = 0;
+ uint32_t curr_tvi = 0;
+ std::unordered_set<uint32_t> tvi_to_delete;
+ for (uint32_t idx = 0; idx < header_->cur_size(); ++idx) {
+ TermIdHitPair term_id_hit_pair(
+ hit_buffer_.array_cast<TermIdHitPair>()[idx]);
+ if (idx == 0 || term_id_hit_pair.term_id() != curr_term_id) {
+ curr_term_id = term_id_hit_pair.term_id();
+ ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo term_info,
+ term_id_codec->DecodeTermInfo(curr_term_id));
+ curr_tvi = term_info.tvi;
+ // Mark the property of the current term as not having hits in prefix
+ // section. The property will be set below if there are any valid hits
+ // from a prefix section.
+ lexicon_.ClearProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
+ // Add curr_tvi to tvi_to_delete. It will be removed from tvi_to_delete
+ // below if there are any valid hits pointing to that termid.
+ tvi_to_delete.insert(curr_tvi);
+ }
+ DocumentId new_document_id =
+ document_id_old_to_new[term_id_hit_pair.hit().document_id()];
+ if (new_document_id == kInvalidDocumentId) {
+ continue;
+ }
+ if (term_id_hit_pair.hit().is_in_prefix_section()) {
+ lexicon_.SetProperty(curr_tvi, GetHasHitsInPrefixSectionPropertyId());
+ }
+ tvi_to_delete.erase(curr_tvi);
+ TermIdHitPair new_term_id_hit_pair(
+ term_id_hit_pair.term_id(),
+ Hit::TranslateHit(term_id_hit_pair.hit(), new_document_id));
+ // Rewriting the hit_buffer in place.
+ // new_size is weakly less than idx so we are okay to overwrite the entry at
+ // new_size, and valp should never be nullptr since it is within the already
+ // allocated region of hit_buffer_.
+ TermIdHitPair::Value* valp =
+ hit_buffer_.GetMutableMem<TermIdHitPair::Value>(new_size++, 1);
+ *valp = new_term_id_hit_pair.value();
+ }
+ header_->set_cur_size(new_size);
+ header_->set_searchable_end(new_size);
+
+ // Delete unused terms.
+ std::unordered_set<std::string> terms_to_delete;
+ for (IcingDynamicTrie::Iterator term_iter(lexicon_, /*prefix=*/"");
+ term_iter.IsValid(); term_iter.Advance()) {
+ if (tvi_to_delete.find(term_iter.GetValueIndex()) != tvi_to_delete.end()) {
+ terms_to_delete.insert(term_iter.GetKey());
+ }
+ }
+ for (const std::string& term : terms_to_delete) {
+ // Mark "term" as deleted. This won't actually free space in the lexicon. It
+ // will simply make it impossible to Find "term" in subsequent calls (which
+ // saves an unnecessary search through the hit buffer). This is acceptable
+ // because the free space will eventually be reclaimed the next time that
+ // the lite index is merged with the main index.
+ if (!lexicon_.Delete(term)) {
+ return absl_ports::InternalError(
+ "Could not delete invalid terms in lite lexicon during compaction.");
+ }
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/lite/lite-index.h b/icing/index/lite/lite-index.h
new file mode 100644
index 0000000..288602a
--- /dev/null
+++ b/icing/index/lite/lite-index.h
@@ -0,0 +1,444 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// A small index with continuous updates (doesn't need explicit Flush
+// to persiste) but has more possibility for corruption. It can always
+// detect corruption reliably.
+
+#ifndef ICING_INDEX_LITE_INDEX_H_
+#define ICING_INDEX_LITE_INDEX_H_
+
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/absl_ports/thread_annotations.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/lite/lite-index-header.h"
+#include "icing/index/lite/lite-index-options.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-array-storage.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/suggestion-result-checker.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// The LiteIndex is go/thread-compatible. Operations on the same data member
+// object interfere with each other, unless they are guaranteed not to mutate
+// the object (In the case of LiteIndex, this means all const methods,
+// FetchHits and ScoreHits).
+class LiteIndex {
+ public:
+ // An entry in the hit buffer.
+ using Options = LiteIndexOptions;
+
+ // Offset for the LiteIndex_Header in the hit buffer mmap.
+ static constexpr uint32_t kHeaderFileOffset = 0;
+
+ // Updates checksum of subcomponents.
+ ~LiteIndex();
+
+ // Creates lite index from storage. The files will be created if they do not
+ // already exist.
+ //
+ // Returns:
+ // OK on success
+ // DATA_LOSS if the index was corrupted and cleared
+ // INTERNAL on I/O error
+ static libtextclassifier3::StatusOr<std::unique_ptr<LiteIndex>> Create(
+ const Options& options, const IcingFilesystem* filesystem);
+
+ // Resets all internal members of the index. Returns OK if all operations were
+ // successful.
+ libtextclassifier3::Status Reset() ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Advises the OS to cache pages in the index, which will be accessed for a
+ // query soon.
+ void Warm() ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Syncs all modified files in the index to disk.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL on I/O error
+ libtextclassifier3::Status PersistToDisk() ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Returns term_id if term found, NOT_FOUND otherwise.
+ libtextclassifier3::StatusOr<uint32_t> GetTermId(
+ const std::string& term) const ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Returns an iterator for all terms for which 'prefix' is a prefix.
+ class PrefixIterator {
+ public:
+ explicit PrefixIterator(const IcingDynamicTrie::Iterator& delegate)
+ : delegate_(delegate) {}
+ bool IsValid() const { return delegate_.IsValid(); }
+
+ void Advance() { delegate_.Advance(); }
+
+ const char* GetKey() const { return delegate_.GetKey(); }
+
+ uint32_t GetValueIndex() const { return delegate_.GetValueIndex(); }
+
+ private:
+ IcingDynamicTrie::Iterator delegate_;
+ };
+
+ // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate any
+ // previously returned PrefixIterator.
+ PrefixIterator FindTermPrefixes(const std::string& prefix) const
+ ICING_LOCKS_EXCLUDED(mutex_) {
+ absl_ports::shared_lock l(&mutex_);
+ return PrefixIterator(IcingDynamicTrie::Iterator(lexicon_, prefix.c_str()));
+ }
+
+ // Inserts a term with its properties.
+ //
+ // Returns:
+ // A value index on success
+ // RESOURCE_EXHAUSTED if lexicon is full or no disk space is available
+ libtextclassifier3::StatusOr<uint32_t> InsertTerm(
+ const std::string& term, TermMatchType::Code term_match_type,
+ NamespaceId namespace_id) ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Updates term properties by setting hasPrefixHits and namespace id of the
+ // term.
+ //
+ // Returns:
+ // OK on success
+ // RESOURCE_EXHAUSTED if no disk space is available
+ libtextclassifier3::Status UpdateTermProperties(uint32_t tvi,
+ bool hasPrefixHits,
+ NamespaceId namespace_id)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Append hit to buffer. term_id must be encoded using the same term_id_codec
+ // supplied to the index constructor.
+ // RETURNS:
+ // - OK if hit was successfully added
+ // - RESOURCE_EXHAUSTED if hit could not be added (either due to hit buffer
+ // or file system capacity reached).
+ libtextclassifier3::Status AddHit(uint32_t term_id, const Hit& hit)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Add all hits with term_id from the sections specified in section_id_mask,
+ // skipping hits in non-prefix sections if only_from_prefix_sections is true,
+ // to hits_out. If hits_out is nullptr, no hits will be added. The
+ // corresponding hit term frequencies will also not be added if
+ // term_frequency_out is nullptr.
+ //
+ // Only those hits which belongs to the given namespaces will be counted and
+ // fetched. A nullptr namespace checker will disable this check.
+ //
+ // Returns the score of hits that would be added to hits_out according the
+ // given score_by.
+ int FetchHits(
+ uint32_t term_id, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker,
+ std::vector<DocHitInfo>* hits_out,
+ std::vector<Hit::TermFrequencyArray>* term_frequency_out = nullptr)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Returns the hit count of the term.
+ // Only those hits which belongs to the given namespaces will be counted.
+ libtextclassifier3::StatusOr<int> ScoreHits(
+ uint32_t term_id,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ bool empty() const ICING_LOCKS_EXCLUDED(mutex_) { return size() == 0; }
+
+ uint32_t size() const ICING_LOCKS_EXCLUDED(mutex_) {
+ absl_ports::shared_lock l(&mutex_);
+ return size_impl();
+ }
+
+ bool WantsMerge() const ICING_LOCKS_EXCLUDED(mutex_) {
+ absl_ports::shared_lock l(&mutex_);
+ return is_full() || size_impl() >= (options_.hit_buffer_want_merge_bytes /
+ sizeof(TermIdHitPair::Value));
+ }
+
+ // Whether or not the HitBuffer's unsorted tail size exceeds the sort
+ // threshold.
+ bool HasUnsortedHitsExceedingSortThreshold() const
+ ICING_LOCKS_EXCLUDED(mutex_) {
+ absl_ports::shared_lock l(&mutex_);
+ return HasUnsortedHitsExceedingSortThresholdImpl();
+ }
+
+ // Sort hits stored in the index.
+ void SortHits() ICING_LOCKS_EXCLUDED(mutex_) {
+ absl_ports::unique_lock l(&mutex_);
+ SortHitsImpl();
+ };
+
+ class const_iterator {
+ friend class LiteIndex;
+
+ public:
+ using iterator_category = std::forward_iterator_tag;
+ using value_type = TermIdHitPair;
+ using reference = const value_type&;
+ using pointer = const value_type*;
+
+ const_iterator() : const_iterator(nullptr, -1, -1) {}
+
+ reference operator*() const { return start_[position_]; }
+
+ pointer operator->() const { return start_ + position_; }
+
+ const_iterator& operator++() {
+ if (++position_ >= end_position_) {
+ start_ = nullptr;
+ position_ = -1;
+ end_position_ = -1;
+ }
+ return *this;
+ }
+
+ const_iterator operator++(int) {
+ auto tmp = *this;
+ ++*this;
+ return tmp;
+ }
+
+ bool operator!=(const const_iterator& rhs) { return !(*this == rhs); }
+
+ bool operator==(const const_iterator& rhs) {
+ return start_ == rhs.start_ && position_ == rhs.position_;
+ }
+
+ private:
+ explicit const_iterator(const TermIdHitPair* start, int position,
+ int end_position)
+ : start_(start), position_(position), end_position_(end_position) {}
+
+ const TermIdHitPair* start_;
+ int position_;
+ int end_position_;
+ };
+
+ const_iterator begin() const ICING_LOCKS_EXCLUDED(mutex_) {
+ absl_ports::shared_lock l(&mutex_);
+ // If the LiteIndex is empty, just return end().
+ return empty_impl()
+ ? end()
+ : const_iterator(hit_buffer_.array_cast<TermIdHitPair>(), 0,
+ header_->cur_size());
+ }
+
+ const_iterator end() const { return const_iterator(); }
+
+ constexpr static uint32_t max_hit_buffer_size() {
+ return std::numeric_limits<uint32_t>::max() / sizeof(TermIdHitPair);
+ }
+
+ // We keep track of the last added document_id. This is always the largest
+ // document_id that has been added because hits can only be added in order of
+ // increasing document_id.
+ DocumentId last_added_document_id() const ICING_LOCKS_EXCLUDED(mutex_) {
+ absl_ports::shared_lock l(&mutex_);
+ return header_->last_added_docid();
+ }
+ void set_last_added_document_id(DocumentId document_id)
+ ICING_LOCKS_EXCLUDED(mutex_) {
+ absl_ports::unique_lock l(&mutex_);
+ header_->set_last_added_docid(document_id);
+ }
+
+ // WARNING: Subsequent calls to AddHit/InsertTerm may invalidate the reference
+ // returned here.
+ const IcingDynamicTrie& lexicon() const { return lexicon_; }
+
+ // Returns debug information for the index in out.
+ // verbosity = BASIC, simplest debug information - size of lexicon, hit buffer
+ // verbosity = DETAILED, more detailed debug information from the lexicon.
+ std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Returns the byte size of all the elements held in the index. This excludes
+ // the size of any internal metadata of the index, e.g. the index's header.
+ //
+ // Returns:
+ // Byte size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Takes the provided storage_info, populates the fields related to the lite
+ // index and returns that storage_info.
+ //
+ // If an IO error occurs while trying to calculate the value for a field, then
+ // that field will be set to -1.
+ IndexStorageInfoProto GetStorageInfo(IndexStorageInfoProto storage_info) const
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ // Reduces internal file sizes by reclaiming space of deleted documents.
+ //
+ // This method also sets the last_added_docid of the index to
+ // new_last_added_document_id.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error, this indicates that the index may be in an
+ // invalid state and should be cleared.
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const TermIdCodec* term_id_codec, DocumentId new_last_added_document_id)
+ ICING_LOCKS_EXCLUDED(mutex_);
+
+ private:
+ static IcingDynamicTrie::RuntimeOptions MakeTrieRuntimeOptions();
+
+ LiteIndex(const Options& options, const IcingFilesystem* filesystem);
+
+ // Initializes lite index from storage. Must be called exactly once after
+ // object construction.
+ //
+ // Returns:
+ // OK on success
+ // DATA_LOSS if the index was corrupted and cleared
+ // INTERNAL on I/O error
+ libtextclassifier3::Status Initialize() ICING_LOCKS_EXCLUDED(mutex_);
+
+ bool initialized() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
+ return header_ != nullptr;
+ }
+
+ // Check if the hit buffer has reached its capacity.
+ bool is_full() const ICING_SHARED_LOCKS_REQUIRED(mutex_);
+
+ // Non-locking implementation for empty().
+ bool empty_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
+ return size_impl() == 0;
+ }
+
+ // Non-locking implementation for size().
+ uint32_t size_impl() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
+ return header_->cur_size();
+ }
+
+ // Calculate the checksum of all sub-components of the LiteIndex
+ Crc32 ComputeChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Sets the computed checksum in the header
+ void UpdateChecksum() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Non-locking implementation for UpdateTermProperties.
+ libtextclassifier3::Status UpdateTermPropertiesImpl(uint32_t tvi,
+ bool hasPrefixHits,
+ NamespaceId namespace_id)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // We need to sort during querying time when:
+ // 1. Sorting at indexing time is not enabled and there is an unsorted tail
+ // section in the HitBuffer.
+ // 2. The unsorted tail size exceeds the hit_buffer_sort_threshold, regardless
+ // of whether or not hit_buffer_sort_at_indexing is enabled. This is to
+ // prevent performing sequential search on a large unsorted tail section,
+ // which would result in bad query performance.
+ // This is more of a sanity check. We should not really be encountering
+ // this case.
+ bool NeedSortAtQuerying() const ICING_SHARED_LOCKS_REQUIRED(mutex_) {
+ return HasUnsortedHitsExceedingSortThresholdImpl() ||
+ (!options_.hit_buffer_sort_at_indexing &&
+ header_->cur_size() - header_->searchable_end() > 0);
+ }
+
+ // Non-locking implementation for HasUnsortedHitsExceedingSortThresholdImpl().
+ bool HasUnsortedHitsExceedingSortThresholdImpl() const
+ ICING_SHARED_LOCKS_REQUIRED(mutex_) {
+ return header_->cur_size() - header_->searchable_end() >=
+ (options_.hit_buffer_sort_threshold_bytes /
+ sizeof(TermIdHitPair::Value));
+ }
+
+ // Non-locking implementation for SortHits().
+ void SortHitsImpl() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Calculates and adds the score for a fetched hit to total_score_out, while
+ // updating last_document_id (which keeps track of the last added docId so
+ // far), and is_last_document_desired (which keeps track of whether that last
+ // added docId belongs to the query's desired namespace.)
+ //
+ // Also appends the hit to hits_out and term_frequency_out if the vectors are
+ // not null.
+ void ScoreAndAppendFetchedHit(
+ const Hit& hit, SectionIdMask section_id_mask,
+ bool only_from_prefix_sections,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker,
+ DocumentId& last_document_id, bool& is_last_document_desired,
+ int& total_score_out, std::vector<DocHitInfo>* hits_out,
+ std::vector<Hit::TermFrequencyArray>* term_frequency_out) const
+ ICING_SHARED_LOCKS_REQUIRED(mutex_);
+
+ // File descriptor that points to where the header and hit buffer are written
+ // to.
+ ScopedFd hit_buffer_fd_ ICING_GUARDED_BY(mutex_);
+
+ // Mmapped region past the header that stores the hits.
+ IcingArrayStorage hit_buffer_ ICING_GUARDED_BY(mutex_);
+
+ // Crc checksum of the hits, excludes the header.
+ uint32_t hit_buffer_crc_ ICING_GUARDED_BY(mutex_);
+
+ // Trie that maps indexed terms to their term id
+ IcingDynamicTrie lexicon_ ICING_GUARDED_BY(mutex_);
+
+ // TODO(b/140437260): Port over to MemoryMappedFile
+ // Memory mapped region of the underlying file that reflects the header.
+ IcingMMapper header_mmap_ ICING_GUARDED_BY(mutex_);
+
+ // Wrapper around the mmapped header that contains stats on the lite index.
+ std::unique_ptr<LiteIndex_Header> header_ ICING_GUARDED_BY(mutex_);
+
+ // Options used to initialize the LiteIndex.
+ const Options options_;
+
+ // TODO(b/139087650) Move to icing::Filesystem
+ const IcingFilesystem* const filesystem_;
+
+ // Used to provide reader and writer locks
+ mutable absl_ports::shared_mutex mutex_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_LITE_INDEX_H_
diff --git a/icing/index/lite/lite-index_test.cc b/icing/index/lite/lite-index_test.cc
new file mode 100644
index 0000000..9811fa2
--- /dev/null
+++ b/icing/index/lite/lite-index_test.cc
@@ -0,0 +1,741 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/lite/lite-index.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/doc-hit-info-iterator-term-lite.h"
+#include "icing/index/lite/lite-index-header.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/always-false-suggestion-result-checker-impl.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::SizeIs;
+
+class LiteIndexTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ void TearDown() override {
+ term_id_codec_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(LiteIndexTest,
+ LiteIndexFetchHits_sortAtQuerying_unsortedHitsBelowSortThreshold) {
+ // Set up LiteIndex and TermIdCodec
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/false,
+ /*hit_buffer_sort_threshold_bytes=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+
+ // Add some hits
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+ Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t bar_tvi,
+ lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id,
+ term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE));
+ Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1));
+
+ // Check that unsorted hits does not exceed the sort threshold.
+ EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse());
+
+ // Check that hits are unsorted. Persist the data and pread from
+ // LiteIndexHeader.
+ ASSERT_THAT(lite_index->PersistToDisk(), IsOk());
+ LiteIndex_HeaderImpl::HeaderData header_data;
+ ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(),
+ &header_data, sizeof(header_data),
+ LiteIndex::kHeaderFileOffset));
+ EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4));
+
+ // Query the LiteIndex
+ std::vector<DocHitInfo> hits1;
+ lite_index->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits1);
+ EXPECT_THAT(hits1, SizeIs(1));
+ EXPECT_THAT(hits1.back().document_id(), Eq(1));
+ // Check that the hits are coming from section 0 and section 1.
+ EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11));
+
+ std::vector<DocHitInfo> hits2;
+ AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker;
+ lite_index->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &always_false_suggestion_result_checker, &hits2);
+ // Check that no hits are returned because they get skipped by the namespace
+ // checker.
+ EXPECT_THAT(hits2, IsEmpty());
+
+ // Check that hits are sorted after querying LiteIndex. Persist the data and
+ // pread from LiteIndexHeader.
+ ASSERT_THAT(lite_index->PersistToDisk(), IsOk());
+ ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(),
+ &header_data, sizeof(header_data),
+ LiteIndex::kHeaderFileOffset));
+ EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0));
+}
+
+TEST_F(LiteIndexTest,
+ LiteIndexFetchHits_sortAtIndexing_unsortedHitsBelowSortThreshold) {
+ // Set up LiteIndex and TermIdCodec
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
+ // However note that in these tests we're unable to sort hits after
+ // indexing, as sorting performed by the string-section-indexing-handler
+ // after indexing all hits in an entire document, rather than after each
+ // AddHits() operation.
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+
+ // Add some hits
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+ Hit foo_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit foo_hit1(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, foo_hit1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t bar_tvi,
+ lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id,
+ term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE));
+ Hit bar_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit bar_hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, bar_hit1));
+
+ // Check that unsorted hits does not exceed the sort threshold.
+ EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse());
+
+ // Check that hits are unsorted. Persist the data and pread from
+ // LiteIndexHeader.
+ ASSERT_THAT(lite_index->PersistToDisk(), IsOk());
+ LiteIndex_HeaderImpl::HeaderData header_data;
+ ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(),
+ &header_data, sizeof(header_data),
+ LiteIndex::kHeaderFileOffset));
+ EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4));
+
+ // Query the LiteIndex
+ std::vector<DocHitInfo> hits1;
+ lite_index->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits1);
+ EXPECT_THAT(hits1, SizeIs(1));
+ EXPECT_THAT(hits1.back().document_id(), Eq(1));
+ // Check that the hits are coming from section 0 and section 1.
+ EXPECT_THAT(hits1.back().hit_section_ids_mask(), Eq(0b11));
+
+ std::vector<DocHitInfo> hits2;
+ AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker;
+ lite_index->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &always_false_suggestion_result_checker, &hits2);
+ // Check that no hits are returned because they get skipped by the namespace
+ // checker.
+ EXPECT_THAT(hits2, IsEmpty());
+
+ // Check that hits are still unsorted after querying LiteIndex because the
+ // HitBuffer unsorted size is still below the sort threshold, and we've
+ // enabled sort_at_indexing.
+ // Persist the data and performing a pread on LiteIndexHeader.
+ ASSERT_THAT(lite_index->PersistToDisk(), IsOk());
+ ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(),
+ &header_data, sizeof(header_data),
+ LiteIndex::kHeaderFileOffset));
+ EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(4));
+}
+
+TEST_F(
+ LiteIndexTest,
+ LiteIndexFetchHits_sortAtQuerying_unsortedHitsExceedingSortAtIndexThreshold) {
+ // Set up LiteIndex and TermIdCodec
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
+ // However note that in these tests we're unable to sort hits after
+ // indexing, as sorting performed by the string-section-indexing-handler
+ // after indexing all hits in an entire document, rather than after each
+ // AddHits() operation.
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/false,
+ /*hit_buffer_sort_threshold_bytes=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+
+ // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total
+ // Doc 0
+ Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ // Doc 1
+ Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ // Doc 2
+ Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ // Doc 3
+ Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+
+ // Create terms
+ // Foo
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+ // Bar
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t bar_tvi,
+ lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id,
+ term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE));
+ // Baz
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t baz_tvi,
+ lite_index->InsertTerm("baz", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t baz_term_id,
+ term_id_codec_->EncodeTvi(baz_tvi, TviType::LITE));
+ // Qux
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t qux_tvi,
+ lite_index->InsertTerm("qux", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t qux_term_id,
+ term_id_codec_->EncodeTvi(qux_tvi, TviType::LITE));
+
+ // Add 14 hits and make sure that termIds are added in unsorted order.
+ // Documents should be inserted in order as new incoming hits should have
+ // larger document ids.
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc0_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc0_hit2));
+ ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc0_hit3));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit3));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc2_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc2_hit2));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit3));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc3_hit1));
+ // Verify that the HitBuffer has not been sorted.
+ EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue());
+
+ // We now have the following in the hit buffer:
+ // <term>: {(docId, sectionId)...}
+ // foo: {(0, 0); (1, 0); (1, 1); (2, 0); (2, 2); (3, 0)}
+ // bar: {(0, 0); (1, 0); (1, 2)}
+ // baz: {(0, 1); (2, 0); (3, 0)}
+ // quz: {(0, 2); (2, 1)}
+
+ // Search over the HitBuffer.
+ std::vector<DocHitInfo> hits1;
+ lite_index->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits1);
+ EXPECT_THAT(hits1, SizeIs(4));
+ // Check that hits are retrieved in descending order of docIds.
+ EXPECT_THAT(hits1[0].document_id(), Eq(3));
+ EXPECT_THAT(hits1[0].hit_section_ids_mask(), Eq(0b1));
+ EXPECT_THAT(hits1[1].document_id(), Eq(2));
+ EXPECT_THAT(hits1[1].hit_section_ids_mask(), Eq(0b101));
+ EXPECT_THAT(hits1[2].document_id(), Eq(1));
+ EXPECT_THAT(hits1[2].hit_section_ids_mask(), Eq(0b11));
+ EXPECT_THAT(hits1[3].document_id(), Eq(0));
+ EXPECT_THAT(hits1[3].hit_section_ids_mask(), Eq(0b1));
+
+ std::vector<DocHitInfo> hits2;
+ AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker;
+ lite_index->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &always_false_suggestion_result_checker, &hits2);
+ // Check that no hits are returned because they get skipped by the namespace
+ // checker.
+ EXPECT_THAT(hits2, IsEmpty());
+
+ std::vector<DocHitInfo> hits3;
+ lite_index->FetchHits(
+ bar_term_id, 0b1,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits3);
+ EXPECT_THAT(hits3, SizeIs(2));
+ // Check fetching hits with SectionIdMask.
+ EXPECT_THAT(hits3[0].document_id(), Eq(1));
+ EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1));
+ EXPECT_THAT(hits3[1].document_id(), Eq(0));
+ EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1));
+
+ // Check that the HitBuffer is sorted after the query call.
+ EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse());
+}
+
+TEST_F(
+ LiteIndexTest,
+ LiteIndexFetchHits_sortAtIndexing_unsortedHitsExceedingSortAtIndexThreshold) {
+ // Set up LiteIndex and TermIdCodec
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+
+ // Create 4 hits for docs 0-2, and 2 hits for doc 3 -- 14 in total
+ // Doc 0
+ Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc0_hit1(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc0_hit2(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc0_hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ // Doc 1
+ Hit doc1_hit0(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc1_hit1(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc1_hit2(/*section_id=*/1, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc1_hit3(/*section_id=*/2, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ // Doc 2
+ Hit doc2_hit0(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc2_hit1(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc2_hit2(/*section_id=*/1, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc2_hit3(/*section_id=*/2, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ // Doc 3
+ Hit doc3_hit0(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc3_hit1(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc3_hit2(/*section_id=*/1, /*document_id=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc3_hit3(/*section_id=*/2, /*document_id=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ // Doc 4
+ Hit doc4_hit0(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc4_hit1(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc4_hit2(/*section_id=*/1, /*document_id=*/4, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ Hit doc4_hit3(/*section_id=*/2, /*document_id=*/4, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+
+ // Create terms
+ // Foo
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+ // Bar
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t bar_tvi,
+ lite_index->InsertTerm("bar", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t bar_term_id,
+ term_id_codec_->EncodeTvi(bar_tvi, TviType::LITE));
+ // Baz
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t baz_tvi,
+ lite_index->InsertTerm("baz", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t baz_term_id,
+ term_id_codec_->EncodeTvi(baz_tvi, TviType::LITE));
+ // Qux
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t qux_tvi,
+ lite_index->InsertTerm("qux", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t qux_term_id,
+ term_id_codec_->EncodeTvi(qux_tvi, TviType::LITE));
+
+ // Add hits and make sure that termIds are added in unsorted order.
+ // Documents should be inserted in order as new incoming hits should have
+ // larger document ids.
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc0_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc0_hit2));
+ ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc0_hit3));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc1_hit3));
+ // Adding 8 hits exceeds the sort threshold. However when sort_at_indexing is
+ // enabled, sorting is done in the string-section-indexing-handler rather than
+ // AddHit() itself, we need to invoke SortHits() manually.
+ EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue());
+ lite_index->SortHits();
+ // Check that the HitBuffer is sorted.
+ ASSERT_THAT(lite_index->PersistToDisk(), IsOk());
+ LiteIndex_HeaderImpl::HeaderData header_data;
+ ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(),
+ &header_data, sizeof(header_data),
+ LiteIndex::kHeaderFileOffset));
+ EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0));
+
+ // Add 12 more hits so that sort threshold is exceeded again.
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc2_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc2_hit2));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc2_hit3));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc3_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc3_hit2));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc3_hit3));
+ ICING_ASSERT_OK(lite_index->AddHit(baz_term_id, doc4_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(qux_term_id, doc4_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc4_hit2));
+ ICING_ASSERT_OK(lite_index->AddHit(bar_term_id, doc4_hit3));
+
+ // Adding these hits exceeds the sort threshold. However when sort_at_indexing
+ // is enabled, sorting is done in the string-section-indexing-handler rather
+ // than AddHit() itself.
+ EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsTrue());
+
+ // We now have the following in the hit buffer:
+ // <term>: {(docId, sectionId)...}
+ // foo: {(0, 0); (1, 0); (1, 1); (2, 0); (2, 2); (3, 0); (3, 1); (4, 1)}
+ // bar: {(0, 0); (1, 0); (1, 2); (3, 2); (4, 2)}
+ // baz: {(0, 1); (2, 0); (3, 0); (4, 0)}
+ // quz: {(0, 2); (2, 1); (4, 0)}
+
+ // Search over the HitBuffer.
+ std::vector<DocHitInfo> hits1;
+ lite_index->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits1);
+ EXPECT_THAT(hits1, SizeIs(5));
+ // Check that hits are retrieved in descending order of docIds.
+ EXPECT_THAT(hits1[0].document_id(), Eq(4));
+ EXPECT_THAT(hits1[0].hit_section_ids_mask(), Eq(0b10));
+ EXPECT_THAT(hits1[1].document_id(), Eq(3));
+ EXPECT_THAT(hits1[1].hit_section_ids_mask(), Eq(0b11));
+ EXPECT_THAT(hits1[2].document_id(), Eq(2));
+ EXPECT_THAT(hits1[2].hit_section_ids_mask(), Eq(0b101));
+ EXPECT_THAT(hits1[3].document_id(), Eq(1));
+ EXPECT_THAT(hits1[3].hit_section_ids_mask(), Eq(0b11));
+ EXPECT_THAT(hits1[4].document_id(), Eq(0));
+ EXPECT_THAT(hits1[4].hit_section_ids_mask(), Eq(0b1));
+
+ std::vector<DocHitInfo> hits2;
+ AlwaysFalseSuggestionResultCheckerImpl always_false_suggestion_result_checker;
+ lite_index->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ &always_false_suggestion_result_checker, &hits2);
+ // Check that no hits are returned because they get skipped by the namespace
+ // checker.
+ EXPECT_THAT(hits2, IsEmpty());
+
+ std::vector<DocHitInfo> hits3;
+ lite_index->FetchHits(
+ bar_term_id, 0b1,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits3);
+ EXPECT_THAT(hits3, SizeIs(2));
+ // Check fetching hits with SectionIdMask.
+ EXPECT_THAT(hits3[0].document_id(), Eq(1));
+ EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1));
+ EXPECT_THAT(hits3[1].document_id(), Eq(0));
+ EXPECT_THAT(hits3[1].hit_section_ids_mask(), Eq(0b1));
+
+ // Check that the HitBuffer is sorted after the query call. FetchHits should
+ // sort before performing binary search if the HitBuffer unsorted size exceeds
+ // the sort threshold. Regardless of the sort_at_indexing config.
+ EXPECT_THAT(lite_index->HasUnsortedHitsExceedingSortThreshold(), IsFalse());
+ ASSERT_THAT(lite_index->PersistToDisk(), IsOk());
+ ASSERT_TRUE(filesystem_.PRead((lite_index_file_name + "hb").c_str(),
+ &header_data, sizeof(header_data),
+ LiteIndex::kHeaderFileOffset));
+ EXPECT_THAT(header_data.cur_size - header_data.searchable_end, Eq(0));
+}
+
+TEST_F(LiteIndexTest, LiteIndexIterator) {
+ // Set up LiteIndex and TermIdCodec
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+
+ const std::string term = "foo";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index->InsertTerm(term, TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3,
+ /*is_in_prefix_section=*/false);
+ Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5,
+ /*is_in_prefix_section=*/false);
+ SectionIdMask doc0_section_id_mask = 0b11;
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map0 = {{0, 3}, {1, 5}};
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1));
+
+ Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7,
+ /*is_in_prefix_section=*/false);
+ Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11,
+ /*is_in_prefix_section=*/false);
+ SectionIdMask doc1_section_id_mask = 0b110;
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{1, 7}, {2, 11}};
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2));
+
+ std::unique_ptr<DocHitInfoIteratorTermLiteExact> iter =
+ std::make_unique<DocHitInfoIteratorTermLiteExact>(
+ term_id_codec_.get(), lite_index.get(), term, /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ /*need_hit_term_frequency=*/true);
+
+ ASSERT_THAT(iter->Advance(), IsOk());
+ EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(1));
+ EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(),
+ Eq(doc1_section_id_mask));
+
+ std::vector<TermMatchInfo> matched_terms_stats;
+ iter->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ term, expected_section_ids_tf_map1)));
+
+ ASSERT_THAT(iter->Advance(), IsOk());
+ EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(0));
+ EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(),
+ Eq(doc0_section_id_mask));
+ matched_terms_stats.clear();
+ iter->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ term, expected_section_ids_tf_map0)));
+}
+
+TEST_F(LiteIndexTest, LiteIndexIterator_sortAtIndexingDisabled) {
+ // Set up LiteIndex and TermIdCodec
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ // At 64 bytes the unsorted tail can contain a max of 8 TermHitPairs.
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/false,
+ /*hit_buffer_sort_threshold_bytes=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LiteIndex> lite_index,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+
+ const std::string term = "foo";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index->InsertTerm(term, TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ Hit doc0_hit0(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/3,
+ /*is_in_prefix_section=*/false);
+ Hit doc0_hit1(/*section_id=*/1, /*document_id=*/0, /*term_frequency=*/5,
+ /*is_in_prefix_section=*/false);
+ SectionIdMask doc0_section_id_mask = 0b11;
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map0 = {{0, 3}, {1, 5}};
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit0));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc0_hit1));
+
+ Hit doc1_hit1(/*section_id=*/1, /*document_id=*/1, /*term_frequency=*/7,
+ /*is_in_prefix_section=*/false);
+ Hit doc1_hit2(/*section_id=*/2, /*document_id=*/1, /*term_frequency=*/11,
+ /*is_in_prefix_section=*/false);
+ SectionIdMask doc1_section_id_mask = 0b110;
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{1, 7}, {2, 11}};
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit1));
+ ICING_ASSERT_OK(lite_index->AddHit(foo_term_id, doc1_hit2));
+
+ std::unique_ptr<DocHitInfoIteratorTermLiteExact> iter =
+ std::make_unique<DocHitInfoIteratorTermLiteExact>(
+ term_id_codec_.get(), lite_index.get(), term, /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ /*need_hit_term_frequency=*/true);
+
+ ASSERT_THAT(iter->Advance(), IsOk());
+ EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(1));
+ EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(),
+ Eq(doc1_section_id_mask));
+
+ std::vector<TermMatchInfo> matched_terms_stats;
+ iter->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ term, expected_section_ids_tf_map1)));
+
+ ASSERT_THAT(iter->Advance(), IsOk());
+ EXPECT_THAT(iter->doc_hit_info().document_id(), Eq(0));
+ EXPECT_THAT(iter->doc_hit_info().hit_section_ids_mask(),
+ Eq(doc0_section_id_mask));
+ matched_terms_stats.clear();
+ iter->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ term, expected_section_ids_tf_map0)));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/lite/lite-index_thread-safety_test.cc b/icing/index/lite/lite-index_thread-safety_test.cc
new file mode 100644
index 0000000..53aa6cd
--- /dev/null
+++ b/icing/index/lite/lite-index_thread-safety_test.cc
@@ -0,0 +1,399 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <array>
+#include <string>
+#include <thread>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/schema/section.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Le;
+using ::testing::SizeIs;
+
+// These tests cover concurrent FetchHits operations, as well as interleaving
+// AddHit and FetchHits operations. Other usages of the LiteIndex other than
+// these scenarios are not guaranteed with to be thread-safe as the LiteIndex is
+// go/thread-compatible.
+class LiteIndexThreadSafetyTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name =
+ index_dir_ + "/test_file.lite-idx-thread-safety.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ term_id_codec_.reset();
+ lite_index_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+constexpr DocumentId kDocumentId0 = 0;
+constexpr DocumentId kDocumentId1 = 1;
+constexpr SectionId kSectionId0 = 1;
+constexpr SectionId kSectionId1 = 0b11;
+
+static constexpr std::array<std::string_view, 100> kCommonWords = {
+ "the", "and", "for", "that", "this", "with",
+ "you", "not", "are", "from", "your", "all",
+ "have", "new", "more", "was", "will", "home",
+ "can", "about", "page", "has", "search", "free",
+ "but", "our", "one", "other", "information", "time",
+ "they", "site", "may", "what", "which", "their",
+ "news", "out", "use", "any", "there", "see",
+ "only", "his", "when", "contact", "here", "business",
+ "who", "web", "also", "now", "help", "get",
+ "view", "online", "first", "been", "would", "how",
+ "were", "services", "some", "these", "click", "its",
+ "like", "service", "than", "find", "price", "date",
+ "back", "top", "people", "had", "list", "name",
+ "just", "over", "state", "year", "day", "into",
+ "email", "two", "health", "world", "next", "used",
+ "work", "last", "most", "products", "music", "buy",
+ "data", "make", "them", "should"};
+
+TEST_F(LiteIndexThreadSafetyTest, SimultaneousFetchHits_singleTerm) {
+ // Add some hits
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit doc_hit1(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId1,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit1));
+
+ // Create kNumThreads threads to call lite_index_->FetchHits()
+ // simultaneously. Each thread should get a valid result of 2 hits for the
+ // term 'foo', and there should be no crash.
+ constexpr int kNumThreads = 50;
+ std::vector<std::vector<DocHitInfo>> hits(kNumThreads);
+ auto callable = [&](int thread_id) {
+ lite_index_->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits[thread_id]);
+ };
+ // Spawn threads for FetchHits().
+ std::vector<std::thread> thread_objs;
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs.emplace_back(callable, /*thread_id=*/i);
+ }
+
+ // Join threads and verify results
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs[i].join();
+ EXPECT_THAT(
+ hits[i],
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId0}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId0})));
+ }
+}
+
+TEST_F(LiteIndexThreadSafetyTest, SimultaneousFetchHits_multipleTerms) {
+ // Add two hits for each of the first 50 terms in kCommonWords.
+ for (int i = 0; i < 50; ++i) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm(std::string(kCommonWords[i]),
+ TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit doc_hit1(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId1,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit0));
+ ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit1));
+ }
+
+ // Create kNumThreads threads to call lite_index_->FetchHits()
+ // simultaneously. Each thread should get a valid result of 2 hits for each
+ // term, and there should be no crash.
+ constexpr int kNumThreads = 50;
+ std::vector<std::vector<DocHitInfo>> hits(kNumThreads);
+ auto callable = [&](int thread_id) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm(std::string(kCommonWords[thread_id]),
+ TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ lite_index_->FetchHits(
+ term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits[thread_id]);
+ };
+
+ // Spawn threads for FetchHits().
+ std::vector<std::thread> thread_objs;
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs.emplace_back(callable, /*thread_id=*/i);
+ }
+
+ // Join threads and verify results
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs[i].join();
+ EXPECT_THAT(
+ hits[i],
+ ElementsAre(
+ EqualsDocHitInfo(kDocumentId1, std::vector<SectionId>{kSectionId0}),
+ EqualsDocHitInfo(kDocumentId0,
+ std::vector<SectionId>{kSectionId0})));
+ }
+}
+
+TEST_F(LiteIndexThreadSafetyTest, SimultaneousAddHitAndFetchHits_singleTerm) {
+ // Add some hits
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
+
+ // Create kNumThreads threads. Every even-numbered thread calls FetchHits and
+ // every odd numbered thread calls AddHit.
+ // Each AddHit operation adds the term 'foo' to a new section of the same doc.
+ // Each query result should contain one hit, and there should be no crash.
+ constexpr int kNumThreads = 50;
+ std::vector<std::vector<DocHitInfo>> hits(kNumThreads);
+ auto callable = [&](int thread_id) {
+ if (thread_id % 2 == 0) {
+ // Even-numbered thread calls FetchHits.
+ lite_index_->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits[thread_id]);
+ } else {
+ // Odd-numbered thread calls AddHit.
+ Hit doc_hit(/*section_id=*/thread_id / 2, /*document_id=*/kDocumentId0,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit));
+ }
+ };
+
+ // Spawn threads.
+ std::vector<std::thread> thread_objs;
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs.emplace_back(callable, /*thread_id=*/i);
+ }
+
+ // Join threads and verify results.
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs[i].join();
+ // All AddHit operations add 'foo' to the same document, so there should
+ // only be one DocHitInfo per run.
+ if (i % 2 == 0) {
+ EXPECT_THAT(hits[i], SizeIs(1));
+ EXPECT_THAT(hits[i].back().document_id(), Eq(0));
+ }
+ }
+
+ // After all threads have executed, hits should come from sections 0-24.
+ std::vector<DocHitInfo> final_hits;
+ lite_index_->FetchHits(
+ foo_term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &final_hits);
+ EXPECT_THAT(final_hits, SizeIs(1));
+ EXPECT_THAT(final_hits.back().document_id(), Eq(0));
+ // Section mask of sections 0-24.
+ EXPECT_THAT(final_hits.back().hit_section_ids_mask(), Eq((1 << 25) - 1));
+}
+
+TEST_F(LiteIndexThreadSafetyTest,
+ SimultaneousAddHitAndFetchHits_multipleTerms) {
+ // Add the initial hit 'foo'.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit0));
+
+ // Create kNumThreads threads. Every even-numbered thread calls FetchHits and
+ // every odd numbered thread calls AddHit.
+ // Each AddHit operation adds a different term to a new doc.
+ // Queries always search for the term 'foo' added above so there will always
+ // be a hit. There should be no crash.
+ constexpr int kNumThreads = 50;
+ std::vector<std::vector<DocHitInfo>> hits(kNumThreads);
+ auto callable = [&](int thread_id) {
+ // Create new tvi and term_id for new term kCommonWords[thread_id].
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm(std::string(kCommonWords[thread_id]),
+ TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ if (thread_id % 2 == 0) {
+ // Even-numbered thread calls FetchHits.
+ lite_index_->FetchHits(
+ foo_term_id, kSectionIdMaskAll, /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits[thread_id]);
+ } else {
+ // Odd-numbered thread calls AddHit.
+ // AddHit to section 0 of a new doc.
+ Hit doc_hit(/*section_id=*/kSectionId0, /*document_id=*/thread_id / 2,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit));
+ }
+ };
+
+ // Spawn threads.
+ std::vector<std::thread> thread_objs;
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs.emplace_back(callable, /*thread_id=*/i);
+ }
+
+ // Join threads and verify results. Queries always search for the term 'foo'
+ // so there will always be a hit
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs[i].join();
+ if (i % 2 == 0) {
+ EXPECT_THAT(hits[i],
+ ElementsAre(EqualsDocHitInfo(
+ kDocumentId0, std::vector<SectionId>{kSectionId0})));
+ }
+ }
+}
+
+TEST_F(LiteIndexThreadSafetyTest, ManyAddHitAndOneFetchHits_multipleTerms) {
+ // Add two hits for each of the first 20 terms in kCommonWords.
+ for (int i = 0; i < 20; ++i) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm(std::string(kCommonWords[i]),
+ TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ Hit doc_hit0(/*section_id=*/kSectionId0, /*document_id=*/kDocumentId0,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ Hit doc_hit1(/*section_id=*/kSectionId1, /*document_id=*/kDocumentId0,
+ Hit::kDefaultTermFrequency, /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit0));
+ ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit1));
+ }
+
+ // Create kNumThreads threads. Call one FetchHits operation after every 5
+ // AddHit operations.
+ // Each AddHit operation adds a different term to a new doc.
+ // Queries always search for the term 'foo' added above so there will always
+ // be a hit. There should be no crash.
+ constexpr int kNumThreads = 100;
+ std::vector<std::vector<DocHitInfo>> hits(kNumThreads);
+ auto callable = [&](int thread_id) {
+ // Create new tvi and term_id for new term kCommonWords[thread_id].
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm(std::string(kCommonWords[thread_id / 5]),
+ TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ if (thread_id % 5 == 0) {
+ // Call FetchHits on term kCommonWords[thread_id / 5]
+ lite_index_->FetchHits(
+ term_id, kSectionIdMaskAll,
+ /*only_from_prefix_sections=*/false,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT,
+ /*namespace_checker=*/nullptr, &hits[thread_id]);
+ } else {
+ // Odd-numbered thread calls AddHit.
+ // AddHit to section (thread_id % 5 + 1) of doc 0.
+ Hit doc_hit(/*section_id=*/thread_id % 5 + 1,
+ /*document_id=*/kDocumentId0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(term_id, doc_hit));
+ }
+ };
+ // Spawn threads.
+ std::vector<std::thread> thread_objs;
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs.emplace_back(callable, /*thread_id=*/i);
+ }
+
+ // Join threads and verify FetchHits results.
+ // Every query should see a hit in doc 0 sections 0 and 1. Additional hits
+ // might also be found in sections 2-6 depending on thread execution order.
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs[i].join();
+ if (i % 5 == 0) {
+ EXPECT_THAT(hits[i], SizeIs(1));
+ EXPECT_THAT(hits[i].back().document_id(), Eq(0));
+ EXPECT_THAT(hits[i].back().hit_section_ids_mask(), Ge(0b11));
+ EXPECT_THAT(hits[i].back().hit_section_ids_mask(), Le(0b1111111));
+ }
+ }
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/lite/term-id-hit-pair.h b/icing/index/lite/term-id-hit-pair.h
new file mode 100644
index 0000000..82bd010
--- /dev/null
+++ b/icing/index/lite/term-id-hit-pair.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_TERM_ID_HIT_PAIR_H_
+#define ICING_INDEX_TERM_ID_HIT_PAIR_H_
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/index/hit/hit.h"
+#include "icing/util/bit-util.h"
+
+namespace icing {
+namespace lib {
+
+class TermIdHitPair {
+ public:
+ // Layout bits: 24 termid + 32 hit value + 8 hit term frequency.
+ using Value = uint64_t;
+
+ static constexpr int kTermIdBits = 24;
+ static constexpr int kHitValueBits = sizeof(Hit::Value) * 8;
+ static constexpr int kHitTermFrequencyBits = sizeof(Hit::TermFrequency) * 8;
+
+ static const Value kInvalidValue;
+
+ explicit TermIdHitPair(Value v = kInvalidValue) : value_(v) {}
+
+ TermIdHitPair(uint32_t term_id, const Hit& hit) {
+ static_assert(kTermIdBits + kHitValueBits + kHitTermFrequencyBits <=
+ sizeof(Value) * 8,
+ "TermIdHitPairTooBig");
+
+ value_ = 0;
+ // Term id goes into the most significant bits because it takes
+ // precedent in sorts.
+ bit_util::BitfieldSet(term_id, kHitValueBits + kHitTermFrequencyBits,
+ kTermIdBits, &value_);
+ bit_util::BitfieldSet(hit.value(), kHitTermFrequencyBits, kHitValueBits,
+ &value_);
+ bit_util::BitfieldSet(hit.term_frequency(), 0, kHitTermFrequencyBits,
+ &value_);
+ }
+
+ uint32_t term_id() const {
+ return bit_util::BitfieldGet(value_, kHitValueBits + kHitTermFrequencyBits,
+ kTermIdBits);
+ }
+
+ Hit hit() const {
+ return Hit(
+ bit_util::BitfieldGet(value_, kHitTermFrequencyBits, kHitValueBits),
+ bit_util::BitfieldGet(value_, 0, kHitTermFrequencyBits));
+ }
+
+ Value value() const { return value_; }
+
+ bool operator==(const TermIdHitPair& rhs) const {
+ return value_ == rhs.value_;
+ }
+
+ bool operator<(const TermIdHitPair& rhs) const { return value_ < rhs.value_; }
+
+ private:
+ Value value_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_TERM_ID_HIT_PAIR_H_
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.cc b/icing/index/main/doc-hit-info-iterator-term-main.cc
new file mode 100644
index 0000000..3e66858
--- /dev/null
+++ b/icing/index/main/doc-hit-info-iterator-term-main.cc
@@ -0,0 +1,218 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/main/posting-list-hit-accessor.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::string SectionIdMaskToString(SectionIdMask section_id_mask) {
+ std::string mask(kTotalNumSections, '0');
+ for (SectionId i = kMaxSectionId; i >= 0; --i) {
+ if (section_id_mask & (UINT64_C(1) << i)) {
+ mask[kMaxSectionId - i] = '1';
+ }
+ }
+ return mask;
+}
+
+void MergeNewHitIntoCachedDocHitInfos(
+ const Hit& hit, bool need_hit_term_frequency,
+ std::vector<DocHitInfoIteratorTermMain::DocHitInfoAndTermFrequencyArray>&
+ cached_doc_hit_infos_out) {
+ if (cached_doc_hit_infos_out.empty() ||
+ hit.document_id() !=
+ cached_doc_hit_infos_out.back().doc_hit_info.document_id()) {
+ std::optional<Hit::TermFrequencyArray> tf_arr;
+ if (need_hit_term_frequency) {
+ tf_arr = std::make_optional<Hit::TermFrequencyArray>();
+ }
+
+ cached_doc_hit_infos_out.push_back(
+ DocHitInfoIteratorTermMain::DocHitInfoAndTermFrequencyArray(
+ DocHitInfo(hit.document_id()), std::move(tf_arr)));
+ }
+
+ cached_doc_hit_infos_out.back().doc_hit_info.UpdateSection(hit.section_id());
+ if (need_hit_term_frequency) {
+ (*cached_doc_hit_infos_out.back().term_frequency_array)[hit.section_id()] =
+ hit.term_frequency();
+ }
+}
+
+} // namespace
+
+libtextclassifier3::Status DocHitInfoIteratorTermMain::Advance() {
+ ++cached_doc_hit_infos_idx_;
+ while (posting_list_accessor_ == nullptr ||
+ (!all_pages_consumed_ && cached_doc_hit_info_count() == 1)) {
+ // If we haven't retrieved any hits before or we've already returned all but
+ // the last cached hit, then go get some more!
+ // We hold back the last cached hit because it could have more hits on the
+ // next posting list in the chain.
+ libtextclassifier3::Status status = RetrieveMoreHits();
+ if (!status.ok()) {
+ if (!absl_ports::IsNotFound(status)) {
+ // NOT_FOUND is expected to happen (not every term will be in the main
+ // index!). Other errors are worth logging.
+ ICING_LOG(ERROR)
+ << "Encountered unexpected failure while retrieving hits "
+ << status.error_message();
+ }
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ }
+ if (cached_doc_hit_infos_idx_ == -1 ||
+ cached_doc_hit_infos_idx_ >= cached_doc_hit_infos_.size()) {
+ // Nothing more for the iterator to return. Set these members to invalid
+ // values.
+ doc_hit_info_ = DocHitInfo();
+ return absl_ports::ResourceExhaustedError(
+ "No more DocHitInfos in iterator");
+ }
+ ++num_advance_calls_;
+ doc_hit_info_ =
+ cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_).doc_hit_info;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<DocHitInfoIterator::TrimmedNode>
+DocHitInfoIteratorTermMain::TrimRightMostNode() && {
+ // Leaf iterator should trim itself.
+ DocHitInfoIterator::TrimmedNode node = {nullptr, term_, term_start_index_,
+ unnormalized_term_length_};
+ return node;
+}
+
+libtextclassifier3::Status DocHitInfoIteratorTermMainExact::RetrieveMoreHits() {
+ DocHitInfoAndTermFrequencyArray last_doc_hit_info;
+ if (!cached_doc_hit_infos_.empty()) {
+ last_doc_hit_info = std::move(cached_doc_hit_infos_.back());
+ }
+ cached_doc_hit_infos_idx_ = 0;
+ cached_doc_hit_infos_.clear();
+ if (last_doc_hit_info.doc_hit_info.document_id() != kInvalidDocumentId) {
+ // Carry over the last hit. It might need to be merged with the first hit of
+ // of the next posting list in the chain.
+ cached_doc_hit_infos_.push_back(std::move(last_doc_hit_info));
+ }
+ if (posting_list_accessor_ == nullptr) {
+ ICING_ASSIGN_OR_RETURN(posting_list_accessor_,
+ main_index_->GetAccessorForExactTerm(term_));
+ }
+
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ posting_list_accessor_->GetNextHitsBatch());
+ if (hits.empty()) {
+ all_pages_consumed_ = true;
+ return libtextclassifier3::Status::OK;
+ }
+
+ ++num_blocks_inspected_;
+ cached_doc_hit_infos_.reserve(cached_doc_hit_infos_.size() + hits.size());
+ for (const Hit& hit : hits) {
+ // Check sections.
+ if (((UINT64_C(1) << hit.section_id()) & section_restrict_mask_) == 0) {
+ continue;
+ }
+ // We want exact hits, skip prefix-only hits.
+ if (hit.is_prefix_hit()) {
+ continue;
+ }
+
+ MergeNewHitIntoCachedDocHitInfos(hit, need_hit_term_frequency_,
+ cached_doc_hit_infos_);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermMainExact::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_);
+}
+
+libtextclassifier3::Status
+DocHitInfoIteratorTermMainPrefix::RetrieveMoreHits() {
+ DocHitInfoAndTermFrequencyArray last_doc_hit_info;
+ if (!cached_doc_hit_infos_.empty()) {
+ last_doc_hit_info = std::move(cached_doc_hit_infos_.back());
+ }
+ cached_doc_hit_infos_idx_ = 0;
+ cached_doc_hit_infos_.clear();
+ if (last_doc_hit_info.doc_hit_info.document_id() != kInvalidDocumentId) {
+ // Carry over the last hit. It might need to be merged with the first hit of
+ // of the next posting list in the chain.
+ cached_doc_hit_infos_.push_back(std::move(last_doc_hit_info));
+ }
+
+ if (posting_list_accessor_ == nullptr) {
+ ICING_ASSIGN_OR_RETURN(MainIndex::GetPrefixAccessorResult result,
+ main_index_->GetAccessorForPrefixTerm(term_));
+ posting_list_accessor_ = std::move(result.accessor);
+ exact_ = result.exact;
+ }
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ posting_list_accessor_->GetNextHitsBatch());
+ if (hits.empty()) {
+ all_pages_consumed_ = true;
+ return libtextclassifier3::Status::OK;
+ }
+
+ ++num_blocks_inspected_;
+ cached_doc_hit_infos_.reserve(cached_doc_hit_infos_.size() + hits.size());
+ for (const Hit& hit : hits) {
+ // Check sections.
+ if (((UINT64_C(1) << hit.section_id()) & section_restrict_mask_) == 0) {
+ continue;
+ }
+ // If we only want hits from prefix sections.
+ if (!exact_ && !hit.is_in_prefix_section()) {
+ continue;
+ }
+
+ MergeNewHitIntoCachedDocHitInfos(hit, need_hit_term_frequency_,
+ cached_doc_hit_infos_);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+std::string DocHitInfoIteratorTermMainPrefix::ToString() const {
+ return absl_ports::StrCat(SectionIdMaskToString(section_restrict_mask_), ":",
+ term_, "*");
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/doc-hit-info-iterator-term-main.h b/icing/index/main/doc-hit-info-iterator-term-main.h
new file mode 100644
index 0000000..e32db2a
--- /dev/null
+++ b/icing/index/main/doc-hit-info-iterator-term-main.h
@@ -0,0 +1,204 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
+#define ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/main/posting-list-hit-accessor.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class DocHitInfoIteratorTermMain : public DocHitInfoLeafIterator {
+ public:
+ struct DocHitInfoAndTermFrequencyArray {
+ DocHitInfo doc_hit_info;
+ std::optional<Hit::TermFrequencyArray> term_frequency_array;
+
+ explicit DocHitInfoAndTermFrequencyArray() = default;
+
+ explicit DocHitInfoAndTermFrequencyArray(
+ DocHitInfo doc_hit_info_in,
+ std::optional<Hit::TermFrequencyArray> term_frequency_array_in)
+ : doc_hit_info(std::move(doc_hit_info_in)),
+ term_frequency_array(std::move(term_frequency_array_in)) {}
+ };
+
+ explicit DocHitInfoIteratorTermMain(MainIndex* main_index,
+ const std::string& term,
+ int term_start_index,
+ int unnormalized_term_length,
+ SectionIdMask section_restrict_mask,
+ bool need_hit_term_frequency)
+ : term_(term),
+ term_start_index_(term_start_index),
+ unnormalized_term_length_(unnormalized_term_length),
+ posting_list_accessor_(nullptr),
+ main_index_(main_index),
+ cached_doc_hit_infos_idx_(-1),
+ num_advance_calls_(0),
+ num_blocks_inspected_(0),
+ all_pages_consumed_(false),
+ section_restrict_mask_(section_restrict_mask),
+ need_hit_term_frequency_(need_hit_term_frequency) {}
+
+ libtextclassifier3::Status Advance() override;
+
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override;
+
+ CallStats GetCallStats() const override {
+ return CallStats(
+ /*num_leaf_advance_calls_lite_index_in=*/0,
+ /*num_leaf_advance_calls_main_index_in=*/num_advance_calls_,
+ /*num_leaf_advance_calls_integer_index_in=*/0,
+ /*num_leaf_advance_calls_no_index_in=*/0,
+ /*num_blocks_inspected_in=*/num_blocks_inspected_);
+ }
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ if (cached_doc_hit_infos_idx_ == -1 ||
+ cached_doc_hit_infos_idx_ >= cached_doc_hit_infos_.size()) {
+ // Current hit isn't valid, return.
+ return;
+ }
+ SectionIdMask section_mask =
+ doc_hit_info_.hit_section_ids_mask() & filtering_section_mask;
+ SectionIdMask section_mask_copy = section_mask;
+ std::array<Hit::TermFrequency, kTotalNumSections> section_term_frequencies =
+ {Hit::kNoTermFrequency};
+ while (section_mask_copy) {
+ SectionId section_id = __builtin_ctzll(section_mask_copy);
+ if (need_hit_term_frequency_) {
+ section_term_frequencies.at(section_id) =
+ (*cached_doc_hit_infos_.at(cached_doc_hit_infos_idx_)
+ .term_frequency_array)[section_id];
+ }
+ section_mask_copy &= ~(UINT64_C(1) << section_id);
+ }
+ TermMatchInfo term_stats(term_, section_mask,
+ std::move(section_term_frequencies));
+
+ for (const TermMatchInfo& cur_term_stats : *matched_terms_stats) {
+ if (cur_term_stats.term == term_stats.term) {
+ // Same docId and same term, we don't need to add the term and the term
+ // frequency should always be the same
+ return;
+ }
+ }
+ matched_terms_stats->push_back(std::move(term_stats));
+ }
+
+ protected:
+ // Add DocHitInfos corresponding to term_ to cached_doc_hit_infos_.
+ virtual libtextclassifier3::Status RetrieveMoreHits() = 0;
+
+ const std::string term_;
+
+ // The start index of the given term in the search query
+ int term_start_index_;
+ // The length of the given unnormalized term in the search query
+ int unnormalized_term_length_;
+ // The accessor of the posting list chain for the requested term.
+ std::unique_ptr<PostingListHitAccessor> posting_list_accessor_;
+
+ MainIndex* main_index_;
+ // Stores hits and optional term frequency arrays retrieved from the index.
+ // This may only be a subset of the hits that are present in the index.
+ // Current value pointed to by the Iterator is tracked by
+ // cached_doc_hit_infos_idx_.
+ std::vector<DocHitInfoAndTermFrequencyArray> cached_doc_hit_infos_;
+ int cached_doc_hit_infos_idx_;
+
+ int num_advance_calls_;
+ int num_blocks_inspected_;
+ bool all_pages_consumed_;
+ // Mask indicating which sections hits should be considered for.
+ // Ex. 0000 0000 0000 0010 means that only hits from section 1 are desired.
+ const SectionIdMask section_restrict_mask_;
+ const bool need_hit_term_frequency_;
+
+ private:
+ // Remaining number of hits including the current hit.
+ // Returns -1 if cached_doc_hit_infos_idx_ is invalid.
+ int cached_doc_hit_info_count() const {
+ if (cached_doc_hit_infos_idx_ == -1 ||
+ cached_doc_hit_infos_idx_ >= cached_doc_hit_infos_.size()) {
+ return -1;
+ }
+ return cached_doc_hit_infos_.size() - cached_doc_hit_infos_idx_;
+ }
+};
+
+class DocHitInfoIteratorTermMainExact : public DocHitInfoIteratorTermMain {
+ public:
+ explicit DocHitInfoIteratorTermMainExact(MainIndex* main_index,
+ const std::string& term,
+ int term_start_index,
+ int unnormalized_term_length,
+ SectionIdMask section_restrict_mask,
+ bool need_hit_term_frequency)
+ : DocHitInfoIteratorTermMain(
+ main_index, term, term_start_index, unnormalized_term_length,
+ section_restrict_mask, need_hit_term_frequency) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+};
+
+class DocHitInfoIteratorTermMainPrefix : public DocHitInfoIteratorTermMain {
+ public:
+ explicit DocHitInfoIteratorTermMainPrefix(MainIndex* main_index,
+ const std::string& term,
+ int term_start_index,
+ int unnormalized_term_length,
+ SectionIdMask section_restrict_mask,
+ bool need_hit_term_frequency)
+ : DocHitInfoIteratorTermMain(
+ main_index, term, term_start_index, unnormalized_term_length,
+ section_restrict_mask, need_hit_term_frequency) {}
+
+ std::string ToString() const override;
+
+ protected:
+ libtextclassifier3::Status RetrieveMoreHits() override;
+
+ private:
+ // Whether or not posting_list_accessor_ holds a posting list chain for
+ // 'term' or for a term for which 'term' is a prefix. This is necessary to
+ // determine whether to return hits that are not from a prefix section (hits
+ // not from a prefix section should only be returned if exact_ is true).
+ bool exact_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_ITERATOR_DOC_HIT_INFO_ITERATOR_TERM_MAIN_H_
diff --git a/icing/index/main/main-index-merger.cc b/icing/index/main/main-index-merger.cc
new file mode 100644
index 0000000..c26a6d7
--- /dev/null
+++ b/icing/index/main/main-index-merger.cc
@@ -0,0 +1,305 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/main-index-merger.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <unordered_map>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+class HitSelector {
+ public:
+ // Returns whether or not term_id_hit_pair has the same term_id, document_id
+ // and section_id as the previously selected hits.
+ bool IsEquivalentHit(const TermIdHitPair& term_id_hit_pair) {
+ return prev_.term_id() == term_id_hit_pair.term_id() &&
+ prev_.hit().document_id() == term_id_hit_pair.hit().document_id() &&
+ prev_.hit().section_id() == term_id_hit_pair.hit().section_id();
+ }
+
+ // Merges term_id_hit_pair with previously added hits.
+ void SelectIfBetter(const TermIdHitPair& term_id_hit_pair) {
+ if (term_id_hit_pair.hit().is_prefix_hit()) {
+ SelectPrefixHitIfBetter(term_id_hit_pair);
+ } else {
+ SelectExactHitIfBetter(term_id_hit_pair);
+ }
+ prev_ = term_id_hit_pair;
+ }
+
+ // Adds all valid, selected hits to hits starting at position pos in hits.
+ // Returns the offset in hits after the position of the last added hit.
+ // This function may add between 0-2 hits depending on whether the HitSelector
+ // holds both a valid exact hit and a valid prefix hit, one of those or none.
+ size_t InsertSelectedHits(size_t pos, std::vector<TermIdHitPair>* hits) {
+ // Given the prefix/exact hits for a given term+docid+sectionid, push needed
+ // hits into hits array at offset pos. Return new pos.
+ if (best_prefix_hit_.hit().is_valid() && best_exact_hit_.hit().is_valid()) {
+ (*hits)[pos++] = best_exact_hit_;
+ const Hit& prefix_hit = best_prefix_hit_.hit();
+ // The prefix hit has score equal to the sum of the scores, capped at
+ // kMaxTermFrequency.
+ Hit::TermFrequency final_term_frequency = std::min(
+ static_cast<int>(Hit::kMaxTermFrequency),
+ prefix_hit.term_frequency() + best_exact_hit_.hit().term_frequency());
+ best_prefix_hit_ = TermIdHitPair(
+ best_prefix_hit_.term_id(),
+ Hit(prefix_hit.section_id(), prefix_hit.document_id(),
+ final_term_frequency, prefix_hit.is_in_prefix_section(),
+ prefix_hit.is_prefix_hit()));
+ (*hits)[pos++] = best_prefix_hit_;
+ // Ensure sorted.
+ if (best_prefix_hit_.hit() < best_exact_hit_.hit()) {
+ std::swap((*hits)[pos - 1], (*hits)[pos - 2]);
+ }
+ } else if (best_prefix_hit_.hit().is_valid()) {
+ (*hits)[pos++] = best_prefix_hit_;
+ } else if (best_exact_hit_.hit().is_valid()) {
+ (*hits)[pos++] = best_exact_hit_;
+ }
+
+ return pos;
+ }
+
+ void Reset() {
+ best_prefix_hit_ = TermIdHitPair();
+ best_exact_hit_ = TermIdHitPair();
+ prev_ = TermIdHitPair();
+ }
+
+ private:
+ void SelectPrefixHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
+ if (!best_prefix_hit_.hit().is_valid()) {
+ best_prefix_hit_ = term_id_hit_pair;
+ } else {
+ const Hit& hit = term_id_hit_pair.hit();
+ // Create a new prefix hit with term_frequency as the sum of the term
+ // frequencies. The term frequency is capped at kMaxTermFrequency.
+ Hit::TermFrequency final_term_frequency = std::min(
+ static_cast<int>(Hit::kMaxTermFrequency),
+ hit.term_frequency() + best_prefix_hit_.hit().term_frequency());
+ best_prefix_hit_ = TermIdHitPair(
+ term_id_hit_pair.term_id(),
+ Hit(hit.section_id(), hit.document_id(), final_term_frequency,
+ best_prefix_hit_.hit().is_in_prefix_section(),
+ best_prefix_hit_.hit().is_prefix_hit()));
+ }
+ }
+
+ void SelectExactHitIfBetter(const TermIdHitPair& term_id_hit_pair) {
+ if (!best_exact_hit_.hit().is_valid()) {
+ best_exact_hit_ = term_id_hit_pair;
+ } else {
+ const Hit& hit = term_id_hit_pair.hit();
+ // Create a new exact hit with term_frequency as the sum of the term
+ // frequencies. The term frequency is capped at kMaxHitScore.
+ Hit::TermFrequency final_term_frequency = std::min(
+ static_cast<int>(Hit::kMaxTermFrequency),
+ hit.term_frequency() + best_exact_hit_.hit().term_frequency());
+ best_exact_hit_ = TermIdHitPair(
+ term_id_hit_pair.term_id(),
+ Hit(hit.section_id(), hit.document_id(), final_term_frequency,
+ best_exact_hit_.hit().is_in_prefix_section(),
+ best_exact_hit_.hit().is_prefix_hit()));
+ }
+ }
+
+ TermIdHitPair best_prefix_hit_;
+ TermIdHitPair best_exact_hit_;
+ TermIdHitPair prev_;
+};
+
+class HitComparator {
+ public:
+ explicit HitComparator(
+ const TermIdCodec& term_id_codec,
+ const std::unordered_map<uint32_t, int>& main_tvi_to_block_index)
+ : term_id_codec_(&term_id_codec),
+ main_tvi_to_block_index_(&main_tvi_to_block_index) {}
+
+ bool operator()(const TermIdHitPair& lhs, const TermIdHitPair& rhs) const {
+ // Primary sort by index block. This acheives two things:
+ // 1. It reduces the number of flash writes by grouping together new hits
+ // for terms whose posting lists might share the same index block.
+ // 2. More importantly, this ensures that newly added backfill branch points
+ // will be populated first (because all newly added terms have an invalid
+ // block index of 0) before any new hits are added to the postings lists
+ // that they backfill from.
+ int lhs_index_block = GetIndexBlock(lhs.term_id());
+ int rhs_index_block = GetIndexBlock(rhs.term_id());
+ if (lhs_index_block == rhs_index_block) {
+ // Secondary sort by term_id and hit.
+ return lhs.value() < rhs.value();
+ }
+ return lhs_index_block < rhs_index_block;
+ }
+
+ private:
+ int GetIndexBlock(uint32_t term_id) const {
+ auto term_info_or = term_id_codec_->DecodeTermInfo(term_id);
+ if (!term_info_or.ok()) {
+ ICING_LOG(WARNING)
+ << "Unable to decode term-info during merge. This shouldn't happen.";
+ return kInvalidBlockIndex;
+ }
+ TermIdCodec::DecodedTermInfo term_info =
+ std::move(term_info_or).ValueOrDie();
+ auto itr = main_tvi_to_block_index_->find(term_info.tvi);
+ if (itr == main_tvi_to_block_index_->end()) {
+ return kInvalidBlockIndex;
+ }
+ return itr->second;
+ }
+
+ const TermIdCodec* term_id_codec_;
+ const std::unordered_map<uint32_t, int>* main_tvi_to_block_index_;
+};
+
+// A helper function to dedupe hits stored in hits. Suppose that the lite index
+// contained a single document with two hits in a single prefix section: "foot"
+// and "fool". When expanded, there would be four hits:
+// {"fo", docid0, sectionid0}
+// {"fo", docid0, sectionid0}
+// {"foot", docid0, sectionid0}
+// {"fool", docid0, sectionid0}
+//
+// The first two are duplicates of each other. So, this function will dedupe
+// and shrink hits to be:
+// {"fo", docid0, sectionid0}
+// {"foot", docid0, sectionid0}
+// {"fool", docid0, sectionid0}
+//
+// When two or more prefix hits are duplicates, merge into one hit with term
+// frequency as the sum of the term frequencies. If there is both an exact and
+// prefix hit for the same term, keep the exact hit as it is, update the prefix
+// hit so that its term frequency is the sum of the term frequencies.
+void DedupeHits(
+ std::vector<TermIdHitPair>* hits, const TermIdCodec& term_id_codec,
+ const std::unordered_map<uint32_t, int>& main_tvi_to_block_index) {
+ // Now all terms are grouped together and all hits for a term are sorted.
+ // Merge equivalent hits into one.
+ std::sort(hits->begin(), hits->end(),
+ HitComparator(term_id_codec, main_tvi_to_block_index));
+ size_t current_offset = 0;
+ HitSelector hit_selector;
+ for (const TermIdHitPair& term_id_hit_pair : *hits) {
+ if (!hit_selector.IsEquivalentHit(term_id_hit_pair)) {
+ // We've reached a new hit. Insert the previously selected hits that we
+ // had accumulated and reset to add this new hit.
+ current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
+ hit_selector.Reset();
+ }
+ // Update best exact and prefix hit.
+ hit_selector.SelectIfBetter(term_id_hit_pair);
+ }
+
+ // Push last.
+ current_offset = hit_selector.InsertSelectedHits(current_offset, hits);
+
+ hits->resize(current_offset);
+}
+
+// Based on experiments with full prefix expansion, the multiplier
+// is ~4x.
+constexpr int kAvgPrefixesPerTerm = 4;
+
+} // namespace
+
+libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
+MainIndexMerger::TranslateAndExpandLiteHits(
+ const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
+ const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs) {
+ std::vector<TermIdHitPair> hits;
+ if (lite_index.empty()) {
+ return hits;
+ }
+ // Reserve enough space for the average number of prefixes per term and the
+ // terms themselves.
+ hits.reserve(lite_index.size() * (kAvgPrefixesPerTerm + 1));
+
+ // Translate lite tvis to main tvis.
+ for (const TermIdHitPair& term_id_hit_pair : lite_index) {
+ uint32_t cur_term_id = term_id_hit_pair.term_id();
+ ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
+ term_id_codec.DecodeTermInfo(cur_term_id));
+ Hit hit(term_id_hit_pair.hit());
+
+ // 1. Translate and push original.
+ auto itr =
+ lexicon_merge_outputs.other_tvi_to_main_tvi.find(cur_decoded_term.tvi);
+ if (itr == lexicon_merge_outputs.other_tvi_to_main_tvi.cend()) {
+ // b/37273773
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Trying to translate lite tvi %u that was never added to the lexicon",
+ cur_decoded_term.tvi));
+ }
+ ICING_ASSIGN_OR_RETURN(uint32_t term_id,
+ term_id_codec.EncodeTvi(itr->second, TviType::MAIN));
+ hits.emplace_back(term_id, hit);
+
+ // 2. Expand hits in prefix sections.
+ if (hit.is_in_prefix_section()) {
+ // Hit was in a prefix section. Push prefixes. Turn on prefix bit.
+ auto itr_prefixes =
+ lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.find(
+ cur_decoded_term.tvi);
+ if (itr_prefixes ==
+ lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.end()) {
+ ICING_VLOG(1) << "No necessary prefix expansion for "
+ << cur_decoded_term.tvi;
+ continue;
+ }
+ // The tvis of all prefixes of this hit's term that appear in the main
+ // lexicon are between [prefix_tvis_buf[offset],
+ // prefix_tvis_buf[offset+len]).
+ size_t offset = itr_prefixes->second.first;
+ size_t len = itr_prefixes->second.second;
+ size_t offset_end_exclusive = offset + len;
+ Hit prefix_hit(hit.section_id(), hit.document_id(), hit.term_frequency(),
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+ for (; offset < offset_end_exclusive; ++offset) {
+ // Take the tvi (in the main lexicon) of each prefix term.
+ uint32_t prefix_main_tvi =
+ lexicon_merge_outputs.prefix_tvis_buf[offset];
+ // Convert it to a term_id.
+ ICING_ASSIGN_OR_RETURN(
+ uint32_t prefix_term_id,
+ term_id_codec.EncodeTvi(prefix_main_tvi, TviType::MAIN));
+ // Create add an element for this prefix TermId and prefix Hit to hits.
+ hits.emplace_back(prefix_term_id, prefix_hit);
+ }
+ }
+ }
+ // 3. Remove any duplicate hits.
+ DedupeHits(&hits, term_id_codec,
+ lexicon_merge_outputs.main_tvi_to_block_index);
+ return hits;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/main-index-merger.h b/icing/index/main/main-index-merger.h
new file mode 100644
index 0000000..1413a8f
--- /dev/null
+++ b/icing/index/main/main-index-merger.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_
+#define ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/lite/lite-index.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/term-id-codec.h"
+
+namespace icing {
+namespace lib {
+
+// Class used to merge hits from the lite_index and lite_lexicon into main_index
+// and main_lexicon.
+class MainIndexMerger {
+ public:
+ // Retrieves all hits in the lite index, translates the term ids of each
+ // LiteIndex::Element and expands prefix hits based on the mapping from
+ // lexicon_merge_outputs.other_tvi_to_prefix_main_tvis.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
+ // that exceeds the max TermId
+ static libtextclassifier3::StatusOr<std::vector<TermIdHitPair>>
+ TranslateAndExpandLiteHits(
+ const LiteIndex& lite_index, const TermIdCodec& term_id_codec,
+ const MainIndex::LexiconMergeOutputs& lexicon_merge_outputs);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_MAIN_INDEX_MERGER_H_
diff --git a/icing/index/main/main-index-merger_test.cc b/icing/index/main/main-index-merger_test.cc
new file mode 100644
index 0000000..37e14fc
--- /dev/null
+++ b/icing/index/main/main-index-merger_test.cc
@@ -0,0 +1,382 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/index/main/main-index-merger.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+#include "icing/index/main/main-index-merger.h"
+#include "icing/index/main/main-index.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::UnorderedElementsAre;
+
+class MainIndexMergerTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(MainIndexMergerTest, TranslateTermNotAdded) {
+ // 1. Index two docs in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=FALSE}
+ // - Doc1 {"fool", is_in_prefix_section=FALSE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_tvi,
+ lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foot_main_tvi = 5;
+
+ // Only create a mapping for 'foot'. Leave out the mapping for 'fool'
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+
+ // 3. TranslateAndExpand should fail because 'fool' doesn't have a main tvi
+ // mapping.
+ ASSERT_THAT(MainIndexMerger::TranslateAndExpandLiteHits(
+ *lite_index_, *term_id_codec_, lexicon_outputs),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(MainIndexMergerTest, PrefixExpansion) {
+ // 1. Index two docs in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=FALSE}
+ // - Doc1 {"fool", is_in_prefix_section=TRUE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_tvi,
+ lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foo_main_tvi = 12;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
+ Hit doc1_prefix_hit(/*section_id=*/0, /*document_id=*/1,
+ Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+
+ uint32_t foot_main_tvi = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_main_term_id,
+ term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
+ uint32_t fool_main_tvi = 10;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_main_term_id,
+ term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
+
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ // Map "fool" to it's prefix hit for "foo".
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
+ std::make_pair(0, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
+
+ // 3. TranslateAndExpand should;
+ // a. Translate lite term ids to main term ids based on the map
+ // b. Expand 'fool' to have a hit for 'foo'
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
+ MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
+ lexicon_outputs));
+ EXPECT_THAT(
+ expanded_term_id_hit_pairs,
+ UnorderedElementsAre(TermIdHitPair(foot_main_term_id, doc0_hit),
+ TermIdHitPair(fool_main_term_id, doc1_hit),
+ TermIdHitPair(foo_term_id, doc1_prefix_hit)));
+}
+
+TEST_F(MainIndexMergerTest, DedupePrefixAndExactWithDifferentTermFrequencies) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+
+ Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
+ Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
+ Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foo_main_tvi = 12;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_main_term_id,
+ term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
+ // The prefix hit for 'foot' should have the same term frequency as the exact
+ // hit for 'foot'. The final prefix hit has term frequency equal to 58.
+ Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
+ /*term_frequency=*/58,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+
+ uint32_t foot_main_tvi = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_main_term_id,
+ term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
+
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ // Map "foot" to it's prefix hit for "foo".
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
+ std::make_pair(0, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
+
+ // 3. TranslateAndExpand should;
+ // a. Translate lite term ids to main term ids based on the map
+ // b. Expand 'foot' to have a hit for 'foo'
+ // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
+ // latter with term frequency as the sum of the term frequencies.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
+ MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
+ lexicon_outputs));
+ EXPECT_THAT(
+ expanded_term_id_hit_pairs,
+ UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
+ TermIdHitPair(foo_main_term_id, foo_doc0_hit),
+ TermIdHitPair(foo_main_term_id, doc0_prefix_hit)));
+}
+
+TEST_F(MainIndexMergerTest, DedupeWithExactSameTermFrequencies) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" "foo" is_in_prefix_section=TRUE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_tvi, TviType::LITE));
+
+ Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
+ Hit foo_doc0_hit(/*section_id=*/0, /*document_id=*/0, /*term_frequency=*/57,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, foo_doc0_hit));
+ // The prefix hit should take the sum as term_frequency - 114.
+ Hit prefix_foo_doc0_hit(/*section_id=*/0, /*document_id=*/0,
+ /*term_frequency=*/114,
+ /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/true);
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foo_main_tvi = 12;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_main_term_id,
+ term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
+
+ uint32_t foot_main_tvi = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_main_term_id,
+ term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
+
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ // Map "foot" to it's prefix hit for "foo".
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
+ std::make_pair(0, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foo_tvi, foo_main_tvi);
+
+ // 3. TranslateAndExpand should;
+ // a. Translate lite term ids to main term ids based on the map
+ // b. Expand 'foot' to have a hit for 'foo'
+ // c. Keep both the exact hit for 'foo' and the prefix hit for 'foot', the
+ // latter with term frequency as the sum of the term frequencies.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
+ MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
+ lexicon_outputs));
+ EXPECT_THAT(expanded_term_id_hit_pairs,
+ UnorderedElementsAre(
+ TermIdHitPair(foot_main_term_id, foot_doc0_hit),
+ TermIdHitPair(foo_main_term_id, foo_doc0_hit),
+ TermIdHitPair(foo_main_term_id, prefix_foo_doc0_hit)));
+}
+
+TEST_F(MainIndexMergerTest, DedupePrefixExpansion) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" "fool" is_in_prefix_section=TRUE}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(foot_tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_tvi,
+ lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(fool_tvi, TviType::LITE));
+
+ Hit foot_doc0_hit(/*section_id=*/0, /*document_id=*/0,
+ /*term_frequency=*/Hit::kMaxTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, foot_doc0_hit));
+ Hit fool_doc0_hit(/*section_id=*/0, /*document_id=*/0,
+ Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, fool_doc0_hit));
+
+ // 2. Build up a fake LexiconMergeOutputs
+ // This is some made up number that doesn't matter for this test.
+ uint32_t foo_main_tvi = 12;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(foo_main_tvi, TviType::MAIN));
+ // The prefix hit should take the sum as term frequency - 256, capped at
+ // kMaxTermFrequency.
+ Hit doc0_prefix_hit(/*section_id=*/0, /*document_id=*/0,
+ /*term_frequency=*/Hit::kMaxTermFrequency,
+ /*is_in_prefix_section=*/true, /*is_prefix_hit=*/true);
+
+ uint32_t foot_main_tvi = 5;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t foot_main_term_id,
+ term_id_codec_->EncodeTvi(foot_main_tvi, TviType::MAIN));
+ uint32_t fool_main_tvi = 10;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t fool_main_term_id,
+ term_id_codec_->EncodeTvi(fool_main_tvi, TviType::MAIN));
+
+ MainIndex::LexiconMergeOutputs lexicon_outputs;
+ // Map "fool" to it's prefix hit for "foo" and "foot" to it's prefix hit for
+ // "foo".
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(fool_tvi,
+ std::make_pair(0, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_prefix_main_tvis.emplace(foot_tvi,
+ std::make_pair(1, 1));
+ lexicon_outputs.prefix_tvis_buf.push_back(foo_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(foot_tvi, foot_main_tvi);
+ lexicon_outputs.other_tvi_to_main_tvi.emplace(fool_tvi, fool_main_tvi);
+
+ // 3. TranslateAndExpand should;
+ // a. Translate lite term ids to main term ids based on the map
+ // b. Expand 'foot' and 'fool' to have hits for 'foo'
+ // c. Merge the prefix hits from 'foot' and 'fool', taking the sum as
+ // term frequency.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermIdHitPair> expanded_term_id_hit_pairs,
+ MainIndexMerger::TranslateAndExpandLiteHits(*lite_index_, *term_id_codec_,
+ lexicon_outputs));
+ EXPECT_THAT(
+ expanded_term_id_hit_pairs,
+ UnorderedElementsAre(TermIdHitPair(foot_main_term_id, foot_doc0_hit),
+ TermIdHitPair(fool_main_term_id, fool_doc0_hit),
+ TermIdHitPair(foo_term_id, doc0_prefix_hit)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/main-index.cc b/icing/index/main/main-index.cc
new file mode 100644
index 0000000..aae60c6
--- /dev/null
+++ b/icing/index/main/main-index.cc
@@ -0,0 +1,858 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/index/main/main-index.h"
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Finds the shortest,valid prefix term with prefix hits in lexicon for which
+// "prefix" is a prefix.
+// Returns a valid FindTermResult with found=true if either:
+// 1. prefix exists as a term in lexicon.
+// 2. the shortest, valid prefix in the lexicon exists and contains prefix
+// hits.
+// Returns a FindTermResult with found=false and undefined values of tvi and
+// exact if no term was found.
+struct FindTermResult {
+ // TVI of the term that was found. Undefined if found=false.
+ uint32_t tvi;
+ // Whether or not a valid term with prefix hits was found.
+ bool found;
+ // Whether or not that term is equal to 'prefix'
+ bool exact;
+};
+FindTermResult FindShortestValidTermWithPrefixHits(
+ const IcingDynamicTrie* lexicon, const std::string& prefix) {
+ // For prefix indexing: when we are doing a prefix match for "prefix", find
+ // the tvi to the equivalent posting list. prefix's own posting list might not
+ // exist but one of its children acts as a proxy.
+ IcingDynamicTrie::PropertyReader hits_in_prefix_section(
+ *lexicon, GetHasHitsInPrefixSectionPropertyId());
+ uint32_t tvi = 0;
+ bool found = false;
+ bool exact = false;
+ for (IcingDynamicTrie::Iterator it(*lexicon, prefix.c_str()); it.IsValid();
+ it.Advance()) {
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, it.GetValue(), sizeof(posting_list_id));
+
+ // Posting list id might be invalid if this is also a backfill term.
+ // Suppose that the main index has two pre-existing prefix hits "foot" and
+ // "fool" - it will have a branch point posting list for "foo". Then, let's
+ // suppose that the other index adds hits for "foul", "four" and "far". This
+ // will result in branch points for "fo" and "f".
+ // If "fo" was added before "f", then the iterator would first give us "fo".
+ // "fo" will have an invalid posting_list_id because it hasn't been
+ // backfilled yet, so we need to continue iterating to "foo".
+ if (posting_list_id.is_valid()) {
+ exact = (prefix.size() == strlen(it.GetKey()));
+ tvi = it.GetValueIndex();
+ // Found it. Does it have prefix hits?
+ found = exact || hits_in_prefix_section.HasProperty(tvi);
+ break;
+ }
+ }
+ FindTermResult result = {tvi, found, exact};
+ return result;
+}
+
+std::string MakeFlashIndexFilename(const std::string& base_dir) {
+ return base_dir + "/main_index";
+}
+
+} // namespace
+
+MainIndex::MainIndex(const std::string& index_directory,
+ const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem)
+ : base_dir_(index_directory),
+ filesystem_(filesystem),
+ icing_filesystem_(icing_filesystem),
+ posting_list_hit_serializer_(
+ std::make_unique<PostingListHitSerializer>()) {}
+
+libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> MainIndex::Create(
+ const std::string& index_directory, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+ ICING_RETURN_ERROR_IF_NULL(icing_filesystem);
+ std::unique_ptr<MainIndex> main_index(
+ new MainIndex(index_directory, filesystem, icing_filesystem));
+ ICING_RETURN_IF_ERROR(main_index->Init());
+ return main_index;
+}
+
+/* static */ libtextclassifier3::StatusOr<int> MainIndex::ReadFlashIndexMagic(
+ const Filesystem* filesystem, const std::string& index_directory) {
+ return FlashIndexStorage::ReadHeaderMagic(
+ filesystem, MakeFlashIndexFilename(index_directory));
+}
+
+// TODO(b/139087650) : Migrate off of IcingFilesystem.
+libtextclassifier3::Status MainIndex::Init() {
+ if (!filesystem_->CreateDirectoryRecursively(base_dir_.c_str())) {
+ return absl_ports::InternalError("Unable to create main index directory.");
+ }
+ std::string flash_index_file = MakeFlashIndexFilename(base_dir_);
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index,
+ FlashIndexStorage::Create(flash_index_file, filesystem_,
+ posting_list_hit_serializer_.get()));
+ flash_index_storage_ =
+ std::make_unique<FlashIndexStorage>(std::move(flash_index));
+
+ std::string lexicon_file = base_dir_ + "/main-lexicon";
+ IcingDynamicTrie::RuntimeOptions runtime_options;
+ main_lexicon_ = std::make_unique<IcingDynamicTrie>(
+ lexicon_file, runtime_options, icing_filesystem_);
+ IcingDynamicTrie::Options lexicon_options;
+ if (!main_lexicon_->CreateIfNotExist(lexicon_options) ||
+ !main_lexicon_->Init()) {
+ return absl_ports::InternalError("Failed to initialize lexicon trie");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<int64_t> MainIndex::GetElementsSize() const {
+ IndexStorageInfoProto storage_info = GetStorageInfo(IndexStorageInfoProto());
+ if (storage_info.main_index_storage_size() == -1 ||
+ storage_info.main_index_lexicon_size() == -1) {
+ return absl_ports::AbortedError(
+ "Failed to get size of MainIndex's members.");
+ }
+ return storage_info.main_index_storage_size() +
+ storage_info.main_index_lexicon_size();
+}
+
+IndexStorageInfoProto MainIndex::GetStorageInfo(
+ IndexStorageInfoProto storage_info) const {
+ storage_info.set_main_index_lexicon_size(
+ IcingFilesystem::SanitizeFileSize(main_lexicon_->GetElementsSize()));
+ storage_info.set_main_index_storage_size(
+ Filesystem::SanitizeFileSize(flash_index_storage_->GetElementsSize()));
+ storage_info.set_main_index_block_size(flash_index_storage_->block_size());
+ storage_info.set_num_blocks(flash_index_storage_->num_blocks());
+ storage_info.set_min_free_fraction(flash_index_storage_->min_free_fraction());
+ return storage_info;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
+MainIndex::GetAccessorForExactTerm(const std::string& term) {
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ if (!main_lexicon_->Find(term.c_str(), &posting_list_id)) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Term %s is not present in main lexicon.", term.c_str()));
+ }
+ return PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_hit_serializer_.get(),
+ posting_list_id);
+}
+
+libtextclassifier3::StatusOr<MainIndex::GetPrefixAccessorResult>
+MainIndex::GetAccessorForPrefixTerm(const std::string& prefix) {
+ bool exact = false;
+ // For prefix indexing: when we are doing a prefix match for
+ // "prefix", find the tvi to the equivalent posting list. prefix's
+ // own posting list might not exist but its shortest child acts as a proxy.
+ //
+ // For example, if there are only two hits in the index are prefix hits for
+ // "bar" and "bat", then both will appear on a posting list for "ba". "b"
+ // won't have a posting list, but "ba" will suffice.
+ IcingDynamicTrie::PropertyReader hits_in_prefix_section(
+ *main_lexicon_, GetHasHitsInPrefixSectionPropertyId());
+ IcingDynamicTrie::Iterator main_itr(*main_lexicon_, prefix.c_str());
+ if (!main_itr.IsValid()) {
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Term: %s is not present in the main lexicon.", prefix.c_str()));
+ }
+ exact = (prefix.length() == strlen(main_itr.GetKey()));
+
+ if (!exact && !hits_in_prefix_section.HasProperty(main_itr.GetValueIndex())) {
+ // Found it, but it doesn't have prefix hits. Exit early. No need to
+ // retrieve the posting list because there's nothing there for us.
+ return absl_ports::NotFoundError("The term doesn't have any prefix hits.");
+ }
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, main_itr.GetValue(), sizeof(posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_hit_serializer_.get(),
+ posting_list_id));
+ return GetPrefixAccessorResult(std::move(pl_accessor), exact);
+}
+
+// TODO(tjbarron): Implement a method PropertyReadersAll.HasAnyProperty().
+bool IsTermInNamespaces(
+ const IcingDynamicTrie::PropertyReadersAll& property_reader,
+ uint32_t value_index, const std::vector<NamespaceId>& namespace_ids) {
+ if (namespace_ids.empty()) {
+ return true;
+ }
+ for (NamespaceId namespace_id : namespace_ids) {
+ if (property_reader.HasProperty(GetNamespacePropertyId(namespace_id),
+ value_index)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+libtextclassifier3::StatusOr<std::vector<TermMetadata>>
+MainIndex::FindTermsByPrefix(
+ const std::string& prefix, TermMatchType::Code scoring_match_type,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker) {
+ // Finds all the terms that start with the given prefix in the lexicon.
+ IcingDynamicTrie::Iterator term_iterator(*main_lexicon_, prefix.c_str());
+
+ std::vector<TermMetadata> term_metadata_list;
+ while (term_iterator.IsValid()) {
+ int score = 0;
+ DocumentId last_document_id = kInvalidDocumentId;
+ bool is_last_document_in_desired = false;
+
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, term_iterator.GetValue(), sizeof(posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_hit_serializer_.get(),
+ posting_list_id));
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> hits,
+ pl_accessor->GetNextHitsBatch());
+ while (!hits.empty()) {
+ for (const Hit& hit : hits) {
+ // Check whether this Hit is desired.
+ DocumentId document_id = hit.document_id();
+ bool is_new_document = document_id != last_document_id;
+ if (is_new_document) {
+ last_document_id = document_id;
+ is_last_document_in_desired =
+ suggestion_result_checker->BelongsToTargetResults(
+ document_id, hit.section_id());
+ }
+ if (!is_last_document_in_desired) {
+ // The document is removed or expired or not belongs to target
+ // namespaces.
+ continue;
+ }
+ if (scoring_match_type == TermMatchType::EXACT_ONLY &&
+ hit.is_prefix_hit()) {
+ continue;
+ }
+
+ // Score the hit by the strategy
+ if (score_by ==
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE) {
+ // Give 1 to all match terms and return them in arbitrary order
+ score = 1;
+ break;
+ } else if (score_by == SuggestionScoringSpecProto::
+ SuggestionRankingStrategy::DOCUMENT_COUNT &&
+ is_new_document) {
+ ++score;
+ } else if (score_by == SuggestionScoringSpecProto::
+ SuggestionRankingStrategy::TERM_FREQUENCY) {
+ if (hit.has_term_frequency()) {
+ score += hit.term_frequency();
+ } else {
+ ++score;
+ }
+ }
+ }
+ if (score_by ==
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::NONE &&
+ score == 1) {
+ // The term is desired and no need to be scored.
+ break;
+ }
+ ICING_ASSIGN_OR_RETURN(hits, pl_accessor->GetNextHitsBatch());
+ }
+ if (score > 0) {
+ term_metadata_list.push_back(TermMetadata(term_iterator.GetKey(), score));
+ }
+
+ term_iterator.Advance();
+ }
+ return term_metadata_list;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddBackfillBranchPoints(const IcingDynamicTrie& other_lexicon) {
+ // Maps new branching points in main lexicon to the term such that
+ // branching_point_term is a prefix of term and there are no terms smaller
+ // than term and greater than branching_point_term.
+ std::string prefix;
+ LexiconMergeOutputs outputs;
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ // If term were inserted in the main lexicon, what new branching would it
+ // create? (It always creates at most one.)
+ int prefix_len = main_lexicon_->FindNewBranchingPrefixLength(
+ other_term_itr.GetKey(), /*utf8=*/true);
+ if (prefix_len <= 0) {
+ continue;
+ }
+ prefix.assign(other_term_itr.GetKey(), prefix_len);
+
+ // Figure out backfill tvi. Might not exist since all children terms could
+ // only contain hits from non-prefix sections.
+ //
+ // Ex. Suppose that the main lexicon contains "foot" and "fool" and that
+ // we're adding "foul". The new branching prefix will be "fo". The backfill
+ // prefix will be "foo" - all hits in prefix section on "foo" will need to
+ // be added to the new "fo" posting list later.
+ FindTermResult result =
+ FindShortestValidTermWithPrefixHits(main_lexicon_.get(), prefix);
+ if (!result.found || result.exact) {
+ continue;
+ }
+
+ // This is a new prefix that will need backfilling from its next-in-line
+ // posting list. This new prefix will have to have a posting list eventually
+ // so insert a default PostingListIdentifier as a placeholder.
+ uint32_t branching_prefix_tvi;
+ bool new_key;
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ libtextclassifier3::Status status = main_lexicon_->Insert(
+ prefix.c_str(), &posting_list_id, &branching_prefix_tvi,
+ /*replace=*/false, &new_key);
+ if (!status.ok()) {
+ ICING_LOG(DBG) << "Could not insert branching prefix\n"
+ << status.error_message();
+ return status;
+ }
+
+ // Backfills only contain prefix hits by default. So set these here but
+ // could be overridden when adding hits from the other index later.
+ if (!main_lexicon_->SetProperty(branching_prefix_tvi,
+ GetHasNoExactHitsPropertyId()) ||
+ !main_lexicon_->SetProperty(branching_prefix_tvi,
+ GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::InternalError("Setting prefix prop failed");
+ }
+
+ outputs.backfill_map[branching_prefix_tvi] = result.tvi;
+ }
+ return outputs;
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddTerms(const IcingDynamicTrie& other_lexicon,
+ LexiconMergeOutputs&& outputs) {
+ IcingDynamicTrie::PropertyReadersAll new_term_prop_readers(other_lexicon);
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ uint32_t new_main_tvi;
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ libtextclassifier3::Status status = main_lexicon_->Insert(
+ other_term_itr.GetKey(), &posting_list_id, &new_main_tvi,
+ /*replace=*/false);
+ if (!status.ok()) {
+ ICING_LOG(DBG) << "Could not insert term: " << other_term_itr.GetKey()
+ << "\n"
+ << status.error_message();
+ return status;
+ }
+
+ // Copy the properties from the other lexicon over to the main lexicon.
+ uint32_t other_tvi = other_term_itr.GetValueIndex();
+ if (!CopyProperties(new_term_prop_readers, other_lexicon, other_tvi,
+ new_main_tvi)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Could not insert term: ", other_term_itr.GetKey()));
+ }
+
+ // Add other to main mapping.
+ outputs.other_tvi_to_main_tvi.emplace(other_tvi, new_main_tvi);
+
+ memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(new_main_tvi),
+ sizeof(posting_list_id));
+ if (posting_list_id.block_index() != kInvalidBlockIndex) {
+ outputs.main_tvi_to_block_index[new_main_tvi] =
+ posting_list_id.block_index();
+ }
+ }
+ return std::move(outputs);
+}
+
+libtextclassifier3::StatusOr<MainIndex::LexiconMergeOutputs>
+MainIndex::AddBranchPoints(const IcingDynamicTrie& other_lexicon,
+ LexiconMergeOutputs&& outputs) {
+ IcingDynamicTrie::PropertyReader has_prefix_prop_reader(
+ other_lexicon, GetHasHitsInPrefixSectionPropertyId());
+ if (!has_prefix_prop_reader.Exists()) {
+ return std::move(outputs);
+ }
+ std::string prefix;
+ for (IcingDynamicTrie::Iterator other_term_itr(other_lexicon, /*prefix=*/"");
+ other_term_itr.IsValid(); other_term_itr.Advance()) {
+ // Only expand terms that have hits in prefix sections.
+ if (!has_prefix_prop_reader.HasProperty(other_term_itr.GetValueIndex())) {
+ continue;
+ }
+
+ // Get prefixes where there is already a branching point in the main
+ // lexicon. We skip prefixes which don't already have a branching point.
+ std::vector<int> prefix_lengths = main_lexicon_->FindBranchingPrefixLengths(
+ other_term_itr.GetKey(), /*utf8=*/true);
+
+ int buf_start = outputs.prefix_tvis_buf.size();
+ // Add prefixes.
+ for (int prefix_length : prefix_lengths) {
+ if (prefix_length <= 0) {
+ continue;
+ }
+
+ prefix.assign(other_term_itr.GetKey(), prefix_length);
+ uint32_t prefix_tvi;
+ bool new_key;
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ libtextclassifier3::Status status =
+ main_lexicon_->Insert(prefix.c_str(), &posting_list_id, &prefix_tvi,
+ /*replace=*/false, &new_key);
+ if (!status.ok()) {
+ ICING_LOG(DBG) << "Could not insert prefix: " << prefix << "\n"
+ << status.error_message();
+ return status;
+ }
+
+ // Prefix tvi will have hits in prefix section.
+ if (!main_lexicon_->SetProperty(prefix_tvi,
+ GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::InternalError(
+ "Setting has hits in prefix section prop failed");
+ }
+
+ // If it hasn't been added by non-prefix term insertions in
+ // AddBackfillBranchPoints and AddTerms, it is a prefix-only term.
+ if (new_key && !main_lexicon_->SetProperty(
+ prefix_tvi, GetHasNoExactHitsPropertyId())) {
+ return absl_ports::InternalError("Setting no exact hits prop failed");
+ }
+
+ outputs.prefix_tvis_buf.push_back(prefix_tvi);
+
+ memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(prefix_tvi),
+ sizeof(posting_list_id));
+ if (posting_list_id.block_index() != kInvalidBlockIndex) {
+ outputs.main_tvi_to_block_index[prefix_tvi] =
+ posting_list_id.block_index();
+ }
+ }
+
+ // Any prefixes added? Then add to map.
+ if (buf_start < outputs.prefix_tvis_buf.size()) {
+ outputs.other_tvi_to_prefix_main_tvis[other_term_itr.GetValueIndex()] = {
+ buf_start, outputs.prefix_tvis_buf.size() - buf_start};
+ }
+ }
+ return std::move(outputs);
+}
+
+bool MainIndex::CopyProperties(
+ const IcingDynamicTrie::PropertyReadersAll& prop_reader,
+ const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
+ uint32_t new_main_tvi) {
+ for (uint32_t property_id = 0; property_id < prop_reader.size();
+ ++property_id) {
+ if (property_id == GetHasNoExactHitsPropertyId()) {
+ // HasNoExactHitsProperty is an inverse. If other_lexicon has exact hits
+ // for this term, then HasNoExactHits needs to be set to false in
+ // main_lexicon. If other_lexicon has no exact hits for this term, then
+ // HasNoExactHits in the main_lexicon should not be modified.
+ if (!prop_reader.HasProperty(property_id, other_tvi) &&
+ !main_lexicon_->ClearProperty(new_main_tvi, property_id)) {
+ ICING_LOG(ERROR) << "Clearing HasNoExactHitsProperty failed";
+ return false;
+ }
+ } else {
+ // If other_lexicon has this property set for this term, then that
+ // property needs to be set for the main_lexicon. If other_lexicon
+ // doesn't have this property set, then the property in the main lexicon
+ // should not be modified.
+ if (prop_reader.HasProperty(property_id, other_tvi) &&
+ !main_lexicon_->SetProperty(new_main_tvi, property_id)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+libtextclassifier3::Status MainIndex::AddHits(
+ const TermIdCodec& term_id_codec,
+ std::unordered_map<uint32_t, uint32_t>&& backfill_map,
+ std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id) {
+ if (hits.empty()) {
+ flash_index_storage_->set_last_indexed_docid(last_added_document_id);
+ return libtextclassifier3::Status::OK;
+ }
+ uint32_t cur_term_id = hits[0].term_id();
+ ICING_ASSIGN_OR_RETURN(TermIdCodec::DecodedTermInfo cur_decoded_term,
+ term_id_codec.DecodeTermInfo(cur_term_id));
+ // Iterate through all hits. If these hits are for a term that also needs
+ // backfill, then backfill first and then add the new hits.
+ size_t k_start = 0;
+ size_t k_end = 0;
+ while (k_start < hits.size()) {
+ uint32_t term_id = hits[k_end].term_id();
+ while (term_id == cur_term_id && ++k_end < hits.size()) {
+ term_id = hits[k_end].term_id();
+ }
+
+ // Look for backfill.
+ PostingListIdentifier backfill_posting_list_id =
+ PostingListIdentifier::kInvalid;
+ auto itr = backfill_map.find(cur_decoded_term.tvi);
+ if (itr != backfill_map.end()) {
+ const void* value = main_lexicon_->GetValueAtIndex(itr->second);
+ memcpy(&backfill_posting_list_id, value,
+ sizeof(backfill_posting_list_id));
+ backfill_map.erase(itr);
+ }
+ ICING_RETURN_IF_ERROR(AddHitsForTerm(cur_decoded_term.tvi,
+ backfill_posting_list_id,
+ &hits[k_start], k_end - k_start));
+ cur_term_id = term_id;
+ ICING_ASSIGN_OR_RETURN(cur_decoded_term,
+ term_id_codec.DecodeTermInfo(cur_term_id));
+ k_start = k_end;
+ }
+
+ // Now copy remaining backfills.
+ ICING_VLOG(1) << "Remaining backfills " << backfill_map.size();
+ for (auto other_tvi_main_tvi_pair : backfill_map) {
+ PostingListIdentifier backfill_posting_list_id =
+ PostingListIdentifier::kInvalid;
+ memcpy(&backfill_posting_list_id,
+ main_lexicon_->GetValueAtIndex(other_tvi_main_tvi_pair.second),
+ sizeof(backfill_posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListHitAccessor> hit_accum,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ posting_list_hit_serializer_.get()));
+ ICING_RETURN_IF_ERROR(
+ AddPrefixBackfillHits(backfill_posting_list_id, hit_accum.get()));
+ PostingListAccessor::FinalizeResult result =
+ std::move(*hit_accum).Finalize();
+ if (result.id.is_valid()) {
+ main_lexicon_->SetValueAtIndex(other_tvi_main_tvi_pair.first, &result.id);
+ }
+ }
+ flash_index_storage_->set_last_indexed_docid(last_added_document_id);
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status MainIndex::AddHitsForTerm(
+ uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
+ const TermIdHitPair* hit_elements, size_t len) {
+ // 1. Create a PostingListHitAccessor - either from the pre-existing block, if
+ // one exists, or from scratch.
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, main_lexicon_->GetValueAtIndex(tvi),
+ sizeof(posting_list_id));
+ std::unique_ptr<PostingListHitAccessor> pl_accessor;
+ if (posting_list_id.is_valid()) {
+ if (posting_list_id.block_index() >= flash_index_storage_->num_blocks()) {
+ ICING_LOG(ERROR) << "Index dropped hits. Invalid block index "
+ << posting_list_id.block_index()
+ << " >= " << flash_index_storage_->num_blocks();
+ // TODO(b/159918304) : Consider revising the checksumming strategy in the
+ // main index. Providing some mechanism to check for corruption - either
+ // during initialization or some later time would allow us to avoid
+ // whack-a-mole with odd corruption issues like this one (b/62820689).
+ return absl_ports::InternalError(
+ "Valid posting list has an invalid block index!");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ pl_accessor, PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(),
+ posting_list_hit_serializer_.get(), posting_list_id));
+ } else {
+ // New posting list.
+ ICING_ASSIGN_OR_RETURN(
+ pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ posting_list_hit_serializer_.get()));
+ }
+
+ // 2. Backfill any hits if necessary.
+ if (backfill_posting_list_id.is_valid()) {
+ ICING_RETURN_IF_ERROR(
+ AddPrefixBackfillHits(backfill_posting_list_id, pl_accessor.get()));
+ }
+
+ // 3. Add all the new hits.
+ for (int i = len - 1; i >= 0; --i) {
+ Hit hit = hit_elements[i].hit();
+ ICING_RETURN_IF_ERROR(pl_accessor->PrependHit(hit));
+ }
+
+ // 4. Finalize this posting list and put its identifier in the lexicon.
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ if (result.id.is_valid()) {
+ main_lexicon_->SetValueAtIndex(tvi, &result.id);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status MainIndex::AddPrefixBackfillHits(
+ PostingListIdentifier backfill_posting_list_id,
+ PostingListHitAccessor* hit_accum) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListHitAccessor> backfill_accessor,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_hit_serializer_.get(),
+ backfill_posting_list_id));
+ std::vector<Hit> backfill_hits;
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
+ backfill_accessor->GetNextHitsBatch());
+ while (!tmp.empty()) {
+ std::copy(tmp.begin(), tmp.end(), std::back_inserter(backfill_hits));
+ ICING_ASSIGN_OR_RETURN(tmp, backfill_accessor->GetNextHitsBatch());
+ }
+
+ Hit last_added_hit;
+ // The hits in backfill_hits are in the reverse order of how they were added.
+ // Iterate in reverse to add them to this new posting list in the correct
+ // order.
+ for (auto itr = backfill_hits.rbegin(); itr != backfill_hits.rend(); ++itr) {
+ const Hit& hit = *itr;
+ // Skip hits from non-prefix-enabled sections.
+ if (!hit.is_in_prefix_section()) {
+ continue;
+ }
+
+ // A backfill hit is a prefix hit in a prefix section.
+ const Hit backfill_hit(hit.section_id(), hit.document_id(),
+ hit.term_frequency(),
+ /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/true);
+ if (backfill_hit == last_added_hit) {
+ // Skip duplicate values due to overriding of the is_prefix flag.
+ continue;
+ }
+ last_added_hit = backfill_hit;
+ ICING_RETURN_IF_ERROR(hit_accum->PrependHit(backfill_hit));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+std::string MainIndex::GetDebugInfo(DebugInfoVerbosity::Code verbosity) const {
+ std::string res;
+
+ // Lexicon.
+ std::string lexicon_info;
+ main_lexicon_->GetDebugInfo(verbosity, &lexicon_info);
+
+ IcingStringUtil::SStringAppendF(&res, 0,
+ "last_added_document_id: %u\n"
+ "\n"
+ "main_lexicon_info:\n%s\n",
+ last_added_document_id(),
+ lexicon_info.c_str());
+
+ if (verbosity == DebugInfoVerbosity::BASIC) {
+ return res;
+ }
+
+ std::string flash_index_storage_info;
+ flash_index_storage_->GetDebugInfo(verbosity, &flash_index_storage_info);
+ IcingStringUtil::SStringAppendF(&res, 0, "flash_index_storage_info:\n%s\n",
+ flash_index_storage_info.c_str());
+ return res;
+}
+
+libtextclassifier3::Status MainIndex::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new) {
+ std::string temporary_index_dir_path = base_dir_ + "_temp";
+ if (!filesystem_->DeleteDirectoryRecursively(
+ temporary_index_dir_path.c_str())) {
+ ICING_LOG(ERROR) << "Recursively deleting " << temporary_index_dir_path;
+ return absl_ports::InternalError(
+ "Unable to delete temp directory to prepare to build new index.");
+ }
+
+ DestructibleDirectory temporary_index_dir(
+ filesystem_, std::move(temporary_index_dir_path));
+ if (!temporary_index_dir.is_valid()) {
+ return absl_ports::InternalError(
+ "Unable to create temp directory to build new index.");
+ }
+
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<MainIndex> new_index,
+ MainIndex::Create(temporary_index_dir.dir(),
+ filesystem_, icing_filesystem_));
+ ICING_RETURN_IF_ERROR(TransferIndex(document_id_old_to_new, new_index.get()));
+ ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
+ new_index = nullptr;
+ flash_index_storage_ = nullptr;
+ main_lexicon_ = nullptr;
+
+ if (!filesystem_->SwapFiles(temporary_index_dir.dir().c_str(),
+ base_dir_.c_str())) {
+ return absl_ports::InternalError(
+ "Unable to apply new index due to failed swap!");
+ }
+
+ // Reinitialize the index so that flash_index_storage_ and main_lexicon_ are
+ // properly updated.
+ return Init();
+}
+
+libtextclassifier3::StatusOr<DocumentId> MainIndex::TransferAndAddHits(
+ const std::vector<DocumentId>& document_id_old_to_new, const char* term,
+ PostingListHitAccessor& old_pl_accessor, MainIndex* new_index) {
+ std::vector<Hit> new_hits;
+ bool has_no_exact_hits = true;
+ bool has_hits_in_prefix_section = false;
+ // The largest document id after translating hits.
+ DocumentId largest_document_id = kInvalidDocumentId;
+ ICING_ASSIGN_OR_RETURN(std::vector<Hit> tmp,
+ old_pl_accessor.GetNextHitsBatch());
+ while (!tmp.empty()) {
+ for (const Hit& hit : tmp) {
+ // A safety check to add robustness to the codebase, so to make sure that
+ // we never access invalid memory, in case that hit from the posting list
+ // is corrupted.
+ if (hit.document_id() < 0 ||
+ hit.document_id() >= document_id_old_to_new.size()) {
+ continue;
+ }
+ DocumentId new_document_id = document_id_old_to_new[hit.document_id()];
+ // Transfer the document id of the hit, if the document is not deleted
+ // or outdated.
+ if (new_document_id != kInvalidDocumentId) {
+ if (hit.is_in_prefix_section()) {
+ has_hits_in_prefix_section = true;
+ }
+ if (!hit.is_prefix_hit()) {
+ has_no_exact_hits = false;
+ }
+ if (largest_document_id == kInvalidDocumentId ||
+ new_document_id > largest_document_id) {
+ largest_document_id = new_document_id;
+ }
+ new_hits.push_back(Hit::TranslateHit(hit, new_document_id));
+ }
+ }
+ ICING_ASSIGN_OR_RETURN(tmp, old_pl_accessor.GetNextHitsBatch());
+ }
+ // A term without exact hits indicates that it is a purely backfill term. If
+ // the term is not branching in the new trie, it means backfilling is no
+ // longer necessary, so that we can skip.
+ if (new_hits.empty() ||
+ (has_no_exact_hits && !new_index->main_lexicon_->IsBranchingTerm(term))) {
+ return largest_document_id;
+ }
+
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<PostingListHitAccessor> hit_accum,
+ PostingListHitAccessor::Create(
+ new_index->flash_index_storage_.get(),
+ new_index->posting_list_hit_serializer_.get()));
+ for (auto itr = new_hits.rbegin(); itr != new_hits.rend(); ++itr) {
+ ICING_RETURN_IF_ERROR(hit_accum->PrependHit(*itr));
+ }
+ PostingListAccessor::FinalizeResult result = std::move(*hit_accum).Finalize();
+ if (!result.id.is_valid()) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to add translated hits for term: ", term));
+ }
+ uint32_t tvi;
+ libtextclassifier3::Status status =
+ new_index->main_lexicon_->Insert(term, &result.id, &tvi,
+ /*replace=*/false);
+ if (!status.ok()) {
+ ICING_LOG(DBG) << "Could not transfer main index for term: " << term << "\n"
+ << status.error_message();
+ return status;
+ }
+ if (has_no_exact_hits && !new_index->main_lexicon_->SetProperty(
+ tvi, GetHasNoExactHitsPropertyId())) {
+ return absl_ports::InternalError("Setting prefix prop failed");
+ }
+ if (has_hits_in_prefix_section &&
+ !new_index->main_lexicon_->SetProperty(
+ tvi, GetHasHitsInPrefixSectionPropertyId())) {
+ return absl_ports::InternalError("Setting prefix prop failed");
+ }
+ return largest_document_id;
+}
+
+libtextclassifier3::Status MainIndex::TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ MainIndex* new_index) {
+ DocumentId largest_document_id = kInvalidDocumentId;
+ for (IcingDynamicTrie::Iterator term_itr(*main_lexicon_, /*prefix=*/"",
+ /*reverse=*/true);
+ term_itr.IsValid(); term_itr.Advance()) {
+ PostingListIdentifier posting_list_id = PostingListIdentifier::kInvalid;
+ memcpy(&posting_list_id, term_itr.GetValue(), sizeof(posting_list_id));
+ if (posting_list_id == PostingListIdentifier::kInvalid) {
+ // Why?
+ ICING_LOG(ERROR)
+ << "Got invalid posting_list_id from previous main index";
+ continue;
+ }
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_hit_serializer_.get(),
+ posting_list_id));
+ ICING_ASSIGN_OR_RETURN(
+ DocumentId curr_largest_document_id,
+ TransferAndAddHits(document_id_old_to_new, term_itr.GetKey(),
+ *pl_accessor, new_index));
+ if (curr_largest_document_id == kInvalidDocumentId) {
+ continue;
+ }
+ if (largest_document_id == kInvalidDocumentId ||
+ curr_largest_document_id > largest_document_id) {
+ largest_document_id = curr_largest_document_id;
+ }
+ }
+ new_index->flash_index_storage_->set_last_indexed_docid(largest_document_id);
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/main-index.h b/icing/index/main/main-index.h
new file mode 100644
index 0000000..9e570d5
--- /dev/null
+++ b/icing/index/main/main-index.h
@@ -0,0 +1,350 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_MAIN_INDEX_H_
+#define ICING_INDEX_MAIN_MAIN_INDEX_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/main/posting-list-hit-accessor.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-metadata.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/suggestion-result-checker.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+class MainIndex {
+ public:
+ // RETURNS:
+ // - valid instance of MainIndex, on success.
+ // - INTERNAL error if unable to create the lexicon or flash storage.
+ static libtextclassifier3::StatusOr<std::unique_ptr<MainIndex>> Create(
+ const std::string& index_directory, const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
+
+ // Reads magic from existing flash index storage file header. We need this
+ // during Icing initialization phase to determine the version.
+ //
+ // RETURNS:
+ // - On success, a valid magic.
+ // - NOT_FOUND if the flash index doesn't exist.
+ // - INTERNAL on I/O error.
+ static libtextclassifier3::StatusOr<int> ReadFlashIndexMagic(
+ const Filesystem* filesystem, const std::string& index_directory);
+
+ // Get a PostingListHitAccessor that holds the posting list chain for 'term'.
+ //
+ // RETURNS:
+ // - On success, a valid PostingListHitAccessor
+ // - NOT_FOUND if term is not present in the main index.
+ libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
+ GetAccessorForExactTerm(const std::string& term);
+
+ // Get a PostingListHitAccessor for 'prefix'.
+ //
+ // RETURNS:
+ // - On success, a result containing a valid PostingListHitAccessor.
+ // - NOT_FOUND if neither 'prefix' nor any terms for which 'prefix' is a
+ // prefix are present in the main index.
+ struct GetPrefixAccessorResult {
+ // A PostingListHitAccessor that holds the posting list chain for the term
+ // that best represents 'prefix' in the main index.
+ std::unique_ptr<PostingListHitAccessor> accessor;
+ // True if the returned posting list chain is for 'prefix' or false if the
+ // returned posting list chain is for a term for which 'prefix' is a prefix.
+ bool exact;
+
+ explicit GetPrefixAccessorResult(
+ std::unique_ptr<PostingListHitAccessor> accessor_in, bool exact_in)
+ : accessor(std::move(accessor_in)), exact(exact_in) {}
+ };
+ libtextclassifier3::StatusOr<GetPrefixAccessorResult>
+ GetAccessorForPrefixTerm(const std::string& prefix);
+
+ // Finds terms with the given prefix in the given result checker. The
+ // input prefix must be normalized, otherwise inaccurate results may be
+ // returned. If scoring_match_type is EXACT, only exact hit will be counted
+ // and it is PREFIX, both prefix and exact hits will be counted. Results are
+ // not sorted specifically and are in lexigraphical order. Number of results
+ // are no more than 'num_to_return'.
+ //
+ // Returns:
+ // A list of TermMetadata on success
+ // INTERNAL_ERROR if failed to access term data.
+ libtextclassifier3::StatusOr<std::vector<TermMetadata>> FindTermsByPrefix(
+ const std::string& prefix, TermMatchType::Code scoring_match_type,
+ SuggestionScoringSpecProto::SuggestionRankingStrategy::Code score_by,
+ const SuggestionResultChecker* suggestion_result_checker);
+
+ struct LexiconMergeOutputs {
+ // Maps from main_lexicon tvi for new branching point to the main_lexicon
+ // tvi for posting list whose hits must be backfilled.
+ std::unordered_map<uint32_t, uint32_t> backfill_map;
+
+ // Maps from lexicon tvis to main_lexicon tvis.
+ std::unordered_map<uint32_t, uint32_t> other_tvi_to_main_tvi;
+
+ // Maps from main lexicon tvi to the block index. Tvis with no entry do not
+ // have an allocated posting list.
+ std::unordered_map<uint32_t, int> main_tvi_to_block_index;
+
+ // Maps from the lexicon tvi to the beginning position in
+ // prefix_tvis_buf and the length.
+ std::unordered_map<uint32_t, std::pair<int, int>>
+ other_tvi_to_prefix_main_tvis;
+
+ // Stores tvis that are mapped to by other_tvi_to_prefix_tvis.
+ std::vector<uint32_t> prefix_tvis_buf;
+ };
+
+ // Merge the lexicon into the main lexicon and populate the data
+ // structures necessary to translate lite tvis to main tvis, track backfilling
+ // and expanding lite terms to prefix terms.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> MergeLexicon(
+ const IcingDynamicTrie& other_lexicon) {
+ // Backfill branch points need to be added first so that the backfill_map
+ // can be correctly populated.
+ ICING_ASSIGN_OR_RETURN(LexiconMergeOutputs outputs,
+ AddBackfillBranchPoints(other_lexicon));
+ ICING_ASSIGN_OR_RETURN(outputs,
+ AddTerms(other_lexicon, std::move(outputs)));
+ // Non-backfill branch points need to be added last so that the mapping of
+ // newly added terms to prefix terms can be correctly populated (prefix
+ // terms might be branch points between two new terms or between a
+ // pre-existing term and a new term).
+ ICING_ASSIGN_OR_RETURN(outputs,
+ AddBranchPoints(other_lexicon, std::move(outputs)));
+ return outputs;
+ }
+
+ // Add hits to the main index and backfill from existing posting lists to new
+ // backfill branch points.
+ //
+ // The backfill_map maps from main_lexicon tvi for a newly added branching
+ // point to the main_lexicon tvi for the posting list whose hits must be
+ // backfilled. backfill_map should be populated as part of LexiconMergeOutputs
+ // in MergeLexicon and be blindly passed to this function.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if one of the elements in the lite index has a term_id
+ // exceeds the max TermId, is not valid or is not less than pre-existing hits
+ // in the main index.
+ // - INTERNAL_ERROR if unable to mmap necessary IndexBlocks
+ // - RESOURCE_EXHAUSTED error if unable to grow the index
+ libtextclassifier3::Status AddHits(
+ const TermIdCodec& term_id_codec,
+ std::unordered_map<uint32_t, uint32_t>&& backfill_map,
+ std::vector<TermIdHitPair>&& hits, DocumentId last_added_document_id);
+
+ libtextclassifier3::Status PersistToDisk() {
+ if (main_lexicon_->Sync() && flash_index_storage_->PersistToDisk()) {
+ return libtextclassifier3::Status::OK;
+ }
+ return absl_ports::InternalError("Unable to sync main index components.");
+ }
+
+ DocumentId last_added_document_id() const {
+ return flash_index_storage_->get_last_indexed_docid();
+ }
+
+ libtextclassifier3::Status Reset() {
+ ICING_RETURN_IF_ERROR(flash_index_storage_->Reset());
+ main_lexicon_->Clear();
+ return libtextclassifier3::Status::OK;
+ }
+
+ void Warm() { main_lexicon_->Warm(); }
+
+ // Returns:
+ // - elements size of lexicon and index, on success
+ // - INTERNAL on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+
+ // Takes the provided storage_info, populates the fields related to the main
+ // index and returns that storage_info.
+ //
+ // If an IO error occurs while trying to calculate the value for a field, then
+ // that field will be set to -1.
+ IndexStorageInfoProto GetStorageInfo(
+ IndexStorageInfoProto storage_info) const;
+
+ // Returns debug information for the main index in out.
+ // verbosity = BASIC, simplest debug information - just the lexicon
+ // verbosity = DETAILED, more detailed debug information including raw
+ // postings lists.
+ std::string GetDebugInfo(DebugInfoVerbosity::Code verbosity) const;
+
+ // Reduces internal file sizes by reclaiming space of deleted documents.
+ //
+ // This method will update the last_added_docid of the index to the largest
+ // document id that still appears in the index after compaction.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error, this indicates that the index may be in an
+ // invalid state and should be cleared.
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new);
+
+ private:
+ explicit MainIndex(const std::string& index_directory,
+ const Filesystem* filesystem,
+ const IcingFilesystem* icing_filesystem);
+
+ libtextclassifier3::Status Init();
+
+ // Helpers for merging the lexicon
+ // Add all 'backfill' branch points. Backfill branch points are prefix
+ // branch points that are a prefix of terms that existed in the lexicon
+ // to the merge.
+ //
+ // For example, if the main lexicon only contains "foot" and is then merged
+ // with a lite lexicon containing only "fool", then a backfill branch point
+ // for "foo" will be added to contain prefix hits from both the pre-existing
+ // posting list for "foot" and the new posting list for "fool".
+ //
+ // Populates LexiconMergeOutputs.backfill_map
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBackfillBranchPoints(
+ const IcingDynamicTrie& other_lexicon);
+
+ // Add all terms from the lexicon.
+ //
+ // Populates LexiconMergeOutputs.other_tvi_to_main_tvi
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddTerms(
+ const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
+
+ // Add all branch points for terms added from the lexicon.
+ // For example, if the main lexicon is empty and is then merged with a
+ // lexicon containing only "foot" and "fool", then a branch point for "foo"
+ // will be added to contain prefix hits from both "foot" and "fool".
+ //
+ // Populates LexiconMergeOutputs.other_tvi_to_prefix_main_tvis and
+ // LexiconMergeOutputs.prefix_tvis_buf;
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL on IO error while writing to the main lexicon.
+ libtextclassifier3::StatusOr<LexiconMergeOutputs> AddBranchPoints(
+ const IcingDynamicTrie& other_lexicon, LexiconMergeOutputs&& outputs);
+
+ // Copies all properties from old_tvi in the other lexicon to the new_tvi in
+ // the main lexicon.
+ // Returns true on success, false if an IO error is encountered.
+ bool CopyProperties(const IcingDynamicTrie::PropertyReadersAll& prop_reader,
+ const IcingDynamicTrie& other_lexicon, uint32_t other_tvi,
+ uint32_t new_main_tvi);
+
+ // Add all hits between [hit_elements, hit_elements + len) to main_index,
+ // updating the entry in the main lexicon at trie_value_index to point to the
+ // resulting posting list. Hits are sorted in descending document id order, so
+ // they should be to posting lists in reverse (starting at hit_elements
+ // + len - 1) and working backwards. Therefore, hit_elements must be in sorted
+ // order.
+ //
+ // trie_value_index may point to a valid posting list id if there is a
+ // pre-existing posting list to append to.
+ //
+ // If backfill_posting_list_id is valid, then the hits from the posting list
+ // identified by backfill_posting_list_id should be added to the new posting
+ // list before the hits in hit_elements.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if posting_list_id stored at trie_value_index is valid
+ // but points out of bounds in the IndexBlock referred to by
+ // id.block_index(), if one of the hits from [hit_elements,hit_elements+len)
+ // is not valid, or if one of the hits from [hit_elements,hit_elements+len)
+ // is not less than the previously added hits.
+ // - INTERNAL_ERROR if posting_list_id stored at trie_value_index is valid
+ // but points to an invalid block index or if unable to mmap the IndexBlock.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status AddHitsForTerm(
+ uint32_t tvi, PostingListIdentifier backfill_posting_list_id,
+ const TermIdHitPair* hit_elements, size_t len);
+
+ // Adds all prefix hits or hits from prefix sections present on the posting
+ // list identified by backfill_posting_list_id to hit_accum.
+ //
+ // RETURNS:
+ // - OK, on success
+ // - INVALID_ARGUMENT if backfill_posting_list_id points out of bounds in the
+ // IndexBlock referred to by id.block_index()
+ // - INTERNAL_ERROR if unable to mmap the block identified by
+ // backfill_posting_list_id or if the posting list identified by
+ // backfill_posting_list_id has been corrupted.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status AddPrefixBackfillHits(
+ PostingListIdentifier backfill_posting_list_id,
+ PostingListHitAccessor* hit_accum);
+
+ // Transfer hits from old_pl_accessor to new_index for term.
+ //
+ // Returns:
+ // largest document id added to the translated posting list, on success
+ // INTERNAL_ERROR on IO error
+ static libtextclassifier3::StatusOr<DocumentId> TransferAndAddHits(
+ const std::vector<DocumentId>& document_id_old_to_new, const char* term,
+ PostingListHitAccessor& old_pl_accessor, MainIndex* new_index);
+
+ // Transfer hits from the current main index to new_index.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ MainIndex* new_index);
+
+ std::string base_dir_;
+ const Filesystem* filesystem_;
+ const IcingFilesystem* icing_filesystem_;
+ std::unique_ptr<PostingListHitSerializer> posting_list_hit_serializer_;
+ std::unique_ptr<FlashIndexStorage> flash_index_storage_;
+ std::unique_ptr<IcingDynamicTrie> main_lexicon_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_MAIN_INDEX_H_
diff --git a/icing/index/main/main-index_test.cc b/icing/index/main/main-index_test.cc
new file mode 100644
index 0000000..fa96e6c
--- /dev/null
+++ b/icing/index/main/main-index_test.cc
@@ -0,0 +1,710 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/main-index.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/lite/term-id-hit-pair.h"
+#include "icing/index/main/doc-hit-info-iterator-term-main.h"
+#include "icing/index/main/main-index-merger.h"
+#include "icing/index/term-id-codec.h"
+#include "icing/index/term-property-id.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/legacy/index/icing-mock-filesystem.h"
+#include "icing/schema/section.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::NiceMock;
+using ::testing::Return;
+using ::testing::SizeIs;
+
+std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+}
+
+std::vector<DocHitInfo> GetExactHits(
+ MainIndex* main_index, int term_start_index, int unnormalized_term_length,
+ const std::string& term, SectionIdMask section_mask = kSectionIdMaskAll) {
+ auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>(
+ main_index, term, term_start_index, unnormalized_term_length,
+ section_mask, /*need_hit_term_frequency=*/true);
+ return GetHits(std::move(iterator));
+}
+
+std::vector<DocHitInfo> GetPrefixHits(
+ MainIndex* main_index, int term_start_index, int unnormalized_term_length,
+ const std::string& term, SectionIdMask section_mask = kSectionIdMaskAll) {
+ auto iterator = std::make_unique<DocHitInfoIteratorTermMainPrefix>(
+ main_index, term, term_start_index, unnormalized_term_length,
+ section_mask, /*need_hit_term_frequency=*/true);
+ return GetHits(std::move(iterator));
+}
+
+libtextclassifier3::Status Merge(const LiteIndex& lite_index,
+ const TermIdCodec& term_id_codec,
+ MainIndex* main_index) {
+ ICING_ASSIGN_OR_RETURN(MainIndex::LexiconMergeOutputs outputs,
+ main_index->MergeLexicon(lite_index.lexicon()));
+ ICING_ASSIGN_OR_RETURN(std::vector<TermIdHitPair> term_id_hit_pairs,
+ MainIndexMerger::TranslateAndExpandLiteHits(
+ lite_index, term_id_codec, outputs));
+ return main_index->AddHits(term_id_codec, std::move(outputs.backfill_map),
+ std::move(term_id_hit_pairs),
+ lite_index.last_added_document_id());
+}
+
+class MainIndexTest : public testing::Test {
+ protected:
+ void SetUp() override {
+ index_dir_ = GetTestTempDir() + "/test_dir";
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(index_dir_.c_str()));
+
+ std::string lite_index_file_name = index_dir_ + "/test_file.lite-idx.index";
+ LiteIndex::Options options(lite_index_file_name,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ term_id_codec_,
+ TermIdCodec::Create(
+ IcingDynamicTrie::max_value_index(IcingDynamicTrie::Options()),
+ IcingDynamicTrie::max_value_index(options.lexicon_options)));
+ }
+
+ void TearDown() override {
+ term_id_codec_.reset();
+ lite_index_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(index_dir_.c_str()));
+ }
+
+ std::string index_dir_;
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::unique_ptr<LiteIndex> lite_index_;
+ std::unique_ptr<TermIdCodec> term_id_codec_;
+};
+
+constexpr NamespaceId kNamespace0 = 0;
+
+TEST_F(MainIndexTest, MainIndexCreateIOFailure) {
+ // Create the index with mock filesystem. By default, Mock will return false,
+ // so the first attempted file operation will fail.
+ NiceMock<IcingMockFilesystem> mock_icing_filesystem;
+ ON_CALL(mock_icing_filesystem, CreateDirectoryRecursively)
+ .WillByDefault(Return(false));
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ EXPECT_THAT(MainIndex::Create(main_index_file_name, &filesystem_,
+ &mock_icing_filesystem),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixTermNotFound) {
+ // Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+ EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsValidAccessor) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=true}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should contain "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // GetAccessorForPrefixTerm should return a valid accessor for "foo".
+ EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"), IsOk());
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForPrefixReturnsNotFound) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should return not found when we search
+ // prefix contain "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // GetAccessorForPrefixTerm should return a valid accessor for "foo".
+ EXPECT_THAT(main_index->GetAccessorForPrefixTerm("foo"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForExactTermNotFound) {
+ // Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+ EXPECT_THAT(main_index->GetAccessorForExactTerm("foo"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(MainIndexTest, MainIndexGetAccessorForExactReturnsValidAccessor) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"foo" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should contain "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // GetAccessorForPrefixTerm should return a valid accessor for "foo".
+ EXPECT_THAT(main_index->GetAccessorForExactTerm("foo"), IsOk());
+}
+
+TEST_F(MainIndexTest, MergeIndexToEmpty) {
+ // 1. Index three docs in the Lite Index:
+ // - Doc0 {"foot", "fool", "far" is_in_prefix_section=false}
+ // - Doc1 {"foot", "fool" is_in_prefix_section=true}
+ // - Doc2 {"fool", "far" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
+
+ Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ std::vector<DocHitInfo> hits =
+ GetExactHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "foot");
+ EXPECT_THAT(hits, IsEmpty());
+ hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "fo");
+ EXPECT_THAT(hits, IsEmpty());
+
+ // 3. Merge the index. The main index should contain "fool", "foot"
+ // and "far" as well as a branch points for "foo" and "f". "fa" and "fo"
+ // should not be present because it is not a branch point.
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // Get hits from an exact posting list.
+ hits = GetExactHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "foot");
+ // We should get hits for "foot" in doc1 and doc0
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()}),
+ EqualsDocHitInfo(doc0_hit.document_id(),
+ std::vector<SectionId>{doc0_hit.section_id()})));
+
+ // Get hits from a branching point posting list. "fo" should redirect to "foo"
+ hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "fo");
+ // We should get hits for "foot" in doc1 and "fool" in doc1. We shouldn't get
+ // the hits for "foot" in doc0 and "fool" in doc0 and doc2 because they
+ // weren't hits in prefix sections.
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest, MergeIndexToPreexisting) {
+ // 1. Index three docs in the Lite Index:
+ // - Doc0 {"foot", "fool", "far" is_in_prefix_section=false}
+ // - Doc1 {"foot", "fool" is_in_prefix_section=true}
+ // - Doc2 {"fool", "far" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("far", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t far_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc0_hit));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc1_hit));
+
+ Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc2_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(far_term_id, doc2_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should contain "fool", "foot"
+ // and "far" as well as a branch points for "foo" and "f". "fa" and "fo"
+ // should not be present because it is not a branch point.
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+
+ // 4. Index two docs in a new Lite Index:
+ // - Doc3 {"foot", "four", "foul", "fall" is_in_prefix_section=false}
+ // - Doc4 {"four", "foul" is_in_prefix_section=true}
+ std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2";
+ LiteIndex::Options options(lite_index_file_name2,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("four", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t four_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("foul", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foul_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("fall", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t fall_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc3_hit(/*section_id=*/0, /*document_id=*/3, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc3_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc3_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc3_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(fall_term_id, doc3_hit));
+
+ Hit doc4_hit(/*section_id=*/0, /*document_id=*/4, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(four_term_id, doc4_hit));
+ ICING_ASSERT_OK(lite_index_->AddHit(foul_term_id, doc4_hit));
+
+ // 3. Merge the index. The main index should now contain "foul", "four"
+ // and "fall", a branch points for "fou" and backfill points for "fo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // Get hits from an exact posting list the existed before the merge.
+ std::vector<DocHitInfo> hits =
+ GetExactHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "foot");
+
+ // We should get hits for "foot" in doc3, doc1 and doc0
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc3_hit.document_id(),
+ std::vector<SectionId>{doc3_hit.section_id()}),
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()}),
+ EqualsDocHitInfo(doc0_hit.document_id(),
+ std::vector<SectionId>{doc0_hit.section_id()})));
+ // Get hits from backfill posting list.
+ hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "fo");
+ // We should get hits for "four" and "foul" in doc4 and hits for "foot" and
+ // "fool" in doc1. We shouldn't get the hits for "foot" in doc0 and doc3,
+ // "fool" in doc0 and doc2 or the hits for "four" and "foul" in doc4 because
+ // they weren't hits in prefix sections.
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc4_hit.document_id(),
+ std::vector<SectionId>{doc4_hit.section_id()}),
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest, ExactRetrievedInPrefixSearch) {
+ // 1. Index two docs in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=true}
+ // - Doc1 {"foo" is_in_prefix_section=false}
+ // - Doc2 {"foot" is_in_prefix_section=false}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));
+
+ Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc2_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the lite lexicon. The main lexicon should contain "foot" and
+ // "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ std::vector<DocHitInfo> hits =
+ GetPrefixHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "foo");
+ // We should get hits for "foo" in doc1 and doc0, but not in doc2 because it
+ // is not a prefix hit.
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()}),
+ EqualsDocHitInfo(doc0_hit.document_id(),
+ std::vector<SectionId>{doc0_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest, PrefixNotRetrievedInExactSearch) {
+ // 1. Index two docs in the Lite Index:
+ // - Doc0 {"foot" is_in_prefix_section=true}
+ // - Doc1 {"foo" is_in_prefix_section=false}
+ // - Doc1 {"foo" is_in_prefix_section=true}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi, lite_index_->InsertTerm("foo", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc0_hit));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc1_hit));
+
+ Hit doc2_hit(/*section_id=*/0, /*document_id=*/2, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc2_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the lite lexicon. The main lexicon should contain "foot" and
+ // "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ std::vector<DocHitInfo> hits =
+ GetExactHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "foo");
+
+ // We should get hits for "foo" in doc2 and doc1, but not in doc0 because it
+ // is not an exact hit.
+ EXPECT_THAT(
+ hits,
+ ElementsAre(
+ EqualsDocHitInfo(doc2_hit.document_id(),
+ std::vector<SectionId>{doc2_hit.section_id()}),
+ EqualsDocHitInfo(doc1_hit.document_id(),
+ std::vector<SectionId>{doc1_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest,
+ SearchChainedPostingListsShouldMergeSectionsAndTermFrequency) {
+ // Index 2048 document with 3 hits in each document. When merged into the main
+ // index, this will 1) lead to a chained posting list and 2) split at least
+ // one document's hits across multiple posting lists.
+ const std::string term = "foot";
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm(term, TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ for (DocumentId document_id = 0; document_id < 2048; ++document_id) {
+ Hit::TermFrequency term_frequency = static_cast<Hit::TermFrequency>(
+ document_id % Hit::kMaxTermFrequency + 1);
+ Hit doc_hit0(
+ /*section_id=*/0, /*document_id=*/document_id, term_frequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit0));
+
+ Hit doc_hit1(
+ /*section_id=*/1, /*document_id=*/document_id, term_frequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit1));
+
+ Hit doc_hit2(
+ /*section_id=*/2, /*document_id=*/document_id, term_frequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc_hit2));
+ }
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the lite index.
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // Get hits for all documents containing "foot" - which should be all of them.
+
+ auto iterator = std::make_unique<DocHitInfoIteratorTermMainExact>(
+ main_index.get(), term, /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ /*need_hit_term_frequency=*/true);
+
+ DocumentId expected_document_id = 2047;
+ while (iterator->Advance().ok()) {
+ EXPECT_THAT(iterator->doc_hit_info(),
+ EqualsDocHitInfo(expected_document_id,
+ std::vector<SectionId>{0, 1, 2}));
+
+ std::vector<TermMatchInfo> matched_terms_stats;
+ iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+
+ Hit::TermFrequency expected_term_frequency =
+ static_cast<Hit::TermFrequency>(
+ expected_document_id % Hit::kMaxTermFrequency + 1);
+ ASSERT_THAT(matched_terms_stats, SizeIs(1));
+ EXPECT_THAT(matched_terms_stats[0].term, Eq(term));
+ EXPECT_THAT(matched_terms_stats[0].term_frequencies[0],
+ Eq(expected_term_frequency));
+ EXPECT_THAT(matched_terms_stats[0].term_frequencies[1],
+ Eq(expected_term_frequency));
+ EXPECT_THAT(matched_terms_stats[0].term_frequencies[2],
+ Eq(expected_term_frequency));
+ --expected_document_id;
+ }
+ EXPECT_THAT(expected_document_id, Eq(-1));
+}
+
+TEST_F(MainIndexTest, MergeIndexBackfilling) {
+ // 1. Index one doc in the Lite Index:
+ // - Doc0 {"fool" is_in_prefix_section=true}
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("fool", TermMatchType::PREFIX, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t fool_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc0_hit(/*section_id=*/0, /*document_id=*/0, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/true);
+ ICING_ASSERT_OK(lite_index_->AddHit(fool_term_id, doc0_hit));
+
+ // 2. Create the main index. It should have no entries in its lexicon.
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ // 3. Merge the index. The main index should contain "fool".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+
+ // 4. Index two docs in a new Lite Index:
+ // - Doc1 {"foot" is_in_prefix_section=false}
+ std::string lite_index_file_name2 = index_dir_ + "/test_file.lite-idx.index2";
+ LiteIndex::Options options(lite_index_file_name2,
+ /*hit_buffer_want_merge_bytes=*/1024 * 1024,
+ /*hit_buffer_sort_at_indexing=*/true,
+ /*hit_buffer_sort_threshold_bytes=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(lite_index_,
+ LiteIndex::Create(options, &icing_filesystem_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ tvi,
+ lite_index_->InsertTerm("foot", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foot_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+
+ Hit doc1_hit(/*section_id=*/0, /*document_id=*/1, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foot_term_id, doc1_hit));
+
+ // 5. Merge the index. The main index should now contain "fool", "foot"
+ // and a backfill point for "foo".
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ // Get hits from an exact posting list the existed before the merge.
+ std::vector<DocHitInfo> hits =
+ GetExactHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "foo");
+ EXPECT_THAT(hits, IsEmpty());
+
+ // Get hits from backfill posting list.
+ hits = GetPrefixHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "foo");
+ // We should get a hit for "fool" in doc0.
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfo(
+ doc0_hit.document_id(),
+ std::vector<SectionId>{doc0_hit.section_id()})));
+}
+
+TEST_F(MainIndexTest, OneHitInTheFirstPageForTwoPagesMainIndex) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t tvi,
+ lite_index_->InsertTerm("foo", TermMatchType::EXACT_ONLY, kNamespace0));
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t foo_term_id,
+ term_id_codec_->EncodeTvi(tvi, TviType::LITE));
+ SectionId section_id = 0;
+ // Based on debugging logs, 2038 documents in the following setting will
+ // result in two pages in the posting list chain, and the first page only
+ // contains one hit.
+ uint32_t num_docs = 2038;
+ for (DocumentId document_id = 0; document_id < num_docs; ++document_id) {
+ Hit doc_hit(section_id, document_id, Hit::kDefaultTermFrequency,
+ /*is_in_prefix_section=*/false);
+ ICING_ASSERT_OK(lite_index_->AddHit(foo_term_id, doc_hit));
+ }
+
+ std::string main_index_file_name = index_dir_ + "/test_file.idx.index";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<MainIndex> main_index,
+ MainIndex::Create(main_index_file_name, &filesystem_,
+ &icing_filesystem_));
+
+ ICING_ASSERT_OK(Merge(*lite_index_, *term_id_codec_, main_index.get()));
+ std::vector<DocHitInfo> hits =
+ GetExactHits(main_index.get(), /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, "foo");
+ ASSERT_THAT(hits, SizeIs(num_docs));
+ for (DocumentId document_id = num_docs - 1; document_id >= 0; --document_id) {
+ ASSERT_THAT(
+ hits[num_docs - 1 - document_id],
+ EqualsDocHitInfo(document_id, std::vector<SectionId>{section_id}));
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-hit-accessor.cc b/icing/index/main/posting-list-hit-accessor.cc
new file mode 100644
index 0000000..3d5476b
--- /dev/null
+++ b/icing/index/main/posting-list-hit-accessor.cc
@@ -0,0 +1,123 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-hit-accessor.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
+PostingListHitAccessor::Create(FlashIndexStorage *storage,
+ PostingListHitSerializer *serializer) {
+ uint32_t max_posting_list_bytes = storage->max_posting_list_bytes();
+ ICING_ASSIGN_OR_RETURN(PostingListUsed in_memory_posting_list,
+ PostingListUsed::CreateFromUnitializedRegion(
+ serializer, max_posting_list_bytes));
+ return std::unique_ptr<PostingListHitAccessor>(new PostingListHitAccessor(
+ storage, serializer, std::move(in_memory_posting_list)));
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
+PostingListHitAccessor::CreateFromExisting(
+ FlashIndexStorage *storage, PostingListHitSerializer *serializer,
+ PostingListIdentifier existing_posting_list_id) {
+ // Our in_memory_posting_list_ will start as empty.
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ Create(storage, serializer));
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage->GetPostingList(existing_posting_list_id));
+ pl_accessor->preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ return pl_accessor;
+}
+
+// Returns the next batch of hits for the provided posting list.
+libtextclassifier3::StatusOr<std::vector<Hit>>
+PostingListHitAccessor::GetNextHitsBatch() {
+ if (preexisting_posting_list_ == nullptr) {
+ if (has_reached_posting_list_chain_end_) {
+ return std::vector<Hit>();
+ }
+ return absl_ports::FailedPreconditionError(
+ "Cannot retrieve hits from a PostingListHitAccessor that was not "
+ "created from a preexisting posting list.");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<Hit> batch,
+ serializer_->GetHits(&preexisting_posting_list_->posting_list));
+ uint32_t next_block_index = kInvalidBlockIndex;
+ // Posting lists will only be chained when they are max-sized, in which case
+ // next_block_index will point to the next block for the next posting list.
+ // Otherwise, next_block_index can be kInvalidBlockIndex or be used to point
+ // to the next free list block, which is not relevant here.
+ if (preexisting_posting_list_->posting_list.size_in_bytes() ==
+ storage_->max_posting_list_bytes()) {
+ next_block_index = preexisting_posting_list_->next_block_index;
+ }
+
+ if (next_block_index != kInvalidBlockIndex) {
+ // Since we only have to deal with next block for max-sized posting list
+ // block, max_num_posting_lists is 1 and posting_list_index_bits is
+ // BitsToStore(1).
+ PostingListIdentifier next_posting_list_id(
+ next_block_index, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/BitsToStore(1));
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage_->GetPostingList(next_posting_list_id));
+ preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ } else {
+ has_reached_posting_list_chain_end_ = true;
+ preexisting_posting_list_.reset();
+ }
+ return batch;
+}
+
+libtextclassifier3::Status PostingListHitAccessor::PrependHit(const Hit &hit) {
+ PostingListUsed &active_pl = (preexisting_posting_list_ != nullptr)
+ ? preexisting_posting_list_->posting_list
+ : in_memory_posting_list_;
+ libtextclassifier3::Status status = serializer_->PrependHit(&active_pl, hit);
+ if (!absl_ports::IsResourceExhausted(status)) {
+ return status;
+ }
+ // There is no more room to add hits to this current posting list! Therefore,
+ // we need to either move those hits to a larger posting list or flush this
+ // posting list and create another max-sized posting list in the chain.
+ if (preexisting_posting_list_ != nullptr) {
+ ICING_RETURN_IF_ERROR(FlushPreexistingPostingList());
+ } else {
+ ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
+ }
+
+ // Re-add hit. Should always fit since we just cleared
+ // in_memory_posting_list_. It's fine to explicitly reference
+ // in_memory_posting_list_ here because there's no way of reaching this line
+ // while preexisting_posting_list_ is still in use.
+ return serializer_->PrependHit(&in_memory_posting_list_, hit);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-hit-accessor.h b/icing/index/main/posting-list-hit-accessor.h
new file mode 100644
index 0000000..7b72437
--- /dev/null
+++ b/icing/index/main/posting-list-hit-accessor.h
@@ -0,0 +1,101 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_POSTING_LIST_HIT_ACCESSOR_H_
+#define ICING_INDEX_POSTING_LIST_HIT_ACCESSOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-accessor.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
+
+namespace icing {
+namespace lib {
+
+// This class is used to provide a simple abstraction for adding hits to posting
+// lists. PostingListHitAccessor handles 1) selection of properly-sized posting
+// lists for the accumulated hits during Finalize() and 2) chaining of max-sized
+// posting lists.
+class PostingListHitAccessor : public PostingListAccessor {
+ public:
+ // Creates an empty PostingListHitAccessor.
+ //
+ // RETURNS:
+ // - On success, a valid unique_ptr instance of PostingListHitAccessor
+ // - INVALID_ARGUMENT error if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
+ Create(FlashIndexStorage* storage, PostingListHitSerializer* serializer);
+
+ // Create a PostingListHitAccessor with an existing posting list identified by
+ // existing_posting_list_id.
+ //
+ // The PostingListHitAccessor will add hits to this posting list until it is
+ // necessary either to 1) chain the posting list (if it is max-sized) or 2)
+ // move its hits to a larger posting list.
+ //
+ // RETURNS:
+ // - On success, a valid unique_ptr instance of PostingListHitAccessor
+ // - INVALID_ARGUMENT if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<std::unique_ptr<PostingListHitAccessor>>
+ CreateFromExisting(FlashIndexStorage* storage,
+ PostingListHitSerializer* serializer,
+ PostingListIdentifier existing_posting_list_id);
+
+ PostingListSerializer* GetSerializer() override { return serializer_; }
+
+ // Retrieve the next batch of hits for the posting list chain
+ //
+ // RETURNS:
+ // - On success, a vector of hits in the posting list chain
+ // - INTERNAL if called on an instance of PostingListHitAccessor that was
+ // created via PostingListHitAccessor::Create, if unable to read the next
+ // posting list in the chain or if the posting list has been corrupted
+ // somehow.
+ libtextclassifier3::StatusOr<std::vector<Hit>> GetNextHitsBatch();
+
+ // Prepend one hit. This may result in flushing the posting list to disk (if
+ // the PostingListHitAccessor holds a max-sized posting list that is full) or
+ // freeing a pre-existing posting list if it is too small to fit all hits
+ // necessary.
+ //
+ // RETURNS:
+ // - OK, on success
+ // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
+ // previously added hit.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status PrependHit(const Hit& hit);
+
+ private:
+ explicit PostingListHitAccessor(FlashIndexStorage* storage,
+ PostingListHitSerializer* serializer,
+ PostingListUsed in_memory_posting_list)
+ : PostingListAccessor(storage, std::move(in_memory_posting_list)),
+ serializer_(serializer) {}
+
+ PostingListHitSerializer* serializer_; // Does not own.
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_POSTING_LIST_HIT_ACCESSOR_H_
diff --git a/icing/index/main/posting-list-hit-accessor_test.cc b/icing/index/main/posting-list-hit-accessor_test.cc
new file mode 100644
index 0000000..1127814
--- /dev/null
+++ b/icing/index/main/posting-list-hit-accessor_test.cc
@@ -0,0 +1,366 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-hit-accessor.h"
+
+#include <cstdint>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/main/posting-list-hit-serializer.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/hit-test-utils.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Lt;
+using ::testing::SizeIs;
+
+class PostingListHitAccessorTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/test_dir";
+ file_name_ = test_dir_ + "/test_file.idx.index";
+
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+
+ serializer_ = std::make_unique<PostingListHitSerializer>();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ flash_index_storage_ =
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
+ }
+
+ void TearDown() override {
+ flash_index_storage_.reset();
+ serializer_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+ std::string file_name_;
+ std::unique_ptr<PostingListHitSerializer> serializer_;
+ std::unique_ptr<FlashIndexStorage> flash_index_storage_;
+};
+
+TEST_F(PostingListHitAccessorTest, HitsAddAndRetrieveProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result.status);
+ EXPECT_THAT(result.id.block_index(), Eq(1));
+ EXPECT_THAT(result.id.posting_list_index(), Eq(0));
+
+ // Retrieve some hits.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result.id));
+ EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+ EXPECT_THAT(pl_holder.next_block_index, Eq(kInvalidBlockIndex));
+}
+
+TEST_F(PostingListHitAccessorTest, PreexistingPLKeepOnSameBlock) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add a single hit. This will fit in a min-sized posting list.
+ Hit hit1(/*section_id=*/1, /*document_id=*/0, Hit::kDefaultTermFrequency);
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit1));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result1.status);
+ // Should have been allocated to the first block.
+ EXPECT_THAT(result1.id.block_index(), Eq(1));
+ EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Add one more hit. The minimum size for a posting list must be able to fit
+ // at least two hits, so this should NOT cause the previous pl to be
+ // reallocated.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/1);
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit2));
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result2.status);
+ // Should have been allocated to the same posting list as the first hit.
+ EXPECT_THAT(result2.id, Eq(result1.id));
+
+ // The posting list at result2.id should hold all of the hits that have been
+ // added.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result2.id));
+ EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAre(hit2, hit1)));
+}
+
+TEST_F(PostingListHitAccessorTest, PreexistingPLReallocateToLargerPL) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // The smallest posting list size is 15 bytes. The first four hits will be
+ // compressed to one byte each and will be able to fit in the 5 byte padded
+ // region. The last hit will fit in one of the special hits. The posting list
+ // will be ALMOST_FULL and can fit at most 2 more hits.
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result1.status);
+ // Should have been allocated to the first block.
+ EXPECT_THAT(result1.id.block_index(), Eq(1));
+ EXPECT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Now let's add some more hits!
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ // The current posting list can fit at most 2 more hits. Adding 12 more hits
+ // should result in these hits being moved to a larger posting list.
+ std::vector<Hit> hits2 = CreateHits(
+ /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/12,
+ /*desired_byte_length=*/1);
+
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result2.status);
+ // Should have been allocated to the second (new) block because the posting
+ // list should have grown beyond the size that the first block maintains.
+ EXPECT_THAT(result2.id.block_index(), Eq(2));
+ EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
+
+ // The posting list at result2.id should hold all of the hits that have been
+ // added.
+ for (const Hit& hit : hits2) {
+ hits1.push_back(hit);
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result2.id));
+ EXPECT_THAT(serializer_->GetHits(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+}
+
+TEST_F(PostingListHitAccessorTest, MultiBlockChainsBlocksProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result1.status);
+ PostingListIdentifier second_block_id = result1.id;
+ // Should have been allocated to the second block, which holds a max-sized
+ // posting list.
+ EXPECT_THAT(second_block_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // Now let's retrieve them!
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(second_block_id));
+ // This pl_holder will only hold a posting list with the hits that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
+ serializer_->GetHits(&pl_holder.posting_list));
+ ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
+ auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
+ EXPECT_THAT(second_block_hits,
+ ElementsAreArray(hits1.rbegin(), first_block_hits_start));
+
+ // Now retrieve all of the hits that were on the first block.
+ uint32_t first_block_id = pl_holder.next_block_index;
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage_->GetPostingList(pl_id));
+ EXPECT_THAT(
+ serializer_->GetHits(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
+}
+
+TEST_F(PostingListHitAccessorTest, PreexistingMultiBlockReusesBlocksProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add some hits! Any hits!
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5000, /*desired_byte_length=*/1);
+ for (const Hit& hit : hits1) {
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result1.status);
+ PostingListIdentifier first_add_id = result1.id;
+ EXPECT_THAT(first_add_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // Now add a couple more hits. These should fit on the existing, not full
+ // second block.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), first_add_id));
+ std::vector<Hit> hits2 = CreateHits(
+ /*start_docid=*/hits1.back().document_id() + 1, /*num_hits=*/50,
+ /*desired_byte_length=*/1);
+
+ for (const Hit& hit : hits2) {
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_EXPECT_OK(result2.status);
+ PostingListIdentifier second_add_id = result2.id;
+ EXPECT_THAT(second_add_id, Eq(first_add_id));
+
+ // We should be able to retrieve all 5050 hits.
+ for (const Hit& hit : hits2) {
+ hits1.push_back(hit);
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(second_add_id));
+ // This pl_holder will only hold a posting list with the hits that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> second_block_hits,
+ serializer_->GetHits(&pl_holder.posting_list));
+ ASSERT_THAT(second_block_hits, SizeIs(Lt(hits1.size())));
+ auto first_block_hits_start = hits1.rbegin() + second_block_hits.size();
+ EXPECT_THAT(second_block_hits,
+ ElementsAreArray(hits1.rbegin(), first_block_hits_start));
+
+ // Now retrieve all of the hits that were on the first block.
+ uint32_t first_block_id = pl_holder.next_block_index;
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage_->GetPostingList(pl_id));
+ EXPECT_THAT(
+ serializer_->GetHits(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(first_block_hits_start, hits1.rend())));
+}
+
+TEST_F(PostingListHitAccessorTest, InvalidHitReturnsInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ Hit invalid_hit;
+ EXPECT_THAT(pl_accessor->PrependHit(invalid_hit),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListHitAccessorTest, HitsNotDecreasingReturnsInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency);
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit1));
+
+ Hit hit2(/*section_id=*/6, /*document_id=*/1, Hit::kDefaultTermFrequency);
+ EXPECT_THAT(pl_accessor->PrependHit(hit2),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ Hit hit3(/*section_id=*/2, /*document_id=*/0, Hit::kDefaultTermFrequency);
+ EXPECT_THAT(pl_accessor->PrependHit(hit3),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListHitAccessorTest, NewPostingListNoHitsAdded) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ EXPECT_THAT(result1.status,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListHitAccessorTest, PreexistingPostingListNoHitsAdded) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor,
+ PostingListHitAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ Hit hit1(/*section_id=*/3, /*document_id=*/1, Hit::kDefaultTermFrequency);
+ ICING_ASSERT_OK(pl_accessor->PrependHit(hit1));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListHitAccessor> pl_accessor2,
+ PostingListHitAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor2).Finalize();
+ ICING_ASSERT_OK(result2.status);
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-hit-serializer.cc b/icing/index/main/posting-list-hit-serializer.cc
new file mode 100644
index 0000000..e14a0c0
--- /dev/null
+++ b/icing/index/main/posting-list-hit-serializer.cc
@@ -0,0 +1,714 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-hit-serializer.h"
+
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <vector>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+uint32_t GetTermFrequencyByteSize(const Hit& hit) {
+ return hit.has_term_frequency() ? sizeof(Hit::TermFrequency) : 0;
+}
+
+} // namespace
+
+uint32_t PostingListHitSerializer::GetBytesUsed(
+ const PostingListUsed* posting_list_used) const {
+ // The special hits will be included if they represent actual hits. If they
+ // represent the hit offset or the invalid hit sentinel, they are not
+ // included.
+ return posting_list_used->size_in_bytes() -
+ GetStartByteOffset(posting_list_used);
+}
+
+uint32_t PostingListHitSerializer::GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const {
+ if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
+ // If in either the FULL state or ALMOST_FULL state, this posting list *is*
+ // the minimum size posting list that can fit these hits. So just return the
+ // size of the posting list.
+ return posting_list_used->size_in_bytes();
+ }
+
+ // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
+ // posting list that would be guaranteed to fit these hits would be
+ // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
+ // special_hit(1) and the n compressed hits in the compressed region.
+ // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
+ // fitting these hits into a posting list would require BytesUsed plus one
+ // extra hit.
+ return GetBytesUsed(posting_list_used) + sizeof(Hit);
+}
+
+void PostingListHitSerializer::Clear(PostingListUsed* posting_list_used) const {
+ // Safe to ignore return value because posting_list_used->size_in_bytes() is
+ // a valid argument.
+ SetStartByteOffset(posting_list_used,
+ /*offset=*/posting_list_used->size_in_bytes());
+}
+
+libtextclassifier3::Status PostingListHitSerializer::MoveFrom(
+ PostingListUsed* dst, PostingListUsed* src) const {
+ ICING_RETURN_ERROR_IF_NULL(dst);
+ ICING_RETURN_ERROR_IF_NULL(src);
+ if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "src MinPostingListSizeToFit %d must be larger than size %d.",
+ GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
+ }
+
+ if (!IsPostingListValid(dst)) {
+ return absl_ports::FailedPreconditionError(
+ "Dst posting list is in an invalid state and can't be used!");
+ }
+ if (!IsPostingListValid(src)) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot MoveFrom an invalid src posting list!");
+ }
+
+ // Pop just enough hits that all of src's compressed hits fit in
+ // dst posting_list's compressed area. Then we can memcpy that area.
+ std::vector<Hit> hits;
+ while (IsFull(src) || IsAlmostFull(src) ||
+ (dst->size_in_bytes() - kSpecialHitsSize < GetBytesUsed(src))) {
+ if (!GetHitsInternal(src, /*limit=*/1, /*pop=*/true, &hits).ok()) {
+ return absl_ports::AbortedError(
+ "Unable to retrieve hits from src posting list.");
+ }
+ }
+
+ // memcpy the area and set up start byte offset.
+ Clear(dst);
+ memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
+ src->posting_list_buffer() + GetStartByteOffset(src),
+ GetBytesUsed(src));
+ // Because we popped all hits from src outside of the compressed area and we
+ // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
+ // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
+ // NOT_FULL state, so ignoring the value is safe.
+ SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
+
+ // Put back remaining hits.
+ for (size_t i = 0; i < hits.size(); i++) {
+ const Hit& hit = hits[hits.size() - i - 1];
+ // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
+ // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
+ // should be impossible because we've already assured that there is enough
+ // room above.
+ ICING_RETURN_IF_ERROR(PrependHit(dst, hit));
+ }
+
+ Clear(src);
+ return libtextclassifier3::Status::OK;
+}
+
+uint32_t PostingListHitSerializer::GetPadEnd(
+ const PostingListUsed* posting_list_used, uint32_t offset) const {
+ Hit::Value pad;
+ uint32_t pad_end = offset;
+ while (pad_end < posting_list_used->size_in_bytes()) {
+ size_t pad_len = VarInt::Decode(
+ posting_list_used->posting_list_buffer() + pad_end, &pad);
+ if (pad != 0) {
+ // No longer a pad.
+ break;
+ }
+ pad_end += pad_len;
+ }
+ return pad_end;
+}
+
+bool PostingListHitSerializer::PadToEnd(PostingListUsed* posting_list_used,
+ uint32_t start, uint32_t end) const {
+ if (end > posting_list_used->size_in_bytes()) {
+ ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
+ return false;
+ }
+ // In VarInt a value of 0 encodes to 0.
+ memset(posting_list_used->posting_list_buffer() + start, 0, end - start);
+ return true;
+}
+
+libtextclassifier3::Status PostingListHitSerializer::PrependHitToAlmostFull(
+ PostingListUsed* posting_list_used, const Hit& hit) const {
+ // Get delta between first hit and the new hit. Try to fit delta
+ // in the padded area and put new hit at the special position 1.
+ // Calling ValueOrDie is safe here because 1 < kNumSpecialData.
+ Hit cur = GetSpecialHit(posting_list_used, /*index=*/1).ValueOrDie();
+ if (cur.value() <= hit.value()) {
+ return absl_ports::InvalidArgumentError(
+ "Hit being prepended must be strictly less than the most recent Hit");
+ }
+ uint64_t delta = cur.value() - hit.value();
+ uint8_t delta_buf[VarInt::kMaxEncodedLen64];
+ size_t delta_len = VarInt::Encode(delta, delta_buf);
+ uint32_t cur_term_frequency_bytes = GetTermFrequencyByteSize(cur);
+
+ uint32_t pad_end = GetPadEnd(posting_list_used,
+ /*offset=*/kSpecialHitsSize);
+
+ if (pad_end >= kSpecialHitsSize + delta_len + cur_term_frequency_bytes) {
+ // Pad area has enough space for delta and term_frequency of existing hit
+ // (cur). Write delta at pad_end - delta_len - cur_term_frequency_bytes.
+ uint8_t* delta_offset = posting_list_used->posting_list_buffer() + pad_end -
+ delta_len - cur_term_frequency_bytes;
+ memcpy(delta_offset, delta_buf, delta_len);
+ // Now copy term_frequency.
+ Hit::TermFrequency term_frequency = cur.term_frequency();
+ uint8_t* term_frequency_offset = delta_offset + delta_len;
+ memcpy(term_frequency_offset, &term_frequency, cur_term_frequency_bytes);
+
+ // Now first hit is the new hit, at special position 1. Safe to ignore the
+ // return value because 1 < kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/1, hit);
+ // Safe to ignore the return value because sizeof(Hit) is a valid argument.
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
+ } else {
+ // No space for delta. We put the new hit at special position 0
+ // and go to the full state. Safe to ignore the return value because 1 <
+ // kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/0, hit);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+void PostingListHitSerializer::PrependHitToEmpty(
+ PostingListUsed* posting_list_used, const Hit& hit) const {
+ // First hit to be added. Just add verbatim, no compression.
+ if (posting_list_used->size_in_bytes() == kSpecialHitsSize) {
+ // Safe to ignore the return value because 1 < kNumSpecialData
+ SetSpecialHit(posting_list_used, /*index=*/1, hit);
+ // Safe to ignore the return value because sizeof(Hit) is a valid argument.
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
+ } else {
+ // Since this is the first hit, size != kSpecialHitsSize and
+ // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
+ // the compressed region, so ValueOrDie is safe.
+ uint32_t offset =
+ PrependHitUncompressed(posting_list_used, hit,
+ /*offset=*/posting_list_used->size_in_bytes())
+ .ValueOrDie();
+ // Safe to ignore the return value because PrependHitUncompressed is
+ // guaranteed to return a valid offset.
+ SetStartByteOffset(posting_list_used, offset);
+ }
+}
+
+libtextclassifier3::Status PostingListHitSerializer::PrependHitToNotFull(
+ PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
+ // First hit in compressed area. It is uncompressed. See if delta
+ // between the first hit and new hit will still fit in the
+ // compressed area.
+ if (offset + sizeof(Hit::Value) > posting_list_used->size_in_bytes()) {
+ // The first hit in the compressed region *should* be uncompressed, but
+ // somehow there isn't enough room between offset and the end of the
+ // compressed area to fit an uncompressed hit. This should NEVER happen.
+ return absl_ports::FailedPreconditionError(
+ "Posting list is in an invalid state.");
+ }
+ Hit::Value cur_value;
+ memcpy(&cur_value, posting_list_used->posting_list_buffer() + offset,
+ sizeof(Hit::Value));
+ if (cur_value <= hit.value()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Hit %d being prepended must be strictly less than the most recent "
+ "Hit %d",
+ hit.value(), cur_value));
+ }
+ uint64_t delta = cur_value - hit.value();
+ uint8_t delta_buf[VarInt::kMaxEncodedLen64];
+ size_t delta_len = VarInt::Encode(delta, delta_buf);
+ uint32_t hit_term_frequency_bytes = GetTermFrequencyByteSize(hit);
+
+ // offset now points to one past the end of the first hit.
+ offset += sizeof(Hit::Value);
+ if (kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
+ hit_term_frequency_bytes <=
+ offset) {
+ // Enough space for delta in compressed area.
+
+ // Prepend delta.
+ offset -= delta_len;
+ memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
+ delta_len);
+
+ // Prepend new hit with (possibly) its term_frequency. We know that there is
+ // room for 'hit' because of the if statement above, so calling ValueOrDie
+ // is safe.
+ offset =
+ PrependHitUncompressed(posting_list_used, hit, offset).ValueOrDie();
+ // offset is guaranteed to be valid here. So it's safe to ignore the return
+ // value. The if above will guarantee that offset >= kSpecialHitSize and <
+ // posting_list_used->size_in_bytes() because the if ensures that there is
+ // enough room between offset and kSpecialHitSize to fit the delta of the
+ // previous hit, any term_frequency and the uncompressed hit.
+ SetStartByteOffset(posting_list_used, offset);
+ } else if (kSpecialHitsSize + delta_len <= offset) {
+ // Only have space for delta. The new hit must be put in special
+ // position 1.
+
+ // Prepend delta.
+ offset -= delta_len;
+ memcpy(posting_list_used->posting_list_buffer() + offset, delta_buf,
+ delta_len);
+
+ // Prepend pad. Safe to ignore the return value of PadToEnd because offset
+ // must be less than posting_list_used->size_in_bytes(). Otherwise, this
+ // function already would have returned FAILED_PRECONDITION.
+ PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
+ /*end=*/offset);
+
+ // Put new hit in special position 1. Safe to ignore return value because 1
+ // < kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/1, hit);
+
+ // State almost_full. Safe to ignore the return value because sizeof(Hit) is
+ // a valid argument.
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(Hit));
+ } else {
+ // Very rare case where delta is larger than sizeof(Hit::Value)
+ // (i.e. varint delta encoding expanded required storage). We
+ // move first hit to special position 1 and put new hit in
+ // special position 0.
+ Hit cur(cur_value);
+ // offset is < kSpecialHitsSize + delta_len. delta_len is at most 5 bytes.
+ // Therefore, offset must be less than kSpecialHitSize + 5. Since posting
+ // list size must be divisible by sizeof(Hit) (5), it is guaranteed that
+ // offset < size_in_bytes, so it is safe to ignore the return value here.
+ ICING_RETURN_IF_ERROR(
+ ConsumeTermFrequencyIfPresent(posting_list_used, &cur, &offset));
+ // Safe to ignore the return value of PadToEnd because offset must be less
+ // than posting_list_used->size_in_bytes(). Otherwise, this function
+ // already would have returned FAILED_PRECONDITION.
+ PadToEnd(posting_list_used, /*start=*/kSpecialHitsSize,
+ /*end=*/offset);
+ // Safe to ignore the return value here because 0 and 1 < kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/1, cur);
+ SetSpecialHit(posting_list_used, /*index=*/0, hit);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PostingListHitSerializer::PrependHit(
+ PostingListUsed* posting_list_used, const Hit& hit) const {
+ static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
+ "Hit::Value cannot be larger than 8 bytes because the delta "
+ "must be able to fit in 8 bytes.");
+ if (!hit.is_valid()) {
+ return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
+ }
+ if (!IsPostingListValid(posting_list_used)) {
+ return absl_ports::FailedPreconditionError(
+ "This PostingListUsed is in an invalid state and can't add any hits!");
+ }
+
+ if (IsFull(posting_list_used)) {
+ // State full: no space left.
+ return absl_ports::ResourceExhaustedError("No more room for hits");
+ } else if (IsAlmostFull(posting_list_used)) {
+ return PrependHitToAlmostFull(posting_list_used, hit);
+ } else if (IsEmpty(posting_list_used)) {
+ PrependHitToEmpty(posting_list_used, hit);
+ return libtextclassifier3::Status::OK;
+ } else {
+ uint32_t offset = GetStartByteOffset(posting_list_used);
+ return PrependHitToNotFull(posting_list_used, hit, offset);
+ }
+}
+
+libtextclassifier3::StatusOr<std::vector<Hit>>
+PostingListHitSerializer::GetHits(
+ const PostingListUsed* posting_list_used) const {
+ std::vector<Hit> hits_out;
+ ICING_RETURN_IF_ERROR(GetHits(posting_list_used, &hits_out));
+ return hits_out;
+}
+
+libtextclassifier3::Status PostingListHitSerializer::GetHits(
+ const PostingListUsed* posting_list_used,
+ std::vector<Hit>* hits_out) const {
+ return GetHitsInternal(posting_list_used,
+ /*limit=*/std::numeric_limits<uint32_t>::max(),
+ /*pop=*/false, hits_out);
+}
+
+libtextclassifier3::Status PostingListHitSerializer::PopFrontHits(
+ PostingListUsed* posting_list_used, uint32_t num_hits) const {
+ if (num_hits == 1 && IsFull(posting_list_used)) {
+ // The PL is in full status which means that we save 2 uncompressed hits in
+ // the 2 special postions. But full status may be reached by 2 different
+ // statuses.
+ // (1) In "almost full" status
+ // +-----------------+----------------+-------+-----------------+
+ // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ // When we prepend another hit, we can only put it at the special
+ // position 0. And we get a full PL
+ // +-----------------+----------------+-------+-----------------+
+ // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ // (2) In "not full" status
+ // +-----------------+----------------+------+-------+------------------+
+ // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
+ // +-----------------+----------------+------+-------+------------------+
+ // When we prepend another hit, we can reach any of the 3 following
+ // scenarios:
+ // (2.1) not full
+ // if the space of pad and original 1st hit can accommodate the new 1st hit
+ // and the encoded delta value.
+ // +-----------------+----------------+------+-----------+-----------------+
+ // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
+ // +-----------------+----------------+------+-----------+-----------------+
+ // (2.2) almost full
+ // If the space of pad and original 1st hit cannot accommodate the new 1st
+ // hit and the encoded delta value but can accommodate the encoded delta
+ // value only. We can put the new 1st hit at special position 1.
+ // +-----------------+----------------+-------+-----------------+
+ // |Hit::kInvalidVal |new 1st hit |(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ // (2.3) full
+ // In very rare case, it cannot even accommodate only the encoded delta
+ // value. we can move the original 1st hit into special position 1 and the
+ // new 1st hit into special position 0. This may happen because we use
+ // VarInt encoding method which may make the encoded value longer (about
+ // 4/3 times of original)
+ // +-----------------+----------------+-------+-----------------+
+ // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ // Suppose now the PL is full. But we don't know whether it arrived to
+ // this status from "not full" like (2.3) or from "almost full" like (1).
+ // We'll return to "almost full" status like (1) if we simply pop the new
+ // 1st hit but we want to make the prepending operation "reversible". So
+ // there should be some way to return to "not full" if possible. A simple
+ // way to do it is to pop 2 hits out of the PL to status "almost full" or
+ // "not full". And add the original 1st hit back. We can return to the
+ // correct original statuses of (2.1) or (1). This makes our prepending
+ // operation reversible.
+ std::vector<Hit> out;
+
+ // Popping 2 hits should never fail because we've just ensured that the
+ // posting list is in the FULL state.
+ ICING_RETURN_IF_ERROR(
+ GetHitsInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
+
+ // PrependHit should never fail because out[1] is a valid hit less than
+ // previous hits in the posting list and because there's no way that the
+ // posting list could run out of room because it previously stored this hit
+ // AND another hit.
+ ICING_RETURN_IF_ERROR(PrependHit(posting_list_used, out[1]));
+ } else if (num_hits > 0) {
+ return GetHitsInternal(posting_list_used, /*limit=*/num_hits, /*pop=*/true,
+ nullptr);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PostingListHitSerializer::GetHitsInternal(
+ const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+ std::vector<Hit>* out) const {
+ // Put current uncompressed val here.
+ Hit::Value val = Hit::kInvalidValue;
+ uint32_t offset = GetStartByteOffset(posting_list_used);
+ uint32_t count = 0;
+
+ // First traverse the first two special positions.
+ while (count < limit && offset < kSpecialHitsSize) {
+ // Calling ValueOrDie is safe here because offset / sizeof(Hit) <
+ // kNumSpecialData because of the check above.
+ Hit hit = GetSpecialHit(posting_list_used, /*index=*/offset / sizeof(Hit))
+ .ValueOrDie();
+ val = hit.value();
+ if (out != nullptr) {
+ out->push_back(hit);
+ }
+ offset += sizeof(Hit);
+ count++;
+ }
+
+ // If special position 1 was set then we need to skip padding.
+ if (val != Hit::kInvalidValue && offset == kSpecialHitsSize) {
+ offset = GetPadEnd(posting_list_used, offset);
+ }
+
+ while (count < limit && offset < posting_list_used->size_in_bytes()) {
+ if (val == Hit::kInvalidValue) {
+ // First hit is in compressed area. Put that in val.
+ memcpy(&val, posting_list_used->posting_list_buffer() + offset,
+ sizeof(Hit::Value));
+ offset += sizeof(Hit::Value);
+ } else {
+ // Now we have delta encoded subsequent hits. Decode and push.
+ uint64_t delta;
+ offset += VarInt::Decode(
+ posting_list_used->posting_list_buffer() + offset, &delta);
+ val += delta;
+ }
+ Hit hit(val);
+ libtextclassifier3::Status status =
+ ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
+ if (!status.ok()) {
+ // This posting list has been corrupted somehow. The first hit of the
+ // posting list claims to have a term frequency, but there's no more room
+ // in the posting list for that term frequency to exist. Return an empty
+ // vector and zero to indicate no hits retrieved.
+ if (out != nullptr) {
+ out->clear();
+ }
+ return absl_ports::InternalError("Posting list has been corrupted!");
+ }
+ if (out != nullptr) {
+ out->push_back(hit);
+ }
+ count++;
+ }
+
+ if (pop) {
+ PostingListUsed* mutable_posting_list_used =
+ const_cast<PostingListUsed*>(posting_list_used);
+ // Modify the posting list so that we pop all hits actually
+ // traversed.
+ if (offset >= kSpecialHitsSize &&
+ offset < posting_list_used->size_in_bytes()) {
+ // In the compressed area. Pop and reconstruct. offset/val is
+ // the last traversed hit, which we must discard. So move one
+ // more forward.
+ uint64_t delta;
+ offset += VarInt::Decode(
+ posting_list_used->posting_list_buffer() + offset, &delta);
+ val += delta;
+
+ // Now val is the first hit of the new posting list.
+ if (kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
+ // val fits in compressed area. Simply copy.
+ offset -= sizeof(Hit::Value);
+ memcpy(mutable_posting_list_used->posting_list_buffer() + offset, &val,
+ sizeof(Hit::Value));
+ } else {
+ // val won't fit in compressed area. Also see if there is a
+ // term_frequency.
+ Hit hit(val);
+ libtextclassifier3::Status status =
+ ConsumeTermFrequencyIfPresent(posting_list_used, &hit, &offset);
+ if (!status.ok()) {
+ // This posting list has been corrupted somehow. The first hit of
+ // the posting list claims to have a term frequency, but there's no
+ // more room in the posting list for that term frequency to exist.
+ // Return an empty vector and zero to indicate no hits retrieved. Do
+ // not pop anything.
+ if (out != nullptr) {
+ out->clear();
+ }
+ return absl_ports::InternalError("Posting list has been corrupted!");
+ }
+ // Okay to ignore the return value here because 1 < kNumSpecialData.
+ SetSpecialHit(mutable_posting_list_used, /*index=*/1, hit);
+
+ // Prepend pad. Safe to ignore the return value of PadToEnd because
+ // offset must be less than posting_list_used->size_in_bytes() thanks to
+ // the if above.
+ PadToEnd(mutable_posting_list_used,
+ /*start=*/kSpecialHitsSize,
+ /*end=*/offset);
+ offset = sizeof(Hit);
+ }
+ }
+ // offset is guaranteed to be valid so ignoring the return value of
+ // set_start_byte_offset is safe. It falls into one of four scenarios:
+ // Scenario 1: the above if was false because offset is not <
+ // posting_list_used->size_in_bytes()
+ // In this case, offset must be == posting_list_used->size_in_bytes()
+ // because we reached offset by unwinding hits on the posting list.
+ // Scenario 2: offset is < kSpecialHitSize
+ // In this case, offset is guaranteed to be either 0 or sizeof(Hit)
+ // because offset is incremented by sizeof(Hit) within the first while
+ // loop.
+ // Scenario 3: offset is within the compressed region and the new first hit
+ // in the posting list (the value that 'val' holds) will fit as an
+ // uncompressed hit in the compressed region. The resulting offset from
+ // decompressing val must be >= kSpecialHitSize because otherwise we'd be
+ // in Scenario 4
+ // Scenario 4: offset is within the compressed region, but the new first hit
+ // in the posting list is too large to fit as an uncompressed hit in the
+ // in the compressed region. Therefore, it must be stored in a special hit
+ // and offset will be sizeof(Hit).
+ SetStartByteOffset(mutable_posting_list_used, offset);
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<Hit> PostingListHitSerializer::GetSpecialHit(
+ const PostingListUsed* posting_list_used, uint32_t index) const {
+ static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
+ if (index >= kNumSpecialData || index < 0) {
+ return absl_ports::InvalidArgumentError(
+ "Special hits only exist at indices 0 and 1");
+ }
+ Hit val;
+ memcpy(&val, posting_list_used->posting_list_buffer() + index * sizeof(val),
+ sizeof(val));
+ return val;
+}
+
+bool PostingListHitSerializer::SetSpecialHit(PostingListUsed* posting_list_used,
+ uint32_t index,
+ const Hit& val) const {
+ if (index >= kNumSpecialData || index < 0) {
+ ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
+ return false;
+ }
+ memcpy(posting_list_used->posting_list_buffer() + index * sizeof(val), &val,
+ sizeof(val));
+ return true;
+}
+
+bool PostingListHitSerializer::IsPostingListValid(
+ const PostingListUsed* posting_list_used) const {
+ if (IsAlmostFull(posting_list_used)) {
+ // Special Hit 1 should hold a Hit. Calling ValueOrDie is safe because we
+ // know that 1 < kNumSpecialData.
+ if (!GetSpecialHit(posting_list_used, /*index=*/1)
+ .ValueOrDie()
+ .is_valid()) {
+ ICING_LOG(ERROR)
+ << "Both special hits cannot be invalid at the same time.";
+ return false;
+ }
+ } else if (!IsFull(posting_list_used)) {
+ // NOT_FULL. Special Hit 0 should hold a valid offset. Calling ValueOrDie is
+ // safe because we know that 0 < kNumSpecialData.
+ if (GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() >
+ posting_list_used->size_in_bytes() ||
+ GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() <
+ kSpecialHitsSize) {
+ ICING_LOG(ERROR)
+ << "Hit: "
+ << GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value()
+ << " size: " << posting_list_used->size_in_bytes()
+ << " sp size: " << kSpecialHitsSize;
+ return false;
+ }
+ }
+ return true;
+}
+
+uint32_t PostingListHitSerializer::GetStartByteOffset(
+ const PostingListUsed* posting_list_used) const {
+ if (IsFull(posting_list_used)) {
+ return 0;
+ } else if (IsAlmostFull(posting_list_used)) {
+ return sizeof(Hit);
+ } else {
+ // NOT_FULL, calling ValueOrDie is safe because we know that 0 <
+ // kNumSpecialData.
+ return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value();
+ }
+}
+
+bool PostingListHitSerializer::SetStartByteOffset(
+ PostingListUsed* posting_list_used, uint32_t offset) const {
+ if (offset > posting_list_used->size_in_bytes()) {
+ ICING_LOG(ERROR) << "offset cannot be a value greater than size "
+ << posting_list_used->size_in_bytes() << ". offset is "
+ << offset << ".";
+ return false;
+ }
+ if (offset < kSpecialHitsSize && offset > sizeof(Hit)) {
+ ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
+ << ", " << kSpecialHitsSize << "). offset is " << offset
+ << ".";
+ return false;
+ }
+ if (offset < sizeof(Hit) && offset != 0) {
+ ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
+ << "). offset is " << offset << ".";
+ return false;
+ }
+ if (offset >= kSpecialHitsSize) {
+ // not_full state. Safe to ignore the return value because 0 and 1 are both
+ // < kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/0, Hit(offset));
+ SetSpecialHit(posting_list_used, /*index=*/1, Hit());
+ } else if (offset == sizeof(Hit)) {
+ // almost_full state. Safe to ignore the return value because 1 is both <
+ // kNumSpecialData.
+ SetSpecialHit(posting_list_used, /*index=*/0, Hit());
+ }
+ // Nothing to do for the FULL state - the offset isn't actually stored
+ // anywhere and both special hits hold valid hits.
+ return true;
+}
+
+libtextclassifier3::StatusOr<uint32_t>
+PostingListHitSerializer::PrependHitUncompressed(
+ PostingListUsed* posting_list_used, const Hit& hit, uint32_t offset) const {
+ if (hit.has_term_frequency()) {
+ if (offset < kSpecialHitsSize + sizeof(Hit)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Not enough room to prepend Hit at offset %d.", offset));
+ }
+ offset -= sizeof(Hit);
+ memcpy(posting_list_used->posting_list_buffer() + offset, &hit,
+ sizeof(Hit));
+ } else {
+ if (offset < kSpecialHitsSize + sizeof(Hit::Value)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Not enough room to prepend Hit::Value at offset %d.", offset));
+ }
+ offset -= sizeof(Hit::Value);
+ Hit::Value val = hit.value();
+ memcpy(posting_list_used->posting_list_buffer() + offset, &val,
+ sizeof(Hit::Value));
+ }
+ return offset;
+}
+
+libtextclassifier3::Status
+PostingListHitSerializer::ConsumeTermFrequencyIfPresent(
+ const PostingListUsed* posting_list_used, Hit* hit,
+ uint32_t* offset) const {
+ if (!hit->has_term_frequency()) {
+ // No term frequency to consume. Everything is fine.
+ return libtextclassifier3::Status::OK;
+ }
+ if (*offset + sizeof(Hit::TermFrequency) >
+ posting_list_used->size_in_bytes()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "offset %d must not point past the end of the posting list of size %d.",
+ *offset, posting_list_used->size_in_bytes()));
+ }
+ Hit::TermFrequency term_frequency;
+ memcpy(&term_frequency, posting_list_used->posting_list_buffer() + *offset,
+ sizeof(Hit::TermFrequency));
+ *hit = Hit(hit->value(), term_frequency);
+ *offset += sizeof(Hit::TermFrequency);
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/main/posting-list-hit-serializer.h b/icing/index/main/posting-list-hit-serializer.h
new file mode 100644
index 0000000..2986d9c
--- /dev/null
+++ b/icing/index/main/posting-list-hit-serializer.h
@@ -0,0 +1,345 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
+#define ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/hit/hit.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// A serializer class to serialize hits to PostingListUsed. Layout described in
+// comments in posting-list-hit-serializer.cc.
+class PostingListHitSerializer : public PostingListSerializer {
+ public:
+ static constexpr uint32_t kSpecialHitsSize = kNumSpecialData * sizeof(Hit);
+
+ uint32_t GetDataTypeBytes() const override { return sizeof(Hit); }
+
+ uint32_t GetMinPostingListSize() const override {
+ static constexpr uint32_t kMinPostingListSize = kSpecialHitsSize;
+ static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
+ "PostingListIndex must be small enough to fit in a "
+ "minimum-sized Posting List.");
+
+ return kMinPostingListSize;
+ }
+
+ uint32_t GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const override;
+
+ uint32_t GetBytesUsed(
+ const PostingListUsed* posting_list_used) const override;
+
+ void Clear(PostingListUsed* posting_list_used) const override;
+
+ libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
+ PostingListUsed* src) const override;
+
+ // Prepend a hit to the posting list.
+ //
+ // RETURNS:
+ // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
+ // previously added hit.
+ // - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
+ // list.
+ libtextclassifier3::Status PrependHit(PostingListUsed* posting_list_used,
+ const Hit& hit) const;
+
+ // Prepend hits to the posting list. Hits should be sorted in descending order
+ // (as defined by the less than operator for Hit)
+ //
+ // Returns the number of hits that could be prepended to the posting list. If
+ // keep_prepended is true, whatever could be prepended is kept, otherwise the
+ // posting list is left in its original state.
+ template <class T, Hit (*GetHit)(const T&)>
+ libtextclassifier3::StatusOr<uint32_t> PrependHitArray(
+ PostingListUsed* posting_list_used, const T* array, uint32_t num_hits,
+ bool keep_prepended) const;
+
+ // Retrieves the hits stored in the posting list.
+ //
+ // RETURNS:
+ // - On success, a vector of hits sorted by the reverse order of prepending.
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<Hit>> GetHits(
+ const PostingListUsed* posting_list_used) const;
+
+ // Same as GetHits but appends hits to hits_out.
+ //
+ // RETURNS:
+ // - On success, a vector of hits sorted by the reverse order of prepending.
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status GetHits(const PostingListUsed* posting_list_used,
+ std::vector<Hit>* hits_out) const;
+
+ // Undo the last num_hits hits prepended. If num_hits > number of
+ // hits we clear all hits.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status PopFrontHits(PostingListUsed* posting_list_used,
+ uint32_t num_hits) const;
+
+ private:
+ // Posting list layout formats:
+ //
+ // not_full
+ //
+ // +-----------------+----------------+-------+-----------------+
+ // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ //
+ // almost_full
+ //
+ // +-----------------+----------------+-------+-----------------+
+ // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ //
+ // full()
+ //
+ // +-----------------+----------------+-------+-----------------+
+ // |1st hit |2nd hit |(pad) |(compressed) hits|
+ // +-----------------+----------------+-------+-----------------+
+ //
+ // The first two uncompressed hits also implicitly encode information about
+ // the size of the compressed hits region.
+ //
+ // 1. If the posting list is NOT_FULL, then
+ // posting_list_buffer_[0] contains the byte offset of the start of the
+ // compressed hits - and, thus, the size of the compressed hits region is
+ // size_in_bytes - posting_list_buffer_[0].
+ //
+ // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
+ // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
+ // - 1] and ends at size_in_bytes - 1.
+ //
+ // Hit term frequencies are stored after the hit value, compressed or
+ // uncompressed. For the first two special hits, we always have a
+ // space for the term frequency. For hits in the compressed area, we only have
+ // the term frequency following the hit value of hit.has_term_frequency() is
+ // true. This allows good compression in the common case where hits don't have
+ // a valid term frequency.
+ //
+ // EXAMPLE
+ // Posting list storage. Posting list size: 20 bytes
+ // EMPTY!
+ // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
+ // | 20 |Hit::kInvalidVal| 0x000 |
+ // +-------------+----------------+----------------+-----------------------+
+ //
+ // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
+ // NOT FULL!
+ // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
+ // | 16 |Hit::kInvalidVal| 0x000 | 0x07FFF998 |
+ // +-------------+----------------+-----------------+----------------------+
+ //
+ // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4,
+ // TermFrequency=125)
+ // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
+ // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
+ // | 13 |Hit::kInvalidVal| 0x000 | 0x07FFF684| 125 | 788 |
+ // +-------------+----------------+-----------+-----------+------+-----------+
+ //
+ // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
+ // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
+ // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
+ // | 9 |Hit::kInvVal| 0x00 |0x07FFF4D2| 434 | 125 | 788 |
+ // +-------------+------------+--------+----------+---------+------+---------+
+ //
+ // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6,
+ // TermFrequency = 87)
+ // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196) ALMOST FULL!
+ // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
+ // |Hit::kInvVal|0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788 |
+ // +-------------+------------+---------+---------+---------+------+---------+
+ //
+ // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
+ // FULL!
+ // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
+ // -+ | 0x07FFF320 |0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788
+ // |
+ // +-------------+-------------+---------+----------+---------+------+---------+
+
+ // Helpers to determine what state the posting list is in.
+ bool IsFull(const PostingListUsed* posting_list_used) const {
+ return GetSpecialHit(posting_list_used, /*index=*/0)
+ .ValueOrDie()
+ .is_valid() &&
+ GetSpecialHit(posting_list_used, /*index=*/1)
+ .ValueOrDie()
+ .is_valid();
+ }
+
+ bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
+ return !GetSpecialHit(posting_list_used, /*index=*/0)
+ .ValueOrDie()
+ .is_valid();
+ }
+
+ bool IsEmpty(const PostingListUsed* posting_list_used) const {
+ return GetSpecialHit(posting_list_used, /*index=*/0).ValueOrDie().value() ==
+ posting_list_used->size_in_bytes() &&
+ !GetSpecialHit(posting_list_used, /*index=*/1)
+ .ValueOrDie()
+ .is_valid();
+ }
+
+ // Returns false if both special hits are invalid or if the offset value
+ // stored in the special hit is less than kSpecialHitsSize or greater than
+ // posting_list_used->size_in_bytes(). Returns true, otherwise.
+ bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
+
+ // Prepend hit to a posting list that is in the ALMOST_FULL state.
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if hit is not less than the previously added hit.
+ libtextclassifier3::Status PrependHitToAlmostFull(
+ PostingListUsed* posting_list_used, const Hit& hit) const;
+
+ // Prepend hit to a posting list that is in the EMPTY state. This will always
+ // succeed because there are no pre-existing hits and no validly constructed
+ // posting list could fail to fit one hit.
+ void PrependHitToEmpty(PostingListUsed* posting_list_used,
+ const Hit& hit) const;
+
+ // Prepend hit to a posting list that is in the NOT_FULL state.
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if hit is not less than the previously added hit.
+ libtextclassifier3::Status PrependHitToNotFull(
+ PostingListUsed* posting_list_used, const Hit& hit,
+ uint32_t offset) const;
+
+ // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
+ // a byte offset between kSpecialHitsSize and
+ // posting_list_used->size_in_bytes() (inclusive) (not_full state).
+ uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
+
+ // Sets the special hits to properly reflect what offset is (see layout
+ // comment for further details).
+ //
+ // Returns false if offset > posting_list_used->size_in_bytes() or offset is
+ // (kSpecialHitsSize, sizeof(Hit)) or offset is (sizeof(Hit), 0). True,
+ // otherwise.
+ bool SetStartByteOffset(PostingListUsed* posting_list_used,
+ uint32_t offset) const;
+
+ // Manipulate padded areas. We never store the same hit value twice
+ // so a delta of 0 is a pad byte.
+
+ // Returns offset of first non-pad byte.
+ uint32_t GetPadEnd(const PostingListUsed* posting_list_used,
+ uint32_t offset) const;
+
+ // Fill padding between offset start and offset end with 0s.
+ // Returns false if end > posting_list_used->size_in_bytes(). True,
+ // otherwise.
+ bool PadToEnd(PostingListUsed* posting_list_used, uint32_t start,
+ uint32_t end) const;
+
+ // Helper for AppendHits/PopFrontHits. Adds limit number of hits to out or all
+ // hits in the posting list if the posting list contains less than limit
+ // number of hits. out can be NULL.
+ //
+ // NOTE: If called with limit=1, pop=true on a posting list that transitioned
+ // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
+ // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
+ // ALMOST_FULL.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status GetHitsInternal(
+ const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+ std::vector<Hit>* out) const;
+
+ // Retrieves the value stored in the index-th special hit.
+ //
+ // RETURNS:
+ // - A valid Hit, on success
+ // - INVALID_ARGUMENT if index is not less than kNumSpecialData
+ libtextclassifier3::StatusOr<Hit> GetSpecialHit(
+ const PostingListUsed* posting_list_used, uint32_t index) const;
+
+ // Sets the value stored in the index-th special hit to val. If index is not
+ // less than kSpecialHitSize / sizeof(Hit), this has no effect.
+ bool SetSpecialHit(PostingListUsed* posting_list_used, uint32_t index,
+ const Hit& val) const;
+
+ // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
+ // returns the new beginning of the padded region.
+ //
+ // RETURNS:
+ // - The new beginning of the padded region, if successful.
+ // - INVALID_ARGUMENT if hit will not fit (uncompressed) between offset and
+ // kSpecialHitsSize
+ libtextclassifier3::StatusOr<uint32_t> PrependHitUncompressed(
+ PostingListUsed* posting_list_used, const Hit& hit,
+ uint32_t offset) const;
+
+ // If hit has a term frequency, consumes the term frequency at offset, updates
+ // hit to include the term frequency and updates offset to reflect that the
+ // term frequency has been consumed.
+ //
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if hit has a term frequency and offset +
+ // sizeof(Hit::TermFrequency) >= posting_list_used->size_in_bytes()
+ libtextclassifier3::Status ConsumeTermFrequencyIfPresent(
+ const PostingListUsed* posting_list_used, Hit* hit,
+ uint32_t* offset) const;
+};
+
+// Inlined functions. Implementation details below. Avert eyes!
+template <class T, Hit (*GetHit)(const T&)>
+libtextclassifier3::StatusOr<uint32_t>
+PostingListHitSerializer::PrependHitArray(PostingListUsed* posting_list_used,
+ const T* array, uint32_t num_hits,
+ bool keep_prepended) const {
+ if (!IsPostingListValid(posting_list_used)) {
+ return 0;
+ }
+
+ // Prepend hits working backwards from array[num_hits - 1].
+ uint32_t i;
+ for (i = 0; i < num_hits; ++i) {
+ if (!PrependHit(posting_list_used, GetHit(array[num_hits - i - 1])).ok()) {
+ break;
+ }
+ }
+ if (i != num_hits && !keep_prepended) {
+ // Didn't fit. Undo everything and check that we have the same offset as
+ // before. PopFrontHits guarantees that it will remove all 'i' hits so long
+ // as there are at least 'i' hits in the posting list, which we know there
+ // are.
+ ICING_RETURN_IF_ERROR(PopFrontHits(posting_list_used, /*num_hits=*/i));
+ }
+ return i;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_MAIN_POSTING_LIST_HIT_SERIALIZER_H_
diff --git a/icing/index/main/posting-list-hit-serializer_test.cc b/icing/index/main/posting-list-hit-serializer_test.cc
new file mode 100644
index 0000000..7f0b945
--- /dev/null
+++ b/icing/index/main/posting-list-hit-serializer_test.cc
@@ -0,0 +1,731 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/main/posting-list-hit-serializer.h"
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/hit-test-utils.h"
+
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::IsEmpty;
+using testing::Le;
+using testing::Lt;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+struct HitElt {
+ HitElt() = default;
+ explicit HitElt(const Hit &hit_in) : hit(hit_in) {}
+
+ static Hit get_hit(const HitElt &hit_elt) { return hit_elt.hit; }
+
+ Hit hit;
+};
+
+TEST(PostingListHitSerializerTest, PostingListUsedPrependHitNotFull) {
+ PostingListHitSerializer serializer;
+
+ static const int kNumHits = 2551;
+ static const size_t kHitsSize = kNumHits * sizeof(Hit);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, kHitsSize));
+
+ // Make used.
+ Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/56);
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit0));
+ // Size = sizeof(uncompressed hit0)
+ int expected_size = sizeof(Hit);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0)));
+
+ Hit hit1(/*section_id=*/0, 1, Hit::kDefaultTermFrequency);
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit1));
+ // Size = sizeof(uncompressed hit1)
+ // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
+ expected_size += 2 + sizeof(Hit::TermFrequency);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit1, hit0)));
+
+ Hit hit2(/*section_id=*/0, 2, /*term_frequency=*/56);
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit2));
+ // Size = sizeof(uncompressed hit2)
+ // + sizeof(hit1-hit2)
+ // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
+ expected_size += 2;
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
+
+ Hit hit3(/*section_id=*/0, 3, Hit::kDefaultTermFrequency);
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit3));
+ // Size = sizeof(uncompressed hit3)
+ // + sizeof(hit2-hit3) + sizeof(hit2::term_frequency)
+ // + sizeof(hit1-hit2)
+ // + sizeof(hit0-hit1) + sizeof(hit0::term_frequency)
+ expected_size += 2 + sizeof(Hit::TermFrequency);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
+}
+
+TEST(PostingListHitSerializerTest, PostingListUsedPrependHitAlmostFull) {
+ PostingListHitSerializer serializer;
+
+ int size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ // Fill up the compressed region.
+ // Transitions:
+ // Adding hit0: EMPTY -> NOT_FULL
+ // Adding hit1: NOT_FULL -> NOT_FULL
+ // Adding hit2: NOT_FULL -> NOT_FULL
+ Hit hit0(/*section_id=*/0, 0, Hit::kDefaultTermFrequency);
+ Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2);
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2);
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit2));
+ // Size used will be 2+2+4=8 bytes
+ int expected_size = sizeof(Hit::Value) + 2 + 2;
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit2, hit1, hit0)));
+
+ // Add one more hit to transition NOT_FULL -> ALMOST_FULL
+ Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/3);
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit3));
+ // Compressed region would be 2+2+3+4=11 bytes, but the compressed region is
+ // only 10 bytes. So instead, the posting list will transition to ALMOST_FULL.
+ // The in-use compressed region will actually shrink from 8 bytes to 7 bytes
+ // because the uncompressed version of hit2 will be overwritten with the
+ // compressed delta of hit2. hit3 will be written to one of the special hits.
+ // Because we're in ALMOST_FULL, the expected size is the size of the pl minus
+ // the one hit used to mark the posting list as ALMOST_FULL.
+ expected_size = size - sizeof(Hit);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit3, hit2, hit1, hit0)));
+
+ // Add one more hit to transition ALMOST_FULL -> ALMOST_FULL
+ Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2);
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit4));
+ // There are currently 7 bytes in use in the compressed region. hit3 will have
+ // a 2-byte delta. That delta will fit in the compressed region (which will
+ // now have 9 bytes in use), hit4 will be placed in one of the special hits
+ // and the posting list will remain in ALMOST_FULL.
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit4, hit3, hit2, hit1, hit0)));
+
+ // Add one more hit to transition ALMOST_FULL -> FULL
+ Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2);
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit5));
+ // There are currently 9 bytes in use in the compressed region. hit4 will have
+ // a 2-byte delta which will not fit in the compressed region. So hit4 will
+ // remain in one of the special hits and hit5 will occupy the other, making
+ // the posting list FULL.
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0)));
+
+ // The posting list is FULL. Adding another hit should fail.
+ Hit hit6 = CreateHit(hit5, /*desired_byte_length=*/1);
+ EXPECT_THAT(serializer.PrependHit(&pl_used, hit6),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListHitSerializerTest, PostingListUsedMinSize) {
+ PostingListHitSerializer serializer;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, serializer.GetMinPostingListSize()));
+ // PL State: EMPTY
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
+ EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
+
+ // Add a hit, PL should shift to ALMOST_FULL state
+ Hit hit0(/*section_id=*/0, 0, /*term_frequency=*/0,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/true);
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit0));
+ // Size = sizeof(uncompressed hit0)
+ int expected_size = sizeof(Hit);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(ElementsAre(hit0)));
+
+ // Add the smallest hit possible - no term_frequency and a delta of 1. PL
+ // should shift to FULL state.
+ Hit hit1(/*section_id=*/0, 0, /*term_frequency=*/0,
+ /*is_in_prefix_section=*/true,
+ /*is_prefix_hit=*/false);
+ ICING_EXPECT_OK(serializer.PrependHit(&pl_used, hit1));
+ // Size = sizeof(uncompressed hit1) + sizeof(uncompressed hit0)
+ expected_size += sizeof(Hit);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit1, hit0)));
+
+ // Try to add the smallest hit possible. Should fail
+ Hit hit2(/*section_id=*/0, 0, /*term_frequency=*/0,
+ /*is_in_prefix_section=*/false,
+ /*is_prefix_hit=*/false);
+ EXPECT_THAT(serializer.PrependHit(&pl_used, hit2),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Le(expected_size));
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAre(hit1, hit0)));
+}
+
+TEST(PostingListHitSerializerTest,
+ PostingListPrependHitArrayMinSizePostingList) {
+ PostingListHitSerializer serializer;
+
+ // Min Size = 10
+ int size = serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<HitElt> hits_in;
+ hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ std::reverse(hits_in.begin(), hits_in.end());
+
+ // Add five hits. The PL is in the empty state and an empty min size PL can
+ // only fit two hits. So PrependHitArray should fail.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t num_can_prepend,
+ (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false)));
+ EXPECT_THAT(num_can_prepend, Eq(2));
+
+ int can_fit_hits = num_can_prepend;
+ // The PL has room for 2 hits. We should be able to add them without any
+ // problem, transitioning the PL from EMPTY -> ALMOST_FULL -> FULL
+ const HitElt *hits_in_ptr = hits_in.data() + (hits_in.size() - 2);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ num_can_prepend, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, hits_in_ptr, can_fit_hits, false)));
+ EXPECT_THAT(num_can_prepend, Eq(can_fit_hits));
+ EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used)));
+ std::deque<Hit> hits_pushed;
+ std::transform(hits_in.rbegin(),
+ hits_in.rend() - hits_in.size() + can_fit_hits,
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+}
+
+TEST(PostingListHitSerializerTest, PostingListPrependHitArrayPostingList) {
+ PostingListHitSerializer serializer;
+
+ // Size = 30
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<HitElt> hits_in;
+ hits_in.emplace_back(Hit(1, 0, Hit::kDefaultTermFrequency));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ std::reverse(hits_in.begin(), hits_in.end());
+ // The last hit is uncompressed and the four before it should only take one
+ // byte. Total use = 8 bytes.
+ // ----------------------
+ // 29 delta(Hit #1)
+ // 28 delta(Hit #2)
+ // 27 delta(Hit #3)
+ // 26 delta(Hit #4)
+ // 25-22 Hit #5
+ // 21-10 <unused>
+ // 9-5 kSpecialHit
+ // 4-0 Offset=22
+ // ----------------------
+ int byte_size = sizeof(Hit::Value) + hits_in.size() - 1;
+
+ // Add five hits. The PL is in the empty state and should be able to fit all
+ // five hits without issue, transitioning the PL from EMPTY -> NOT_FULL.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t num_could_fit,
+ (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false)));
+ EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
+ EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
+ std::deque<Hit> hits_pushed;
+ std::transform(hits_in.rbegin(), hits_in.rend(),
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ Hit first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1);
+ hits_in.clear();
+ hits_in.emplace_back(first_hit);
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3));
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ std::reverse(hits_in.begin(), hits_in.end());
+ // Size increased by the deltas of these hits (1+2+1+2+3+2) = 11 bytes
+ // ----------------------
+ // 29 delta(Hit #1)
+ // 28 delta(Hit #2)
+ // 27 delta(Hit #3)
+ // 26 delta(Hit #4)
+ // 25 delta(Hit #5)
+ // 24-23 delta(Hit #6)
+ // 22 delta(Hit #7)
+ // 21-20 delta(Hit #8)
+ // 19-17 delta(Hit #9)
+ // 16-15 delta(Hit #10)
+ // 14-11 Hit #11
+ // 10 <unused>
+ // 9-5 kSpecialHit
+ // 4-0 Offset=11
+ // ----------------------
+ byte_size += 11;
+
+ // Add these 6 hits. The PL is currently in the NOT_FULL state and should
+ // remain in the NOT_FULL state.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false)));
+ EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
+ EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
+ // All hits from hits_in were added.
+ std::transform(hits_in.rbegin(), hits_in.rend(),
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/3);
+ hits_in.clear();
+ hits_in.emplace_back(first_hit);
+ // ----------------------
+ // 29 delta(Hit #1)
+ // 28 delta(Hit #2)
+ // 27 delta(Hit #3)
+ // 26 delta(Hit #4)
+ // 25 delta(Hit #5)
+ // 24-23 delta(Hit #6)
+ // 22 delta(Hit #7)
+ // 21-20 delta(Hit #8)
+ // 19-17 delta(Hit #9)
+ // 16-15 delta(Hit #10)
+ // 14-12 delta(Hit #11)
+ // 11-10 <unused>
+ // 9-5 Hit #12
+ // 4-0 kSpecialHit
+ // ----------------------
+ byte_size = 25;
+
+ // Add this 1 hit. The PL is currently in the NOT_FULL state and should
+ // transition to the ALMOST_FULL state - even though there is still some
+ // unused space.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false)));
+ EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
+ EXPECT_THAT(byte_size, Eq(serializer.GetBytesUsed(&pl_used)));
+ // All hits from hits_in were added.
+ std::transform(hits_in.rbegin(), hits_in.rend(),
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+
+ first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1);
+ hits_in.clear();
+ hits_in.emplace_back(first_hit);
+ hits_in.emplace_back(
+ CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
+ std::reverse(hits_in.begin(), hits_in.end());
+ // ----------------------
+ // 29 delta(Hit #1)
+ // 28 delta(Hit #2)
+ // 27 delta(Hit #3)
+ // 26 delta(Hit #4)
+ // 25 delta(Hit #5)
+ // 24-23 delta(Hit #6)
+ // 22 delta(Hit #7)
+ // 21-20 delta(Hit #8)
+ // 19-17 delta(Hit #9)
+ // 16-15 delta(Hit #10)
+ // 14-12 delta(Hit #11)
+ // 11 delta(Hit #12)
+ // 10 <unused>
+ // 9-5 Hit #13
+ // 4-0 Hit #14
+ // ----------------------
+
+ // Add these 2 hits. The PL is currently in the ALMOST_FULL state. Adding the
+ // first hit should keep the PL in ALMOST_FULL because the delta between Hit
+ // #12 and Hit #13 (1 byte) can fit in the unused area (2 bytes). Adding the
+ // second hit should tranisition to the FULL state because the delta between
+ // Hit #13 and Hit #14 (2 bytes) is larger than the remaining unused area
+ // (1 byte).
+ ICING_ASSERT_OK_AND_ASSIGN(
+ num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hits_in[0], hits_in.size(), false)));
+ EXPECT_THAT(num_could_fit, Eq(hits_in.size()));
+ EXPECT_THAT(size, Eq(serializer.GetBytesUsed(&pl_used)));
+ // All hits from hits_in were added.
+ std::transform(hits_in.rbegin(), hits_in.rend(),
+ std::front_inserter(hits_pushed), HitElt::get_hit);
+ EXPECT_THAT(serializer.GetHits(&pl_used),
+ IsOkAndHolds(ElementsAreArray(hits_pushed)));
+}
+
+TEST(PostingListHitSerializerTest, PostingListPrependHitArrayTooManyHits) {
+ PostingListHitSerializer serializer;
+
+ static constexpr int kNumHits = 128;
+ static constexpr int kDeltaSize = 1;
+ static constexpr int kTermFrequencySize = 1;
+ static constexpr size_t kHitsSize =
+ ((kNumHits * (kDeltaSize + kTermFrequencySize)) / 5) * 5;
+
+ // Create an array with one too many hits
+ std::vector<Hit> hits_in_too_many =
+ CreateHits(kNumHits + 1, /*desired_byte_length=*/1);
+ std::vector<HitElt> hit_elts_in_too_many;
+ for (const Hit &hit : hits_in_too_many) {
+ hit_elts_in_too_many.emplace_back(hit);
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, serializer.GetMinPostingListSize()));
+
+ // PrependHitArray should fail because hit_elts_in_too_many is far too large
+ // for the minimum size pl.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ uint32_t num_could_fit,
+ (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hit_elts_in_too_many[0], hit_elts_in_too_many.size(),
+ false)));
+ ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size()));
+ ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
+ ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, kHitsSize));
+ // PrependHitArray should fail because hit_elts_in_too_many is one hit too
+ // large for this pl.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ num_could_fit, (serializer.PrependHitArray<HitElt, HitElt::get_hit>(
+ &pl_used, &hit_elts_in_too_many[0],
+ hit_elts_in_too_many.size(), false)));
+ ASSERT_THAT(num_could_fit, Lt(hit_elts_in_too_many.size()));
+ ASSERT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
+ ASSERT_THAT(serializer.GetHits(&pl_used), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PostingListHitSerializerTest,
+ PostingListStatusJumpFromNotFullToFullAndBack) {
+ PostingListHitSerializer serializer;
+
+ const uint32_t pl_size = 3 * sizeof(Hit);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue - 1, 0)));
+ uint32_t bytes_used = serializer.GetBytesUsed(&pl);
+ // Status not full.
+ ASSERT_THAT(bytes_used,
+ Le(pl_size - PostingListHitSerializer::kSpecialHitsSize));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl, Hit(Hit::kInvalidValue >> 2, 0)));
+ // Status should jump to full directly.
+ ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(pl_size));
+ ICING_ASSERT_OK(serializer.PopFrontHits(&pl, 1));
+ // Status should return to not full as before.
+ ASSERT_THAT(serializer.GetBytesUsed(&pl), Eq(bytes_used));
+}
+
+TEST(PostingListHitSerializerTest, DeltaOverflow) {
+ PostingListHitSerializer serializer;
+
+ const uint32_t pl_size = 4 * sizeof(Hit);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+
+ static const Hit::Value kOverflow[4] = {
+ Hit::kInvalidValue >> 2,
+ (Hit::kInvalidValue >> 2) * 2,
+ (Hit::kInvalidValue >> 2) * 3,
+ Hit::kInvalidValue - 1,
+ };
+
+ // Fit at least 4 ordinary values.
+ for (Hit::Value v = 0; v < 4; v++) {
+ ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(4 - v)));
+ }
+
+ // Cannot fit 4 overflow values.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl, PostingListUsed::CreateFromUnitializedRegion(&serializer, pl_size));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[3])));
+ ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[2])));
+
+ // Can fit only one more.
+ ICING_EXPECT_OK(serializer.PrependHit(&pl, Hit(kOverflow[1])));
+ EXPECT_THAT(serializer.PrependHit(&pl, Hit(kOverflow[0])),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListHitSerializerTest, MoveFrom) {
+ PostingListHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits1) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<Hit> hits2 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits2) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit));
+ }
+
+ ICING_ASSERT_OK(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1));
+ EXPECT_THAT(serializer.GetHits(&pl_used2),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+ EXPECT_THAT(serializer.GetHits(&pl_used1), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PostingListHitSerializerTest, MoveFromNullArgumentReturnsInvalidArgument) {
+ PostingListHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<Hit> hits = CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
+ }
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used1, /*src=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(serializer.GetHits(&pl_used1),
+ IsOkAndHolds(ElementsAreArray(hits.rbegin(), hits.rend())));
+}
+
+TEST(PostingListHitSerializerTest,
+ MoveFromInvalidPostingListReturnsInvalidArgument) {
+ PostingListHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits1) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<Hit> hits2 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits2) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit));
+ }
+
+ // Write invalid hits to the beginning of pl_used1 to make it invalid.
+ Hit invalid_hit;
+ Hit *first_hit = reinterpret_cast<Hit *>(pl_used1.posting_list_buffer());
+ *first_hit = invalid_hit;
+ ++first_hit;
+ *first_hit = invalid_hit;
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(serializer.GetHits(&pl_used2),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+}
+
+TEST(PostingListHitSerializerTest,
+ MoveToInvalidPostingListReturnsFailedPrecondition) {
+ PostingListHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits1) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<Hit> hits2 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits2) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit));
+ }
+
+ // Write invalid hits to the beginning of pl_used2 to make it invalid.
+ Hit invalid_hit;
+ Hit *first_hit = reinterpret_cast<Hit *>(pl_used2.posting_list_buffer());
+ *first_hit = invalid_hit;
+ ++first_hit;
+ *first_hit = invalid_hit;
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(serializer.GetHits(&pl_used1),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+}
+
+TEST(PostingListHitSerializerTest, MoveToPostingListTooSmall) {
+ PostingListHitSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<Hit> hits1 =
+ CreateHits(/*num_hits=*/5, /*desired_byte_length=*/1);
+ for (const Hit &hit : hits1) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used1, hit));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(
+ &serializer, serializer.GetMinPostingListSize()));
+ std::vector<Hit> hits2 =
+ CreateHits(/*num_hits=*/1, /*desired_byte_length=*/2);
+ for (const Hit &hit : hits2) {
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used2, hit));
+ }
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(serializer.GetHits(&pl_used1),
+ IsOkAndHolds(ElementsAreArray(hits1.rbegin(), hits1.rend())));
+ EXPECT_THAT(serializer.GetHits(&pl_used2),
+ IsOkAndHolds(ElementsAreArray(hits2.rbegin(), hits2.rend())));
+}
+
+TEST(PostingListHitSerializerTest, PopHitsWithScores) {
+ PostingListHitSerializer serializer;
+
+ int size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ // This posting list is 20-bytes. Create four hits that will have deltas of
+ // two bytes each and all of whom will have a non-default score. This posting
+ // list will be almost_full.
+ //
+ // ----------------------
+ // 19 score(Hit #0)
+ // 18-17 delta(Hit #0)
+ // 16 score(Hit #1)
+ // 15-14 delta(Hit #1)
+ // 13 score(Hit #2)
+ // 12-11 delta(Hit #2)
+ // 10 <unused>
+ // 9-5 Hit #3
+ // 4-0 kInvalidHitVal
+ // ----------------------
+ Hit hit0(/*section_id=*/0, /*document_id=*/0, /*score=*/5);
+ Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2);
+ Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2);
+ Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/2);
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit0));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit1));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit2));
+ ICING_ASSERT_OK(serializer.PrependHit(&pl_used, hit3));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Hit> hits_out,
+ serializer.GetHits(&pl_used));
+ EXPECT_THAT(hits_out, ElementsAre(hit3, hit2, hit1, hit0));
+
+ // Now, pop the last hit. The posting list should contain the first three
+ // hits.
+ //
+ // ----------------------
+ // 19 score(Hit #0)
+ // 18-17 delta(Hit #0)
+ // 16 score(Hit #1)
+ // 15-14 delta(Hit #1)
+ // 13-10 <unused>
+ // 9-5 Hit #2
+ // 4-0 kInvalidHitVal
+ // ----------------------
+ ICING_ASSERT_OK(serializer.PopFrontHits(&pl_used, 1));
+ ICING_ASSERT_OK_AND_ASSIGN(hits_out, serializer.GetHits(&pl_used));
+ EXPECT_THAT(hits_out, ElementsAre(hit2, hit1, hit0));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/doc-hit-info-iterator-numeric.h b/icing/index/numeric/doc-hit-info-iterator-numeric.h
new file mode 100644
index 0000000..7cdb230
--- /dev/null
+++ b/icing/index/numeric/doc-hit-info-iterator-numeric.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_DOC_HIT_INFO_ITERATOR_NUMERIC_H_
+#define ICING_INDEX_NUMERIC_DOC_HIT_INFO_ITERATOR_NUMERIC_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+template <typename T>
+class DocHitInfoIteratorNumeric : public DocHitInfoLeafIterator {
+ public:
+ explicit DocHitInfoIteratorNumeric(
+ std::unique_ptr<typename NumericIndex<T>::Iterator> numeric_index_iter)
+ : numeric_index_iter_(std::move(numeric_index_iter)) {}
+
+ libtextclassifier3::Status Advance() override {
+ // If the query property path doesn't exist (i.e. the storage doesn't
+ // exist), then numeric_index_iter_ will be nullptr.
+ if (numeric_index_iter_ == nullptr) {
+ return absl_ports::ResourceExhaustedError("End of iterator");
+ }
+
+ ICING_RETURN_IF_ERROR(numeric_index_iter_->Advance());
+
+ doc_hit_info_ = numeric_index_iter_->GetDocHitInfo();
+ return libtextclassifier3::Status::OK;
+ }
+
+ libtextclassifier3::StatusOr<TrimmedNode> TrimRightMostNode() && override {
+ return absl_ports::InvalidArgumentError(
+ "Cannot generate suggestion if the last term is numeric operator.");
+ }
+
+ CallStats GetCallStats() const override {
+ if (numeric_index_iter_ == nullptr) {
+ return CallStats();
+ }
+
+ return CallStats(/*num_leaf_advance_calls_lite_index_in=*/0,
+ /*num_leaf_advance_calls_main_index_in=*/0,
+ /*num_leaf_advance_calls_integer_index_in=*/
+ numeric_index_iter_->GetNumAdvanceCalls(),
+ /*num_leaf_advance_calls_no_index_in=*/0,
+ /*num_blocks_inspected_in=*/
+ numeric_index_iter_->GetNumBlocksInspected());
+ }
+
+ std::string ToString() const override { return "test"; }
+
+ void PopulateMatchedTermsStats(
+ std::vector<TermMatchInfo>* matched_terms_stats,
+ SectionIdMask filtering_section_mask = kSectionIdMaskAll) const override {
+ // For numeric hit iterator, this should do nothing since there is no term.
+ }
+
+ private:
+ std::unique_ptr<typename NumericIndex<T>::Iterator> numeric_index_iter_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_DOC_HIT_INFO_ITERATOR_NUMERIC_H_
diff --git a/icing/index/numeric/dummy-numeric-index.h b/icing/index/numeric/dummy-numeric-index.h
new file mode 100644
index 0000000..d18f2aa
--- /dev/null
+++ b/icing/index/numeric/dummy-numeric-index.h
@@ -0,0 +1,351 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
+#define ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
+
+#include <cstdint>
+#include <functional>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/doc-hit-info-iterator-numeric.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// DummyNumericIndex: dummy class to help with testing and unblock e2e
+// integration for numeric search. It stores all numeric index data (keys and
+// hits) in memory without actual persistent storages. All PersistentStorage
+// features do not work as expected, i.e. they don't persist any data into disk
+// and therefore data are volatile.
+template <typename T>
+class DummyNumericIndex : public NumericIndex<T> {
+ public:
+ static libtextclassifier3::StatusOr<std::unique_ptr<DummyNumericIndex<T>>>
+ Create(const Filesystem& filesystem, std::string working_path) {
+ auto dummy_numeric_index = std::unique_ptr<DummyNumericIndex<T>>(
+ new DummyNumericIndex<T>(filesystem, std::move(working_path)));
+ ICING_RETURN_IF_ERROR(dummy_numeric_index->InitializeNewStorage());
+ return dummy_numeric_index;
+ }
+
+ ~DummyNumericIndex() override = default;
+
+ std::unique_ptr<typename NumericIndex<T>::Editor> Edit(
+ std::string_view property_path, DocumentId document_id,
+ SectionId section_id) override {
+ return std::make_unique<Editor>(property_path, document_id, section_id,
+ storage_);
+ }
+
+ libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
+ std::string_view property_path, T key_lower, T key_upper,
+ const DocumentStore&, const SchemaStore&, int64_t) const override;
+
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id) override;
+
+ libtextclassifier3::Status Clear() override {
+ storage_.clear();
+ last_added_document_id_ = kInvalidDocumentId;
+ return libtextclassifier3::Status::OK;
+ }
+
+ DocumentId last_added_document_id() const override {
+ return last_added_document_id_;
+ }
+
+ void set_last_added_document_id(DocumentId document_id) override {
+ if (last_added_document_id_ == kInvalidDocumentId ||
+ document_id > last_added_document_id_) {
+ last_added_document_id_ = document_id;
+ }
+ }
+
+ int num_property_indices() const override { return storage_.size(); }
+
+ private:
+ class Editor : public NumericIndex<T>::Editor {
+ public:
+ explicit Editor(
+ std::string_view property_path, DocumentId document_id,
+ SectionId section_id,
+ std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>&
+ storage)
+ : NumericIndex<T>::Editor(property_path, document_id, section_id),
+ storage_(storage) {}
+
+ ~Editor() override = default;
+
+ libtextclassifier3::Status BufferKey(T key) override {
+ seen_keys_.insert(key);
+ return libtextclassifier3::Status::OK;
+ }
+
+ libtextclassifier3::Status IndexAllBufferedKeys() && override;
+
+ private:
+ std::unordered_set<T> seen_keys_;
+ std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>&
+ storage_; // Does not own.
+ };
+
+ class Iterator : public NumericIndex<T>::Iterator {
+ public:
+ // We group BasicHits (sorted by document_id) of a key into a Bucket (stored
+ // as std::vector) and store key -> vector in an std::map. When doing range
+ // query, we may access vectors from multiple keys and want to return
+ // BasicHits to callers sorted by document_id. Therefore, this problem is
+ // actually "merge K sorted vectors".
+ // To implement this algorithm via priority_queue, we create this wrapper
+ // class to store iterators of map and vector.
+ class BucketInfo {
+ public:
+ explicit BucketInfo(
+ typename std::map<T, std::vector<BasicHit>>::const_iterator
+ bucket_iter)
+ : bucket_iter_(bucket_iter),
+ vec_iter_(bucket_iter_->second.rbegin()) {}
+
+ bool Advance() { return ++vec_iter_ != bucket_iter_->second.rend(); }
+
+ const BasicHit& GetCurrentBasicHit() const { return *vec_iter_; }
+
+ bool operator<(const BucketInfo& other) const {
+ // std::priority_queue is a max heap and we should return BasicHits in
+ // DocumentId descending order.
+ // - BucketInfo::operator< should have the same order as DocumentId.
+ // - BasicHit encodes inverted document id and its operator< compares
+ // the encoded raw value directly.
+ // - Therefore, BucketInfo::operator< should compare BasicHit reversely.
+ // - This will make priority_queue return buckets in DocumentId
+ // descending and SectionId ascending order.
+ // - Whatever direction we sort SectionId by (or pop by priority_queue)
+ // doesn't matter because all hits for the same DocumentId will be
+ // merged into a single DocHitInfo.
+ return other.GetCurrentBasicHit() < GetCurrentBasicHit();
+ }
+
+ private:
+ typename std::map<T, std::vector<BasicHit>>::const_iterator bucket_iter_;
+ std::vector<BasicHit>::const_reverse_iterator vec_iter_;
+ };
+
+ explicit Iterator(T key_lower, T key_upper,
+ std::vector<BucketInfo>&& bucket_info_vec)
+ : NumericIndex<T>::Iterator(key_lower, key_upper),
+ pq_(std::less<BucketInfo>(), std::move(bucket_info_vec)),
+ num_advance_calls_(0) {}
+
+ ~Iterator() override = default;
+
+ libtextclassifier3::Status Advance() override;
+
+ DocHitInfo GetDocHitInfo() const override { return doc_hit_info_; }
+
+ int32_t GetNumAdvanceCalls() const override { return num_advance_calls_; }
+
+ int32_t GetNumBlocksInspected() const override { return 0; }
+
+ private:
+ std::priority_queue<BucketInfo> pq_;
+ DocHitInfo doc_hit_info_;
+
+ int32_t num_advance_calls_;
+ };
+
+ explicit DummyNumericIndex(const Filesystem& filesystem,
+ std::string&& working_path)
+ : NumericIndex<T>(filesystem, std::move(working_path),
+ PersistentStorage::WorkingPathType::kDummy),
+ dummy_crcs_buffer_(
+ std::make_unique<uint8_t[]>(sizeof(PersistentStorage::Crcs))),
+ last_added_document_id_(kInvalidDocumentId) {
+ memset(dummy_crcs_buffer_.get(), 0, sizeof(PersistentStorage::Crcs));
+ }
+
+ libtextclassifier3::Status PersistStoragesToDisk(bool force) override {
+ return libtextclassifier3::Status::OK;
+ }
+
+ libtextclassifier3::Status PersistMetadataToDisk(bool force) override {
+ return libtextclassifier3::Status::OK;
+ }
+
+ libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override {
+ return Crc32(0);
+ }
+
+ libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) override {
+ return Crc32(0);
+ }
+
+ PersistentStorage::Crcs& crcs() override {
+ return *reinterpret_cast<PersistentStorage::Crcs*>(
+ dummy_crcs_buffer_.get());
+ }
+ const PersistentStorage::Crcs& crcs() const override {
+ return *reinterpret_cast<const PersistentStorage::Crcs*>(
+ dummy_crcs_buffer_.get());
+ }
+
+ std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>> storage_;
+ std::unique_ptr<uint8_t[]> dummy_crcs_buffer_;
+ DocumentId last_added_document_id_;
+};
+
+template <typename T>
+libtextclassifier3::Status
+DummyNumericIndex<T>::Editor::IndexAllBufferedKeys() && {
+ auto property_map_iter = storage_.find(this->property_path_);
+ if (property_map_iter == storage_.end()) {
+ const auto& [inserted_iter, insert_result] =
+ storage_.insert({this->property_path_, {}});
+ if (!insert_result) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create a new map for property \"",
+ this->property_path_, "\""));
+ }
+ property_map_iter = inserted_iter;
+ }
+
+ for (const T& key : seen_keys_) {
+ auto key_map_iter = property_map_iter->second.find(key);
+ if (key_map_iter == property_map_iter->second.end()) {
+ const auto& [inserted_iter, insert_result] =
+ property_map_iter->second.insert({key, {}});
+ if (!insert_result) {
+ return absl_ports::InternalError("Failed to create a new map for key");
+ }
+ key_map_iter = inserted_iter;
+ }
+ key_map_iter->second.push_back(
+ BasicHit(this->section_id_, this->document_id_));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::Status DummyNumericIndex<T>::Iterator::Advance() {
+ if (pq_.empty()) {
+ return absl_ports::ResourceExhaustedError("End of iterator");
+ }
+
+ DocumentId document_id = pq_.top().GetCurrentBasicHit().document_id();
+ doc_hit_info_ = DocHitInfo(document_id);
+ // Merge sections with same document_id into a single DocHitInfo
+ while (!pq_.empty() &&
+ pq_.top().GetCurrentBasicHit().document_id() == document_id) {
+ ++num_advance_calls_;
+ doc_hit_info_.UpdateSection(pq_.top().GetCurrentBasicHit().section_id());
+
+ BucketInfo info = pq_.top();
+ pq_.pop();
+
+ if (info.Advance()) {
+ pq_.push(std::move(info));
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+DummyNumericIndex<T>::GetIterator(std::string_view property_path, T key_lower,
+ T key_upper, const DocumentStore&,
+ const SchemaStore&, int64_t) const {
+ if (key_lower > key_upper) {
+ return absl_ports::InvalidArgumentError(
+ "key_lower should not be greater than key_upper");
+ }
+
+ auto property_map_iter = storage_.find(std::string(property_path));
+ if (property_map_iter == storage_.end()) {
+ // Return an empty iterator.
+ return std::make_unique<DocHitInfoIteratorNumeric<T>>(nullptr);
+ }
+
+ std::vector<typename Iterator::BucketInfo> bucket_info_vec;
+ for (auto key_map_iter = property_map_iter->second.lower_bound(key_lower);
+ key_map_iter != property_map_iter->second.cend() &&
+ key_map_iter->first <= key_upper;
+ ++key_map_iter) {
+ bucket_info_vec.push_back(typename Iterator::BucketInfo(key_map_iter));
+ }
+
+ return std::make_unique<DocHitInfoIteratorNumeric<T>>(
+ std::make_unique<Iterator>(key_lower, key_upper,
+ std::move(bucket_info_vec)));
+}
+
+template <typename T>
+libtextclassifier3::Status DummyNumericIndex<T>::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id) {
+ std::unordered_map<std::string, std::map<T, std::vector<BasicHit>>>
+ new_storage;
+
+ for (const auto& [property_path, old_property_map] : storage_) {
+ std::map<T, std::vector<BasicHit>> new_property_map;
+ for (const auto& [key, hits] : old_property_map) {
+ for (const BasicHit& hit : hits) {
+ DocumentId old_doc_id = hit.document_id();
+ if (old_doc_id >= document_id_old_to_new.size() ||
+ document_id_old_to_new[old_doc_id] == kInvalidDocumentId) {
+ continue;
+ }
+
+ new_property_map[key].push_back(
+ BasicHit(hit.section_id(), document_id_old_to_new[old_doc_id]));
+ }
+ }
+
+ if (!new_property_map.empty()) {
+ new_storage[property_path] = std::move(new_property_map);
+ }
+ }
+
+ storage_ = std::move(new_storage);
+ last_added_document_id_ = new_last_added_document_id;
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_DUMMY_NUMERIC_INDEX_H_
diff --git a/icing/index/numeric/integer-index-bucket-util.cc b/icing/index/numeric/integer-index-bucket-util.cc
new file mode 100644
index 0000000..a05baab
--- /dev/null
+++ b/icing/index/numeric/integer-index-bucket-util.cc
@@ -0,0 +1,205 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index-bucket-util.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#include "icing/index/numeric/integer-index-data.h"
+
+namespace icing {
+namespace lib {
+
+namespace integer_index_bucket_util {
+
+namespace {
+
+// Helper function to determine if data slice [start, end) forms a "full
+// single-range bucket".
+//
+// Full single-range bucket: keys of all data are identical and # of them exceed
+// num_data_threshold.
+//
+// REQUIRES: data slice [start, end) are sorted by key.
+inline bool WouldBeFullSingleRangeBucket(
+ const std::vector<IntegerIndexData>::iterator& start,
+ const std::vector<IntegerIndexData>::iterator& end,
+ int32_t num_data_threshold) {
+ return std::distance(start, end) > num_data_threshold &&
+ start->key() == (end - 1)->key();
+}
+
+// Helper function to determine if a bucket is full single-range.
+//
+// REQUIRES:
+// bucket.key_lower <= [bucket.start, bucket.end)->key() <= bucket.key_upper
+inline bool IsFullSingleRangeBucket(const DataRangeAndBucketInfo& bucket,
+ int32_t num_data_threshold) {
+ return bucket.key_lower == bucket.key_upper &&
+ WouldBeFullSingleRangeBucket(bucket.start, bucket.end,
+ num_data_threshold);
+}
+
+// Helper function to append new bucket(s) with corresponding data slice for
+// range [curr_key_lower, last_key] where last_key = (it_end - 1)->key().
+//
+// Also it handles an edge case:
+// If data slice [it_start, it_end) forms a "full single-range bucket" (see
+// WouldBeFullSingleRangeBucket for definition), then we have to put them into a
+// single range bucket [last_key, last_key] instead of [curr_key_lower,
+// last_key]. Also we have to deal with range [curr_key_lower, last_key - 1]:
+// - If the previous bucket exists and it is not a "full single-range bucket",
+// then merge [curr_key_lower, last_key - 1] into the previous bucket, i.e.
+// change the previous bucket's key_upper to (last_key - 1). Then we will end
+// up having:
+// - [prev_bucket.key_lower, last_key - 1]
+// - [last_key, last_key]
+// - Otherwise, we have to create [curr_key_lower, last_key - 1] with
+// empty data. Then we will end up having (Note: prev_bucket.key_upper ==
+// curr_key_lower - 1):
+// - [prev_bucket.key_lower, curr_key_lower - 1]
+// - [curr_key_lower, last_key - 1]
+// - [last_key, last_key]
+// This will avoid split bucket being called too frequently.
+// For example, original_key_lower = 0, original_key_upper = 50. If we have
+// (num_data_threshold + 1) data with key = 20 and another data with key = 40:
+// - Without this part, we will split them into [[0, 20], [21, 50]]. Then when
+// adding data with key = 10 next round, we will invoke split again and split
+// [0, 20] to [[0, 10], [11, 20]].
+// - With this part, we will split them into [[0, 19], [20, 20], [21, 50]],
+// which will avoid splitting in the next round for key = 20.
+//
+// REQUIRES: it_start < it_end
+void AppendNewBuckets(const std::vector<IntegerIndexData>::iterator& it_start,
+ const std::vector<IntegerIndexData>::iterator& it_end,
+ int64_t curr_key_lower, int32_t num_data_threshold,
+ std::vector<DataRangeAndBucketInfo>& results) {
+ int64_t last_key = (it_end - 1)->key();
+ if (curr_key_lower < last_key &&
+ WouldBeFullSingleRangeBucket(it_start, it_end, num_data_threshold)) {
+ if (!results.empty() &&
+ !IsFullSingleRangeBucket(results.back(), num_data_threshold)) {
+ // Previous bucket is not full single-range, so merge it to now hold the
+ // range [prev_bucket.key_lower, last_key - 1].
+ results.back().key_upper = last_key - 1;
+ } else {
+ // There is either no previous bucket or the previous bucket is full
+ // single-range. So add an empty bucket for the range [curr_key_lower,
+ // last_key - 1].
+ results.push_back(DataRangeAndBucketInfo(it_start, it_start,
+ curr_key_lower, last_key - 1));
+ }
+ curr_key_lower = last_key;
+ }
+ results.push_back(
+ DataRangeAndBucketInfo(it_start, it_end, curr_key_lower, last_key));
+}
+
+} // namespace
+
+std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data,
+ int64_t original_key_lower,
+ int64_t original_key_upper,
+ int32_t num_data_threshold) {
+ // Early return if there is no need to split.
+ if (data.size() <= num_data_threshold) {
+ return {DataRangeAndBucketInfo(data.begin(), data.end(), original_key_lower,
+ original_key_upper)};
+ }
+
+ // Sort data by key.
+ std::sort(
+ data.begin(), data.end(),
+ [](const IntegerIndexData& lhs, const IntegerIndexData& rhs) -> bool {
+ return lhs.key() < rhs.key();
+ });
+
+ std::vector<DataRangeAndBucketInfo> results;
+ int64_t curr_key_lower = original_key_lower;
+ // Sliding window [it_start, it_end) to separate data into different buckets.
+ auto it_start = data.begin();
+ auto it_end = data.begin();
+ while (it_end != data.end()) {
+ // Attempt to extend it_end by 1, but we have to include all data with the
+ // same key since they cannot be separated into different buckets. Also use
+ // extend_it_end to avoid modifying it_end directly. For some edge cases,
+ // the extension in a single round is extremely large (i.e. a lot of data
+ // have the same key), and we want to separate them. For example:
+ // - key = 0: 5 data
+ // - key = 1: num_data_threshold - 1 data
+ // In the second round, # of data in the sliding window will exceed the
+ // threshold. We want to separate all data with key = 0 into a single bucket
+ // instead of putting key = 0 and key = 1 together. Therefore, using
+ // extend_it_end allow us to preserve it_end of the previous round and be
+ // able to deal with this case.
+ auto extend_it_end = it_end + 1;
+ while (extend_it_end != data.end() &&
+ it_end->key() == extend_it_end->key()) {
+ ++extend_it_end;
+ }
+
+ if (std::distance(it_start, extend_it_end) > num_data_threshold &&
+ it_start != it_end) {
+ // Split data between [it_start, it_end) into range [curr_key_lower,
+ // (it_end - 1)->key()].
+ AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold,
+ results);
+
+ // it_end at this moment won't be data.end(), so the last element of the
+ // new bucket can't have key == INT64_MAX. Therefore, it is safe to set
+ // curr_key_lower as ((it_end - 1)->key() + 1).
+ curr_key_lower = (it_end - 1)->key() + 1;
+ it_start = it_end;
+ }
+ it_end = extend_it_end;
+ }
+
+ // Handle the final range [curr_key_lower, original_key_upper].
+ if (curr_key_lower <= original_key_upper) {
+ if (it_start != it_end) {
+ AppendNewBuckets(it_start, it_end, curr_key_lower, num_data_threshold,
+ results);
+
+ // AppendNewBuckets only handles range [curr_key_lower, (it_end -
+ // 1)->key()], so we have to handle range [(it_end - 1)->key() + 1,
+ // original_key_upper] if needed.
+ int64_t last_key = (it_end - 1)->key();
+ if (last_key != std::numeric_limits<int64_t>::max() &&
+ last_key + 1 <= original_key_upper) {
+ if (!results.empty() &&
+ !IsFullSingleRangeBucket(results.back(), num_data_threshold)) {
+ results.back().key_upper = original_key_upper;
+ } else {
+ results.push_back(DataRangeAndBucketInfo(
+ it_start, it_start, last_key + 1, original_key_upper));
+ }
+ }
+ } else {
+ results.push_back(DataRangeAndBucketInfo(it_start, it_end, curr_key_lower,
+ original_key_upper));
+ }
+ }
+
+ return results;
+}
+
+} // namespace integer_index_bucket_util
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/integer-index-bucket-util.h b/icing/index/numeric/integer-index-bucket-util.h
new file mode 100644
index 0000000..d6fc245
--- /dev/null
+++ b/icing/index/numeric/integer-index-bucket-util.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_
+#define ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "icing/index/numeric/integer-index-data.h"
+
+namespace icing {
+namespace lib {
+
+namespace integer_index_bucket_util {
+
+// A wrapper struct that contains information of a bucket.
+// - The bucket contains data within the iterator [start, end).
+// - Bucket range is [key_lower, key_upper], and all data within [start, end)
+// should have keys in the bucket range.
+//
+// Note: the caller should make sure the lifecycle of data vector is longer than
+// instances of this wrapper struct.
+struct DataRangeAndBucketInfo {
+ std::vector<IntegerIndexData>::iterator start;
+ std::vector<IntegerIndexData>::iterator end;
+ int64_t key_lower;
+ int64_t key_upper;
+
+ explicit DataRangeAndBucketInfo(
+ std::vector<IntegerIndexData>::iterator start_in,
+ std::vector<IntegerIndexData>::iterator end_in, int64_t key_lower_in,
+ int64_t key_upper_in)
+ : start(std::move(start_in)),
+ end(std::move(end_in)),
+ key_lower(key_lower_in),
+ key_upper(key_upper_in) {}
+};
+
+// Helper function to split data (that are originally in a bucket with range
+// [original_key_lower, original_key_upper]) into different buckets according to
+// num_data_threshold.
+// - The input vector `data` will be sorted by key in ascending order (unless
+// there's no need to split in which case data is returned unmodified)
+// - Data with the same key will be in the same bucket even if # of them exceed
+// num_data_threshold.
+// - Range of all buckets will be disjoint, and the range union will be
+// [original_key_lower, original_key_upper].
+// - Data slice (i.e. [start, end)) can be empty.
+//
+// REQUIRES:
+// - original_key_lower < original_key_upper
+// - num_data_threshold > 0
+// - Keys of all data are in range [original_key_lower, original_key_upper]
+//
+// Returns: a vector of DataRangeAndBucketInfo that contain all bucket info
+// after splitting. Also the returned vector should contain at least one
+// bucket, otherwise it is considered an error.
+std::vector<DataRangeAndBucketInfo> Split(std::vector<IntegerIndexData>& data,
+ int64_t original_key_lower,
+ int64_t original_key_upper,
+ int32_t num_data_threshold);
+
+} // namespace integer_index_bucket_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_BUCKET_UTIL_H_
diff --git a/icing/index/numeric/integer-index-bucket-util_test.cc b/icing/index/numeric/integer-index-bucket-util_test.cc
new file mode 100644
index 0000000..82c593e
--- /dev/null
+++ b/icing/index/numeric/integer-index-bucket-util_test.cc
@@ -0,0 +1,1112 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index-bucket-util.h"
+
+#include <limits>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+namespace integer_index_bucket_util {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+static constexpr DocumentId kDefaultDocumentId = 123;
+static constexpr SectionId kDefaultSectionId = 31;
+
+TEST(IntegerIndexBucketUtilTest, Split_numDataNotDivisibleByThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)};
+ int64_t key_lower = -10;
+ int64_t key_upper = 10;
+ int32_t num_data_threshold = 3;
+ ASSERT_THAT(data.size() % num_data_threshold, Ne(0));
+
+ // Keys = [-10, -3, -2, 0, 1, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, key_lower, key_upper, num_data_threshold);
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -3, -2].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 1: key lower = -1, key upper = 2, keys = [0, 1, 2].
+ EXPECT_THAT(results[1].key_lower, Eq(-1));
+ EXPECT_THAT(results[1].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 3, key upper = 10, keys = [10].
+ EXPECT_THAT(results[2].key_lower, Eq(3));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest, Split_numDataDivisibleByThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)};
+ int64_t key_lower = -10;
+ int64_t key_upper = 10;
+ int32_t num_data_threshold = 3;
+ ASSERT_THAT(data.size() % num_data_threshold, Eq(0));
+
+ // Keys = [-10, -3, -2, 0, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, key_lower, key_upper, num_data_threshold);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -3, -2].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 1: key lower = -1, key upper = 2, keys = [0, 2, 10].
+ EXPECT_THAT(results[1].key_lower, Eq(-1));
+ EXPECT_THAT(results[1].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest, Split_shouldIncludeOriginalKeyRange) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)};
+ int64_t key_lower = -1000;
+ int64_t key_upper = 1000;
+ int32_t num_data_threshold = 3;
+
+ // Keys = [-10, -3, -2, 0, 1, 2, 10].
+ // Split should include the original key_lower and key_upper even if there is
+ // no key at boundary.
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, key_lower, key_upper, num_data_threshold);
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -1000, key upper = -2, keys = [-10, -3, -2].
+ EXPECT_THAT(results[0].key_lower, Eq(-1000));
+ EXPECT_THAT(results[0].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 1: key lower = -1, key upper = 2, keys = [0, 1, 2].
+ EXPECT_THAT(results[1].key_lower, Eq(-1));
+ EXPECT_THAT(results[1].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 3, key upper = 1000, keys = [10].
+ EXPECT_THAT(results[2].key_lower, Eq(3));
+ EXPECT_THAT(results[2].key_upper, Eq(1000));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest, Split_singleBucketWithoutSplitting) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)};
+ int64_t key_lower = -1000;
+ int64_t key_upper = 1000;
+ int32_t num_data_threshold = 100;
+
+ // Keys = [-10, -3, -2, 0, 1, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, key_lower, key_upper, num_data_threshold);
+ ASSERT_THAT(results, SizeIs(1));
+ // Bucket 0: key lower = -1000, key upper = 1000, keys = [-10, -3, -2, 0, 1,
+ // 2, 10]. Since # of data <= threshold, data vector won't be sorted and thus
+ // [start, end) will have data with the original order.
+ EXPECT_THAT(results[0].key_lower, Eq(-1000));
+ EXPECT_THAT(results[0].key_upper, Eq(1000));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+}
+
+TEST(IntegerIndexBucketUtilTest, Split_emptyData) {
+ std::vector<IntegerIndexData> empty_data;
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(empty_data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(1));
+ // Bucket 0: key lower = -10, key upper = 10, keys = [].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_firstBucket_keyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -10, -10, -10, -10, 0, 3, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -10 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = 0, 3, ....
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10, -10, -10,
+ // -10].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)));
+ // Bucket 1: key lower = -9, key upper = 5, keys = [0, 3, 5].
+ EXPECT_THAT(results[1].key_lower, Eq(-9));
+ EXPECT_THAT(results[1].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5)));
+ // Bucket 2: key lower = 6, key upper = 10, keys = [10].
+ EXPECT_THAT(results[2].key_lower, Eq(6));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_firstBucket_keyGreaterThanKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-7, -7, -7, -7, -7, 0, 3, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -7 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = 0, 3, ....
+ // - They should be in a single range bucket [-7, -7], and another bucket
+ // [-10, -8] with empty data should be created before it.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = -8, keys = [].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-8));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ IsEmpty());
+ // Bucket 1: key lower = -7, key upper = -7, keys = [-7, -7, -7, -7, -7].
+ EXPECT_THAT(results[1].key_lower, Eq(-7));
+ EXPECT_THAT(results[1].key_upper, Eq(-7));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -7)));
+ // Bucket 2: key lower = -6, key upper = 5, keys = [0, 3, 5].
+ EXPECT_THAT(results[2].key_lower, Eq(-6));
+ EXPECT_THAT(results[2].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5)));
+ // Bucket 3: key lower = 6, key upper = 10, keys = [10].
+ EXPECT_THAT(results[3].key_lower, Eq(6));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_midBucket_keyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -5, -4, -4, -4, -4, -4, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -4 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -5, 5, 10.
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -5, keys = [-10, -5].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5)));
+ // Bucket 1: key lower = -4, key upper = -4, keys = [-4, -4, -4, -4, -4].
+ EXPECT_THAT(results[1].key_lower, Eq(-4));
+ EXPECT_THAT(results[1].key_upper, Eq(-4));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -4)));
+ // Bucket 2: key lower = -3, key upper = 10, keys = [5, 10].
+ EXPECT_THAT(results[2].key_lower, Eq(-3));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_midBucket_keyGreaterThanKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -5, -1, -1, -1, -1, -1, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -1 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -5, 5, 10.
+ // - They should be in a single range bucket [-1, -1], and range [-4, -2]
+ // should be merged into the previous bucket.
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -2, keys = [-10, -5].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -5)));
+ // Bucket 1: key lower = -1, key upper = -1, keys = [-1, -1, -1, -1, -1].
+ EXPECT_THAT(results[1].key_lower, Eq(-1));
+ EXPECT_THAT(results[1].key_upper, Eq(-1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1)));
+ // Bucket 2: key lower = 0, key upper = 10, keys = [5, 10].
+ EXPECT_THAT(results[2].key_lower, Eq(0));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_lastBucket_keyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3)};
+
+ // Keys = [-10, -3, 0, 2, 3, 3, 3, 3, 3].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 3 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -3, 0, 2.
+ // - They should be in a single range bucket [3, 3], and another bucket
+ // [4, 10] with empty data should be created after it.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(0));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)));
+ // Bucket 1: key lower = 1, key upper = 2, keys = [2].
+ EXPECT_THAT(results[1].key_lower, Eq(1));
+ EXPECT_THAT(results[1].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 3, key upper = 10, keys = [3, 3, 3, 3, 3].
+ EXPECT_THAT(results[2].key_lower, Eq(3));
+ EXPECT_THAT(results[2].key_upper, Eq(3));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 3)));
+ // Bucket 3: key lower = 4, key upper = 10, keys = [].
+ EXPECT_THAT(results[3].key_lower, Eq(4));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_lastBucket_keyWithinKeyLowerAndUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6)};
+
+ // Keys = [-10, -3, 0, 2, 6, 6, 6, 6, 6].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 6 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -3, 0, 2.
+ // - They should be in a single range bucket [6, 6]. Range [3, 5] should be
+ // merged into the previous bucket. and another bucket [7, 10] with empty
+ // data should be created after it.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(0));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)));
+ // Bucket 1: key lower = 1, key upper = 5, keys = [2].
+ EXPECT_THAT(results[1].key_lower, Eq(1));
+ EXPECT_THAT(results[1].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 6, key upper = 6, keys = [6, 6, 6, 6, 6].
+ EXPECT_THAT(results[2].key_lower, Eq(6));
+ EXPECT_THAT(results[2].key_upper, Eq(6));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 6)));
+ // Bucket 3: key lower = 7, key upper = 10, keys = [].
+ EXPECT_THAT(results[3].key_lower, Eq(7));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_lastBucket_keyEqualsKeyUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -3, 0, 2, 10, 10, 10, 10, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 10 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be separated from key = -10, -3, 0, 2.
+ // - They should be in a single range bucket [10, 10], and range [3, 9] should
+ // be merged into the previous bucket.
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = 0, keys = [-10, -3, 0].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(0));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)));
+ // Bucket 1: key lower = 1, key upper = 9, keys = [2].
+ EXPECT_THAT(results[1].key_lower, Eq(1));
+ EXPECT_THAT(results[1].key_upper, Eq(9));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 10, key upper = 10, keys = [10, 10, 10, 10, 10].
+ EXPECT_THAT(results[2].key_lower, Eq(10));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_shouldNotMergeIntoPreviousBucket) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -2, -2, -2, -2, -2, 5, 5, 5, 5, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Data with key = -2 and 5 should be put into a single bucket respectively.
+ // - When dealing with key = 5, range [-1, 4] should not be merged into the
+ // previous bucket [-2, -2] because [-2, -2] also contains single key data
+ // exceeding the threshold. Instead, we should create bucket [-1, 4] with
+ // empty data.
+ ASSERT_THAT(results, SizeIs(5));
+ // Bucket 0: key lower = -10, key upper = -3, keys = [-10].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-3));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId,
+ kDefaultDocumentId, -10)));
+ // Bucket 1: key lower = -2, key upper = -2, keys = [-2, -2, -2, -2, -2].
+ EXPECT_THAT(results[1].key_lower, Eq(-2));
+ EXPECT_THAT(results[1].key_upper, Eq(-2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 2: key lower = -1, key upper = 4, keys = [].
+ EXPECT_THAT(results[2].key_lower, Eq(-1));
+ EXPECT_THAT(results[2].key_upper, Eq(4));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ IsEmpty());
+ // Bucket 3: key lower = 5, key upper = 5, keys = [5, 5, 5, 5, 5].
+ EXPECT_THAT(results[3].key_lower, Eq(5));
+ EXPECT_THAT(results[3].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5)));
+ // Bucket 4: key lower = 6, key upper = 10, keys = [10].
+ EXPECT_THAT(results[4].key_lower, Eq(6));
+ EXPECT_THAT(results[4].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[4].start, results[4].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_shouldMergeIntoPreviousBucket) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -8),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -8, -3, -2, -2, -2, 5, 5, 5, 5, 5, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Data with key = 5 should be put into a single bucket.
+ // - When dealing with key = 5, range [-1, 4] should be merged into the
+ // previous bucket [-2, -2] because # of data in [-2, -2] doesn't exceed the
+ // threshold.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = -3, keys = [-10, -8, -3].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-3));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -8),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -3)));
+ // Bucket 1: key lower = -2, key upper = 4, keys = [-2, -2, -2].
+ EXPECT_THAT(results[1].key_lower, Eq(-2));
+ EXPECT_THAT(results[1].key_upper, Eq(4));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -2)));
+ // Bucket 2: key lower = 5, key upper = 5, keys = [5, 5, 5, 5, 5].
+ EXPECT_THAT(results[2].key_lower, Eq(5));
+ EXPECT_THAT(results[2].key_upper, Eq(5));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 5)));
+ // Bucket 3: key lower = 6, key upper = 10, keys = [10].
+ EXPECT_THAT(results[3].key_lower, Eq(6));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_singleBucket_keyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)};
+
+ // Keys = [-10, -10, -10, -10, -10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = -10 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be in a single range bucket [-10, -10], and another bucket
+ // [-9, 10] with empty data should be created after it.
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10, -10, -10,
+ // -10].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)));
+ // Bucket 1: key lower = -9, key upper = 10, keys = [].
+ EXPECT_THAT(results[1].key_lower, Eq(-9));
+ EXPECT_THAT(results[1].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_singleBucket_keyWithinKeyLowerAndUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)};
+
+ // Keys = [0, 0, 0, 0, 0].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 0 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be in a single range bucket [0, 0]. Another bucket [-10, -1]
+ // with empty data should be created before it, and another bucket [1, 10]
+ // with empty data should be created after it.
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = -10, key upper = -1, keys = [].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-1));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ IsEmpty());
+ // Bucket 1: key lower = 0, key upper = 0, keys = [0, 0, 0, 0, 0].
+ EXPECT_THAT(results[1].key_lower, Eq(0));
+ EXPECT_THAT(results[1].key_upper, Eq(0));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 0)));
+ // Bucket 2: key lower = 1, key upper = 10, keys = [].
+ EXPECT_THAT(results[2].key_lower, Eq(1));
+ EXPECT_THAT(results[2].key_upper, Eq(10));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ IsEmpty());
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_sameKeysExceedingThreshold_singleBucket_keyEqualsKeyUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [10, 10, 10, 10, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // - Even though # of data with key = 10 exceeds the threshold, they should
+ // still be in the same bucket.
+ // - They should be in a single range bucket [10, 10], and another bucket
+ // [-10, 9] with empty data should be created before it.
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = -10, key upper = 9, keys = [].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(9));
+ EXPECT_THAT(std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ IsEmpty());
+ // Bucket 1: key lower = -10, key upper = 10, keys = [10, 10, 10, 10, 10].
+ EXPECT_THAT(results[1].key_lower, Eq(10));
+ EXPECT_THAT(results[1].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_adjacentKeysTotalNumDataExceedThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [-10, -10, -1, -1, 2, 2, 10, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/-10, /*original_key_upper=*/10,
+ /*num_data_threshold=*/3);
+ // Even though # of data with the same key is within the threshold, since
+ // total # of data of adjacent keys exceed the threshold, they should be
+ // separated into different buckets.
+ ASSERT_THAT(results, SizeIs(4));
+ // Bucket 0: key lower = -10, key upper = -10, keys = [-10, -10].
+ EXPECT_THAT(results[0].key_lower, Eq(-10));
+ EXPECT_THAT(results[0].key_upper, Eq(-10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10)));
+ // Bucket 1: key lower = -9, key upper = -1, keys = [-1, -1].
+ EXPECT_THAT(results[1].key_lower, Eq(-9));
+ EXPECT_THAT(results[1].key_upper, Eq(-1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1)));
+ // Bucket 2: key lower = 0, key upper = 2, keys = [2, 2].
+ EXPECT_THAT(results[2].key_lower, Eq(0));
+ EXPECT_THAT(results[2].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 3: key lower = 3, key upper = 10, keys = [10, 10].
+ EXPECT_THAT(results[3].key_lower, Eq(3));
+ EXPECT_THAT(results[3].key_upper, Eq(10));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[3].start, results[3].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyLowerEqualsIntMin_smallestKeyGreaterThanKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min() + 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [INT64_MIN + 1, -10, -1, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = INT64_MIN, key upper = -1, keys = [INT64_MIN + 1,
+ // -10, -1].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(-1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min() + 1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1)));
+ // Bucket 1: key lower = 0, key upper = INT64_MAX, keys = [2, 10].
+ EXPECT_THAT(results[1].key_lower, Eq(0));
+ EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyLowerEqualsIntMin_smallestKeyEqualsKeyLower) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [INT64_MIN, -10, -1, 2, 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = INT64_MIN, key upper = -1, keys = [INT64_MIN, -10,
+ // -1].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(-1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1)));
+ // Bucket 1: key lower = 0, key upper = INT64_MAX, keys = [2, 10].
+ EXPECT_THAT(results[1].key_lower, Eq(0));
+ EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyLowerEqualsIntMin_keyIntMinExceedingThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)};
+
+ // Keys = [INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN, -10, -1, 2,
+ // 10].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = INT64_MIN, key upper = INT64_MIN, keys = [INT64_MIN,
+ // INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::min())));
+ // Bucket 1: key lower = INT64_MIN + 1, key upper = 2, keys = [-10, -1, 2].
+ EXPECT_THAT(results[1].key_lower,
+ Eq(std::numeric_limits<int64_t>::min() + 1));
+ EXPECT_THAT(results[1].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 2: key lower = 3, key upper = INT64_MAX, keys = [10].
+ EXPECT_THAT(results[2].key_lower, Eq(3));
+ EXPECT_THAT(results[2].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyUpperEqualsIntMax_largestKeySmallerThanKeyUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max() - 1),
+ };
+
+ // Keys = [-10, -1, 2, 10, INT64_MAX - 1].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 1: key lower = 3, key upper = INT64_MAX, keys = [10, INT64_MAX - 1].
+ EXPECT_THAT(results[1].key_lower, Eq(3));
+ EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max() - 1)));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyUpperEqualsIntMax_largestKeyEqualsKeyUpper) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ };
+
+ // Keys = [-10, -1, 2, 10, INT64_MAX].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(2));
+ // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 1: key lower = 3, key upper = INT64_MAX, keys = [10, INT64_MAX].
+ EXPECT_THAT(results[1].key_lower, Eq(3));
+ EXPECT_THAT(results[1].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max())));
+}
+
+TEST(IntegerIndexBucketUtilTest,
+ Split_keyUpperEqualsIntMax_keyIntMaxExceedingThreshold) {
+ std::vector<IntegerIndexData> data = {
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max())};
+
+ // Keys = [-10, -1, 2, 10, INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX,
+ // INT64_MAX].
+ std::vector<DataRangeAndBucketInfo> results =
+ Split(data, /*original_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*original_key_upper=*/std::numeric_limits<int64_t>::max(),
+ /*num_data_threshold=*/3);
+ ASSERT_THAT(results, SizeIs(3));
+ // Bucket 0: key lower = INT64_MIN, key upper = 2, keys = [-10, -1, 2].
+ EXPECT_THAT(results[0].key_lower, Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(results[0].key_upper, Eq(2));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[0].start, results[0].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -10),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, -1),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 2)));
+ // Bucket 1: key lower = 3, key upper = INT_MAX - 1, keys = [10].
+ EXPECT_THAT(results[1].key_lower, Eq(3));
+ EXPECT_THAT(results[1].key_upper,
+ Eq(std::numeric_limits<int64_t>::max() - 1));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[1].start, results[1].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId, 10)));
+ // Bucket 2: key lower = INT64_MAX, key upper = INT64_MAX, keys = [INT64_MAX,
+ // INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX].
+ EXPECT_THAT(results[2].key_lower, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(results[2].key_upper, Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(
+ std::vector<IntegerIndexData>(results[2].start, results[2].end),
+ ElementsAre(IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max()),
+ IntegerIndexData(kDefaultSectionId, kDefaultDocumentId,
+ std::numeric_limits<int64_t>::max())));
+}
+
+} // namespace
+
+} // namespace integer_index_bucket_util
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/integer-index-data.h b/icing/index/numeric/integer-index-data.h
new file mode 100644
index 0000000..92653fa
--- /dev/null
+++ b/icing/index/numeric/integer-index-data.h
@@ -0,0 +1,59 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_DATA_H_
+#define ICING_INDEX_NUMERIC_INTEGER_INDEX_DATA_H_
+
+#include <cstdint>
+
+#include "icing/index/hit/hit.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Data wrapper to store BasicHit and key for integer index.
+class IntegerIndexData {
+ public:
+ explicit IntegerIndexData(SectionId section_id, DocumentId document_id,
+ int64_t key)
+ : basic_hit_(section_id, document_id), key_(key) {}
+
+ explicit IntegerIndexData() : basic_hit_(), key_(0) {}
+
+ const BasicHit& basic_hit() const { return basic_hit_; }
+
+ int64_t key() const { return key_; }
+
+ bool is_valid() const { return basic_hit_.is_valid(); }
+
+ bool operator<(const IntegerIndexData& other) const {
+ return basic_hit_ < other.basic_hit_;
+ }
+
+ bool operator==(const IntegerIndexData& other) const {
+ return basic_hit_ == other.basic_hit_ && key_ == other.key_;
+ }
+
+ private:
+ BasicHit basic_hit_;
+ int64_t key_;
+} __attribute__((packed));
+static_assert(sizeof(IntegerIndexData) == 12, "");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_DATA_H_
diff --git a/icing/index/numeric/integer-index-storage.cc b/icing/index/numeric/integer-index-storage.cc
new file mode 100644
index 0000000..72e0266
--- /dev/null
+++ b/icing/index/numeric/integer-index-storage.cc
@@ -0,0 +1,1180 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index-storage.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <queue>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/doc-hit-info-iterator-numeric.h"
+#include "icing/index/numeric/integer-index-bucket-util.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/numeric/posting-list-integer-index-accessor.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Helper function to flush data between [it_start, it_end) into posting list(s)
+// and return posting list id.
+// Note: it will sort data between [it_start, it_end) by basic hit value, so the
+// caller should be aware that the data order will be changed after calling this
+// function.
+libtextclassifier3::StatusOr<PostingListIdentifier> FlushDataIntoPostingLists(
+ FlashIndexStorage* flash_index_storage,
+ PostingListIntegerIndexSerializer* posting_list_serializer,
+ const std::vector<IntegerIndexData>::iterator& it_start,
+ const std::vector<IntegerIndexData>::iterator& it_end) {
+ if (it_start == it_end) {
+ return PostingListIdentifier::kInvalid;
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> new_pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage,
+ posting_list_serializer));
+
+ std::sort(it_start, it_end);
+ for (auto it = it_end - 1; it >= it_start; --it) {
+ ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(*it));
+ }
+
+ PostingListAccessor::FinalizeResult result =
+ std::move(*new_pl_accessor).Finalize();
+ if (!result.status.ok()) {
+ return result.status;
+ }
+ if (!result.id.is_valid()) {
+ return absl_ports::InternalError("Fail to flush data into posting list(s)");
+ }
+ return result.id;
+}
+
+// The following 4 methods are helper functions to get the correct file path of
+// metadata/sorted_buckets/unsorted_buckets/flash_index_storage, according to
+// the given working directory.
+std::string GetMetadataFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", IntegerIndexStorage::kFilePrefix,
+ ".m");
+}
+
+std::string GetSortedBucketsFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", IntegerIndexStorage::kFilePrefix,
+ ".s");
+}
+
+std::string GetUnsortedBucketsFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", IntegerIndexStorage::kFilePrefix,
+ ".u");
+}
+
+std::string GetFlashIndexStorageFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", IntegerIndexStorage::kFilePrefix,
+ ".f");
+}
+
+} // namespace
+
+// We add (BasicHits, key) into a bucket in DocumentId descending and SectionId
+// ascending order. When doing range query, we may access buckets and want to
+// return BasicHits to callers sorted by DocumentId. Therefore, this problem is
+// actually "merge K sorted lists".
+// To implement this algorithm via priority_queue, we create this wrapper class
+// to store PostingListIntegerIndexAccessor for iterating through the posting
+// list chain.
+// - Non-relevant (i.e. not in range [key_lower, key_upper]) will be skipped.
+// - Relevant BasicHits will be returned.
+class BucketPostingListIterator {
+ public:
+ class Comparator {
+ public:
+ // REQUIRES: 2 BucketPostingListIterator* instances (lhs, rhs) should be
+ // valid, i.e. the preceding AdvanceAndFilter() succeeded.
+ bool operator()(const BucketPostingListIterator* lhs,
+ const BucketPostingListIterator* rhs) const {
+ // std::priority_queue is a max heap and we should return BasicHits in
+ // DocumentId descending order.
+ // - BucketPostingListIterator::operator< should have the same order as
+ // DocumentId.
+ // - BasicHit encodes inverted document id and BasicHit::operator<
+ // compares the encoded raw value directly.
+ // - Therefore, BucketPostingListIterator::operator< should compare
+ // BasicHit reversely.
+ // - This will make priority_queue return buckets in DocumentId
+ // descending and SectionId ascending order.
+ // - Whatever direction we sort SectionId by (or pop by priority_queue)
+ // doesn't matter because all hits for the same DocumentId will be
+ // merged into a single DocHitInfo.
+ return rhs->GetCurrentBasicHit() < lhs->GetCurrentBasicHit();
+ }
+ };
+
+ explicit BucketPostingListIterator(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor)
+ : pl_accessor_(std::move(pl_accessor)),
+ should_retrieve_next_batch_(true) {}
+
+ struct AdvanceAndFilterResult {
+ libtextclassifier3::Status status = libtextclassifier3::Status::OK;
+ int32_t num_advance_calls = 0;
+ int32_t num_blocks_inspected = 0;
+ };
+ // Advances to the next relevant data. The posting list of a bucket contains
+ // keys within range [bucket.key_lower, bucket.key_upper], but some of them
+ // may be out of [query_key_lower, query_key_upper], so when advancing we have
+ // to filter out those non-relevant keys.
+ //
+ // Returns:
+ // AdvanceAndFilterResult. status will be:
+ // - OK on success
+ // - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant
+ // data)
+ // - Any other PostingListIntegerIndexAccessor errors
+ AdvanceAndFilterResult AdvanceAndFilter(int64_t query_key_lower,
+ int64_t query_key_upper) {
+ AdvanceAndFilterResult result;
+ // Move curr_ until reaching a relevant data (i.e. key in range
+ // [query_key_lower, query_key_upper])
+ do {
+ if (!should_retrieve_next_batch_) {
+ ++curr_;
+ should_retrieve_next_batch_ =
+ curr_ >= cached_batch_integer_index_data_.cend();
+ }
+ if (should_retrieve_next_batch_) {
+ auto status = GetNextDataBatch();
+ if (!status.ok()) {
+ result.status = std::move(status);
+ return result;
+ }
+ ++result.num_blocks_inspected;
+ should_retrieve_next_batch_ = false;
+ }
+ ++result.num_advance_calls;
+ } while (curr_->key() < query_key_lower || curr_->key() > query_key_upper);
+
+ return result;
+ }
+
+ const BasicHit& GetCurrentBasicHit() const { return curr_->basic_hit(); }
+
+ private:
+ // Gets next batch of data from the posting list chain, caches in
+ // cached_batch_integer_index_data_, and sets curr_ to the begin of the cache.
+ libtextclassifier3::Status GetNextDataBatch() {
+ auto cached_batch_integer_index_data_or = pl_accessor_->GetNextDataBatch();
+ if (!cached_batch_integer_index_data_or.ok()) {
+ ICING_LOG(WARNING)
+ << "Fail to get next batch data from posting list due to: "
+ << cached_batch_integer_index_data_or.status().error_message();
+ return std::move(cached_batch_integer_index_data_or).status();
+ }
+
+ cached_batch_integer_index_data_ =
+ std::move(cached_batch_integer_index_data_or).ValueOrDie();
+ curr_ = cached_batch_integer_index_data_.cbegin();
+
+ if (cached_batch_integer_index_data_.empty()) {
+ return absl_ports::ResourceExhaustedError("End of iterator");
+ }
+
+ return libtextclassifier3::Status::OK;
+ }
+
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor_;
+ std::vector<IntegerIndexData> cached_batch_integer_index_data_;
+ std::vector<IntegerIndexData>::const_iterator curr_;
+ bool should_retrieve_next_batch_;
+};
+
+// Wrapper class to iterate through IntegerIndexStorage to get relevant data.
+// It uses multiple BucketPostingListIterator instances from different candidate
+// buckets and merges all relevant BasicHits from these buckets by
+// std::priority_queue in DocumentId descending order. Also different SectionIds
+// of the same DocumentId will be merged into SectionIdMask and returned as a
+// single DocHitInfo.
+class IntegerIndexStorageIterator : public NumericIndex<int64_t>::Iterator {
+ public:
+ explicit IntegerIndexStorageIterator(
+ int64_t query_key_lower, int64_t query_key_upper,
+ std::vector<std::unique_ptr<BucketPostingListIterator>>&& bucket_pl_iters)
+ : NumericIndex<int64_t>::Iterator(query_key_lower, query_key_upper),
+ num_advance_calls_(0),
+ num_blocks_inspected_(0) {
+ std::vector<BucketPostingListIterator*> bucket_pl_iters_raw_ptrs;
+ for (std::unique_ptr<BucketPostingListIterator>& bucket_pl_itr :
+ bucket_pl_iters) {
+ // Before adding BucketPostingListIterator* into the priority queue, we
+ // have to advance the bucket iterator to the first valid data since the
+ // priority queue needs valid data to compare the order.
+ // Note: it is possible that the bucket iterator fails to advance for the
+ // first round, because data could be filtered out by [query_key_lower,
+ // query_key_upper]. In this case, just discard the iterator.
+ BucketPostingListIterator::AdvanceAndFilterResult
+ advance_and_filter_result =
+ bucket_pl_itr->AdvanceAndFilter(query_key_lower, query_key_upper);
+ if (advance_and_filter_result.status.ok()) {
+ bucket_pl_iters_raw_ptrs.push_back(bucket_pl_itr.get());
+ bucket_pl_iters_.push_back(std::move(bucket_pl_itr));
+ }
+ num_advance_calls_ += advance_and_filter_result.num_advance_calls;
+ num_blocks_inspected_ += advance_and_filter_result.num_blocks_inspected;
+ }
+
+ pq_ = std::priority_queue<BucketPostingListIterator*,
+ std::vector<BucketPostingListIterator*>,
+ BucketPostingListIterator::Comparator>(
+ comparator_, std::move(bucket_pl_iters_raw_ptrs));
+ }
+
+ ~IntegerIndexStorageIterator() override = default;
+
+ // Advances to the next DocHitInfo. Note: several BucketPostingListIterator
+ // instances may be advanced if they point to data with the same DocumentId.
+ //
+ // Returns:
+ // - OK on success
+ // - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant
+ // data)
+ // - Any BucketPostingListIterator errors
+ libtextclassifier3::Status Advance() override;
+
+ DocHitInfo GetDocHitInfo() const override { return doc_hit_info_; }
+
+ int32_t GetNumAdvanceCalls() const override { return num_advance_calls_; }
+
+ int32_t GetNumBlocksInspected() const override {
+ return num_blocks_inspected_;
+ }
+
+ private:
+ BucketPostingListIterator::Comparator comparator_;
+
+ // We have to fetch and pop the top BucketPostingListIterator from
+ // std::priority_queue to perform "merge K sorted lists algorithm".
+ // - Since std::priority_queue::pop() doesn't return the top element, we have
+ // to call top() and pop() together.
+ // - std::move the top() element by const_cast is not an appropriate way
+ // because it introduces transient unstable state for std::priority_queue.
+ // - We don't want to copy BucketPostingListIterator, either.
+ // - Therefore, add bucket_pl_iters_ for the ownership of all
+ // BucketPostingListIterator instances and std::priority_queue uses the raw
+ // pointer. So when calling top(), we can simply copy the raw pointer via
+ // top() and avoid transient unstable state.
+ std::vector<std::unique_ptr<BucketPostingListIterator>> bucket_pl_iters_;
+ std::priority_queue<BucketPostingListIterator*,
+ std::vector<BucketPostingListIterator*>,
+ BucketPostingListIterator::Comparator>
+ pq_;
+
+ DocHitInfo doc_hit_info_;
+
+ int32_t num_advance_calls_;
+ int32_t num_blocks_inspected_;
+};
+
+libtextclassifier3::Status IntegerIndexStorageIterator::Advance() {
+ if (pq_.empty()) {
+ return absl_ports::ResourceExhaustedError("End of iterator");
+ }
+
+ DocumentId document_id = pq_.top()->GetCurrentBasicHit().document_id();
+ doc_hit_info_ = DocHitInfo(document_id);
+ // Merge sections with same document_id into a single DocHitInfo
+ while (!pq_.empty() &&
+ pq_.top()->GetCurrentBasicHit().document_id() == document_id) {
+ BucketPostingListIterator* bucket_itr = pq_.top();
+ pq_.pop();
+
+ libtextclassifier3::Status advance_status;
+ do {
+ doc_hit_info_.UpdateSection(
+ bucket_itr->GetCurrentBasicHit().section_id());
+ BucketPostingListIterator::AdvanceAndFilterResult
+ advance_and_filter_result =
+ bucket_itr->AdvanceAndFilter(key_lower_, key_upper_);
+ advance_status = std::move(advance_and_filter_result.status);
+ num_advance_calls_ += advance_and_filter_result.num_advance_calls;
+ num_blocks_inspected_ += advance_and_filter_result.num_blocks_inspected;
+ } while (advance_status.ok() &&
+ bucket_itr->GetCurrentBasicHit().document_id() == document_id);
+ if (advance_status.ok()) {
+ pq_.push(bucket_itr);
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+bool IntegerIndexStorage::Options::IsValid() const {
+ if (num_data_threshold_for_bucket_split <=
+ kMinNumDataThresholdForBucketSplit) {
+ return false;
+ }
+
+ if (!HasCustomInitBuckets()) {
+ return true;
+ }
+
+ // Verify if the range of buckets are disjoint and the range union is
+ // [INT64_MIN, INT64_MAX].
+ std::vector<Bucket> buckets;
+ buckets.reserve(custom_init_sorted_buckets.size() +
+ custom_init_unsorted_buckets.size());
+ buckets.insert(buckets.end(), custom_init_sorted_buckets.begin(),
+ custom_init_sorted_buckets.end());
+ buckets.insert(buckets.end(), custom_init_unsorted_buckets.begin(),
+ custom_init_unsorted_buckets.end());
+ if (buckets.empty()) {
+ return false;
+ }
+ std::sort(buckets.begin(), buckets.end());
+ int64_t prev_upper = std::numeric_limits<int64_t>::min();
+ for (int i = 0; i < buckets.size(); ++i) {
+ // key_lower should not be greater than key_upper and init bucket should
+ // have invalid posting list identifier.
+ if (buckets[i].key_lower() > buckets[i].key_upper() ||
+ buckets[i].posting_list_identifier().is_valid()) {
+ return false;
+ }
+
+ // Previous upper bound should not be INT64_MAX since it is not the last
+ // bucket.
+ if (prev_upper == std::numeric_limits<int64_t>::max()) {
+ return false;
+ }
+
+ int64_t expected_lower =
+ (i == 0 ? std::numeric_limits<int64_t>::min() : prev_upper + 1);
+ if (buckets[i].key_lower() != expected_lower) {
+ return false;
+ }
+
+ prev_upper = buckets[i].key_upper();
+ }
+
+ return prev_upper == std::numeric_limits<int64_t>::max();
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+IntegerIndexStorage::Create(
+ const Filesystem& filesystem, std::string working_path, Options options,
+ PostingListIntegerIndexSerializer* posting_list_serializer) {
+ if (!options.IsValid()) {
+ return absl_ports::InvalidArgumentError(
+ "Invalid IntegerIndexStorage options");
+ }
+
+ if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
+ !filesystem.FileExists(GetSortedBucketsFilePath(working_path).c_str()) ||
+ !filesystem.FileExists(
+ GetUnsortedBucketsFilePath(working_path).c_str()) ||
+ !filesystem.FileExists(
+ GetFlashIndexStorageFilePath(working_path).c_str())) {
+ // Discard working_path if any of them is missing, and reinitialize.
+ if (filesystem.DirectoryExists(working_path.c_str())) {
+ ICING_RETURN_IF_ERROR(Discard(filesystem, working_path));
+ }
+ return InitializeNewFiles(filesystem, std::move(working_path),
+ std::move(options), posting_list_serializer);
+ }
+ return InitializeExistingFiles(filesystem, std::move(working_path),
+ std::move(options), posting_list_serializer);
+}
+
+IntegerIndexStorage::~IntegerIndexStorage() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to persist hash map to disk while destructing "
+ << working_path_;
+ }
+}
+
+class IntegerIndexStorageComparator {
+ public:
+ bool operator()(const IntegerIndexStorage::Bucket& lhs, int64_t rhs) const {
+ return lhs.key_upper() < rhs;
+ }
+} kComparator;
+
+libtextclassifier3::Status IntegerIndexStorage::AddKeys(
+ DocumentId document_id, SectionId section_id,
+ std::vector<int64_t>&& new_keys) {
+ if (new_keys.empty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ SetDirty();
+
+ std::sort(new_keys.begin(), new_keys.end());
+
+ // Dedupe
+ auto last = std::unique(new_keys.begin(), new_keys.end());
+ new_keys.erase(last, new_keys.end());
+
+ if (static_cast<int32_t>(new_keys.size()) >
+ std::numeric_limits<int32_t>::max() - info().num_data) {
+ return absl_ports::ResourceExhaustedError(
+ "# of keys in this integer index storage exceed the limit");
+ }
+
+ // When adding keys into a bucket, we potentially split it into 2 new buckets
+ // and one of them will be added into the unsorted bucket array.
+ // When handling keys belonging to buckets in the unsorted bucket array, we
+ // don't have to (and must not) handle these newly split buckets. Therefore,
+ // collect all newly split buckets in another vector and append them into the
+ // unsorted bucket array after adding all keys.
+ std::vector<Bucket> new_buckets;
+
+ // Binary search range of the sorted bucket array.
+ const Bucket* sorted_bucket_arr_begin = sorted_buckets_->array();
+ const Bucket* sorted_bucket_arr_end =
+ sorted_buckets_->array() + sorted_buckets_->num_elements();
+
+ // Step 1: handle keys belonging to buckets in the sorted bucket array. Skip
+ // keys belonging to the unsorted bucket array and deal with them in
+ // the next step.
+ // - Iterate through new_keys by it_start.
+ // - Binary search (std::lower_bound comparing key with bucket.key_upper()) to
+ // find the first bucket in the sorted bucket array with key_upper is not
+ // smaller than (>=) the key.
+ // - Skip (and advance it_start) all keys smaller than the target bucket's
+ // key_lower. It means these keys belong to buckets in the unsorted bucket
+ // array and we will deal with them later.
+ // - Find it_end such that all keys within range [it_start, it_end) belong to
+ // the target bucket.
+ // - Batch add keys within range [it_start, it_end) into the target bucket.
+ auto it_start = new_keys.cbegin();
+ while (it_start != new_keys.cend() &&
+ sorted_bucket_arr_begin < sorted_bucket_arr_end) {
+ // Use std::lower_bound to find the first bucket in the sorted bucket array
+ // with key_upper >= *it_start.
+ const Bucket* target_bucket = std::lower_bound(
+ sorted_bucket_arr_begin, sorted_bucket_arr_end, *it_start, kComparator);
+ if (target_bucket >= sorted_bucket_arr_end) {
+ // Keys in range [it_start, new_keys.cend()) are greater than all sorted
+ // buckets' key_upper, so we can end step 1. In fact, they belong to
+ // buckets in the unsorted bucket array and we will deal with them in
+ // step 2.
+ break;
+ }
+
+ // Sequential instead of binary search to advance it_start and it_end for
+ // several reasons:
+ // - Eventually we have to iterate through all keys within range [it_start,
+ // it_end) and add them into the posting list, so binary search doesn't
+ // improve the overall time complexity.
+ // - Binary search may jump to far-away indices, which potentially
+ // downgrades the cache performance.
+
+ // After binary search, we've ensured *it_start <=
+ // target_bucket->key_upper(), but it is still possible that *it_start (and
+ // the next several keys) is still smaller than target_bucket->key_lower(),
+ // so we have to skip them. In fact, they belong to buckets in the unsorted
+ // bucket array.
+ //
+ // For example:
+ // - sorted bucket array: [(INT_MIN, 0), (1, 5), (100, 300), (301, 550)]
+ // - unsorted bucket array: [(550, INT_MAX), (6, 99)]
+ // - new_keys: [10, 20, 40, 102, 150, 200, 500, 600]
+ // std::lower_bound (target = 10) will get target_bucket = (100, 300), but
+ // we have to skip 10, 20, 40 because they are smaller than 100 (the
+ // bucket's key_lower). We should move it_start pointing to key 102.
+ while (it_start != new_keys.cend() &&
+ *it_start < target_bucket->key_lower()) {
+ ++it_start;
+ }
+
+ // Locate it_end such that all keys within range [it_start, it_end) belong
+ // to target_bucket and all keys outside this range don't belong to
+ // target_bucket.
+ //
+ // For example (continue above), we should locate it_end to point to key
+ // 500.
+ auto it_end = it_start;
+ while (it_end != new_keys.cend() && *it_end <= target_bucket->key_upper()) {
+ ++it_end;
+ }
+
+ // Now, keys within range [it_start, it_end) belong to target_bucket, so
+ // construct IntegerIndexData and add them into the bucket's posting list.
+ if (it_start != it_end) {
+ ICING_ASSIGN_OR_RETURN(
+ FileBackedVector<Bucket>::MutableView mutable_bucket,
+ sorted_buckets_->GetMutable(target_bucket -
+ sorted_buckets_->array()));
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<Bucket> round_new_buckets,
+ AddKeysIntoBucketAndSplitIfNecessary(
+ document_id, section_id, it_start, it_end, mutable_bucket));
+ new_buckets.insert(new_buckets.end(), round_new_buckets.begin(),
+ round_new_buckets.end());
+ }
+
+ it_start = it_end;
+ sorted_bucket_arr_begin = target_bucket + 1;
+ }
+
+ // Step 2: handle keys belonging to buckets in the unsorted bucket array. They
+ // were skipped in step 1.
+ // For each bucket in the unsorted bucket array, find [it_start, it_end) such
+ // that all keys within this range belong to the bucket and add them.
+ // - Binary search (std::lower_bound comparing bucket.key_lower() with key) to
+ // find it_start.
+ // - Sequential advance (start from it_start) to find it_end. Same reason as
+ // above for choosing sequential advance instead of binary search.
+ // - Add keys within range [it_start, it_end) into the bucket.
+ for (int32_t i = 0; i < unsorted_buckets_->num_elements(); ++i) {
+ ICING_ASSIGN_OR_RETURN(FileBackedVector<Bucket>::MutableView mutable_bucket,
+ unsorted_buckets_->GetMutable(i));
+ auto it_start = std::lower_bound(new_keys.cbegin(), new_keys.cend(),
+ mutable_bucket.Get().key_lower());
+ if (it_start == new_keys.cend()) {
+ continue;
+ }
+
+ // Sequential advance instead of binary search to find the correct position
+ // of it_end for the same reasons mentioned above in step 1.
+ auto it_end = it_start;
+ while (it_end != new_keys.cend() &&
+ *it_end <= mutable_bucket.Get().key_upper()) {
+ ++it_end;
+ }
+
+ // Now, key within range [it_start, it_end) belong to the bucket, so
+ // construct IntegerIndexData and add them into the bucket's posting list.
+ if (it_start != it_end) {
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<Bucket> round_new_buckets,
+ AddKeysIntoBucketAndSplitIfNecessary(
+ document_id, section_id, it_start, it_end, mutable_bucket));
+ new_buckets.insert(new_buckets.end(), round_new_buckets.begin(),
+ round_new_buckets.end());
+ }
+ }
+
+ // Step 3: append new buckets into the unsorted bucket array.
+ if (!new_buckets.empty()) {
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<Bucket>::MutableArrayView mutable_new_arr,
+ unsorted_buckets_->Allocate(new_buckets.size()));
+ mutable_new_arr.SetArray(/*idx=*/0, new_buckets.data(), new_buckets.size());
+ }
+
+ // Step 4: sort and merge the unsorted bucket array into the sorted bucket
+ // array if the length of the unsorted bucket array exceeds the
+ // threshold.
+ if (unsorted_buckets_->num_elements() > kUnsortedBucketsLengthThreshold) {
+ ICING_RETURN_IF_ERROR(SortBuckets());
+ }
+
+ info().num_data += new_keys.size();
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+IntegerIndexStorage::GetIterator(int64_t query_key_lower,
+ int64_t query_key_upper) const {
+ if (query_key_lower > query_key_upper) {
+ return absl_ports::InvalidArgumentError(
+ "key_lower should not be greater than key_upper");
+ }
+
+ std::vector<std::unique_ptr<BucketPostingListIterator>> bucket_pl_iters;
+
+ // Sorted bucket array
+ const Bucket* sorted_bucket_arr_begin = sorted_buckets_->array();
+ const Bucket* sorted_bucket_arr_end =
+ sorted_buckets_->array() + sorted_buckets_->num_elements();
+ for (const Bucket* bucket =
+ std::lower_bound(sorted_bucket_arr_begin, sorted_bucket_arr_end,
+ query_key_lower, kComparator);
+ bucket < sorted_bucket_arr_end && bucket->key_lower() <= query_key_upper;
+ ++bucket) {
+ if (!bucket->posting_list_identifier().is_valid()) {
+ continue;
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_,
+ bucket->posting_list_identifier()));
+ bucket_pl_iters.push_back(
+ std::make_unique<BucketPostingListIterator>(std::move(pl_accessor)));
+ }
+
+ // Unsorted bucket array
+ for (int32_t i = 0; i < unsorted_buckets_->num_elements(); ++i) {
+ ICING_ASSIGN_OR_RETURN(const Bucket* bucket, unsorted_buckets_->Get(i));
+ if (query_key_upper < bucket->key_lower() ||
+ query_key_lower > bucket->key_upper() ||
+ !bucket->posting_list_identifier().is_valid()) {
+ // Skip bucket whose range doesn't overlap with [query_key_lower,
+ // query_key_upper] or posting_list_identifier is invalid.
+ continue;
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_,
+ bucket->posting_list_identifier()));
+ bucket_pl_iters.push_back(
+ std::make_unique<BucketPostingListIterator>(std::move(pl_accessor)));
+ }
+
+ return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>(
+ std::make_unique<IntegerIndexStorageIterator>(
+ query_key_lower, query_key_upper, std::move(bucket_pl_iters)));
+}
+
+libtextclassifier3::Status IntegerIndexStorage::TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ IntegerIndexStorage* new_storage) const {
+ // Discard all pre-existing buckets in new_storage since we will append newly
+ // merged buckets gradually into new_storage.
+ if (new_storage->sorted_buckets_->num_elements() > 0) {
+ ICING_RETURN_IF_ERROR(new_storage->sorted_buckets_->TruncateTo(0));
+ }
+ if (new_storage->unsorted_buckets_->num_elements() > 0) {
+ ICING_RETURN_IF_ERROR(new_storage->unsorted_buckets_->TruncateTo(0));
+ }
+
+ // "Reference sort" the original storage buckets.
+ std::vector<std::reference_wrapper<const Bucket>> temp_buckets;
+ temp_buckets.reserve(sorted_buckets_->num_elements() +
+ unsorted_buckets_->num_elements());
+ temp_buckets.insert(
+ temp_buckets.end(), sorted_buckets_->array(),
+ sorted_buckets_->array() + sorted_buckets_->num_elements());
+ temp_buckets.insert(
+ temp_buckets.end(), unsorted_buckets_->array(),
+ unsorted_buckets_->array() + unsorted_buckets_->num_elements());
+ std::sort(temp_buckets.begin(), temp_buckets.end(),
+ [](const std::reference_wrapper<const Bucket>& lhs,
+ const std::reference_wrapper<const Bucket>& rhs) -> bool {
+ return lhs.get() < rhs.get();
+ });
+
+ const int32_t num_data_threshold_for_bucket_merge =
+ kNumDataThresholdRatioForBucketMerge *
+ new_storage->options_.num_data_threshold_for_bucket_split;
+ int64_t curr_key_lower = std::numeric_limits<int64_t>::min();
+ int64_t curr_key_upper = std::numeric_limits<int64_t>::min();
+ std::vector<IntegerIndexData> accumulated_data;
+ for (const std::reference_wrapper<const Bucket>& bucket_ref : temp_buckets) {
+ // Read all data from the bucket.
+ std::vector<IntegerIndexData> new_data;
+ if (bucket_ref.get().posting_list_identifier().is_valid()) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> old_pl_accessor,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_,
+ bucket_ref.get().posting_list_identifier()));
+
+ ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> batch_old_data,
+ old_pl_accessor->GetNextDataBatch());
+ while (!batch_old_data.empty()) {
+ for (const IntegerIndexData& old_data : batch_old_data) {
+ DocumentId new_document_id =
+ old_data.basic_hit().document_id() < document_id_old_to_new.size()
+ ? document_id_old_to_new[old_data.basic_hit().document_id()]
+ : kInvalidDocumentId;
+ // Transfer the document id of the hit if the document is not deleted
+ // or outdated.
+ if (new_document_id != kInvalidDocumentId) {
+ new_data.push_back(
+ IntegerIndexData(old_data.basic_hit().section_id(),
+ new_document_id, old_data.key()));
+ }
+ }
+ ICING_ASSIGN_OR_RETURN(batch_old_data,
+ old_pl_accessor->GetNextDataBatch());
+ }
+ }
+
+ // Decide whether:
+ // - Flush accumulated_data and create a new bucket for them.
+ // - OR merge new_data into accumulated_data and go to the next round.
+ if (!accumulated_data.empty() && accumulated_data.size() + new_data.size() >
+ num_data_threshold_for_bucket_merge) {
+ // TODO(b/259743562): [Optimization 3] adjust upper bound to fit more data
+ // from new_data to accumulated_data.
+ ICING_RETURN_IF_ERROR(FlushDataIntoNewSortedBucket(
+ curr_key_lower, curr_key_upper, std::move(accumulated_data),
+ new_storage));
+
+ curr_key_lower = bucket_ref.get().key_lower();
+ accumulated_data = std::move(new_data);
+ } else {
+ // We can just append to accumulated data because
+ // FlushDataIntoNewSortedBucket will take care of sorting the contents.
+ std::move(new_data.begin(), new_data.end(),
+ std::back_inserter(accumulated_data));
+ }
+ curr_key_upper = bucket_ref.get().key_upper();
+ }
+
+ // Add the last round of bucket.
+ ICING_RETURN_IF_ERROR(
+ FlushDataIntoNewSortedBucket(curr_key_lower, curr_key_upper,
+ std::move(accumulated_data), new_storage));
+
+ return libtextclassifier3::Status::OK;
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+IntegerIndexStorage::InitializeNewFiles(
+ const Filesystem& filesystem, std::string&& working_path, Options&& options,
+ PostingListIntegerIndexSerializer* posting_list_serializer) {
+ // IntegerIndexStorage uses working_path as working directory path.
+ // Create working directory.
+ if (!filesystem.CreateDirectory(working_path.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create directory: ", working_path));
+ }
+
+ // Initialize sorted_buckets
+ int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10);
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetSortedBucketsFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<Bucket>::kMaxFileSize,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize unsorted_buckets
+ pre_mapping_mmap_size = sizeof(Bucket) * kUnsortedBucketsLengthThreshold;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetUnsortedBucketsFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<Bucket>::kMaxFileSize,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize flash_index_storage
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path),
+ &filesystem, posting_list_serializer));
+
+ if (options.HasCustomInitBuckets()) {
+ // Insert custom init buckets.
+ std::sort(options.custom_init_sorted_buckets.begin(),
+ options.custom_init_sorted_buckets.end());
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<Bucket>::MutableArrayView
+ mutable_new_sorted_bucket_arr,
+ sorted_buckets->Allocate(options.custom_init_sorted_buckets.size()));
+ mutable_new_sorted_bucket_arr.SetArray(
+ /*idx=*/0, options.custom_init_sorted_buckets.data(),
+ options.custom_init_sorted_buckets.size());
+
+ ICING_ASSIGN_OR_RETURN(typename FileBackedVector<Bucket>::MutableArrayView
+ mutable_new_unsorted_bucket_arr,
+ unsorted_buckets->Allocate(
+ options.custom_init_unsorted_buckets.size()));
+ mutable_new_unsorted_bucket_arr.SetArray(
+ /*idx=*/0, options.custom_init_unsorted_buckets.data(),
+ options.custom_init_unsorted_buckets.size());
+
+ // After inserting buckets, we can clear vectors since there is no need to
+ // cache them.
+ options.custom_init_sorted_buckets.clear();
+ options.custom_init_unsorted_buckets.clear();
+ } else {
+ // Insert one bucket with range [INT64_MIN, INT64_MAX].
+ ICING_RETURN_IF_ERROR(sorted_buckets->Append(Bucket(
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max())));
+ }
+ ICING_RETURN_IF_ERROR(sorted_buckets->PersistToDisk());
+
+ // Initialize metadata file. Create MemoryMappedFile with pre-mapping, and
+ // call GrowAndRemapIfNecessary to grow the underlying file.
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile metadata_mmapped_file,
+ MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/kMetadataFileSize,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/kMetadataFileSize));
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
+ /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
+
+ // Create instance.
+ auto new_integer_index_storage =
+ std::unique_ptr<IntegerIndexStorage>(new IntegerIndexStorage(
+ filesystem, std::move(working_path), std::move(options),
+ posting_list_serializer,
+ std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
+ std::move(sorted_buckets), std::move(unsorted_buckets),
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage))));
+ // Initialize info content by writing mapped memory directly.
+ Info& info_ref = new_integer_index_storage->info();
+ info_ref.magic = Info::kMagic;
+ info_ref.num_data = 0;
+ // Initialize new PersistentStorage. The initial checksums will be computed
+ // and set via InitializeNewStorage.
+ ICING_RETURN_IF_ERROR(new_integer_index_storage->InitializeNewStorage());
+
+ return new_integer_index_storage;
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+IntegerIndexStorage::InitializeExistingFiles(
+ const Filesystem& filesystem, std::string&& working_path, Options&& options,
+ PostingListIntegerIndexSerializer* posting_list_serializer) {
+ // Mmap the content of the crcs and info.
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile metadata_mmapped_file,
+ MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/kMetadataFileSize,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/kMetadataFileSize));
+ if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
+ return absl_ports::FailedPreconditionError("Incorrect metadata file size");
+ }
+
+ // Initialize sorted_buckets
+ int32_t pre_mapping_mmap_size = sizeof(Bucket) * (1 << 10);
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetSortedBucketsFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<Bucket>::kMaxFileSize,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize unsorted_buckets
+ pre_mapping_mmap_size = sizeof(Bucket) * kUnsortedBucketsLengthThreshold;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem, GetUnsortedBucketsFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<Bucket>::kMaxFileSize,
+ options.pre_mapping_fbv ? pre_mapping_mmap_size : 0));
+
+ // Initialize flash_index_storage
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path),
+ &filesystem, posting_list_serializer));
+
+ // Create instance.
+ auto integer_index_storage =
+ std::unique_ptr<IntegerIndexStorage>(new IntegerIndexStorage(
+ filesystem, std::move(working_path), std::move(options),
+ posting_list_serializer,
+ std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
+ std::move(sorted_buckets), std::move(unsorted_buckets),
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage))));
+ // Initialize existing PersistentStorage. Checksums will be validated.
+ ICING_RETURN_IF_ERROR(integer_index_storage->InitializeExistingStorage());
+
+ // Validate other values of info and options.
+ // Magic should be consistent with the codebase.
+ if (integer_index_storage->info().magic != Info::kMagic) {
+ return absl_ports::FailedPreconditionError("Incorrect magic value");
+ }
+
+ return integer_index_storage;
+}
+
+/* static */ libtextclassifier3::Status
+IntegerIndexStorage::FlushDataIntoNewSortedBucket(
+ int64_t key_lower, int64_t key_upper, std::vector<IntegerIndexData>&& data,
+ IntegerIndexStorage* storage) {
+ storage->SetDirty();
+
+ if (data.empty()) {
+ return storage->sorted_buckets_->Append(Bucket(
+ key_lower, key_upper, PostingListIdentifier::kInvalid, /*num_data=*/0));
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ PostingListIdentifier pl_id,
+ FlushDataIntoPostingLists(storage->flash_index_storage_.get(),
+ storage->posting_list_serializer_, data.begin(),
+ data.end()));
+
+ storage->info().num_data += data.size();
+ return storage->sorted_buckets_->Append(
+ Bucket(key_lower, key_upper, pl_id, data.size()));
+}
+
+libtextclassifier3::Status IntegerIndexStorage::PersistStoragesToDisk(
+ bool force) {
+ if (!force && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ ICING_RETURN_IF_ERROR(sorted_buckets_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(unsorted_buckets_->PersistToDisk());
+ if (!flash_index_storage_->PersistToDisk()) {
+ return absl_ports::InternalError(
+ "Fail to persist FlashIndexStorage to disk");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IntegerIndexStorage::PersistMetadataToDisk(
+ bool force) {
+ // We can skip persisting metadata to disk only if both info and storage are
+ // clean.
+ if (!force && !is_info_dirty() && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Changes should have been applied to the underlying file when using
+ // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
+ // extra safety step to ensure they are written out.
+ return metadata_mmapped_file_->PersistToDisk();
+}
+
+libtextclassifier3::StatusOr<Crc32> IntegerIndexStorage::ComputeInfoChecksum(
+ bool force) {
+ if (!force && !is_info_dirty()) {
+ return Crc32(crcs().component_crcs.info_crc);
+ }
+
+ return info().ComputeChecksum();
+}
+
+libtextclassifier3::StatusOr<Crc32>
+IntegerIndexStorage::ComputeStoragesChecksum(bool force) {
+ if (!force && !is_storage_dirty()) {
+ return Crc32(crcs().component_crcs.storages_crc);
+ }
+
+ // Compute crcs
+ ICING_ASSIGN_OR_RETURN(Crc32 sorted_buckets_crc,
+ sorted_buckets_->ComputeChecksum());
+ ICING_ASSIGN_OR_RETURN(Crc32 unsorted_buckets_crc,
+ unsorted_buckets_->ComputeChecksum());
+
+ // TODO(b/259744228): implement and include flash_index_storage checksum
+ return Crc32(sorted_buckets_crc.Get() ^ unsorted_buckets_crc.Get());
+}
+
+libtextclassifier3::StatusOr<std::vector<IntegerIndexStorage::Bucket>>
+IntegerIndexStorage::AddKeysIntoBucketAndSplitIfNecessary(
+ DocumentId document_id, SectionId section_id,
+ const std::vector<int64_t>::const_iterator& it_start,
+ const std::vector<int64_t>::const_iterator& it_end,
+ FileBackedVector<Bucket>::MutableView& mutable_bucket) {
+ int32_t num_data_in_bucket = mutable_bucket.Get().num_data();
+ int32_t num_new_data = std::distance(it_start, it_end);
+ if (mutable_bucket.Get().key_lower() < mutable_bucket.Get().key_upper() &&
+ num_new_data + num_data_in_bucket >
+ options_.num_data_threshold_for_bucket_split) {
+ // Split bucket.
+
+ // 1. Read all data and free all posting lists.
+ std::vector<IntegerIndexData> all_data;
+ if (mutable_bucket.Get().posting_list_identifier().is_valid()) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_,
+ mutable_bucket.Get().posting_list_identifier()));
+ ICING_ASSIGN_OR_RETURN(all_data, pl_accessor->GetAllDataAndFree());
+ }
+
+ // 2. Append all new data.
+ all_data.reserve(all_data.size() + num_new_data);
+ for (auto it = it_start; it != it_end; ++it) {
+ all_data.push_back(IntegerIndexData(section_id, document_id, *it));
+ }
+
+ // 3. Run bucket splitting algorithm to decide new buckets and dispatch
+ // data.
+ // - # of data in a full bucket =
+ // options_.num_data_threshold_for_bucket_split.
+ // - Bucket splitting logic will be invoked if adding new data
+ // (num_new_data >= 1) into a full bucket.
+ // - In order to achieve good (amortized) time complexity, we want # of
+ // data in new buckets to be around half_of_threshold (i.e.
+ // options_.num_data_threshold_for_bucket_split / 2).
+ // - Using half_of_threshold as the cutoff threshold will cause splitting
+ // buckets with [half_of_threshold, half_of_threshold, num_new_data]
+ // data, which is not ideal because num_new_data is usually small.
+ // - Thus, we pick (half_of_threshold + kNumDataAfterSplitAdjustment) as
+ // the cutoff threshold to avoid over-splitting. It can tolerate
+ // num_new_data up to (2 * kNumDataAfterSplitAdjustment) and
+ // split only 2 buckets (instead of 3) with
+ // [half_of_threshold + kNumDataAfterSplitAdjustment,
+ // half_of_threshold + (kNumDataAfterSplitAdjustment - num_new_data)].
+ int32_t cutoff_threshold =
+ options_.num_data_threshold_for_bucket_split / 2 +
+ kNumDataAfterSplitAdjustment;
+ std::vector<integer_index_bucket_util::DataRangeAndBucketInfo>
+ new_bucket_infos = integer_index_bucket_util::Split(
+ all_data, mutable_bucket.Get().key_lower(),
+ mutable_bucket.Get().key_upper(), cutoff_threshold);
+ if (new_bucket_infos.empty()) {
+ ICING_LOG(WARNING)
+ << "No buckets after splitting. This should not happen.";
+ return absl_ports::InternalError("Split error");
+ }
+
+ // 4. Flush data and create new buckets.
+ std::vector<Bucket> new_buckets;
+ for (int i = 0; i < new_bucket_infos.size(); ++i) {
+ int32_t num_data_in_new_bucket =
+ std::distance(new_bucket_infos[i].start, new_bucket_infos[i].end);
+ ICING_ASSIGN_OR_RETURN(
+ PostingListIdentifier pl_id,
+ FlushDataIntoPostingLists(
+ flash_index_storage_.get(), posting_list_serializer_,
+ new_bucket_infos[i].start, new_bucket_infos[i].end));
+ if (i == 0) {
+ // Reuse mutable_bucket
+ mutable_bucket.Get().set_key_lower(new_bucket_infos[i].key_lower);
+ mutable_bucket.Get().set_key_upper(new_bucket_infos[i].key_upper);
+ mutable_bucket.Get().set_posting_list_identifier(pl_id);
+ mutable_bucket.Get().set_num_data(num_data_in_new_bucket);
+ } else {
+ new_buckets.push_back(Bucket(new_bucket_infos[i].key_lower,
+ new_bucket_infos[i].key_upper, pl_id,
+ num_data_in_new_bucket));
+ }
+ }
+
+ return new_buckets;
+ }
+
+ // Otherwise, we don't need to split bucket. Just simply add all new data into
+ // the bucket.
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor;
+ if (mutable_bucket.Get().posting_list_identifier().is_valid()) {
+ ICING_ASSIGN_OR_RETURN(
+ pl_accessor, PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_,
+ mutable_bucket.Get().posting_list_identifier()));
+ } else {
+ ICING_ASSIGN_OR_RETURN(
+ pl_accessor, PostingListIntegerIndexAccessor::Create(
+ flash_index_storage_.get(), posting_list_serializer_));
+ }
+
+ for (auto it = it_start; it != it_end; ++it) {
+ ICING_RETURN_IF_ERROR(pl_accessor->PrependData(
+ IntegerIndexData(section_id, document_id, *it)));
+ }
+
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ if (!result.status.ok()) {
+ return result.status;
+ }
+ if (!result.id.is_valid()) {
+ return absl_ports::InternalError("Fail to flush data into posting list(s)");
+ }
+
+ mutable_bucket.Get().set_posting_list_identifier(result.id);
+ // We've already verified num_new_data won't exceed the limit of the entire
+ // storage, so it is safe to add to the counter of the bucket.
+ mutable_bucket.Get().set_num_data(num_data_in_bucket + num_new_data);
+
+ return std::vector<Bucket>();
+}
+
+libtextclassifier3::Status IntegerIndexStorage::SortBuckets() {
+ if (unsorted_buckets_->num_elements() == 0) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ int32_t sorted_len = sorted_buckets_->num_elements();
+ int32_t unsorted_len = unsorted_buckets_->num_elements();
+ if (sorted_len > FileBackedVector<Bucket>::kMaxNumElements - unsorted_len) {
+ return absl_ports::OutOfRangeError(
+ "Sorted buckets length exceeds the limit after merging");
+ }
+
+ ICING_RETURN_IF_ERROR(sorted_buckets_->Allocate(unsorted_len));
+
+ // Sort unsorted_buckets_.
+ ICING_RETURN_IF_ERROR(
+ unsorted_buckets_->Sort(/*begin_idx=*/0, /*end_idx=*/unsorted_len));
+
+ // Merge unsorted_buckets_ into sorted_buckets_ and clear unsorted_buckets_.
+ // Note that we could have used std::sort + std::inplace_merge, but it is more
+ // complicated to deal with FileBackedVector SetDirty logic, so implement our
+ // own merging with FileBackedVector methods.
+ //
+ // Merge buckets from back. This could save some iterations and avoid setting
+ // dirty for unchanged elements of the original sorted segments.
+ // For example, we can avoid setting dirty for elements [1, 2, 3, 5] for the
+ // following sorted/unsorted data:
+ // - sorted: [1, 2, 3, 5, 8, 13, _, _, _, _)]
+ // - unsorted: [6, 10, 14, 15]
+ int32_t sorted_write_idx = sorted_len + unsorted_len - 1;
+ int32_t sorted_curr_idx = sorted_len - 1;
+ int32_t unsorted_curr_idx = unsorted_len - 1;
+ while (unsorted_curr_idx >= 0) {
+ if (sorted_curr_idx >= 0 && unsorted_buckets_->array()[unsorted_curr_idx] <
+ sorted_buckets_->array()[sorted_curr_idx]) {
+ ICING_RETURN_IF_ERROR(sorted_buckets_->Set(
+ sorted_write_idx, sorted_buckets_->array()[sorted_curr_idx]));
+ --sorted_curr_idx;
+
+ } else {
+ ICING_RETURN_IF_ERROR(sorted_buckets_->Set(
+ sorted_write_idx, unsorted_buckets_->array()[unsorted_curr_idx]));
+ --unsorted_curr_idx;
+ }
+ --sorted_write_idx;
+ }
+
+ ICING_RETURN_IF_ERROR(unsorted_buckets_->TruncateTo(0));
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/integer-index-storage.h b/icing/index/numeric/integer-index-storage.h
new file mode 100644
index 0000000..0c1afbb
--- /dev/null
+++ b/icing/index/numeric/integer-index-storage.h
@@ -0,0 +1,506 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_
+#define ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// IntegerIndexStorage: a class for indexing (persistent storage) and searching
+// contents of integer type sections in documents.
+// - Accepts new integer contents (a.k.a keys) and adds records (BasicHit, key)
+// into the integer index.
+// - Stores records (BasicHit, key) in posting lists and compresses them.
+// - Bucketizes these records by key to make range query more efficient and
+// manages them with the corresponding posting lists.
+// - When a posting list reaches the max size and is full, the mechanism of
+// PostingListAccessor is to create another new (max-size) posting list and
+// chain them together.
+// - It will be inefficient if we store all records in the same PL chain. E.g.
+// small range query needs to iterate through the whole PL chain but skips a
+// lot of non-relevant records (whose keys don't belong to the query range).
+// - Therefore, we implement the splitting mechanism to split a full max-size
+// posting list. Also adjust range of the original bucket and add new
+// buckets.
+// - Ranges of all buckets are disjoint and the union of them is [INT64_MIN,
+// INT64_MAX].
+// - Buckets should be sorted, so we can do binary search to find the desired
+// bucket(s). However, we may split a bucket into several buckets, and the
+// cost to insert newly created buckets is high.
+// - Thus, we introduce an unsorted bucket array for newly created buckets,
+// and merge unsorted buckets into the sorted bucket array only if length of
+// the unsorted bucket array exceeds the threshold. This mechanism will
+// reduce # of merging events and amortize the overall cost for bucket order
+// maintenance.
+// Note: some tree data structures (e.g. segment tree, B+ tree) maintain the
+// bucket order more efficiently than the sorted/unsorted bucket array
+// mechanism, but the implementation is more complicated and doesn't improve
+// the performance too much according to our analysis, so currently we
+// choose sorted/unsorted bucket array.
+// - Then we do binary search on the sorted bucket array and sequential search
+// on the unsorted bucket array.
+class IntegerIndexStorage : public PersistentStorage {
+ public:
+ struct Info {
+ static constexpr int32_t kMagic = 0x6470e547;
+
+ int32_t magic;
+ int32_t num_data;
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(
+ std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
+ }
+ } __attribute__((packed));
+ static_assert(sizeof(Info) == 8, "");
+
+ // Bucket
+ class Bucket {
+ public:
+ // Absolute max # of buckets allowed. Since the absolute max file size of
+ // FileBackedVector on 32-bit platform is ~2^28, we can at most have ~13.4M
+ // buckets. To make it power of 2, round it down to 2^23. Also since we're
+ // using FileBackedVector to store buckets, add some static_asserts to
+ // ensure numbers here are compatible with FileBackedVector.
+ static constexpr int32_t kMaxNumBuckets = 1 << 23;
+
+ explicit Bucket(int64_t key_lower, int64_t key_upper,
+ PostingListIdentifier posting_list_identifier =
+ PostingListIdentifier::kInvalid,
+ int32_t num_data = 0)
+ : key_lower_(key_lower),
+ key_upper_(key_upper),
+ posting_list_identifier_(posting_list_identifier),
+ num_data_(num_data) {}
+
+ bool operator<(const Bucket& other) const {
+ return key_lower_ < other.key_lower_;
+ }
+
+ // For FileBackedVector
+ bool operator==(const Bucket& other) const {
+ return key_lower_ == other.key_lower_ && key_upper_ == other.key_upper_ &&
+ posting_list_identifier_ == other.posting_list_identifier_;
+ }
+
+ int64_t key_lower() const { return key_lower_; }
+
+ int64_t key_upper() const { return key_upper_; }
+
+ void set_key_lower(int64_t key_lower) { key_lower_ = key_lower; }
+
+ void set_key_upper(int64_t key_upper) { key_upper_ = key_upper; }
+
+ PostingListIdentifier posting_list_identifier() const {
+ return posting_list_identifier_;
+ }
+ void set_posting_list_identifier(
+ PostingListIdentifier posting_list_identifier) {
+ posting_list_identifier_ = posting_list_identifier;
+ }
+
+ int32_t num_data() const { return num_data_; }
+ void set_num_data(int32_t num_data) { num_data_ = num_data; }
+
+ private:
+ int64_t key_lower_;
+ int64_t key_upper_;
+ PostingListIdentifier posting_list_identifier_;
+ int32_t num_data_;
+ } __attribute__((packed));
+ static_assert(sizeof(Bucket) == 24, "");
+ static_assert(sizeof(Bucket) == FileBackedVector<Bucket>::kElementTypeSize,
+ "Bucket type size is inconsistent with FileBackedVector "
+ "element type size");
+ static_assert(Bucket::kMaxNumBuckets <=
+ (FileBackedVector<Bucket>::kMaxFileSize -
+ FileBackedVector<Bucket>::Header::kHeaderSize) /
+ FileBackedVector<Bucket>::kElementTypeSize,
+ "Max # of buckets cannot fit into FileBackedVector");
+
+ struct Options {
+ // - According to the benchmark result, the more # of buckets, the higher
+ // latency for range query. Therefore, this number cannot be too small to
+ // avoid splitting bucket too aggressively.
+ // - We use `num_data_threshold_for_bucket_split / 2 + 5` as the cutoff
+ // threshold after splitting. This number cannot be too small (e.g. 10)
+ // because in this case we will have similar # of data in a single bucket
+ // before and after splitting, which contradicts the purpose of splitting.
+ // - For convenience, let's set 64 as the minimum value.
+ static constexpr int32_t kMinNumDataThresholdForBucketSplit = 64;
+
+ explicit Options(int32_t num_data_threshold_for_bucket_split_in,
+ bool pre_mapping_fbv_in)
+ : num_data_threshold_for_bucket_split(
+ num_data_threshold_for_bucket_split_in),
+ pre_mapping_fbv(pre_mapping_fbv_in) {}
+
+ explicit Options(std::vector<Bucket> custom_init_sorted_buckets_in,
+ std::vector<Bucket> custom_init_unsorted_buckets_in,
+ int32_t num_data_threshold_for_bucket_split_in,
+ bool pre_mapping_fbv_in)
+ : custom_init_sorted_buckets(std::move(custom_init_sorted_buckets_in)),
+ custom_init_unsorted_buckets(
+ std::move(custom_init_unsorted_buckets_in)),
+ num_data_threshold_for_bucket_split(
+ num_data_threshold_for_bucket_split_in),
+ pre_mapping_fbv(pre_mapping_fbv_in) {}
+
+ bool IsValid() const;
+
+ bool HasCustomInitBuckets() const {
+ return !custom_init_sorted_buckets.empty() ||
+ !custom_init_unsorted_buckets.empty();
+ }
+
+ // Custom buckets when initializing new files. If both are empty, then the
+ // initial bucket is (INT64_MIN, INT64_MAX). Usually we only set them in the
+ // unit test. Note that all buckets in custom_init_sorted_buckets and
+ // custom_init_unsorted_buckets should be disjoint and the range union
+ // should be [INT64_MIN, INT64_MAX].
+ std::vector<Bucket> custom_init_sorted_buckets;
+ std::vector<Bucket> custom_init_unsorted_buckets;
+
+ // Threshold for invoking bucket splitting. If # of data in a bucket exceeds
+ // this number after adding new data, then it will invoke bucket splitting
+ // logic.
+ //
+ // Note: num_data_threshold_for_bucket_split should be >=
+ // kMinNumDataThresholdForBucketSplit.
+ int32_t num_data_threshold_for_bucket_split;
+
+ // Flag indicating whether memory map max possible file size for underlying
+ // FileBackedVector before growing the actual file size.
+ bool pre_mapping_fbv;
+ };
+
+ // Metadata file layout: <Crcs><Info>
+ static constexpr int32_t kCrcsMetadataFileOffset = 0;
+ static constexpr int32_t kInfoMetadataFileOffset =
+ static_cast<int32_t>(sizeof(Crcs));
+ static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
+ static_assert(kMetadataFileSize == 20, "");
+
+ static constexpr WorkingPathType kWorkingPathType =
+ WorkingPathType::kDirectory;
+ static constexpr std::string_view kFilePrefix = "integer_index_storage";
+
+ // Default # of data threshold for bucket splitting during indexing (AddKeys).
+ // When # of data in a bucket reaches this number, we will try to split data
+ // into multiple buckets according to their keys.
+ static constexpr int32_t kDefaultNumDataThresholdForBucketSplit = 65536;
+
+ // # of data threshold for bucket merging during optimization (TransferIndex)
+ // = kNumDataThresholdRatioForBucketMerge *
+ // options.num_data_threshold_for_bucket_split
+ //
+ // If total # data of adjacent buckets exceed this threshold, then flush the
+ // accumulated data. Otherwise merge buckets and their data.
+ static constexpr double kNumDataThresholdRatioForBucketMerge = 0.7;
+
+ // Length threshold to sort and merge unsorted buckets into sorted buckets. If
+ // the length of unsorted_buckets exceed the threshold, then call
+ // SortBuckets().
+ // TODO(b/259743562): decide if removing unsorted buckets given that we
+ // changed bucket splitting threshold and # of buckets are small now.
+ static constexpr int32_t kUnsortedBucketsLengthThreshold = 5;
+
+ // Creates a new IntegerIndexStorage instance to index integers (for a single
+ // property). If any of the underlying file is missing, then delete the whole
+ // working_path and (re)initialize with new ones. Otherwise initialize and
+ // create the instance by existing files.
+ //
+ // filesystem: Object to make system level calls
+ // working_path: Specifies the working path for PersistentStorage.
+ // IntegerIndexStorage uses working path as working directory
+ // and all related files will be stored under this directory. It
+ // takes full ownership and of working_path_, including
+ // creation/deletion. It is the caller's responsibility to
+ // specify correct working path and avoid mixing different
+ // persistent storages together under the same path. Also the
+ // caller has the ownership for the parent directory of
+ // working_path_, and it is responsible for parent directory
+ // creation/deletion. See PersistentStorage for more details
+ // about the concept of working_path.
+ // options: Options instance.
+ // posting_list_serializer: a PostingListIntegerIndexSerializer instance to
+ // serialize/deserialize integer index data to/from
+ // posting lists.
+ //
+ // Returns:
+ // - INVALID_ARGUMENT_ERROR if any value in options is invalid.
+ // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
+ // checksum.
+ // - INTERNAL_ERROR on I/O errors.
+ // - Any FileBackedVector/FlashIndexStorage errors.
+ static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ Create(const Filesystem& filesystem, std::string working_path,
+ Options options,
+ PostingListIntegerIndexSerializer* posting_list_serializer);
+
+ // Deletes IntegerIndexStorage under working_path.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ static libtextclassifier3::Status Discard(const Filesystem& filesystem,
+ const std::string& working_path) {
+ return PersistentStorage::Discard(filesystem, working_path,
+ kWorkingPathType);
+ }
+
+ // Delete copy and move constructor/assignment operator.
+ IntegerIndexStorage(const IntegerIndexStorage&) = delete;
+ IntegerIndexStorage& operator=(const IntegerIndexStorage&) = delete;
+
+ IntegerIndexStorage(IntegerIndexStorage&&) = delete;
+ IntegerIndexStorage& operator=(IntegerIndexStorage&&) = delete;
+
+ ~IntegerIndexStorage() override;
+
+ // Batch adds new keys (of the same DocumentId and SectionId) into the integer
+ // index storage.
+ // Note that since we separate different property names into different integer
+ // index storages, it is impossible to have keys in a single document across
+ // multiple sections to add into the same integer index storage.
+ //
+ // Returns:
+ // - OK on success
+ // - RESOURCE_EXHAUSTED_ERROR if # of integers in this storage exceed
+ // INT_MAX after adding new_keys
+ // - Any FileBackedVector or PostingList errors
+ libtextclassifier3::Status AddKeys(DocumentId document_id,
+ SectionId section_id,
+ std::vector<int64_t>&& new_keys);
+
+ // Returns a DocHitInfoIteratorNumeric<int64_t> (in DocHitInfoIterator
+ // interface type format) for iterating through all docs which have the
+ // specified (integer) property contents in range [query_key_lower,
+ // query_key_upper].
+ // When iterating through all relevant doc hits, it:
+ // - Merges multiple SectionIds of doc hits with same DocumentId into a single
+ // SectionIdMask and constructs DocHitInfo.
+ // - Returns DocHitInfo in descending DocumentId order.
+ //
+ // Returns:
+ // - On success: a DocHitInfoIterator(Numeric)
+ // - INVALID_ARGUMENT_ERROR if query_key_lower > query_key_upper
+ // - Any FileBackedVector or PostingList errors
+ libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
+ int64_t query_key_lower, int64_t query_key_upper) const;
+
+ // Transfers integer index data from the current storage to new_storage and
+ // optimizes buckets (for new_storage only), i.e. merging adjacent buckets if
+ // total # of data among them are less than or equal to
+ // kNumDataThresholdForBucketMerge.
+ //
+ // REQUIRES: new_storage should be a newly created storage instance, i.e. not
+ // contain any data. Otherwise, existing data and posting lists won't be
+ // freed and space will be wasted.
+ //
+ // Returns:
+ // - OK on success
+ // - OUT_OF_RANGE_ERROR if sorted buckets length exceeds the limit after
+ // merging
+ // - INTERNAL_ERROR on IO error
+ libtextclassifier3::Status TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ IntegerIndexStorage* new_storage) const;
+
+ int32_t num_data() const { return info().num_data; }
+
+ private:
+ static constexpr int8_t kNumDataAfterSplitAdjustment = 5;
+
+ explicit IntegerIndexStorage(
+ const Filesystem& filesystem, std::string&& working_path,
+ Options&& options,
+ PostingListIntegerIndexSerializer* posting_list_serializer,
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ std::unique_ptr<FlashIndexStorage> flash_index_storage)
+ : PersistentStorage(filesystem, std::move(working_path),
+ kWorkingPathType),
+ options_(std::move(options)),
+ posting_list_serializer_(posting_list_serializer),
+ metadata_mmapped_file_(std::move(metadata_mmapped_file)),
+ sorted_buckets_(std::move(sorted_buckets)),
+ unsorted_buckets_(std::move(unsorted_buckets)),
+ flash_index_storage_(std::move(flash_index_storage)),
+ is_info_dirty_(false),
+ is_storage_dirty_(false) {}
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ InitializeNewFiles(
+ const Filesystem& filesystem, std::string&& working_path,
+ Options&& options,
+ PostingListIntegerIndexSerializer* posting_list_serializer);
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ InitializeExistingFiles(
+ const Filesystem& filesystem, std::string&& working_path,
+ Options&& options,
+ PostingListIntegerIndexSerializer* posting_list_serializer);
+
+ // Flushes data into posting list(s), creates a new bucket with range
+ // [key_lower, key_upper], and appends it into sorted buckets for storage.
+ // It is a helper function for TransferIndex.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR if fails to write existing data into posting list(s)
+ // - Any FileBackedVector or PostingList errors
+ static libtextclassifier3::Status FlushDataIntoNewSortedBucket(
+ int64_t key_lower, int64_t key_upper,
+ std::vector<IntegerIndexData>&& data, IntegerIndexStorage* storage);
+
+ // Flushes contents of all storages to underlying files.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistStoragesToDisk(bool force) override;
+
+ // Flushes contents of metadata file.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistMetadataToDisk(bool force) override;
+
+ // Computes and returns Info checksum.
+ //
+ // Returns:
+ // - Crc of the Info on success
+ libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override;
+
+ // Computes and returns all storages checksum. Checksums of sorted_buckets_,
+ // unsorted_buckets_ will be combined together by XOR.
+ // TODO(b/259744228): implement and include flash_index_storage checksum
+ //
+ // Returns:
+ // - Crc of all storages on success
+ // - INTERNAL_ERROR if any data inconsistency
+ libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) override;
+
+ // Helper function to add keys in range [it_start, it_end) into the given
+ // bucket. It handles the bucket and its corresponding posting list(s) to make
+ // searching and indexing efficient.
+ //
+ // When the (single) posting list of the bucket is full:
+ // - If the size of posting list hasn't reached the max size, then just simply
+ // add a new key into it, and PostingListAccessor mechanism will
+ // automatically double the size of the posting list.
+ // - Else:
+ // - If the bucket is splittable (i.e. key_lower < key_upper), then split it
+ // into several new buckets with new ranges, and split the data (according
+ // to their keys and the range of new buckets) of the original posting
+ // list into several new posting lists.
+ // - Otherwise, just simply add a new key into it, and PostingListAccessor
+ // mechanism will automatically create a new max size posting list and
+ // chain them.
+ //
+ // Returns:
+ // - On success: a vector of new Buckets (to add into the unsorted bucket
+ // array later)
+ // - Any FileBackedVector or PostingList errors
+ libtextclassifier3::StatusOr<std::vector<Bucket>>
+ AddKeysIntoBucketAndSplitIfNecessary(
+ DocumentId document_id, SectionId section_id,
+ const std::vector<int64_t>::const_iterator& it_start,
+ const std::vector<int64_t>::const_iterator& it_end,
+ FileBackedVector<Bucket>::MutableView& mutable_bucket);
+
+ // Merges all unsorted buckets into sorted buckets and clears unsorted
+ // buckets.
+ //
+ // Returns:
+ // - OK on success
+ // - OUT_OF_RANGE_ERROR if sorted buckets length exceeds the limit after
+ // merging
+ // - Any FileBackedVector errors
+ libtextclassifier3::Status SortBuckets();
+
+ Crcs& crcs() override {
+ return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
+ kCrcsMetadataFileOffset);
+ }
+
+ const Crcs& crcs() const override {
+ return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
+ kCrcsMetadataFileOffset);
+ }
+
+ Info& info() {
+ return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
+ kInfoMetadataFileOffset);
+ }
+
+ const Info& info() const {
+ return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
+ kInfoMetadataFileOffset);
+ }
+
+ void SetInfoDirty() { is_info_dirty_ = true; }
+ // When storage is dirty, we have to set info dirty as well. So just expose
+ // SetDirty to set both.
+ void SetDirty() {
+ is_info_dirty_ = true;
+ is_storage_dirty_ = true;
+ }
+
+ bool is_info_dirty() const { return is_info_dirty_; }
+ bool is_storage_dirty() const { return is_storage_dirty_; }
+
+ Options options_;
+
+ PostingListIntegerIndexSerializer* posting_list_serializer_; // Does not own.
+
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets_;
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets_;
+ std::unique_ptr<FlashIndexStorage> flash_index_storage_;
+
+ bool is_info_dirty_;
+ bool is_storage_dirty_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_STORAGE_H_
diff --git a/icing/index/numeric/integer-index-storage_benchmark.cc b/icing/index/numeric/integer-index-storage_benchmark.cc
new file mode 100644
index 0000000..85d381d
--- /dev/null
+++ b/icing/index/numeric/integer-index-storage_benchmark.cc
@@ -0,0 +1,407 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/destructible-directory.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/integer-index-storage.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/numeric/normal-distribution-number-generator.h"
+#include "icing/testing/numeric/number-generator.h"
+#include "icing/testing/numeric/uniform-distribution-integer-generator.h"
+#include "icing/testing/tmp-directory.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/index/numeric:integer-index-storage_benchmark
+//
+// $ blaze-bin/icing/index/numeric/integer-index-storage_benchmark
+// --benchmark_filter=all --benchmark_memory_usage
+//
+// Run on an Android device:
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/index/numeric:integer-index-storage_benchmark
+//
+// $ adb push
+// blaze-bin/icing/index/numeric/integer-index-storage_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/integer-index-storage_benchmark
+// --benchmark_filter=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+static constexpr int32_t kNumDataThresholdForBucketSplit =
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit;
+static constexpr bool kPreMappingFbv = true;
+
+static constexpr SectionId kDefaultSectionId = 12;
+static constexpr int kDefaultSeed = 12345;
+
+enum DistributionTypeEnum {
+ kUniformDistribution,
+ kNormalDistribution,
+};
+
+class IntegerIndexStorageBenchmark {
+ public:
+ Filesystem filesystem;
+ std::string working_path;
+
+ PostingListIntegerIndexSerializer posting_list_serializer;
+
+ explicit IntegerIndexStorageBenchmark()
+ : working_path(GetTestTempDir() + "/integer_index_benchmark") {}
+
+ ~IntegerIndexStorageBenchmark() {
+ filesystem.DeleteDirectoryRecursively(working_path.c_str());
+ }
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<NumberGenerator<int64_t>>>
+CreateIntegerGenerator(DistributionTypeEnum distribution_type, int seed,
+ int num_keys) {
+ switch (distribution_type) {
+ case DistributionTypeEnum::kUniformDistribution:
+ // Since the collision # follows poisson distribution with lambda =
+ // (num_keys / range), we set the range 10x (lambda = 0.1) to avoid too
+ // many collisions.
+ //
+ // Distribution:
+ // - keys in range being picked for 0 times: 90.5%
+ // - keys in range being picked for 1 time: 9%
+ // - keys in range being picked for 2 times: 0.45%
+ // - keys in range being picked for 3 times: 0.015%
+ //
+ // For example, num_keys = 1M, range = 10M. Then there will be ~904837
+ // unique keys, 45242 keys being picked twice, 1508 keys being picked
+ // thrice ...
+ return std::make_unique<UniformDistributionIntegerGenerator<int64_t>>(
+ seed, /*range_lower=*/0,
+ /*range_upper=*/static_cast<int64_t>(num_keys) * 10 - 1);
+ case DistributionTypeEnum::kNormalDistribution:
+ // Normal distribution with mean = 0 and stddev = num_keys / 1024.
+ // - keys in range [-1 * stddev, 1 * stddev]: 68.2%
+ // - keys in range [-2 * stddev, 2 * stddev]: 95.4%
+ // - keys in range [-3 * stddev, 3 * stddev]: 99.7%
+ //
+ // - When generating num_keys integers, 68.2% of them will be in range
+ // [-num_keys / 1024, num_keys / 1024]
+ // - Each number in this range will be sampled (num_keys * 0.682) /
+ // ((num_keys / 1024) * 2) = 349 times on average and become
+ // "single-range bucket".
+ return std::make_unique<NormalDistributionNumberGenerator<int64_t>>(
+ seed, /*mean=*/0.0, /*stddev=*/num_keys / 1024.0);
+ default:
+ return absl_ports::InvalidArgumentError("Unknown type");
+ }
+}
+
+void BM_Index(benchmark::State& state) {
+ DistributionTypeEnum distribution_type =
+ static_cast<DistributionTypeEnum>(state.range(0));
+ int num_keys = state.range(1);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumberGenerator<int64_t>> generator,
+ CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys));
+ std::vector<int64_t> keys(num_keys);
+ for (int i = 0; i < num_keys; ++i) {
+ keys[i] = generator->Generate();
+ }
+
+ IntegerIndexStorageBenchmark benchmark;
+ for (auto _ : state) {
+ state.PauseTiming();
+ benchmark.filesystem.DeleteDirectoryRecursively(
+ benchmark.working_path.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ benchmark.filesystem, benchmark.working_path,
+ IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit,
+ kPreMappingFbv),
+ &benchmark.posting_list_serializer));
+ state.ResumeTiming();
+
+ for (int i = 0; i < num_keys; ++i) {
+ ICING_ASSERT_OK(storage->AddKeys(static_cast<DocumentId>(i),
+ kDefaultSectionId, {keys[i]}));
+ }
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ state.PauseTiming();
+ storage.reset();
+ state.ResumeTiming();
+ }
+}
+BENCHMARK(BM_Index)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20);
+
+void BM_BatchIndex(benchmark::State& state) {
+ DistributionTypeEnum distribution_type =
+ static_cast<DistributionTypeEnum>(state.range(0));
+ int num_keys = state.range(1);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumberGenerator<int64_t>> generator,
+ CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys));
+ std::vector<int64_t> keys(num_keys);
+ for (int i = 0; i < num_keys; ++i) {
+ keys[i] = generator->Generate();
+ }
+
+ IntegerIndexStorageBenchmark benchmark;
+ for (auto _ : state) {
+ state.PauseTiming();
+ benchmark.filesystem.DeleteDirectoryRecursively(
+ benchmark.working_path.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ benchmark.filesystem, benchmark.working_path,
+ IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit,
+ kPreMappingFbv),
+ &benchmark.posting_list_serializer));
+ std::vector<int64_t> keys_copy(keys);
+ state.ResumeTiming();
+
+ ICING_ASSERT_OK(storage->AddKeys(static_cast<DocumentId>(0),
+ kDefaultSectionId, std::move(keys_copy)));
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ state.PauseTiming();
+ storage.reset();
+ state.ResumeTiming();
+ }
+}
+BENCHMARK(BM_BatchIndex)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20);
+
+void BM_ExactQuery(benchmark::State& state) {
+ DistributionTypeEnum distribution_type =
+ static_cast<DistributionTypeEnum>(state.range(0));
+ int num_keys = state.range(1);
+
+ IntegerIndexStorageBenchmark benchmark;
+ benchmark.filesystem.DeleteDirectoryRecursively(
+ benchmark.working_path.c_str());
+ DestructibleDirectory ddir(&benchmark.filesystem, benchmark.working_path);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ benchmark.filesystem, benchmark.working_path,
+ IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit,
+ kPreMappingFbv),
+ &benchmark.posting_list_serializer));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumberGenerator<int64_t>> generator,
+ CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys));
+ std::unordered_map<int64_t, std::vector<DocumentId>> keys;
+ for (int i = 0; i < num_keys; ++i) {
+ int64_t key = generator->Generate();
+ keys[key].push_back(static_cast<DocumentId>(i));
+ ICING_ASSERT_OK(
+ storage->AddKeys(static_cast<DocumentId>(i), kDefaultSectionId, {key}));
+ }
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ for (auto _ : state) {
+ int64_t exact_query_key = generator->Generate();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iterator,
+ storage->GetIterator(/*query_key_lower=*/exact_query_key,
+ /*query_key_upper=*/exact_query_key));
+ std::vector<DocHitInfo> data;
+ while (iterator->Advance().ok()) {
+ data.push_back(iterator->doc_hit_info());
+ }
+
+ state.PauseTiming();
+ const auto it = keys.find(exact_query_key);
+ if (it == keys.end()) {
+ ASSERT_THAT(data, IsEmpty());
+ } else {
+ ASSERT_THAT(data, SizeIs(it->second.size()));
+ std::reverse(data.begin(), data.end());
+ for (int i = 0; i < data.size(); ++i) {
+ ASSERT_THAT(data[i].document_id(), Eq(it->second[i]));
+ ASSERT_THAT(data[i].hit_section_ids_mask(), Eq(1 << kDefaultSectionId));
+ }
+ }
+ state.ResumeTiming();
+ }
+}
+BENCHMARK(BM_ExactQuery)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20);
+
+void BM_RangeQueryAll(benchmark::State& state) {
+ DistributionTypeEnum distribution_type =
+ static_cast<DistributionTypeEnum>(state.range(0));
+ int num_keys = state.range(1);
+
+ IntegerIndexStorageBenchmark benchmark;
+ benchmark.filesystem.DeleteDirectoryRecursively(
+ benchmark.working_path.c_str());
+ DestructibleDirectory ddir(&benchmark.filesystem, benchmark.working_path);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ benchmark.filesystem, benchmark.working_path,
+ IntegerIndexStorage::Options(kNumDataThresholdForBucketSplit,
+ kPreMappingFbv),
+ &benchmark.posting_list_serializer));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumberGenerator<int64_t>> generator,
+ CreateIntegerGenerator(distribution_type, kDefaultSeed, num_keys));
+ for (int i = 0; i < num_keys; ++i) {
+ ICING_ASSERT_OK(storage->AddKeys(static_cast<DocumentId>(i),
+ kDefaultSectionId,
+ {generator->Generate()}));
+ }
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ for (auto _ : state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iterator,
+ storage->GetIterator(
+ /*query_key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*query_key_upper=*/std::numeric_limits<int64_t>::max()));
+ std::vector<DocHitInfo> data;
+ while (iterator->Advance().ok()) {
+ data.push_back(iterator->doc_hit_info());
+ }
+
+ ASSERT_THAT(data, SizeIs(num_keys));
+ }
+}
+BENCHMARK(BM_RangeQueryAll)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kUniformDistribution, 1 << 20)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 10)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 11)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 12)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 13)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 14)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 15)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 16)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 17)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 18)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 19)
+ ->ArgPair(DistributionTypeEnum::kNormalDistribution, 1 << 20);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/integer-index-storage_test.cc b/icing/index/numeric/integer-index-storage_test.cc
new file mode 100644
index 0000000..a632bc8
--- /dev/null
+++ b/icing/index/numeric/integer-index-storage_test.cc
@@ -0,0 +1,2161 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index-storage.h"
+
+#include <unistd.h>
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Contains;
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Key;
+using ::testing::Le;
+using ::testing::Lt;
+using ::testing::Ne;
+using ::testing::Not;
+
+using Bucket = IntegerIndexStorage::Bucket;
+using Crcs = PersistentStorage::Crcs;
+using Info = IntegerIndexStorage::Info;
+using Options = IntegerIndexStorage::Options;
+
+static constexpr int32_t kCorruptedValueOffset = 3;
+static constexpr DocumentId kDefaultDocumentId = 123;
+static constexpr SectionId kDefaultSectionId = 31;
+
+class IntegerIndexStorageTest : public ::testing::TestWithParam<bool> {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/icing";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ working_path_ = base_dir_ + "/integer_index_storage_test";
+
+ serializer_ = std::make_unique<PostingListIntegerIndexSerializer>();
+ }
+
+ void TearDown() override {
+ serializer_.reset();
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+ std::string working_path_;
+ std::unique_ptr<PostingListIntegerIndexSerializer> serializer_;
+};
+
+libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query(
+ const IntegerIndexStorage* storage, int64_t key_lower, int64_t key_upper) {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iter,
+ storage->GetIterator(key_lower, key_upper));
+ std::vector<DocHitInfo> hits;
+ while (iter->Advance().ok()) {
+ hits.push_back(iter->doc_hit_info());
+ }
+ return hits;
+}
+
+TEST_P(IntegerIndexStorageTest, OptionsEmptyCustomInitBucketsShouldBeValid) {
+ EXPECT_THAT(
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsTrue());
+}
+
+TEST_P(IntegerIndexStorageTest, OptionsInvalidNumDataThresholdForBucketSplit) {
+ EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/{},
+ /*num_data_threshold_for_bucket_split=*/-1,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+ EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/{},
+ /*num_data_threshold_for_bucket_split=*/0,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+ EXPECT_THAT(Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/{},
+ /*num_data_threshold_for_bucket_split=*/63,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+}
+
+TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsRange) {
+ // Invalid custom init sorted bucket
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/
+ {Bucket(std::numeric_limits<int64_t>::min(), 5), Bucket(9, 6)},
+ /*custom_init_unsorted_buckets_in=*/
+ {Bucket(10, std::numeric_limits<int64_t>::max())},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+
+ // Invalid custom init unsorted bucket
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/
+ {Bucket(10, std::numeric_limits<int64_t>::max())},
+ /*custom_init_unsorted_buckets_in=*/
+ {Bucket(std::numeric_limits<int64_t>::min(), 5), Bucket(9, 6)},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+}
+
+TEST_P(IntegerIndexStorageTest,
+ OptionsInvalidCustomInitBucketsPostingListIdentifier) {
+ // Custom init buckets should contain invalid posting list identifier.
+ PostingListIdentifier valid_posting_list_identifier(0, 0, 0);
+ ASSERT_THAT(valid_posting_list_identifier.is_valid(), IsTrue());
+
+ // Invalid custom init sorted bucket
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/
+ {Bucket(std::numeric_limits<int64_t>::min(),
+ std::numeric_limits<int64_t>::max(),
+ valid_posting_list_identifier)},
+ /*custom_init_unsorted_buckets_in=*/{},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+
+ // Invalid custom init unsorted bucket
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/
+ {Bucket(std::numeric_limits<int64_t>::min(),
+ std::numeric_limits<int64_t>::max(),
+ valid_posting_list_identifier)},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+}
+
+TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsOverlapping) {
+ // sorted buckets overlap
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/
+ {Bucket(std::numeric_limits<int64_t>::min(), -100),
+ Bucket(-100, std::numeric_limits<int64_t>::max())},
+ /*custom_init_unsorted_buckets_in=*/{},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+
+ // unsorted buckets overlap
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/
+ {Bucket(-100, std::numeric_limits<int64_t>::max()),
+ Bucket(std::numeric_limits<int64_t>::min(), -100)},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+
+ // Cross buckets overlap
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/
+ {Bucket(std::numeric_limits<int64_t>::min(), -100),
+ Bucket(-99, 0)},
+ /*custom_init_unsorted_buckets_in=*/
+ {Bucket(200, std::numeric_limits<int64_t>::max()), Bucket(0, 50),
+ Bucket(51, 199)},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+}
+
+TEST_P(IntegerIndexStorageTest, OptionsInvalidCustomInitBucketsUnion) {
+ // Missing INT64_MAX
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/
+ {Bucket(std::numeric_limits<int64_t>::min(), -100),
+ Bucket(-99, 0)},
+ /*custom_init_unsorted_buckets_in=*/{Bucket(1, 1000)},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+
+ // Missing INT64_MIN
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/
+ {Bucket(-200, -100), Bucket(-99, 0)},
+ /*custom_init_unsorted_buckets_in=*/
+ {Bucket(1, std::numeric_limits<int64_t>::max())},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+
+ // Missing some intermediate ranges
+ EXPECT_THAT(
+ Options(/*custom_init_sorted_buckets_in=*/
+ {Bucket(std::numeric_limits<int64_t>::min(), -100)},
+ /*custom_init_unsorted_buckets_in=*/
+ {Bucket(1, std::numeric_limits<int64_t>::max())},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam())
+ .IsValid(),
+ IsFalse());
+}
+
+TEST_P(IntegerIndexStorageTest, InvalidWorkingPath) {
+ EXPECT_THAT(
+ IntegerIndexStorage::Create(
+ filesystem_, "/dev/null/integer_index_storage_test",
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_P(IntegerIndexStorageTest, CreateWithInvalidOptionsShouldFail) {
+ Options invalid_options(
+ /*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/
+ {Bucket(-100, std::numeric_limits<int64_t>::max()),
+ Bucket(std::numeric_limits<int64_t>::min(), -100)},
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam());
+ ASSERT_THAT(invalid_options.IsValid(), IsFalse());
+
+ EXPECT_THAT(IntegerIndexStorage::Create(filesystem_, working_path_,
+ invalid_options, serializer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(IntegerIndexStorageTest, InitializeNewFiles) {
+ {
+ // Create new integer index storage
+ ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ ICING_ASSERT_OK(storage->PersistToDisk());
+ }
+
+ // Metadata file should be initialized correctly for both info and crcs
+ // sections.
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ // Check info section
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ IntegerIndexStorage::kInfoMetadataFileOffset));
+ EXPECT_THAT(info.magic, Eq(Info::kMagic));
+ EXPECT_THAT(info.num_data, Eq(0));
+
+ // Check crcs section
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ IntegerIndexStorage::kCrcsMetadataFileOffset));
+ // # of elements in sorted_buckets should be 1, so it should have non-zero
+ // all storages crc value.
+ EXPECT_THAT(crcs.component_crcs.storages_crc, Ne(0));
+ EXPECT_THAT(crcs.component_crcs.info_crc,
+ Eq(Crc32(std::string_view(reinterpret_cast<const char*>(&info),
+ sizeof(Info)))
+ .Get()));
+ EXPECT_THAT(crcs.all_crc,
+ Eq(Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get()));
+}
+
+TEST_P(IntegerIndexStorageTest,
+ InitializationShouldFailWithoutPersistToDiskOrDestruction) {
+ // Create new integer index storage
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Insert some data.
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, /*section_id=*/20,
+ /*new_keys=*/{0, 100, -100}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, /*section_id=*/2,
+ /*new_keys=*/{3, -1000, 500}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, /*section_id=*/15,
+ /*new_keys=*/{-6, 321, 98}));
+
+ // Without calling PersistToDisk, checksums will not be recomputed or synced
+ // to disk, so initializing another instance on the same files should fail.
+ EXPECT_THAT(
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedWithPersistToDisk) {
+ // Create new integer index storage
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage1,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Insert some data.
+ ICING_ASSERT_OK(storage1->AddKeys(/*document_id=*/0, /*section_id=*/20,
+ /*new_keys=*/{0, 100, -100}));
+ ICING_ASSERT_OK(storage1->AddKeys(/*document_id=*/1, /*section_id=*/2,
+ /*new_keys=*/{3, -1000, 500}));
+ ICING_ASSERT_OK(storage1->AddKeys(/*document_id=*/2, /*section_id=*/15,
+ /*new_keys=*/{-6, 321, 98}));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<DocHitInfo> doc_hit_info_vec,
+ Query(storage1.get(),
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()));
+
+ // After calling PersistToDisk, all checksums should be recomputed and synced
+ // correctly to disk, so initializing another instance on the same files
+ // should succeed, and we should be able to get the same contents.
+ ICING_EXPECT_OK(storage1->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage2,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(
+ Query(storage2.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(
+ ElementsAreArray(doc_hit_info_vec.begin(), doc_hit_info_vec.end())));
+}
+
+TEST_P(IntegerIndexStorageTest, InitializationShouldSucceedAfterDestruction) {
+ std::vector<DocHitInfo> doc_hit_info_vec;
+ {
+ // Create new integer index storage
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Insert some data.
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, /*section_id=*/20,
+ /*new_keys=*/{0, 100, -100}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, /*section_id=*/2,
+ /*new_keys=*/{3, -1000, 500}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, /*section_id=*/15,
+ /*new_keys=*/{-6, 321, 98}));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ doc_hit_info_vec,
+ Query(storage.get(),
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()));
+ }
+
+ {
+ // The previous instance went out of scope and was destructed. Although we
+ // didn't call PersistToDisk explicitly, the destructor should invoke it and
+ // thus initializing another instance on the same files should succeed, and
+ // we should be able to get the same contents.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(ElementsAreArray(doc_hit_info_vec.begin(),
+ doc_hit_info_vec.end())));
+ }
+}
+
+TEST_P(IntegerIndexStorageTest,
+ InitializeExistingFilesWithWrongAllCrcShouldFail) {
+ {
+ // Create new integer index storage
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId,
+ /*new_keys=*/{0, 100, -100}));
+
+ ICING_ASSERT_OK(storage->PersistToDisk());
+ }
+
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ IntegerIndexStorage::kCrcsMetadataFileOffset));
+
+ // Manually corrupt all_crc
+ crcs.all_crc += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(),
+ IntegerIndexStorage::kCrcsMetadataFileOffset,
+ &crcs, sizeof(Crcs)));
+ metadata_sfd.reset();
+
+ {
+ // Attempt to create the integer index storage with metadata containing
+ // corrupted all_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ storage_or = IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get());
+ EXPECT_THAT(storage_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(storage_or.status().error_message(),
+ HasSubstr("Invalid all crc"));
+ }
+}
+
+TEST_P(IntegerIndexStorageTest,
+ InitializeExistingFilesWithCorruptedInfoShouldFail) {
+ {
+ // Create new integer index storage
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId,
+ /*new_keys=*/{0, 100, -100}));
+
+ ICING_ASSERT_OK(storage->PersistToDisk());
+ }
+
+ const std::string metadata_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ IntegerIndexStorage::kInfoMetadataFileOffset));
+
+ // Modify info, but don't update the checksum. This would be similar to
+ // corruption of info.
+ info.num_data += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(),
+ IntegerIndexStorage::kInfoMetadataFileOffset,
+ &info, sizeof(Info)));
+ metadata_sfd.reset();
+
+ {
+ // Attempt to create the integer index storage with info that doesn't match
+ // its checksum and confirm that it fails.
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ storage_or = IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get());
+ EXPECT_THAT(storage_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(storage_or.status().error_message(),
+ HasSubstr("Invalid info crc"));
+ }
+}
+
+TEST_P(IntegerIndexStorageTest,
+ InitializeExistingFilesWithCorruptedSortedBucketsShouldFail) {
+ {
+ // Create new integer index storage
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId,
+ /*new_keys=*/{0, 100, -100}));
+
+ ICING_ASSERT_OK(storage->PersistToDisk());
+ }
+
+ {
+ // Corrupt sorted buckets manually.
+ const std::string sorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, sorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc,
+ sorted_buckets->ComputeChecksum());
+ ICING_ASSERT_OK(sorted_buckets->Append(Bucket(
+ /*key_lower=*/0, /*key_upper=*/std::numeric_limits<int64_t>::max())));
+ ICING_ASSERT_OK(sorted_buckets->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc,
+ sorted_buckets->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ {
+ // Attempt to create the integer index storage with metadata containing
+ // corrupted sorted_buckets_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ storage_or = IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get());
+ EXPECT_THAT(storage_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(storage_or.status().error_message(),
+ HasSubstr("Invalid storages crc"));
+ }
+}
+
+TEST_P(IntegerIndexStorageTest,
+ InitializeExistingFilesWithCorruptedUnsortedBucketsShouldFail) {
+ {
+ // Create new integer index storage
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ ICING_ASSERT_OK(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId,
+ /*new_keys=*/{0, 100, -100}));
+
+ ICING_ASSERT_OK(storage->PersistToDisk());
+ }
+
+ {
+ // Corrupt unsorted buckets manually.
+ const std::string unsorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, unsorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/sizeof(Bucket) * 100 +
+ FileBackedVector<Bucket>::Header::kHeaderSize));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc,
+ unsorted_buckets->ComputeChecksum());
+ ICING_ASSERT_OK(unsorted_buckets->Append(Bucket(
+ /*key_lower=*/0, /*key_upper=*/std::numeric_limits<int64_t>::max())));
+ ICING_ASSERT_OK(unsorted_buckets->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc,
+ unsorted_buckets->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ {
+ // Attempt to create the integer index storage with metadata containing
+ // corrupted unsorted_buckets_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ storage_or = IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get());
+ EXPECT_THAT(storage_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(storage_or.status().error_message(),
+ HasSubstr("Invalid storages crc"));
+ }
+}
+
+// TODO(b/259744228): add test for corrupted flash_index_storage
+
+TEST_P(IntegerIndexStorageTest, InvalidQuery) {
+ // Create new integer index storage
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(
+ storage->GetIterator(/*query_key_lower=*/0, /*query_key_upper=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(IntegerIndexStorageTest, AddKeysShouldUpdateNumData) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into buckets [(-1000,-100), (200,300), (-99,-1)].
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{-51, -500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{201, 209, -149}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{208}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(6));
+
+ ICING_ASSERT_OK(storage->PersistToDisk());
+ }
+
+ // Check sorted_buckets manually.
+ const std::string sorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, sorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(sorted_buckets->num_elements(), Eq(5));
+
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk1,
+ sorted_buckets->Get(/*idx=*/0));
+ EXPECT_THAT(sbk1->key_lower(), Eq(-1000));
+ EXPECT_THAT(sbk1->key_upper(), Eq(-100));
+ EXPECT_THAT(sbk1->num_data(), Eq(2));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk2,
+ sorted_buckets->Get(/*idx=*/1));
+ EXPECT_THAT(sbk2->key_lower(), Eq(0));
+ EXPECT_THAT(sbk2->key_upper(), Eq(100));
+ EXPECT_THAT(sbk2->num_data(), Eq(0));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk3,
+ sorted_buckets->Get(/*idx=*/2));
+ EXPECT_THAT(sbk3->key_lower(), Eq(150));
+ EXPECT_THAT(sbk3->key_upper(), Eq(199));
+ EXPECT_THAT(sbk3->num_data(), Eq(0));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk4,
+ sorted_buckets->Get(/*idx=*/3));
+ EXPECT_THAT(sbk4->key_lower(), Eq(200));
+ EXPECT_THAT(sbk4->key_upper(), Eq(300));
+ EXPECT_THAT(sbk4->num_data(), Eq(3));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* sbk5,
+ sorted_buckets->Get(/*idx=*/4));
+ EXPECT_THAT(sbk5->key_lower(), Eq(301));
+ EXPECT_THAT(sbk5->key_upper(), Eq(999));
+ EXPECT_THAT(sbk5->num_data(), Eq(0));
+
+ // Check unsorted_buckets and unsorted buckets manually.
+ const std::string unsorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, unsorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(unsorted_buckets->num_elements(), Eq(4));
+
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk1,
+ unsorted_buckets->Get(/*idx=*/0));
+ EXPECT_THAT(ubk1->key_lower(), Eq(1000));
+ EXPECT_THAT(ubk1->key_upper(), Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(ubk1->num_data(), Eq(0));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk2,
+ unsorted_buckets->Get(/*idx=*/1));
+ EXPECT_THAT(ubk2->key_lower(), Eq(-99));
+ EXPECT_THAT(ubk2->key_upper(), Eq(-1));
+ EXPECT_THAT(ubk2->num_data(), Eq(1));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk3,
+ unsorted_buckets->Get(/*idx=*/2));
+ EXPECT_THAT(ubk3->key_lower(), Eq(101));
+ EXPECT_THAT(ubk3->key_upper(), Eq(149));
+ EXPECT_THAT(ubk3->num_data(), Eq(0));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* ubk4,
+ unsorted_buckets->Get(/*idx=*/3));
+ EXPECT_THAT(ubk4->key_lower(), Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(ubk4->key_upper(), Eq(-1001));
+ EXPECT_THAT(ubk4->num_data(), Eq(0));
+}
+
+TEST_P(IntegerIndexStorageTest, ExactQuerySortedBuckets) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into sorted buckets [(-1000,-100), (200,300)].
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{-500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{208}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{-200}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{-1000}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/4, kDefaultSectionId,
+ /*new_keys=*/{300}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(5));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ // Exact query on key in each sorted bucket should get the correct result.
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-500, /*key_upper=*/-500),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/208, /*key_upper=*/208),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-200, /*key_upper=*/-200),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1000, /*key_upper=*/-1000),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/300, /*key_upper=*/300),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections))));
+}
+
+TEST_P(IntegerIndexStorageTest, ExactQueryUnsortedBuckets) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into unsorted buckets [(1000,INT64_MAX), (INT64_MIN,-1001)].
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{1024}),
+ IsOk());
+ EXPECT_THAT(
+ storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{std::numeric_limits<int64_t>::max()}),
+ IsOk());
+ EXPECT_THAT(
+ storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{std::numeric_limits<int64_t>::min()}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{-1500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/4, kDefaultSectionId,
+ /*new_keys=*/{2000}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(5));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ // Exact query on key in each unsorted bucket should get the correct result.
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/1024, /*key_upper=*/1024),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::max(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(
+ ElementsAre(EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::min()),
+ IsOkAndHolds(
+ ElementsAre(EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1500, /*key_upper=*/-1500),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/2000, /*key_upper=*/2000),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections))));
+}
+
+TEST_P(IntegerIndexStorageTest, ExactQueryIdenticalKeys) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into buckets [(0,100), (1000,INT64_MAX)].
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{1024}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{1024}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{20}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{20}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(4));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ // Exact query on key with multiple hits should get the correct result.
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/1024, /*key_upper=*/1024),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/20, /*key_upper=*/20),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
+}
+
+TEST_P(IntegerIndexStorageTest, RangeQueryEmptyIntegerIndexStorage) {
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IntegerIndexStorageTest, RangeQuerySingleEntireSortedBucket) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into sorted buckets [(-1000,-100), (200,300)].
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{-500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{208}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{-200}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{-1000}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/4, kDefaultSectionId,
+ /*new_keys=*/{300}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(5));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ // Range query on each sorted bucket boundary should get the correct result.
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1000, /*key_upper=*/-100),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/0, /*key_upper=*/100),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/150, /*key_upper=*/199),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/200, /*key_upper=*/300),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/301, /*key_upper=*/999),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IntegerIndexStorageTest, RangeQuerySingleEntireUnsortedBucket) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into unsorted buckets [(1000,INT64_MAX), (INT64_MIN,-1001)].
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{1024}),
+ IsOk());
+ EXPECT_THAT(
+ storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{std::numeric_limits<int64_t>::max()}),
+ IsOk());
+ EXPECT_THAT(
+ storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{std::numeric_limits<int64_t>::min()}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{-1500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/4, kDefaultSectionId,
+ /*new_keys=*/{2000}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(5));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ // Range query on each unsorted bucket boundary should get the correct result.
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/1000,
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-99, /*key_upper=*/-1),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/101, /*key_upper=*/149),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/-1001),
+ IsOkAndHolds(
+ ElementsAre(EqualsDocHitInfo(/*document_id=*/3, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
+}
+
+TEST_P(IntegerIndexStorageTest, RangeQuerySinglePartialSortedBucket) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into sorted bucket (0,100).
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{43}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{30}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(2));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ // Range query on partial range of each sorted bucket should get the correct
+ // result.
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/25, /*key_upper=*/200),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1000, /*key_upper=*/49),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/25, /*key_upper=*/49),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/31, /*key_upper=*/49),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/25, /*key_upper=*/31),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/3, /*key_upper=*/5),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IntegerIndexStorageTest, RangeQuerySinglePartialUnsortedBucket) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into unsorted buckets (-99,-1).
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{-19}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{-72}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(2));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ // Range query on partial range of each unsorted bucket should get the correct
+ // result.
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-1000, /*key_upper=*/-15),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-80, /*key_upper=*/149),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-80, /*key_upper=*/-15),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-38, /*key_upper=*/-15),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-80, /*key_upper=*/-38),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-95, /*key_upper=*/-92),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IntegerIndexStorageTest, RangeQueryMultipleBuckets) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into buckets [(-1000,-100), (200,300), (1000,INT64_MAX),
+ // (INT64_MIN,-1001)]
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{-500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{1024}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{-200}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{208}),
+ IsOk());
+ EXPECT_THAT(
+ storage->AddKeys(/*document_id=*/4, kDefaultSectionId,
+ /*new_keys=*/{std::numeric_limits<int64_t>::max()}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/5, kDefaultSectionId,
+ /*new_keys=*/{-1000}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/6, kDefaultSectionId,
+ /*new_keys=*/{300}),
+ IsOk());
+ EXPECT_THAT(
+ storage->AddKeys(/*document_id=*/7, kDefaultSectionId,
+ /*new_keys=*/{std::numeric_limits<int64_t>::min()}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/8, kDefaultSectionId,
+ /*new_keys=*/{-1500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(/*document_id=*/9, kDefaultSectionId,
+ /*new_keys=*/{2000}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(10));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ // Range query should get the correct result.
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(
+ ElementsAre(EqualsDocHitInfo(/*document_id=*/9, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/8, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/7, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/6, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/-199,
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/9, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/6, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/-200),
+ IsOkAndHolds(
+ ElementsAre(EqualsDocHitInfo(/*document_id=*/8, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/7, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+}
+
+TEST_P(IntegerIndexStorageTest, BatchAdd) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Batch add the following keys (including some edge cases) to test the
+ // correctness of the sort and binary search logic in AddKeys().
+ // clang-format off
+ std::vector<int64_t> keys = {4000, 3000, 2000, 300, 201, 200, 106, 104,
+ 100, 3, 2, 1, 0, -97, -98, -99,
+ -100, -200, -1000, -1001, -1500, -2000,
+ std::numeric_limits<int64_t>::max(),
+ std::numeric_limits<int64_t>::min()};
+ // clang-format on
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, kDefaultSectionId,
+ std::vector<int64_t>(keys)),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(keys.size()));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ for (int64_t key : keys) {
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDefaultDocumentId, expected_sections))));
+ }
+}
+
+TEST_P(IntegerIndexStorageTest, BatchAddShouldDedupeKeys) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ std::vector<int64_t> keys = {2, 3, 1, 2, 4, -1, -1, 100, 3};
+ EXPECT_THAT(
+ storage->AddKeys(kDefaultDocumentId, kDefaultSectionId, std::move(keys)),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(6));
+}
+
+TEST_P(IntegerIndexStorageTest, MultipleKeysShouldMergeAndDedupeDocHitInfo) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys with same document id and section id.
+ EXPECT_THAT(
+ storage->AddKeys(
+ /*document_id=*/0, kDefaultSectionId, /*new_keys=*/
+ {-500, 1024, -200, 208, std::numeric_limits<int64_t>::max(), -1000,
+ 300, std::numeric_limits<int64_t>::min(), -1500, 2000}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(10));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(
+ ElementsAre(EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+}
+
+TEST_P(IntegerIndexStorageTest,
+ MultipleSectionsShouldMergeSectionsAndDedupeDocHitInfo) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys with same document id but different section ids.
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/63,
+ /*new_keys=*/{-500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/62,
+ /*new_keys=*/{1024}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/61,
+ /*new_keys=*/{-200}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/60,
+ /*new_keys=*/{208}),
+ IsOk());
+ EXPECT_THAT(
+ storage->AddKeys(kDefaultDocumentId, /*section_id=*/59,
+ /*new_keys=*/{std::numeric_limits<int64_t>::max()}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/58,
+ /*new_keys=*/{-1000}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/57,
+ /*new_keys=*/{300}),
+ IsOk());
+ EXPECT_THAT(
+ storage->AddKeys(kDefaultDocumentId, /*section_id=*/56,
+ /*new_keys=*/{std::numeric_limits<int64_t>::min()}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/55,
+ /*new_keys=*/{-1500}),
+ IsOk());
+ EXPECT_THAT(storage->AddKeys(kDefaultDocumentId, /*section_id=*/54,
+ /*new_keys=*/{2000}),
+ IsOk());
+ EXPECT_THAT(storage->num_data(), Eq(10));
+
+ std::vector<SectionId> expected_sections = {63, 62, 61, 60, 59,
+ 58, 57, 56, 55, 54};
+ EXPECT_THAT(
+ Query(storage.get(), /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(kDefaultDocumentId, expected_sections))));
+}
+
+TEST_P(IntegerIndexStorageTest, IteratorCallStatsMultipleBuckets) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add some keys into sorted buckets [(-1000,-100), (200,300)].
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{-500}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{208}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{-200}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{-1000}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/4, kDefaultSectionId,
+ /*new_keys=*/{300}));
+ ASSERT_THAT(storage->num_data(), Eq(5));
+
+ // GetIterator for range [INT_MIN, INT_MAX] and Advance all. Those 5 keys are
+ // in 2 buckets, so we will be inspecting 2 posting lists in 2 blocks.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iter1,
+ storage->GetIterator(/*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()));
+ while (iter1->Advance().ok()) {
+ // Advance all hits.
+ }
+ EXPECT_THAT(
+ iter1->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/5,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/2));
+
+ // GetIterator for range [-1000, -100] and Advance all. Since we only have to
+ // read bucket (-1000,-100), there will be 3 advance calls and 1 block
+ // inspected.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iter2,
+ storage->GetIterator(/*key_lower=*/-1000, /*key_upper=*/-100));
+ while (iter2->Advance().ok()) {
+ // Advance all hits.
+ }
+ EXPECT_THAT(
+ iter2->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/3,
+ /*num_leaf_advance_calls_no_index=*/0, /*num_blocks_inspected=*/1));
+}
+
+TEST_P(IntegerIndexStorageTest, IteratorCallStatsSingleBucketChainedBlocks) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ int32_t num_keys_to_add = 800;
+ ASSERT_THAT(num_keys_to_add,
+ Lt(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit));
+ for (int i = 0; i < num_keys_to_add; ++i) {
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/i, kDefaultSectionId,
+ /*new_keys=*/{i}));
+ }
+
+ // Those 800 keys are in 1 single bucket with 3 chained posting lists, so we
+ // will be inspecting 3 blocks.
+ int32_t expected_num_blocks_inspected = 3;
+
+ // GetIterator for range [INT_MIN, INT_MAX] and Advance all.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iter1,
+ storage->GetIterator(/*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()));
+ while (iter1->Advance().ok()) {
+ // Advance all hits.
+ }
+ EXPECT_THAT(iter1->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/num_keys_to_add,
+ /*num_leaf_advance_calls_no_index=*/0,
+ expected_num_blocks_inspected));
+
+ // GetIterator for range [1, 1] and Advance all. Although there is only 1
+ // relevant data, we still have to inspect the entire bucket and its posting
+ // lists chain (which contain 3 blocks and 800 data).
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iter2,
+ storage->GetIterator(/*key_lower=*/1, /*key_upper=*/1));
+ while (iter2->Advance().ok()) {
+ // Advance all hits.
+ }
+ EXPECT_THAT(iter2->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/num_keys_to_add,
+ /*num_leaf_advance_calls_no_index=*/0,
+ expected_num_blocks_inspected));
+}
+
+TEST_P(IntegerIndexStorageTest, SplitBuckets) {
+ int32_t custom_num_data_threshold_for_bucket_split = 300;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/{},
+ custom_num_data_threshold_for_bucket_split,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add custom_num_data_threshold_for_bucket_split + 1 keys to invoke bucket
+ // splitting.
+ // - Keys: custom_num_data_threshold_for_bucket_split to 0 Document
+ // - ids: 0 to custom_num_data_threshold_for_bucket_split
+ std::unordered_map<int64_t, DocumentId> data;
+ int64_t key = custom_num_data_threshold_for_bucket_split;
+ DocumentId document_id = 0;
+ for (int i = 0; i < custom_num_data_threshold_for_bucket_split + 1; ++i) {
+ data[key] = document_id;
+ ICING_ASSERT_OK(
+ storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key}));
+ ++document_id;
+ --key;
+ }
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ // Manually check sorted and unsorted buckets.
+ {
+ // Check sorted buckets.
+ const std::string sorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, sorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(sorted_buckets->num_elements(), Eq(1));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bucket1,
+ sorted_buckets->Get(/*idx=*/0));
+ EXPECT_THAT(bucket1->key_lower(), Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(bucket1->key_upper(), Ne(std::numeric_limits<int64_t>::max()));
+
+ int64_t sorted_bucket_key_upper = bucket1->key_upper();
+
+ // Check unsorted buckets.
+ const std::string unsorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, unsorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+
+ EXPECT_THAT(unsorted_buckets->num_elements(), Ge(1));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bucket2,
+ unsorted_buckets->Get(/*idx=*/0));
+ EXPECT_THAT(bucket2->key_lower(), Eq(sorted_bucket_key_upper + 1));
+ }
+
+ // Ensure that search works normally.
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ for (int64_t key = custom_num_data_threshold_for_bucket_split; key >= 0;
+ key--) {
+ ASSERT_THAT(data, Contains(Key(key)));
+ DocumentId expected_document_id = data[key];
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(expected_document_id,
+ expected_sections))));
+ }
+}
+
+TEST_P(IntegerIndexStorageTest, SplitBucketsTriggerSortBuckets) {
+ int32_t custom_num_data_threshold_for_bucket_split = 300;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/{},
+ custom_num_data_threshold_for_bucket_split,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Add IntegerIndexStorage::kUnsortedBucketsLengthThreshold keys. For each
+ // key, add custom_num_data_threshold_for_bucket_split + 1 data. Then we will
+ // get:
+ // - Bucket splitting will create kUnsortedBucketsLengthThreshold + 1 unsorted
+ // buckets [[50, 50], [49, 49], ..., [1, 1], [51, INT64_MAX]].
+ // - Since there are kUnsortedBucketsLengthThreshold + 1 unsorted buckets, we
+ // should sort and merge buckets.
+ std::unordered_map<int64_t, std::vector<DocumentId>> data;
+ int64_t key = IntegerIndexStorage::kUnsortedBucketsLengthThreshold;
+ DocumentId document_id = 0;
+ for (int i = 0; i < IntegerIndexStorage::kUnsortedBucketsLengthThreshold;
+ ++i) {
+ for (int j = 0; j < custom_num_data_threshold_for_bucket_split + 1; ++j) {
+ data[key].push_back(document_id);
+ ICING_ASSERT_OK(
+ storage->AddKeys(document_id, kDefaultSectionId, /*new_keys=*/{key}));
+ ++document_id;
+ }
+ --key;
+ }
+ ICING_ASSERT_OK(storage->PersistToDisk());
+
+ // Manually check sorted and unsorted buckets.
+ {
+ // Check unsorted buckets.
+ const std::string unsorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".u");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> unsorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, unsorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(unsorted_buckets->num_elements(), Eq(0));
+
+ // Check sorted buckets.
+ const std::string sorted_buckets_file_path = absl_ports::StrCat(
+ working_path_, "/", IntegerIndexStorage::kFilePrefix, ".s");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, sorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(sorted_buckets->num_elements(), Gt(1));
+ }
+
+ // Ensure that search works normally.
+ for (key = 1; key <= IntegerIndexStorage::kUnsortedBucketsLengthThreshold;
+ ++key) {
+ ASSERT_THAT(data, Contains(Key(key)));
+
+ std::vector<DocHitInfo> expected_doc_hit_infos;
+ for (DocumentId doc_id : data[key]) {
+ expected_doc_hit_infos.push_back(DocHitInfo(
+ doc_id, /*hit_section_ids_mask=*/UINT64_C(1) << kDefaultSectionId));
+ }
+ EXPECT_THAT(Query(storage.get(), /*key_lower=*/key, /*key_upper=*/key),
+ IsOkAndHolds(ElementsAreArray(expected_doc_hit_infos.rbegin(),
+ expected_doc_hit_infos.rend())));
+ }
+}
+
+TEST_P(IntegerIndexStorageTest, TransferIndex) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{-500}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{1024}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{-200}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/5, kDefaultSectionId,
+ /*new_keys=*/{-60}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/8, kDefaultSectionId,
+ /*new_keys=*/{-60}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/13, kDefaultSectionId,
+ /*new_keys=*/{-500}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/21, kDefaultSectionId,
+ /*new_keys=*/{2048}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/34, kDefaultSectionId,
+ /*new_keys=*/{156}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/55, kDefaultSectionId,
+ /*new_keys=*/{20}));
+ ASSERT_THAT(storage->num_data(), Eq(9));
+
+ // Delete doc id = 5, 34, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(56, kInvalidDocumentId);
+ document_id_old_to_new[1] = 8;
+ document_id_old_to_new[2] = 3;
+ document_id_old_to_new[3] = 0;
+ document_id_old_to_new[8] = 2;
+ document_id_old_to_new[13] = 6;
+ document_id_old_to_new[21] = 1;
+ document_id_old_to_new[55] = 4;
+
+ // Transfer to new storage.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_ + "_temp",
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(
+ storage->TransferIndex(document_id_old_to_new, new_storage.get()),
+ IsOk());
+ ICING_ASSERT_OK(new_storage->PersistToDisk());
+ }
+
+ // Verify after transferring and reinitializing the instance.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_ + "_temp",
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(new_storage->num_data(), Eq(7));
+
+ // -500 had hits for old_docids 1 and 13, which are now 6 and 8.
+ EXPECT_THAT(Query(new_storage.get(), /*key_lower=*/-500, /*key_upper=*/-500),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/8, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/6, expected_sections))));
+
+ // 1024 had a hit for old_docid 2, which is now 3.
+ EXPECT_THAT(Query(new_storage.get(), /*key_lower=*/1024, /*key_upper=*/1024),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections))));
+
+ // -200 had a hit for old_docid 3, which is now 0.
+ EXPECT_THAT(Query(new_storage.get(), /*key_lower=*/-200, /*key_upper=*/-200),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+
+ // -60 had hits for old_docids 5 and 8, which is now only 2 (because doc 5 has
+ // been deleted).
+ EXPECT_THAT(Query(new_storage.get(), /*key_lower=*/-60, /*key_upper=*/-60),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
+
+ // 2048 had a hit for old_docid 21, which is now 1.
+ EXPECT_THAT(Query(new_storage.get(), /*key_lower=*/2048, /*key_upper=*/2048),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+
+ // 156 had a hit for old_docid 34, which is not found now (because doc 34 has
+ // been deleted).
+ EXPECT_THAT(Query(new_storage.get(), /*key_lower=*/156, /*key_upper=*/156),
+ IsOkAndHolds(IsEmpty()));
+
+ // 20 had a hit for old_docid 55, which is now 4.
+ EXPECT_THAT(Query(new_storage.get(), /*key_lower=*/20, /*key_upper=*/20),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections))));
+}
+
+TEST_P(IntegerIndexStorageTest, TransferIndexOutOfRangeDocumentId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{120}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{-2000}));
+ ASSERT_THAT(storage->num_data(), Eq(2));
+
+ // Create document_id_old_to_new with size = 2. TransferIndex should handle
+ // out of range DocumentId properly.
+ std::vector<DocumentId> document_id_old_to_new = {kInvalidDocumentId, 0};
+
+ // Transfer to new storage.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_ + "_temp",
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(storage->TransferIndex(document_id_old_to_new, new_storage.get()),
+ IsOk());
+
+ // Verify after transferring.
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(new_storage->num_data(), Eq(1));
+ EXPECT_THAT(Query(new_storage.get(), /*key_lower=*/120, /*key_upper=*/120),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(
+ Query(new_storage.get(), /*key_lower=*/-2000, /*key_upper=*/-2000),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IntegerIndexStorageTest, TransferEmptyIndex) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ ASSERT_THAT(storage->num_data(), Eq(0));
+
+ std::vector<DocumentId> document_id_old_to_new = {kInvalidDocumentId, 0, 1,
+ kInvalidDocumentId, 2};
+
+ // Transfer to new storage.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_ + "_temp",
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(storage->TransferIndex(document_id_old_to_new, new_storage.get()),
+ IsOk());
+
+ // Verify after transferring.
+ EXPECT_THAT(new_storage->num_data(), Eq(0));
+ EXPECT_THAT(Query(new_storage.get(),
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IntegerIndexStorageTest, TransferIndexDeleteAll) {
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{-500}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{1024}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{-200}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/5, kDefaultSectionId,
+ /*new_keys=*/{-60}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/8, kDefaultSectionId,
+ /*new_keys=*/{-60}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/13, kDefaultSectionId,
+ /*new_keys=*/{-500}));
+ ASSERT_THAT(storage->num_data(), Eq(6));
+
+ // Delete all documents.
+ std::vector<DocumentId> document_id_old_to_new(14, kInvalidDocumentId);
+
+ // Transfer to new storage.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_ + "_temp",
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(
+ storage->TransferIndex(document_id_old_to_new, new_storage.get()),
+ IsOk());
+ ICING_ASSERT_OK(new_storage->PersistToDisk());
+ }
+
+ // Verify after transferring and reinitializing the instance.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_ + "_temp",
+ Options(IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(new_storage->num_data(), Eq(0));
+ EXPECT_THAT(Query(new_storage.get(),
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_P(IntegerIndexStorageTest, TransferIndexShouldInvokeMergeBuckets) {
+ int32_t custom_num_data_threshold_for_bucket_split = 300;
+ int32_t custom_num_data_threshold_for_bucket_merge =
+ IntegerIndexStorage::kNumDataThresholdRatioForBucketMerge *
+ custom_num_data_threshold_for_bucket_split;
+
+ // This test verifies that if TransferIndex invokes bucket merging logic to
+ // ensure sure we're able to avoid having mostly empty buckets after inserting
+ // and deleting data for many rounds.
+
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ custom_num_data_threshold_for_bucket_split,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/0, kDefaultSectionId,
+ /*new_keys=*/{-500}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/1, kDefaultSectionId,
+ /*new_keys=*/{1024}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/2, kDefaultSectionId,
+ /*new_keys=*/{-200}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/3, kDefaultSectionId,
+ /*new_keys=*/{-60}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/4, kDefaultSectionId,
+ /*new_keys=*/{-60}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/5, kDefaultSectionId,
+ /*new_keys=*/{-500}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/6, kDefaultSectionId,
+ /*new_keys=*/{2048}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/7, kDefaultSectionId,
+ /*new_keys=*/{156}));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/8, kDefaultSectionId,
+ /*new_keys=*/{20}));
+ ASSERT_THAT(storage->num_data(), Eq(9));
+ ASSERT_THAT(storage->num_data(),
+ Le(custom_num_data_threshold_for_bucket_merge));
+
+ // Create document_id_old_to_new that keeps all existing documents.
+ std::vector<DocumentId> document_id_old_to_new(9);
+ std::iota(document_id_old_to_new.begin(), document_id_old_to_new.end(), 0);
+
+ // Transfer to new storage. It should result in 1 bucket: [INT64_MIN,
+ // INT64_MAX] after transferring.
+ const std::string new_storage_working_path = working_path_ + "_temp";
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ filesystem_, new_storage_working_path,
+ Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/{},
+ custom_num_data_threshold_for_bucket_split,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(
+ storage->TransferIndex(document_id_old_to_new, new_storage.get()),
+ IsOk());
+ EXPECT_THAT(new_storage->num_data(), Eq(9));
+ }
+
+ // Check new_storage->sorted_bucket_ manually.
+ const std::string sorted_buckets_file_path = absl_ports::StrCat(
+ new_storage_working_path, "/", IntegerIndexStorage::kFilePrefix, ".s");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, sorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(sorted_buckets->num_elements(), Eq(1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk1, sorted_buckets->Get(/*idx=*/0));
+ EXPECT_THAT(bk1->key_lower(), Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(bk1->key_upper(), Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(bk1->num_data(), Eq(9));
+}
+
+TEST_P(IntegerIndexStorageTest, TransferIndexExceedsMergeThreshold) {
+ int32_t custom_num_data_threshold_for_bucket_split = 300;
+ int32_t custom_num_data_threshold_for_bucket_merge =
+ IntegerIndexStorage::kNumDataThresholdRatioForBucketMerge *
+ custom_num_data_threshold_for_bucket_split;
+
+ // This test verifies that if TransferIndex invokes bucket merging logic and
+ // doesn't merge buckets too aggressively to ensure we won't get a bucket with
+ // too many data.
+
+ // We use predefined custom buckets to initialize new integer index storage
+ // and create some test keys accordingly.
+ std::vector<Bucket> custom_init_sorted_buckets = {
+ Bucket(-1000, -100), Bucket(0, 100), Bucket(150, 199), Bucket(200, 300),
+ Bucket(301, 999)};
+ std::vector<Bucket> custom_init_unsorted_buckets = {
+ Bucket(1000, std::numeric_limits<int64_t>::max()), Bucket(-99, -1),
+ Bucket(101, 149), Bucket(std::numeric_limits<int64_t>::min(), -1001)};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, working_path_,
+ Options(std::move(custom_init_sorted_buckets),
+ std::move(custom_init_unsorted_buckets),
+ custom_num_data_threshold_for_bucket_split,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+
+ // Insert data into 2 buckets so that total # of these 2 buckets exceed
+ // custom_num_data_threshold_for_bucket_merge.
+ // - Bucket 1: [-1000, -100]
+ // - Bucket 2: [101, 149]
+ DocumentId document_id = 0;
+ int num_data_for_bucket1 = custom_num_data_threshold_for_bucket_merge - 50;
+ for (int i = 0; i < num_data_for_bucket1; ++i) {
+ ICING_ASSERT_OK(storage->AddKeys(document_id, kDefaultSectionId,
+ /*new_keys=*/{-200}));
+ ++document_id;
+ }
+
+ int num_data_for_bucket2 = 150;
+ for (int i = 0; i < num_data_for_bucket2; ++i) {
+ ICING_ASSERT_OK(storage->AddKeys(document_id, kDefaultSectionId,
+ /*new_keys=*/{120}));
+ ++document_id;
+ }
+
+ ASSERT_THAT(storage->num_data(),
+ Eq(num_data_for_bucket1 + num_data_for_bucket2));
+ ASSERT_THAT(num_data_for_bucket1 + num_data_for_bucket2,
+ Gt(custom_num_data_threshold_for_bucket_merge));
+
+ // Create document_id_old_to_new that keeps all existing documents.
+ std::vector<DocumentId> document_id_old_to_new(document_id);
+ std::iota(document_id_old_to_new.begin(), document_id_old_to_new.end(), 0);
+
+ // Transfer to new storage. This should result in 2 buckets: [INT64_MIN, 100]
+ // and [101, INT64_MAX]
+ const std::string new_storage_working_path = working_path_ + "_temp";
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ filesystem_, new_storage_working_path,
+ Options(/*custom_init_sorted_buckets_in=*/{},
+ /*custom_init_unsorted_buckets_in=*/{},
+ custom_num_data_threshold_for_bucket_split,
+ /*pre_mapping_fbv_in=*/GetParam()),
+ serializer_.get()));
+ EXPECT_THAT(
+ storage->TransferIndex(document_id_old_to_new, new_storage.get()),
+ IsOk());
+ EXPECT_THAT(new_storage->num_data(),
+ Eq(num_data_for_bucket1 + num_data_for_bucket2));
+ }
+
+ // Check new_storage->sorted_bucket_ manually.
+ const std::string sorted_buckets_file_path = absl_ports::StrCat(
+ new_storage_working_path, "/", IntegerIndexStorage::kFilePrefix, ".s");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<Bucket>> sorted_buckets,
+ FileBackedVector<Bucket>::Create(
+ filesystem_, sorted_buckets_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ EXPECT_THAT(sorted_buckets->num_elements(), Eq(2));
+
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk1, sorted_buckets->Get(/*idx=*/0));
+ EXPECT_THAT(bk1->key_lower(), Eq(std::numeric_limits<int64_t>::min()));
+ EXPECT_THAT(bk1->key_upper(), Eq(100));
+ EXPECT_THAT(bk1->num_data(), Eq(num_data_for_bucket1));
+ ICING_ASSERT_OK_AND_ASSIGN(const Bucket* bk2, sorted_buckets->Get(/*idx=*/1));
+ EXPECT_THAT(bk2->key_lower(), Eq(101));
+ EXPECT_THAT(bk2->key_upper(), Eq(std::numeric_limits<int64_t>::max()));
+ EXPECT_THAT(bk2->num_data(), Eq(num_data_for_bucket2));
+}
+
+INSTANTIATE_TEST_SUITE_P(IntegerIndexStorageTest, IntegerIndexStorageTest,
+ testing::Values(true, false));
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/integer-index.cc b/icing/index/numeric/integer-index.cc
new file mode 100644
index 0000000..8c80698
--- /dev/null
+++ b/icing/index/numeric/integer-index.cc
@@ -0,0 +1,651 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
+#include "icing/index/numeric/doc-hit-info-iterator-numeric.h"
+#include "icing/index/numeric/integer-index-storage.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/store/document-id.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Helper function to get the file name of metadata.
+std::string GetMetadataFileName() {
+ return absl_ports::StrCat(IntegerIndex::kFilePrefix, ".m");
+}
+
+// Helper function to get the file path of metadata according to the given
+// working directory.
+std::string GetMetadataFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/", GetMetadataFileName());
+}
+
+constexpr std::string_view kWildcardPropertyIndexFileName =
+ "wildcard_property_index";
+
+constexpr std::string_view kWildcardPropertyStorageFileName =
+ "wildcard_property_storage";
+
+std::string GetWildcardPropertyStorageFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/",
+ kWildcardPropertyStorageFileName);
+}
+
+// Helper function to get the sub working (directory) path of
+// IntegerIndexStorage according to the given working directory and property
+// path.
+std::string GetPropertyIndexStoragePath(std::string_view working_path,
+ std::string_view property_path) {
+ return absl_ports::StrCat(working_path, "/", property_path);
+}
+
+// Helper function to get all existing property paths by listing all
+// directories.
+libtextclassifier3::StatusOr<std::vector<std::string>>
+GetAllExistingPropertyPaths(const Filesystem& filesystem,
+ const std::string& working_path) {
+ std::vector<std::string> property_paths;
+ std::unordered_set<std::string> excludes = {
+ GetMetadataFileName(), std::string(kWildcardPropertyStorageFileName)};
+ if (!filesystem.ListDirectory(working_path.c_str(), excludes,
+ /*recursive=*/false, &property_paths)) {
+ return absl_ports::InternalError("Failed to list directory");
+ }
+ return property_paths;
+}
+
+libtextclassifier3::StatusOr<IntegerIndex::PropertyToStorageMapType>
+GetPropertyIntegerIndexStorageMap(
+ const Filesystem& filesystem, const std::string& working_path,
+ PostingListIntegerIndexSerializer* posting_list_serializer,
+ int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) {
+ ICING_ASSIGN_OR_RETURN(std::vector<std::string> property_paths,
+ GetAllExistingPropertyPaths(filesystem, working_path));
+
+ IntegerIndex::PropertyToStorageMapType property_to_storage_map;
+ for (const std::string& property_path : property_paths) {
+ if (property_path == kWildcardPropertyIndexFileName) {
+ continue;
+ }
+ std::string storage_working_path =
+ GetPropertyIndexStoragePath(working_path, property_path);
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem, storage_working_path,
+ IntegerIndexStorage::Options(num_data_threshold_for_bucket_split,
+ pre_mapping_fbv),
+ posting_list_serializer));
+ property_to_storage_map.insert(
+ std::make_pair(property_path, std::move(storage)));
+ }
+
+ return property_to_storage_map;
+}
+
+// RETURNS:
+// - On success, an unordered_set representing the list of property paths
+// stored in the WildcardPropertyStorage managed by property_storage
+// - INTERNAL_ERROR on any failure to successfully read the underlying proto.
+libtextclassifier3::StatusOr<std::unordered_set<std::string>> CreatePropertySet(
+ const FileBackedProto<WildcardPropertyStorage>& property_storage) {
+ std::unordered_set<std::string> wildcard_properties_set;
+ auto wildcard_properties_or = property_storage.Read();
+ if (!wildcard_properties_or.ok()) {
+ if (absl_ports::IsNotFound(wildcard_properties_or.status())) {
+ return wildcard_properties_set;
+ }
+ return wildcard_properties_or.status();
+ }
+
+ const WildcardPropertyStorage* wildcard_properties =
+ wildcard_properties_or.ValueOrDie();
+ wildcard_properties_set.reserve(wildcard_properties->property_entries_size());
+ for (const std::string& property : wildcard_properties->property_entries()) {
+ wildcard_properties_set.insert(property);
+ }
+ return wildcard_properties_set;
+}
+
+} // namespace
+
+libtextclassifier3::Status IntegerIndex::Editor::IndexAllBufferedKeys() && {
+ integer_index_.SetDirty();
+
+ auto iter = integer_index_.property_to_storage_map_.find(property_path_);
+ IntegerIndexStorage* target_storage = nullptr;
+ // 1. Check if this property already has its own individual index.
+ if (iter != integer_index_.property_to_storage_map_.end()) {
+ target_storage = iter->second.get();
+ // 2. Check if this property was added to wildcard storage.
+ } else if (integer_index_.wildcard_properties_set_.find(property_path_) !=
+ integer_index_.wildcard_properties_set_.end()) {
+ target_storage = integer_index_.wildcard_index_storage_.get();
+ // 3. Check if we've reach the limit of individual property storages.
+ } else if (integer_index_.property_to_storage_map_.size() >=
+ kMaxPropertyStorages) {
+ // 3a. Create the wildcard storage if it doesn't exist.
+ if (integer_index_.wildcard_index_storage_ == nullptr) {
+ ICING_ASSIGN_OR_RETURN(
+ integer_index_.wildcard_index_storage_,
+ IntegerIndexStorage::Create(
+ integer_index_.filesystem_,
+ GetPropertyIndexStoragePath(integer_index_.working_path_,
+ kWildcardPropertyIndexFileName),
+ IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_,
+ pre_mapping_fbv_),
+ integer_index_.posting_list_serializer_.get()));
+ }
+ ICING_RETURN_IF_ERROR(
+ integer_index_.AddPropertyToWildcardStorage(property_path_));
+ target_storage = integer_index_.wildcard_index_storage_.get();
+ // 4. Create a new individual storage for this new property.
+ } else {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ integer_index_.filesystem_,
+ GetPropertyIndexStoragePath(integer_index_.working_path_,
+ property_path_),
+ IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_,
+ pre_mapping_fbv_),
+ integer_index_.posting_list_serializer_.get()));
+ target_storage = new_storage.get();
+ integer_index_.property_to_storage_map_.insert(
+ std::make_pair(property_path_, std::move(new_storage)));
+ }
+
+ return target_storage->AddKeys(document_id_, section_id_,
+ std::move(seen_keys_));
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+IntegerIndex::Create(const Filesystem& filesystem, std::string working_path,
+ int32_t num_data_threshold_for_bucket_split,
+ bool pre_mapping_fbv) {
+ if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str())) {
+ // Discard working_path if metadata file is missing, and reinitialize.
+ if (filesystem.DirectoryExists(working_path.c_str())) {
+ ICING_RETURN_IF_ERROR(Discard(filesystem, working_path));
+ }
+ return InitializeNewFiles(filesystem, std::move(working_path),
+ num_data_threshold_for_bucket_split,
+ pre_mapping_fbv);
+ }
+ return InitializeExistingFiles(filesystem, std::move(working_path),
+ num_data_threshold_for_bucket_split,
+ pre_mapping_fbv);
+}
+
+IntegerIndex::~IntegerIndex() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to persist integer index to disk while destructing "
+ << working_path_;
+ }
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+IntegerIndex::GetIterator(std::string_view property_path, int64_t key_lower,
+ int64_t key_upper,
+ const DocumentStore& document_store,
+ const SchemaStore& schema_store,
+ int64_t current_time_ms) const {
+ std::string property_path_str(property_path);
+ auto iter = property_to_storage_map_.find(property_path_str);
+ if (iter != property_to_storage_map_.end()) {
+ return iter->second->GetIterator(key_lower, key_upper);
+ }
+
+ if (wildcard_properties_set_.find(property_path_str) !=
+ wildcard_properties_set_.end()) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> delegate,
+ wildcard_index_storage_->GetIterator(key_lower, key_upper));
+ std::set<std::string> property_paths = {std::move(property_path_str)};
+ return DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(delegate), &document_store, &schema_store,
+ std::move(property_paths), current_time_ms);
+ }
+
+ // Return an empty iterator.
+ return std::make_unique<DocHitInfoIteratorNumeric<int64_t>>(
+ /*numeric_index_iter=*/nullptr);
+}
+
+libtextclassifier3::Status IntegerIndex::AddPropertyToWildcardStorage(
+ const std::string& property_path) {
+ SetDirty();
+
+ WildcardPropertyStorage wildcard_properties;
+ wildcard_properties.mutable_property_entries()->Reserve(
+ wildcard_properties_set_.size());
+ for (const std::string& property_path : wildcard_properties_set_) {
+ wildcard_properties.add_property_entries(property_path);
+ }
+ ICING_RETURN_IF_ERROR(wildcard_property_storage_->Write(
+ std::make_unique<WildcardPropertyStorage>(
+ std::move(wildcard_properties))));
+
+ wildcard_properties_set_.insert(property_path);
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IntegerIndex::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id) {
+ std::string temp_working_path = working_path_ + "_temp";
+ ICING_RETURN_IF_ERROR(Discard(filesystem_, temp_working_path));
+
+ DestructibleDirectory temp_working_path_ddir(&filesystem_,
+ std::move(temp_working_path));
+ if (!temp_working_path_ddir.is_valid()) {
+ return absl_ports::InternalError(
+ "Unable to create temp directory to build new integer index");
+ }
+
+ {
+ // Transfer all indexed data from current integer index to new integer
+ // index. Also PersistToDisk and destruct the instance after finishing, so
+ // we can safely swap directories later.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<IntegerIndex> new_integer_index,
+ Create(filesystem_, temp_working_path_ddir.dir(),
+ num_data_threshold_for_bucket_split_, pre_mapping_fbv_));
+ ICING_RETURN_IF_ERROR(
+ TransferIndex(document_id_old_to_new, new_integer_index.get()));
+ new_integer_index->set_last_added_document_id(new_last_added_document_id);
+ ICING_RETURN_IF_ERROR(new_integer_index->PersistToDisk());
+ }
+
+ // Destruct current storage instances to safely swap directories.
+ metadata_mmapped_file_.reset();
+ property_to_storage_map_.clear();
+ wildcard_index_storage_.reset();
+ wildcard_property_storage_.reset();
+ if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(),
+ working_path_.c_str())) {
+ return absl_ports::InternalError(
+ "Unable to apply new integer index due to failed swap");
+ }
+
+ // Reinitialize the integer index.
+ std::string metadata_file_path = GetMetadataFilePath(working_path_);
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile metadata_mmapped_file,
+ MemoryMappedFile::Create(filesystem_, metadata_file_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/kMetadataFileSize,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/kMetadataFileSize));
+ if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
+ return absl_ports::InternalError(
+ "Invalid metadata file size after Optimize");
+ }
+ metadata_mmapped_file_ =
+ std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file));
+
+ // Recreate all of the data structures tracking the wildcard storage.
+ std::string wildcard_property_path =
+ GetWildcardPropertyStorageFilePath(working_path_);
+ wildcard_property_storage_ =
+ std::make_unique<FileBackedProto<WildcardPropertyStorage>>(
+ filesystem_, wildcard_property_path);
+
+ ICING_ASSIGN_OR_RETURN(wildcard_properties_set_,
+ CreatePropertySet(*wildcard_property_storage_));
+ if (!wildcard_properties_set_.empty()) {
+ ICING_ASSIGN_OR_RETURN(
+ wildcard_index_storage_,
+ IntegerIndexStorage::Create(
+ filesystem_,
+ GetPropertyIndexStoragePath(working_path_,
+ kWildcardPropertyIndexFileName),
+ IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_,
+ pre_mapping_fbv_),
+ posting_list_serializer_.get()));
+ }
+
+ // Initialize all existing integer index storages.
+ ICING_ASSIGN_OR_RETURN(
+ property_to_storage_map_,
+ GetPropertyIntegerIndexStorageMap(
+ filesystem_, working_path_, posting_list_serializer_.get(),
+ num_data_threshold_for_bucket_split_, pre_mapping_fbv_));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IntegerIndex::Clear() {
+ SetDirty();
+
+ // Step 1: clear property_to_storage_map_.
+ property_to_storage_map_.clear();
+ wildcard_index_storage_.reset();
+
+ // Step 2: delete all IntegerIndexStorages. It is safe because there is no
+ // active IntegerIndexStorage after clearing the map.
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<std::string> property_paths,
+ GetAllExistingPropertyPaths(filesystem_, working_path_));
+ for (const std::string& property_path : property_paths) {
+ ICING_RETURN_IF_ERROR(IntegerIndexStorage::Discard(
+ filesystem_,
+ GetPropertyIndexStoragePath(working_path_, property_path)));
+ }
+
+ // Step 3: Delete the wildcard property storage
+ std::string wildcard_property_path =
+ GetWildcardPropertyStorageFilePath(working_path_);
+ if (filesystem_.FileExists(wildcard_property_path.c_str()) ||
+ !filesystem_.DeleteFile(wildcard_property_path.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Unable to delete file at path ", wildcard_property_path));
+ }
+
+ info().last_added_document_id = kInvalidDocumentId;
+ return libtextclassifier3::Status::OK;
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+IntegerIndex::InitializeNewFiles(const Filesystem& filesystem,
+ std::string&& working_path,
+ int32_t num_data_threshold_for_bucket_split,
+ bool pre_mapping_fbv) {
+ // Create working directory.
+ if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create directory: ", working_path));
+ }
+
+ // Initialize metadata file. Create MemoryMappedFile with pre-mapping, and
+ // call GrowAndRemapIfNecessary to grow the underlying file.
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile metadata_mmapped_file,
+ MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/kMetadataFileSize,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/kMetadataFileSize));
+ ICING_RETURN_IF_ERROR(metadata_mmapped_file.GrowAndRemapIfNecessary(
+ /*file_offset=*/0, /*mmap_size=*/kMetadataFileSize));
+
+ std::string wildcard_property_path =
+ GetWildcardPropertyStorageFilePath(working_path);
+ auto wildcard_property_storage =
+ std::make_unique<FileBackedProto<WildcardPropertyStorage>>(
+ filesystem, wildcard_property_path);
+
+ // Create instance.
+ auto new_integer_index = std::unique_ptr<IntegerIndex>(new IntegerIndex(
+ filesystem, std::move(working_path),
+ std::make_unique<PostingListIntegerIndexSerializer>(),
+ std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
+ /*property_to_storage_map=*/{}, std::move(wildcard_property_storage),
+ /*wildcard_properties_set=*/{}, /*wildcard_index_storage=*/nullptr,
+ num_data_threshold_for_bucket_split, pre_mapping_fbv));
+
+ // Initialize info content by writing mapped memory directly.
+ Info& info_ref = new_integer_index->info();
+ info_ref.magic = Info::kMagic;
+ info_ref.last_added_document_id = kInvalidDocumentId;
+ info_ref.num_data_threshold_for_bucket_split =
+ num_data_threshold_for_bucket_split;
+ // Initialize new PersistentStorage. The initial checksums will be computed
+ // and set via InitializeNewStorage.
+ ICING_RETURN_IF_ERROR(new_integer_index->InitializeNewStorage());
+
+ return new_integer_index;
+}
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+IntegerIndex::InitializeExistingFiles(
+ const Filesystem& filesystem, std::string&& working_path,
+ int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv) {
+ // Mmap the content of the crcs and info.
+ ICING_ASSIGN_OR_RETURN(
+ MemoryMappedFile metadata_mmapped_file,
+ MemoryMappedFile::Create(filesystem, GetMetadataFilePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ /*max_file_size=*/kMetadataFileSize,
+ /*pre_mapping_file_offset=*/0,
+ /*pre_mapping_mmap_size=*/kMetadataFileSize));
+ if (metadata_mmapped_file.available_size() != kMetadataFileSize) {
+ return absl_ports::FailedPreconditionError("Incorrect metadata file size");
+ }
+
+ auto posting_list_serializer =
+ std::make_unique<PostingListIntegerIndexSerializer>();
+
+ // Initialize all existing integer index storages.
+ ICING_ASSIGN_OR_RETURN(
+ PropertyToStorageMapType property_to_storage_map,
+ GetPropertyIntegerIndexStorageMap(
+ filesystem, working_path, posting_list_serializer.get(),
+ num_data_threshold_for_bucket_split, pre_mapping_fbv));
+
+ std::string wildcard_property_path =
+ GetWildcardPropertyStorageFilePath(working_path);
+ auto wildcard_property_storage =
+ std::make_unique<FileBackedProto<WildcardPropertyStorage>>(
+ filesystem, wildcard_property_path);
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unordered_set<std::string> wildcard_properties_set,
+ CreatePropertySet(*wildcard_property_storage));
+
+ std::unique_ptr<IntegerIndexStorage> wildcard_index_storage;
+ if (!wildcard_properties_set.empty()) {
+ ICING_ASSIGN_OR_RETURN(
+ wildcard_index_storage,
+ IntegerIndexStorage::Create(
+ filesystem,
+ GetPropertyIndexStoragePath(working_path,
+ kWildcardPropertyIndexFileName),
+ IntegerIndexStorage::Options(num_data_threshold_for_bucket_split,
+ pre_mapping_fbv),
+ posting_list_serializer.get()));
+ }
+
+ // Create instance.
+ auto integer_index = std::unique_ptr<IntegerIndex>(new IntegerIndex(
+ filesystem, std::move(working_path), std::move(posting_list_serializer),
+ std::make_unique<MemoryMappedFile>(std::move(metadata_mmapped_file)),
+ std::move(property_to_storage_map), std::move(wildcard_property_storage),
+ std::move(wildcard_properties_set), std::move(wildcard_index_storage),
+ num_data_threshold_for_bucket_split, pre_mapping_fbv));
+ // Initialize existing PersistentStorage. Checksums will be validated.
+ ICING_RETURN_IF_ERROR(integer_index->InitializeExistingStorage());
+
+ // Validate magic.
+ if (integer_index->info().magic != Info::kMagic) {
+ return absl_ports::FailedPreconditionError("Incorrect magic value");
+ }
+
+ // If num_data_threshold_for_bucket_split mismatches, then return error to let
+ // caller rebuild.
+ if (integer_index->info().num_data_threshold_for_bucket_split !=
+ num_data_threshold_for_bucket_split) {
+ return absl_ports::FailedPreconditionError(
+ "Mismatch num_data_threshold_for_bucket_split");
+ }
+
+ return integer_index;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+IntegerIndex::TransferIntegerIndexStorage(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const IntegerIndexStorage* old_storage, const std::string& property_path,
+ IntegerIndex* new_integer_index) const {
+ std::string new_storage_working_path = GetPropertyIndexStoragePath(
+ new_integer_index->working_path_, property_path);
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<IntegerIndexStorage> new_storage,
+ IntegerIndexStorage::Create(
+ new_integer_index->filesystem_, new_storage_working_path,
+ IntegerIndexStorage::Options(num_data_threshold_for_bucket_split_,
+ pre_mapping_fbv_),
+ new_integer_index->posting_list_serializer_.get()));
+
+ ICING_RETURN_IF_ERROR(
+ old_storage->TransferIndex(document_id_old_to_new, new_storage.get()));
+
+ if (new_storage->num_data() == 0) {
+ new_storage.reset();
+ ICING_RETURN_IF_ERROR(
+ IntegerIndexStorage::Discard(filesystem_, new_storage_working_path));
+ }
+ return new_storage;
+}
+
+libtextclassifier3::Status IntegerIndex::TransferWildcardStorage(
+ IntegerIndex* new_integer_index) const {
+ auto property_storage = std::make_unique<WildcardPropertyStorage>();
+ property_storage->mutable_property_entries()->Reserve(
+ wildcard_properties_set_.size());
+ for (const std::string& property : wildcard_properties_set_) {
+ property_storage->add_property_entries(property);
+ }
+
+ ICING_RETURN_IF_ERROR(new_integer_index->wildcard_property_storage_->Write(
+ std::move(property_storage)));
+ new_integer_index->wildcard_properties_set_ = wildcard_properties_set_;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IntegerIndex::TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ IntegerIndex* new_integer_index) const {
+ // Transfer over the integer index storages
+ std::unique_ptr<IntegerIndexStorage> new_storage;
+ for (const auto& [property_path, old_storage] : property_to_storage_map_) {
+ ICING_ASSIGN_OR_RETURN(
+ new_storage,
+ TransferIntegerIndexStorage(document_id_old_to_new, old_storage.get(),
+ property_path, new_integer_index));
+ if (new_storage != nullptr) {
+ new_integer_index->property_to_storage_map_.insert(
+ {property_path, std::move(new_storage)});
+ }
+ }
+ if (wildcard_index_storage_ != nullptr) {
+ ICING_ASSIGN_OR_RETURN(
+ new_storage,
+ TransferIntegerIndexStorage(
+ document_id_old_to_new, wildcard_index_storage_.get(),
+ std::string(kWildcardPropertyIndexFileName), new_integer_index));
+ if (new_storage != nullptr) {
+ new_integer_index->wildcard_index_storage_ = std::move(new_storage);
+
+ // The only time we need to copy over the list of properties using
+ // wildcard storage is if wildcard_index_storage and new_storage are both
+ // non-null. Otherwise, the new wildcard index storage won't have any
+ // data.
+ ICING_RETURN_IF_ERROR(TransferWildcardStorage(new_integer_index));
+ }
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IntegerIndex::PersistStoragesToDisk(bool force) {
+ if (!force && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ for (auto& [_, storage] : property_to_storage_map_) {
+ ICING_RETURN_IF_ERROR(storage->PersistToDisk());
+ }
+ // No need to persist wildcard_properties_storage_. All calls to
+ // FileBackedProto::Write are fully written through at the time of the call.
+ if (wildcard_index_storage_) {
+ ICING_RETURN_IF_ERROR(wildcard_index_storage_->PersistToDisk());
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status IntegerIndex::PersistMetadataToDisk(bool force) {
+ if (!force && !is_info_dirty() && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Changes should have been applied to the underlying file when using
+ // MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC, but call msync() as an
+ // extra safety step to ensure they are written out.
+ return metadata_mmapped_file_->PersistToDisk();
+}
+
+libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeInfoChecksum(
+ bool force) {
+ if (!force && !is_info_dirty()) {
+ return Crc32(crcs().component_crcs.info_crc);
+ }
+
+ return info().ComputeChecksum();
+}
+
+libtextclassifier3::StatusOr<Crc32> IntegerIndex::ComputeStoragesChecksum(
+ bool force) {
+ if (!force && !is_storage_dirty()) {
+ return Crc32(crcs().component_crcs.storages_crc);
+ }
+
+ // XOR all crcs of all storages. Since XOR is commutative and associative,
+ // the order doesn't matter.
+ uint32_t storages_checksum = 0;
+ for (auto& [property_path, storage] : property_to_storage_map_) {
+ ICING_ASSIGN_OR_RETURN(Crc32 storage_crc, storage->UpdateChecksums());
+ storage_crc.Append(property_path);
+
+ storages_checksum ^= storage_crc.Get();
+ }
+
+ if (wildcard_index_storage_ != nullptr) {
+ ICING_ASSIGN_OR_RETURN(Crc32 storage_crc,
+ wildcard_index_storage_->UpdateChecksums());
+ storages_checksum ^= storage_crc.Get();
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 wildcard_properties_crc,
+ wildcard_property_storage_->ComputeChecksum());
+ storages_checksum ^= wildcard_properties_crc.Get();
+
+ return Crc32(storages_checksum);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/integer-index.h b/icing/index/numeric/integer-index.h
new file mode 100644
index 0000000..e7a3127
--- /dev/null
+++ b/icing/index/numeric/integer-index.h
@@ -0,0 +1,409 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
+#define ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/file-backed-proto.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/index/numeric/integer-index-storage.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/index/numeric/wildcard-property-storage.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// IntegerIndex: a wrapper class for managing IntegerIndexStorage (a lower level
+// persistent storage class for indexing and searching contents of integer type
+// sections in documents) instances for different property paths.
+// We separate indexable integer data from different properties into different
+// storages, and IntegerIndex manages and handles indexable integer data
+// appropriately to their corresponding IntegerIndexStorage instance according
+// to the given property path.
+class IntegerIndex : public NumericIndex<int64_t> {
+ public:
+ using PropertyToStorageMapType =
+ std::unordered_map<std::string, std::unique_ptr<IntegerIndexStorage>>;
+
+ // Maximum number of individual property storages that this index will allow
+ // before falling back to placing hits for any new properties into the
+ // 'wildcard' storage.
+ static constexpr int kMaxPropertyStorages = 32;
+
+ static constexpr int32_t kDefaultNumDataThresholdForBucketSplit =
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit;
+
+ struct Info {
+ static constexpr int32_t kMagic = 0x5d8a1e8a;
+
+ int32_t magic;
+ DocumentId last_added_document_id;
+ int32_t num_data_threshold_for_bucket_split;
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(
+ std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
+ }
+ } __attribute__((packed));
+ static_assert(sizeof(Info) == 12, "");
+
+ // Metadata file layout: <Crcs><Info>
+ static constexpr int32_t kCrcsMetadataFileOffset = 0;
+ static constexpr int32_t kInfoMetadataFileOffset =
+ static_cast<int32_t>(sizeof(Crcs));
+ static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
+ static_assert(kMetadataFileSize == 24, "");
+
+ static constexpr WorkingPathType kWorkingPathType =
+ WorkingPathType::kDirectory;
+ static constexpr std::string_view kFilePrefix = "integer_index";
+
+ // Creates a new IntegerIndex instance to index integers. If any of the
+ // underlying file is missing, then delete the whole working_path and
+ // (re)initialize with new ones. Otherwise initialize and create the instance
+ // by existing files.
+ //
+ // filesystem: Object to make system level calls
+ // working_path: Specifies the working path for PersistentStorage.
+ // IntegerIndex uses working path as working directory and all
+ // related files will be stored under this directory. See
+ // PersistentStorage for more details about the concept of
+ // working_path.
+ // num_data_threshold_for_bucket_split: see IntegerIndexStorage::Options for
+ // more details.
+ // pre_mapping_fbv: flag indicating whether memory map max possible file size
+ // for underlying FileBackedVector before growing the actual
+ // file size.
+ //
+ // Returns:
+ // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
+ // checksum.
+ // - INTERNAL_ERROR on I/O errors.
+ // - Any FileBackedVector/MemoryMappedFile errors.
+ static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>> Create(
+ const Filesystem& filesystem, std::string working_path,
+ int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv);
+
+ // Deletes IntegerIndex under working_path.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ static libtextclassifier3::Status Discard(const Filesystem& filesystem,
+ const std::string& working_path) {
+ return PersistentStorage::Discard(filesystem, working_path,
+ kWorkingPathType);
+ }
+
+ ~IntegerIndex() override;
+
+ // Returns an Editor instance for adding new records into integer index for a
+ // given property, DocumentId and SectionId. See Editor for more details.
+ std::unique_ptr<typename NumericIndex<int64_t>::Editor> Edit(
+ std::string_view property_path, DocumentId document_id,
+ SectionId section_id) override {
+ return std::make_unique<Editor>(property_path, document_id, section_id,
+ *this, num_data_threshold_for_bucket_split_,
+ pre_mapping_fbv_);
+ }
+
+ // Returns a DocHitInfoIterator for iterating through all docs which have the
+ // specified (integer) property contents in range [query_key_lower,
+ // query_key_upper].
+ // When iterating through all relevant doc hits, it:
+ // - Merges multiple SectionIds of doc hits with same DocumentId into a single
+ // SectionIdMask and constructs DocHitInfo.
+ // - Returns DocHitInfo in descending DocumentId order.
+ //
+ // Returns:
+ // - On success: a DocHitInfoIterator instance
+ // - NOT_FOUND_ERROR if the given property_path doesn't exist
+ // - Any IntegerIndexStorage errors
+ libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> GetIterator(
+ std::string_view property_path, int64_t key_lower, int64_t key_upper,
+ const DocumentStore& document_store, const SchemaStore& schema_store,
+ int64_t current_time_ms) const override;
+
+ // Reduces internal file sizes by reclaiming space and ids of deleted
+ // documents. Integer index will convert all data (hits) to the new document
+ // ids and regenerate all index files. If all data in a property path are
+ // completely deleted, then the underlying storage will be discarded as well.
+ //
+ // - document_id_old_to_new: a map for converting old document id to new
+ // document id.
+ // - new_last_added_document_id: will be used to update the last added
+ // document id in the integer index.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id) override;
+
+ // Clears all integer index data by discarding all existing storages, and set
+ // last_added_document_id to kInvalidDocumentId.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status Clear() override;
+
+ DocumentId last_added_document_id() const override {
+ return info().last_added_document_id;
+ }
+
+ void set_last_added_document_id(DocumentId document_id) override {
+ SetInfoDirty();
+
+ Info& info_ref = info();
+ if (info_ref.last_added_document_id == kInvalidDocumentId ||
+ document_id > info_ref.last_added_document_id) {
+ info_ref.last_added_document_id = document_id;
+ }
+ }
+
+ int num_property_indices() const override {
+ return property_to_storage_map_.size() +
+ ((wildcard_index_storage_ == nullptr) ? 0 : 1);
+ }
+
+ private:
+ class Editor : public NumericIndex<int64_t>::Editor {
+ public:
+ explicit Editor(std::string_view property_path, DocumentId document_id,
+ SectionId section_id, IntegerIndex& integer_index,
+ int32_t num_data_threshold_for_bucket_split,
+ bool pre_mapping_fbv)
+ : NumericIndex<int64_t>::Editor(property_path, document_id, section_id),
+ integer_index_(integer_index),
+ num_data_threshold_for_bucket_split_(
+ num_data_threshold_for_bucket_split),
+ pre_mapping_fbv_(pre_mapping_fbv) {}
+
+ ~Editor() override = default;
+
+ libtextclassifier3::Status BufferKey(int64_t key) override {
+ seen_keys_.push_back(key);
+ return libtextclassifier3::Status::OK;
+ }
+
+ libtextclassifier3::Status IndexAllBufferedKeys() && override;
+
+ private:
+ // Vector for caching all seen keys. Since IntegerIndexStorage::AddKeys
+ // sorts and dedupes keys, we can just simply use vector here and move it to
+ // AddKeys().
+ std::vector<int64_t> seen_keys_;
+
+ IntegerIndex& integer_index_; // Does not own.
+
+ int32_t num_data_threshold_for_bucket_split_;
+
+ // Flag indicating whether memory map max possible file size for underlying
+ // FileBackedVector before growing the actual file size.
+ bool pre_mapping_fbv_;
+ };
+
+ explicit IntegerIndex(
+ const Filesystem& filesystem, std::string&& working_path,
+ std::unique_ptr<PostingListIntegerIndexSerializer>
+ posting_list_serializer,
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file,
+ PropertyToStorageMapType&& property_to_storage_map,
+ std::unique_ptr<FileBackedProto<WildcardPropertyStorage>>
+ wildcard_property_storage,
+ std::unordered_set<std::string> wildcard_properties_set,
+ std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage,
+ int32_t num_data_threshold_for_bucket_split, bool pre_mapping_fbv)
+ : NumericIndex<int64_t>(filesystem, std::move(working_path),
+ kWorkingPathType),
+ posting_list_serializer_(std::move(posting_list_serializer)),
+ metadata_mmapped_file_(std::move(metadata_mmapped_file)),
+ property_to_storage_map_(std::move(property_to_storage_map)),
+ wildcard_property_storage_(std::move(wildcard_property_storage)),
+ wildcard_properties_set_(std::move(wildcard_properties_set)),
+ wildcard_index_storage_(std::move(wildcard_index_storage)),
+ num_data_threshold_for_bucket_split_(
+ num_data_threshold_for_bucket_split),
+ pre_mapping_fbv_(pre_mapping_fbv),
+ is_info_dirty_(false),
+ is_storage_dirty_(false) {}
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+ InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
+ int32_t num_data_threshold_for_bucket_split,
+ bool pre_mapping_fbv);
+
+ static libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+ InitializeExistingFiles(const Filesystem& filesystem,
+ std::string&& working_path,
+ int32_t num_data_threshold_for_bucket_split,
+ bool pre_mapping_fbv);
+
+ // Adds the property path to the list of properties using wildcard storage.
+ // This will both update the in-memory list (wildcard_properties_set_) and
+ // the persistent list (wilcard_property_storage_).
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if unable to successfully persist updated properties
+ // list in wildcard_property_storage_.
+ libtextclassifier3::Status AddPropertyToWildcardStorage(
+ const std::string& property_path);
+
+ // Transfers integer index data from the current integer index to
+ // new_integer_index.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error. This could potentially leave the storages
+ // in an invalid state and the caller should handle it properly (e.g.
+ // discard and rebuild)
+ libtextclassifier3::Status TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ IntegerIndex* new_integer_index) const;
+
+ // Transfers integer index data from old_storage to new_integer_index.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error. This could potentially leave the storages
+ // in an invalid state and the caller should handle it properly (e.g.
+ // discard and rebuild)
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndexStorage>>
+ TransferIntegerIndexStorage(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const IntegerIndexStorage* old_storage, const std::string& property_path,
+ IntegerIndex* new_integer_index) const;
+
+ // Transfers the persistent and in-memory list of properties using the
+ // wildcard storage from old_storage to new_integer_index.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if unable to successfully persist updated properties
+ // list in new_integer_index.
+ libtextclassifier3::Status TransferWildcardStorage(
+ IntegerIndex* new_integer_index) const;
+
+ // Flushes contents of all storages to underlying files.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistStoragesToDisk(bool force) override;
+
+ // Flushes contents of metadata file.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistMetadataToDisk(bool force) override;
+
+ // Computes and returns Info checksum.
+ //
+ // Returns:
+ // - Crc of the Info on success
+ libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override;
+
+ // Computes and returns all storages checksum. Checksums of (storage_crc,
+ // property_path) for all existing property paths will be combined together by
+ // XOR.
+ //
+ // Returns:
+ // - Crc of all storages on success
+ // - INTERNAL_ERROR if any data inconsistency
+ libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) override;
+
+ Crcs& crcs() override {
+ return *reinterpret_cast<Crcs*>(metadata_mmapped_file_->mutable_region() +
+ kCrcsMetadataFileOffset);
+ }
+
+ const Crcs& crcs() const override {
+ return *reinterpret_cast<const Crcs*>(metadata_mmapped_file_->region() +
+ kCrcsMetadataFileOffset);
+ }
+
+ Info& info() {
+ return *reinterpret_cast<Info*>(metadata_mmapped_file_->mutable_region() +
+ kInfoMetadataFileOffset);
+ }
+
+ const Info& info() const {
+ return *reinterpret_cast<const Info*>(metadata_mmapped_file_->region() +
+ kInfoMetadataFileOffset);
+ }
+
+ void SetInfoDirty() { is_info_dirty_ = true; }
+ // When storage is dirty, we have to set info dirty as well. So just expose
+ // SetDirty to set both.
+ void SetDirty() {
+ is_info_dirty_ = true;
+ is_storage_dirty_ = true;
+ }
+
+ bool is_info_dirty() const { return is_info_dirty_; }
+ bool is_storage_dirty() const { return is_storage_dirty_; }
+
+ std::unique_ptr<PostingListIntegerIndexSerializer> posting_list_serializer_;
+
+ std::unique_ptr<MemoryMappedFile> metadata_mmapped_file_;
+
+ // Property path to integer index storage map.
+ PropertyToStorageMapType property_to_storage_map_;
+
+ // Persistent list of properties that have added content to
+ // wildcard_index_storage_.
+ std::unique_ptr<FileBackedProto<WildcardPropertyStorage>>
+ wildcard_property_storage_;
+
+ // In-memory list of properties that have added content to
+ // wildcard_index_storage_.
+ std::unordered_set<std::string> wildcard_properties_set_;
+
+ // The index storage that is used once we have already created
+ // kMaxPropertyStorages in property_to_storage_map.
+ std::unique_ptr<icing::lib::IntegerIndexStorage> wildcard_index_storage_;
+
+ int32_t num_data_threshold_for_bucket_split_;
+
+ // Flag indicating whether memory map max possible file size for underlying
+ // FileBackedVector before growing the actual file size.
+ bool pre_mapping_fbv_;
+
+ bool is_info_dirty_;
+ bool is_storage_dirty_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_INTEGER_INDEX_H_
diff --git a/icing/index/numeric/integer-index_test.cc b/icing/index/numeric/integer-index_test.cc
new file mode 100644
index 0000000..3b60001
--- /dev/null
+++ b/icing/index/numeric/integer-index_test.cc
@@ -0,0 +1,2598 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/integer-index.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <type_traits>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/dummy-numeric-index.h"
+#include "icing/index/numeric/integer-index-storage.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Lt;
+
+using Crcs = PersistentStorage::Crcs;
+using Info = IntegerIndex::Info;
+
+static constexpr int32_t kCorruptedValueOffset = 3;
+constexpr static std::string_view kDefaultTestPropertyPath = "test.property";
+
+constexpr SectionId kDefaultSectionId = 0;
+
+template <typename T>
+class NumericIndexIntegerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/icing";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ working_path_ = base_dir_ + "/numeric_index_integer_test";
+ std::string schema_dir = base_dir_ + "/schema_test";
+
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(schema_dir.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_, SchemaStore::Create(&filesystem_, schema_dir, &clock_));
+
+ std::string document_store_dir = base_dir_ + "/doc_store_test";
+ ASSERT_TRUE(
+ filesystem_.CreateDirectoryRecursively(document_store_dir.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult doc_store_create_result,
+ DocumentStore::Create(
+ &filesystem_, document_store_dir, &clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ doc_store_ = std::move(doc_store_create_result.document_store);
+ }
+
+ void TearDown() override {
+ doc_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ template <typename UnknownIntegerIndexType>
+ libtextclassifier3::StatusOr<std::unique_ptr<NumericIndex<int64_t>>>
+ CreateIntegerIndex() {
+ return absl_ports::InvalidArgumentError("Unknown type");
+ }
+
+ template <>
+ libtextclassifier3::StatusOr<std::unique_ptr<NumericIndex<int64_t>>>
+ CreateIntegerIndex<DummyNumericIndex<int64_t>>() {
+ return DummyNumericIndex<int64_t>::Create(filesystem_, working_path_);
+ }
+
+ template <>
+ libtextclassifier3::StatusOr<std::unique_ptr<NumericIndex<int64_t>>>
+ CreateIntegerIndex<IntegerIndex>() {
+ return IntegerIndex::Create(
+ filesystem_, working_path_, /*num_data_threshold_for_bucket_split=*/
+ IntegerIndexStorage::kDefaultNumDataThresholdForBucketSplit,
+ /*pre_mapping_fbv=*/false);
+ }
+
+ template <typename NotIntegerIndexType>
+ bool is_integer_index() const {
+ return false;
+ }
+
+ template <>
+ bool is_integer_index<IntegerIndex>() const {
+ return true;
+ }
+
+ libtextclassifier3::StatusOr<std::vector<DocumentId>> CompactDocStore() {
+ std::string document_store_dir = base_dir_ + "/doc_store_test";
+ std::string document_store_compact_dir =
+ base_dir_ + "/doc_store_compact_test";
+ if (!filesystem_.CreateDirectoryRecursively(
+ document_store_compact_dir.c_str())) {
+ return absl_ports::InternalError("Unable to create compact directory");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ DocumentStore::OptimizeResult doc_store_optimize_result,
+ doc_store_->OptimizeInto(document_store_compact_dir, nullptr));
+
+ doc_store_.reset();
+ if (!filesystem_.SwapFiles(document_store_dir.c_str(),
+ document_store_compact_dir.c_str())) {
+ return absl_ports::InternalError("Unable to swap directories.");
+ }
+ if (!filesystem_.DeleteDirectoryRecursively(
+ document_store_compact_dir.c_str())) {
+ return absl_ports::InternalError("Unable to delete compact directory");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ DocumentStore::CreateResult doc_store_create_result,
+ DocumentStore::Create(
+ &filesystem_, document_store_dir, &clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ doc_store_ = std::move(doc_store_create_result.document_store);
+ return std::move(doc_store_optimize_result.document_id_old_to_new);
+ }
+
+ libtextclassifier3::StatusOr<std::vector<DocHitInfo>> Query(
+ const NumericIndex<int64_t>* integer_index,
+ std::string_view property_path, int64_t key_lower, int64_t key_upper) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> iter,
+ integer_index->GetIterator(property_path, key_lower, key_upper,
+ *doc_store_, *schema_store_,
+ clock_.GetSystemTimeMilliseconds()));
+
+ std::vector<DocHitInfo> result;
+ while (iter->Advance().ok()) {
+ result.push_back(iter->doc_hit_info());
+ }
+ return result;
+ }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+ std::string working_path_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> doc_store_;
+ Clock clock_;
+};
+
+void Index(NumericIndex<int64_t>* integer_index, std::string_view property_path,
+ DocumentId document_id, SectionId section_id,
+ std::vector<int64_t> keys) {
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ integer_index->Edit(property_path, document_id, section_id);
+
+ for (const auto& key : keys) {
+ ICING_EXPECT_OK(editor->BufferKey(key));
+ }
+ ICING_EXPECT_OK(std::move(*editor).IndexAllBufferedKeys());
+}
+
+using TestTypes = ::testing::Types<DummyNumericIndex<int64_t>, IntegerIndex>;
+TYPED_TEST_SUITE(NumericIndexIntegerTest, TestTypes);
+
+TYPED_TEST(NumericIndexIntegerTest, SetLastAddedDocumentId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ constexpr DocumentId kDocumentId = 100;
+ integer_index->set_last_added_document_id(kDocumentId);
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(kDocumentId));
+
+ constexpr DocumentId kNextDocumentId = 123;
+ integer_index->set_last_added_document_id(kNextDocumentId);
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(kNextDocumentId));
+}
+
+TYPED_TEST(
+ NumericIndexIntegerTest,
+ SetLastAddedDocumentIdShouldIgnoreNewDocumentIdNotGreaterThanTheCurrent) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ constexpr DocumentId kDocumentId = 123;
+ integer_index->set_last_added_document_id(kDocumentId);
+ ASSERT_THAT(integer_index->last_added_document_id(), Eq(kDocumentId));
+
+ constexpr DocumentId kNextDocumentId = 100;
+ ASSERT_THAT(kNextDocumentId, Lt(kDocumentId));
+ integer_index->set_last_added_document_id(kNextDocumentId);
+ // last_added_document_id() should remain unchanged.
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(kDocumentId));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, SingleKeyExactQuery) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/4,
+ kDefaultSectionId, /*keys=*/{4});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{2});
+
+ int64_t query_key = 2;
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/query_key, /*key_upper=*/query_key),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, SingleKeyRangeQuery) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/4,
+ kDefaultSectionId, /*keys=*/{4});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{2});
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, WildcardStorageQuery) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ // This test sets its schema assuming that max property storages == 32.
+ ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32));
+
+ PropertyConfigProto int_property_config =
+ PropertyConfigBuilder()
+ .SetName("otherProperty1")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build();
+ // Create a schema with two types:
+ // - TypeA has 34 properties:
+ // 'desiredProperty', 'otherProperty'*, 'undesiredProperty'
+ // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty'
+ // 1. The 32 'otherProperty's will consume all of the individual storages
+ // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned
+ // SectionId = 0 for their respective types.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(int_property_config)
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty2"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty3"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty4"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty5"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty6"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty7"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty8"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty9"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty10"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty11"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty12"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty13"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty14"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty15"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty16"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty17"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty18"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty19"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty20"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty21"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty22"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty23"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty24"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty25"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty26"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty27"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty28"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty29"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty30"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty31"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty32"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("undesiredProperty")))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("anotherProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty")))
+ .Build();
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(
+ schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Put 11 docs of "TypeA" into the document store.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build()));
+
+ // Put 5 docs of "TypeB" into the document store.
+ doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build()));
+
+ // Ids are assigned alphabetically, so the property ids are:
+ // TypeA.desiredProperty = 0
+ // TypeA.otherPropertyN = N
+ // TypeA.undesiredProperty = 33
+ // TypeB.anotherProperty = 0
+ // TypeB.desiredProperty = 1
+ SectionId typea_desired_prop_id = 0;
+ SectionId typea_undesired_prop_id = 33;
+ SectionId typeb_another_prop_id = 0;
+ SectionId typeb_desired_prop_id = 1;
+
+ // Index numeric content for other properties to force our property into the
+ // wildcard storage.
+ std::string other_property_path = "otherProperty";
+ for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) {
+ Index(integer_index.get(),
+ absl_ports::StrCat(other_property_path, std::to_string(i)),
+ /*document_id=*/0, /*section_id=*/i, /*keys=*/{i});
+ }
+
+ // Index numeric content for TypeA.desiredProperty
+ std::string desired_property = "desiredProperty";
+ Index(integer_index.get(), desired_property, /*document_id=*/0,
+ typea_desired_prop_id, /*keys=*/{1});
+ Index(integer_index.get(), desired_property, /*document_id=*/1,
+ typea_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/2,
+ typea_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/3,
+ typea_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/4,
+ typea_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeA.undesiredProperty
+ std::string undesired_property = "undesiredProperty";
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), undesired_property, /*document_id=*/7,
+ typea_undesired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), undesired_property, /*document_id=*/8,
+ typea_undesired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), undesired_property, /*document_id=*/9,
+ typea_undesired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), undesired_property, /*document_id=*/10,
+ typea_undesired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeB.anotherProperty
+ std::string another_property = "anotherProperty";
+ Index(integer_index.get(), another_property, /*document_id=*/11,
+ typeb_another_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), another_property, /*document_id=*/12,
+ typeb_another_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), another_property, /*document_id=*/13,
+ typeb_another_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), another_property, /*document_id=*/14,
+ typeb_another_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), another_property, /*document_id=*/15,
+ typeb_another_prop_id, /*keys=*/{2});
+
+ // Finally, index the same numeric content for TypeB.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/16,
+ typeb_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/17,
+ typeb_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/18,
+ typeb_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/19,
+ typeb_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/20,
+ typeb_desired_prop_id, /*keys=*/{2});
+
+ if (this->template is_integer_index<TypeParam>()) {
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(33));
+ } else {
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(35));
+ }
+
+ // Only the hits for 'desired_prop_id' should be returned.
+ std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id};
+ std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id};
+ EXPECT_THAT(
+ this->Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea))));
+
+ EXPECT_THAT(
+ this->Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/16, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, EmptyResult) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/4,
+ kDefaultSectionId, /*keys=*/{4});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{2});
+
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/10, /*key_upper=*/10),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/100, /*key_upper=*/200),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TYPED_TEST(NumericIndexIntegerTest,
+ NonExistingPropertyPathShouldReturnEmptyResult) {
+ constexpr std::string_view kAnotherPropertyPath = "another_property";
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+
+ EXPECT_THAT(this->Query(integer_index.get(), kAnotherPropertyPath,
+ /*key_lower=*/100, /*key_upper=*/200),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TYPED_TEST(NumericIndexIntegerTest,
+ MultipleKeysShouldMergeAndDedupeDocHitInfo) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ // Construct several documents with mutiple keys under the same section.
+ // Range query [1, 3] will find hits with same (DocumentId, SectionId) for
+ // mutiple times. For example, (2, kDefaultSectionId) will be found twice
+ // (once for key = 1 and once for key = 3).
+ // Test if the iterator dedupes correctly.
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{-1000, 0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{-100, 0, 1, 2, 3, 4, 5});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{3, 1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{4, 1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/4,
+ kDefaultSectionId, /*keys=*/{1, 6});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{2, 100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/6,
+ kDefaultSectionId, /*keys=*/{1000, 2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/7,
+ kDefaultSectionId, /*keys=*/{4, -1000});
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/6, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, EdgeNumericValues) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{-100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{-80});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{std::numeric_limits<int64_t>::max()});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/4,
+ kDefaultSectionId, /*keys=*/{std::numeric_limits<int64_t>::min()});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{200});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/6,
+ kDefaultSectionId, /*keys=*/{100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/7,
+ kDefaultSectionId, /*keys=*/{std::numeric_limits<int64_t>::max()});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/8,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/9,
+ kDefaultSectionId, /*keys=*/{std::numeric_limits<int64_t>::min()});
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+
+ // Negative key
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/-100, /*key_upper=*/-70),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+
+ // INT64_MAX key
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::max(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/7, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections))));
+
+ // INT64_MIN key
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::min()),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/9, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections))));
+
+ // Key = 0
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/0, /*key_upper=*/0),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/8, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+
+ // All keys from INT64_MIN to INT64_MAX
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/9, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/8, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/7, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/6, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/4, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest,
+ MultipleSectionsShouldMergeSectionsAndDedupeDocHitInfo) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ // Construct several documents with mutiple numeric sections.
+ // Range query [1, 3] will find hits with same DocumentIds but multiple
+ // different SectionIds. For example, there will be 2 hits (1, 0), (1, 1) for
+ // DocumentId=1.
+ // Test if the iterator merges multiple sections into a single SectionIdMask
+ // correctly.
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/2, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/1, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/0, /*keys=*/{-1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/2, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/1, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/0, /*keys=*/{4});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/5, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/4, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/3, /*keys=*/{5});
+
+ EXPECT_THAT(
+ this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1,
+ /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/2, std::vector<SectionId>{4, 5}),
+ EqualsDocHitInfo(/*document_id=*/1, std::vector<SectionId>{1, 2}),
+ EqualsDocHitInfo(/*document_id=*/0, std::vector<SectionId>{1}))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, NonRelevantPropertyShouldNotBeIncluded) {
+ constexpr std::string_view kNonRelevantProperty = "non_relevant_property";
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kNonRelevantProperty, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kNonRelevantProperty, /*document_id=*/4,
+ kDefaultSectionId, /*keys=*/{4});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{2});
+
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest,
+ RangeQueryKeyLowerGreaterThanKeyUpperShouldReturnError) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/4,
+ kDefaultSectionId, /*keys=*/{4});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{2});
+
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/3, /*key_upper=*/1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, Optimize) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/8,
+ kDefaultSectionId, /*keys=*/{4});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/13,
+ kDefaultSectionId, /*keys=*/{2});
+
+ // Delete doc id = 3, 5, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(14, kInvalidDocumentId);
+ document_id_old_to_new[1] = 0;
+ document_id_old_to_new[2] = 1;
+ document_id_old_to_new[8] = 2;
+ document_id_old_to_new[13] = 3;
+
+ DocumentId new_last_added_document_id = 3;
+ EXPECT_THAT(integer_index->Optimize(document_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(),
+ Eq(new_last_added_document_id));
+
+ // Verify index and query API still work normally after Optimize().
+ std::vector<SectionId> expected_sections = {kDefaultSectionId};
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/1, /*key_upper=*/1),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections))));
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/3, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections))));
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/0, /*key_upper=*/0),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/4, /*key_upper=*/4),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections))));
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/3, expected_sections))));
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{123});
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/123, /*key_upper=*/123),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, OptimizeMultiplePropertyPaths) {
+ constexpr std::string_view kPropertyPath1 = "prop1";
+ constexpr SectionId kSectionId1 = 0;
+ constexpr std::string_view kPropertyPath2 = "prop2";
+ constexpr SectionId kSectionId2 = 1;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ // Doc id = 1: insert 2 data for "prop1", "prop2"
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/1, kSectionId2,
+ /*keys=*/{1});
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/1, kSectionId1,
+ /*keys=*/{2});
+
+ // Doc id = 2: insert 1 data for "prop1".
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/2, kSectionId1,
+ /*keys=*/{3});
+
+ // Doc id = 3: insert 2 data for "prop2"
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/3, kSectionId2,
+ /*keys=*/{4});
+
+ // Doc id = 5: insert 3 data for "prop1", "prop2"
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/5, kSectionId2,
+ /*keys=*/{1});
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/5, kSectionId1,
+ /*keys=*/{2});
+
+ // Doc id = 8: insert 1 data for "prop2".
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/8, kSectionId2,
+ /*keys=*/{3});
+
+ // Doc id = 13: insert 1 data for "prop1".
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/13, kSectionId1,
+ /*keys=*/{4});
+
+ // Delete doc id = 3, 5, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(14, kInvalidDocumentId);
+ document_id_old_to_new[1] = 0;
+ document_id_old_to_new[2] = 1;
+ document_id_old_to_new[8] = 2;
+ document_id_old_to_new[13] = 3;
+
+ DocumentId new_last_added_document_id = 3;
+ EXPECT_THAT(integer_index->Optimize(document_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(),
+ Eq(new_last_added_document_id));
+
+ // Verify index and query API still work normally after Optimize().
+ // Key = 1
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1,
+ /*key_upper=*/1),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/1,
+ /*key_upper=*/1),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/0, std::vector<SectionId>{kSectionId2}))));
+
+ // key = 2
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/2,
+ /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/0, std::vector<SectionId>{kSectionId1}))));
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/2,
+ /*key_upper=*/2),
+ IsOkAndHolds(IsEmpty()));
+
+ // key = 3
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/3,
+ /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/1, std::vector<SectionId>{kSectionId1}))));
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/3,
+ /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/2, std::vector<SectionId>{kSectionId2}))));
+
+ // key = 4
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath1, /*key_lower=*/4,
+ /*key_upper=*/4),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/3, std::vector<SectionId>{kSectionId1}))));
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2, /*key_lower=*/4,
+ /*key_upper=*/4),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, OptimizeShouldDiscardEmptyPropertyStorage) {
+ constexpr std::string_view kPropertyPath1 = "prop1";
+ constexpr SectionId kSectionId1 = 0;
+ constexpr std::string_view kPropertyPath2 = "prop2";
+ constexpr SectionId kSectionId2 = 1;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ // Doc id = 1: insert 2 data for "prop1", "prop2"
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/1, kSectionId2,
+ /*keys=*/{1});
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/1, kSectionId1,
+ /*keys=*/{2});
+
+ // Doc id = 2: insert 1 data for "prop1".
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/2, kSectionId1,
+ /*keys=*/{3});
+
+ // Doc id = 3: insert 2 data for "prop2"
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/3, kSectionId2,
+ /*keys=*/{4});
+
+ // Delete doc id = 1, 3, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(4, kInvalidDocumentId);
+ document_id_old_to_new[2] = 0;
+
+ DocumentId new_last_added_document_id = 0;
+ EXPECT_THAT(integer_index->Optimize(document_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(),
+ Eq(new_last_added_document_id));
+
+ // All data in "prop2" as well as the underlying storage should be deleted, so
+ // when querying "prop2", we should get empty result.
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(IsEmpty()));
+ if (std::is_same_v<IntegerIndex, TypeParam>) {
+ std::string prop2_storage_working_path =
+ absl_ports::StrCat(this->working_path_, "/", kPropertyPath2);
+ EXPECT_THAT(
+ this->filesystem_.DirectoryExists(prop2_storage_working_path.c_str()),
+ IsFalse());
+ }
+
+ // Verify we can still index and query for "prop2".
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/100, kSectionId2,
+ /*keys=*/{123});
+ EXPECT_THAT(this->Query(integer_index.get(), kPropertyPath2,
+ /*key_lower=*/123, /*key_upper=*/123),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/100, std::vector<SectionId>{kSectionId2}))));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, OptimizeOutOfRangeDocumentId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{3});
+
+ // Create document_id_old_to_new with size = 2. Optimize should handle out of
+ // range DocumentId properly.
+ std::vector<DocumentId> document_id_old_to_new(2, kInvalidDocumentId);
+
+ EXPECT_THAT(integer_index->Optimize(
+ document_id_old_to_new,
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, OptimizeDeleteAll) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/5,
+ kDefaultSectionId, /*keys=*/{0});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/8,
+ kDefaultSectionId, /*keys=*/{4});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/13,
+ kDefaultSectionId, /*keys=*/{2});
+
+ // Delete all documents.
+ std::vector<DocumentId> document_id_old_to_new(14, kInvalidDocumentId);
+
+ EXPECT_THAT(integer_index->Optimize(
+ document_id_old_to_new,
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(this->Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TYPED_TEST(NumericIndexIntegerTest, Clear) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<NumericIndex<int64_t>> integer_index,
+ this->template CreateIntegerIndex<TypeParam>());
+
+ Index(integer_index.get(), /*property_path=*/"A", /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), /*property_path=*/"B", /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{3});
+ integer_index->set_last_added_document_id(1);
+
+ ASSERT_THAT(integer_index->last_added_document_id(), Eq(1));
+ ASSERT_THAT(
+ this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1,
+ /*key_upper=*/1),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/0, std::vector<SectionId>{kDefaultSectionId}))));
+ ASSERT_THAT(
+ this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3,
+ /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/1, std::vector<SectionId>{kDefaultSectionId}))));
+
+ // After resetting, last_added_document_id should be set to
+ // kInvalidDocumentId, and the previous added keys should be deleted.
+ ICING_ASSERT_OK(integer_index->Clear());
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(
+ this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/1,
+ /*key_upper=*/1),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/3,
+ /*key_upper=*/3),
+ IsOkAndHolds(IsEmpty()));
+
+ // Integer index should be able to work normally after Clear().
+ Index(integer_index.get(), /*property_path=*/"A", /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{123});
+ Index(integer_index.get(), /*property_path=*/"B", /*document_id=*/4,
+ kDefaultSectionId, /*keys=*/{456});
+ integer_index->set_last_added_document_id(4);
+
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(4));
+ EXPECT_THAT(
+ this->Query(integer_index.get(), /*property_path=*/"A", /*key_lower=*/123,
+ /*key_upper=*/123),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/3, std::vector<SectionId>{kDefaultSectionId}))));
+ EXPECT_THAT(
+ this->Query(integer_index.get(), /*property_path=*/"B", /*key_lower=*/456,
+ /*key_upper=*/456),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/4, std::vector<SectionId>{kDefaultSectionId}))));
+}
+
+struct IntegerIndexTestParam {
+ int32_t num_data_threshold_for_bucket_split;
+ bool pre_mapping_fbv;
+
+ explicit IntegerIndexTestParam(int32_t num_data_threshold_for_bucket_split_in,
+ bool pre_mapping_fbv_in)
+ : num_data_threshold_for_bucket_split(
+ num_data_threshold_for_bucket_split_in),
+ pre_mapping_fbv(pre_mapping_fbv_in) {}
+};
+
+// Tests for persistent integer index only
+class IntegerIndexTest
+ : public NumericIndexIntegerTest<IntegerIndex>,
+ public ::testing::WithParamInterface<IntegerIndexTestParam> {};
+
+TEST_P(IntegerIndexTest, InvalidWorkingPath) {
+ EXPECT_THAT(
+ IntegerIndex::Create(filesystem_, "/dev/null/integer_index_test",
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_P(IntegerIndexTest, InitializeNewFiles) {
+ {
+ ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ ICING_ASSERT_OK(integer_index->PersistToDisk());
+ }
+
+ // Metadata file should be initialized correctly for both info and crcs
+ // sections.
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/", IntegerIndex::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ // Check info section
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ IntegerIndex::kInfoMetadataFileOffset));
+ EXPECT_THAT(info.magic, Eq(Info::kMagic));
+ EXPECT_THAT(info.last_added_document_id, Eq(kInvalidDocumentId));
+ EXPECT_THAT(info.num_data_threshold_for_bucket_split,
+ Eq(GetParam().num_data_threshold_for_bucket_split));
+
+ // Check crcs section
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ IntegerIndex::kCrcsMetadataFileOffset));
+ // There are no storages initially, so storages_crc should be 0.
+ EXPECT_THAT(crcs.component_crcs.storages_crc, Eq(0));
+ EXPECT_THAT(crcs.component_crcs.info_crc,
+ Eq(Crc32(std::string_view(reinterpret_cast<const char*>(&info),
+ sizeof(Info)))
+ .Get()));
+ EXPECT_THAT(crcs.all_crc,
+ Eq(Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs.component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get()));
+}
+
+TEST_P(IntegerIndexTest,
+ InitializationShouldFailWithoutPersistToDiskOrDestruction) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ // Insert some data.
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/20, /*keys=*/{0, 100, -100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/2, /*keys=*/{3, -1000, 500});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/15, /*keys=*/{-6, 321, 98});
+
+ // Without calling PersistToDisk, checksums will not be recomputed or synced
+ // to disk, so initializing another instance on the same files should fail.
+ EXPECT_THAT(
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_P(IntegerIndexTest, InitializationShouldSucceedWithPersistToDisk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index1,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ // Insert some data.
+ Index(integer_index1.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/20, /*keys=*/{0, 100, -100});
+ Index(integer_index1.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/2, /*keys=*/{3, -1000, 500});
+ Index(integer_index1.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/15, /*keys=*/{-6, 321, 98});
+ integer_index1->set_last_added_document_id(2);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<DocHitInfo> doc_hit_info_vec,
+ Query(integer_index1.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()));
+
+ // After calling PersistToDisk, all checksums should be recomputed and synced
+ // correctly to disk, so initializing another instance on the same files
+ // should succeed, and we should be able to get the same contents.
+ ICING_EXPECT_OK(integer_index1->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index2,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+ EXPECT_THAT(integer_index2->last_added_document_id(), Eq(2));
+ EXPECT_THAT(Query(integer_index2.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(ElementsAreArray(doc_hit_info_vec.begin(),
+ doc_hit_info_vec.end())));
+}
+
+TEST_P(IntegerIndexTest, InitializationShouldSucceedAfterDestruction) {
+ std::vector<DocHitInfo> doc_hit_info_vec;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ // Insert some data.
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/20, /*keys=*/{0, 100, -100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/2, /*keys=*/{3, -1000, 500});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/15, /*keys=*/{-6, 321, 98});
+ integer_index->set_last_added_document_id(2);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ doc_hit_info_vec,
+ Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()));
+ }
+
+ {
+ // The previous instance went out of scope and was destructed. Although we
+ // didn't call PersistToDisk explicitly, the destructor should invoke it and
+ // thus initializing another instance on the same files should succeed, and
+ // we should be able to get the same contents.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+ EXPECT_THAT(integer_index->last_added_document_id(), Eq(2));
+ EXPECT_THAT(Query(integer_index.get(), kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max()),
+ IsOkAndHolds(ElementsAreArray(doc_hit_info_vec.begin(),
+ doc_hit_info_vec.end())));
+ }
+}
+
+TEST_P(IntegerIndexTest, InitializeExistingFilesWithWrongAllCrcShouldFail) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+ // Insert some data.
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/20, /*keys=*/{0, 100, -100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/2, /*keys=*/{3, -1000, 500});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/15, /*keys=*/{-6, 321, 98});
+
+ ICING_ASSERT_OK(integer_index->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/", IntegerIndex::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Crcs crcs;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &crcs, sizeof(Crcs),
+ IntegerIndex::kCrcsMetadataFileOffset));
+
+ // Manually corrupt all_crc
+ crcs.all_crc += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(),
+ IntegerIndexStorage::kCrcsMetadataFileOffset,
+ &crcs, sizeof(Crcs)));
+ metadata_sfd.reset();
+
+ {
+ // Attempt to create the integer index with metadata containing corrupted
+ // all_crc. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+ integer_index_or =
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv);
+ EXPECT_THAT(integer_index_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(integer_index_or.status().error_message(),
+ HasSubstr("Invalid all crc"));
+ }
+}
+
+TEST_P(IntegerIndexTest, InitializeExistingFilesWithCorruptedInfoShouldFail) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+ // Insert some data.
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/20, /*keys=*/{0, 100, -100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/2, /*keys=*/{3, -1000, 500});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/15, /*keys=*/{-6, 321, 98});
+
+ ICING_ASSERT_OK(integer_index->PersistToDisk());
+ }
+
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/", IntegerIndex::kFilePrefix, ".m");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_TRUE(metadata_sfd.is_valid());
+
+ Info info;
+ ASSERT_TRUE(filesystem_.PRead(metadata_sfd.get(), &info, sizeof(Info),
+ IntegerIndex::kInfoMetadataFileOffset));
+
+ // Modify info, but don't update the checksum. This would be similar to
+ // corruption of info.
+ info.last_added_document_id += kCorruptedValueOffset;
+ ASSERT_TRUE(filesystem_.PWrite(metadata_sfd.get(),
+ IntegerIndex::kInfoMetadataFileOffset, &info,
+ sizeof(Info)));
+ metadata_sfd.reset();
+
+ {
+ // Attempt to create the integer index with info that doesn't match its
+ // checksum and confirm that it fails.
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+ integer_index_or =
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv);
+ EXPECT_THAT(integer_index_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(integer_index_or.status().error_message(),
+ HasSubstr("Invalid info crc"));
+ }
+}
+
+TEST_P(IntegerIndexTest,
+ InitializeExistingFilesWithCorruptedStoragesShouldFail) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+ // Insert some data.
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/20, /*keys=*/{0, 100, -100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/2, /*keys=*/{3, -1000, 500});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/15, /*keys=*/{-6, 321, 98});
+
+ ICING_ASSERT_OK(integer_index->PersistToDisk());
+ }
+
+ {
+ // Corrupt integer index storage for kDefaultTestPropertyPath manually.
+ PostingListIntegerIndexSerializer posting_list_integer_index_serializer;
+ std::string storage_working_path =
+ absl_ports::StrCat(working_path_, "/", kDefaultTestPropertyPath);
+ ASSERT_TRUE(filesystem_.DirectoryExists(storage_working_path.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndexStorage> storage,
+ IntegerIndexStorage::Create(
+ filesystem_, std::move(storage_working_path),
+ IntegerIndexStorage::Options(
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv),
+ &posting_list_integer_index_serializer));
+ ICING_ASSERT_OK(storage->AddKeys(/*document_id=*/3, /*section_id=*/4,
+ /*new_keys=*/{3, 4, 5}));
+
+ ICING_ASSERT_OK(storage->PersistToDisk());
+ }
+
+ {
+ // Attempt to create the integer index with corrupted storages. This should
+ // fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+ integer_index_or =
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv);
+ EXPECT_THAT(integer_index_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(integer_index_or.status().error_message(),
+ HasSubstr("Invalid storages crc"));
+ }
+}
+
+TEST_P(
+ IntegerIndexTest,
+ InitializeExistingFilesWithMismatchNumDataThresholdForBucketSplitShouldFail) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+ // Insert some data.
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ /*section_id=*/20, /*keys=*/{0, 100, -100});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ /*section_id=*/2, /*keys=*/{3, -1000, 500});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ /*section_id=*/15, /*keys=*/{-6, 321, 98});
+
+ ICING_ASSERT_OK(integer_index->PersistToDisk());
+ }
+
+ {
+ // Attempt to create the integer index with different
+ // num_data_threshold_for_bucket_split. This should fail.
+ libtextclassifier3::StatusOr<std::unique_ptr<IntegerIndex>>
+ integer_index_or = IntegerIndex::Create(
+ filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split + 1,
+ GetParam().pre_mapping_fbv);
+ EXPECT_THAT(integer_index_or,
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(integer_index_or.status().error_message(),
+ HasSubstr("Mismatch num_data_threshold_for_bucket_split"));
+ }
+}
+
+TEST_P(IntegerIndexTest, WildcardStoragePersistenceQuery) {
+ // This test sets its schema assuming that max property storages == 32.
+ ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32));
+
+ PropertyConfigProto int_property_config =
+ PropertyConfigBuilder()
+ .SetName("otherProperty1")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build();
+ // Create a schema with two types:
+ // - TypeA has 34 properties:
+ // 'desiredProperty', 'otherProperty'*, 'undesiredProperty'
+ // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty'
+ // 1. The 32 'otherProperty's will consume all of the individual storages
+ // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned
+ // SectionId = 0 for their respective types.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(int_property_config)
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty2"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty3"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty4"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty5"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty6"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty7"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty8"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty9"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty10"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty11"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty12"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty13"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty14"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty15"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty16"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty17"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty18"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty19"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty20"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty21"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty22"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty23"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty24"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty25"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty26"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty27"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty28"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty29"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty30"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty31"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty32"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("undesiredProperty")))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("anotherProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty")))
+ .Build();
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(
+ schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Ids are assigned alphabetically, so the property ids are:
+ // TypeA.desiredProperty = 0
+ // TypeA.otherPropertyN = N
+ // TypeA.undesiredProperty = 33
+ // TypeB.anotherProperty = 0
+ // TypeB.desiredProperty = 1
+ SectionId typea_desired_prop_id = 0;
+ SectionId typea_undesired_prop_id = 33;
+ SectionId typeb_another_prop_id = 0;
+ SectionId typeb_desired_prop_id = 1;
+ std::string desired_property = "desiredProperty";
+ std::string undesired_property = "undesiredProperty";
+ std::string another_property = "anotherProperty";
+
+ // Put 11 docs of "TypeA" into the document store.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build()));
+
+ // Put 10 docs of "TypeB" into the document store.
+ doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build()));
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ // Index numeric content for other properties to force our property into the
+ // wildcard storage.
+ std::string other_property_path = "otherProperty";
+ for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) {
+ Index(integer_index.get(),
+ absl_ports::StrCat(other_property_path, std::to_string(i)),
+ /*document_id=*/0, /*section_id=*/i, /*keys=*/{i});
+ }
+
+ // Index numeric content for TypeA.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/0,
+ typea_desired_prop_id, /*keys=*/{1});
+ Index(integer_index.get(), desired_property, /*document_id=*/1,
+ typea_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/2,
+ typea_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/3,
+ typea_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/4,
+ typea_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeA.undesiredProperty
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), undesired_property, /*document_id=*/7,
+ typea_undesired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), undesired_property, /*document_id=*/8,
+ typea_undesired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), undesired_property, /*document_id=*/9,
+ typea_undesired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), undesired_property, /*document_id=*/10,
+ typea_undesired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeB.undesiredProperty
+ Index(integer_index.get(), another_property, /*document_id=*/11,
+ typeb_another_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), another_property, /*document_id=*/12,
+ typeb_another_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), another_property, /*document_id=*/13,
+ typeb_another_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), another_property, /*document_id=*/14,
+ typeb_another_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), another_property, /*document_id=*/15,
+ typeb_another_prop_id, /*keys=*/{2});
+
+ // Finally, index the same numeric content for TypeB.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/16,
+ typeb_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/17,
+ typeb_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/18,
+ typeb_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/19,
+ typeb_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/20,
+ typeb_desired_prop_id, /*keys=*/{2});
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(33));
+
+ // Only the hits for 'desired_prop_id' should be returned.
+ std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id};
+ std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id};
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea))));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/16, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/5, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea))));
+}
+
+TEST_P(IntegerIndexTest,
+ IntegerIndexShouldWorkAfterOptimizeAndReinitialization) {
+ constexpr std::string_view kPropertyPath1 = "prop1";
+ constexpr SectionId kSectionId1 = 0;
+ constexpr std::string_view kPropertyPath2 = "prop2";
+ constexpr SectionId kSectionId2 = 1;
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ // Doc id = 1: insert 2 data for "prop1", "prop2"
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/1, kSectionId2,
+ /*keys=*/{1});
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/1, kSectionId1,
+ /*keys=*/{2});
+
+ // Doc id = 2: insert 1 data for "prop1".
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/2, kSectionId1,
+ /*keys=*/{3});
+
+ // Doc id = 3: insert 2 data for "prop2"
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/3, kSectionId2,
+ /*keys=*/{4});
+
+ // Doc id = 5: insert 3 data for "prop1", "prop2"
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/5, kSectionId2,
+ /*keys=*/{1});
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/5, kSectionId1,
+ /*keys=*/{2});
+
+ // Doc id = 8: insert 1 data for "prop2".
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/8, kSectionId2,
+ /*keys=*/{3});
+
+ // Doc id = 13: insert 1 data for "prop1".
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/13, kSectionId1,
+ /*keys=*/{4});
+
+ // Delete doc id = 3, 5, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(14, kInvalidDocumentId);
+ document_id_old_to_new[1] = 0;
+ document_id_old_to_new[2] = 1;
+ document_id_old_to_new[8] = 2;
+ document_id_old_to_new[13] = 3;
+
+ DocumentId new_last_added_document_id = 3;
+ EXPECT_THAT(integer_index->Optimize(document_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(),
+ Eq(new_last_added_document_id));
+ }
+
+ {
+ // Reinitialize IntegerIndex and verify index and query API still work
+ // normally.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ // Key = 1
+ EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/1,
+ /*key_upper=*/1),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/1,
+ /*key_upper=*/1),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/0, std::vector<SectionId>{kSectionId2}))));
+
+ // key = 2
+ EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/2,
+ /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/0, std::vector<SectionId>{kSectionId1}))));
+ EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/2,
+ /*key_upper=*/2),
+ IsOkAndHolds(IsEmpty()));
+
+ // key = 3
+ EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/3,
+ /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/1, std::vector<SectionId>{kSectionId1}))));
+ EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/3,
+ /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/2, std::vector<SectionId>{kSectionId2}))));
+
+ // key = 4
+ EXPECT_THAT(Query(integer_index.get(), kPropertyPath1, /*key_lower=*/4,
+ /*key_upper=*/4),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/3, std::vector<SectionId>{kSectionId1}))));
+ EXPECT_THAT(Query(integer_index.get(), kPropertyPath2, /*key_lower=*/4,
+ /*key_upper=*/4),
+ IsOkAndHolds(IsEmpty()));
+
+ // Index new data.
+ Index(integer_index.get(), kPropertyPath2, /*document_id=*/100, kSectionId2,
+ /*keys=*/{123});
+ Index(integer_index.get(), kPropertyPath1, /*document_id=*/100, kSectionId1,
+ /*keys=*/{456});
+ EXPECT_THAT(
+ Query(integer_index.get(), kPropertyPath2, /*key_lower=*/123,
+ /*key_upper=*/456),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/100, std::vector<SectionId>{kSectionId2}))));
+ EXPECT_THAT(
+ Query(integer_index.get(), kPropertyPath1, /*key_lower=*/123,
+ /*key_upper=*/456),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/100, std::vector<SectionId>{kSectionId1}))));
+ }
+}
+
+TEST_P(IntegerIndexTest, WildcardStorageWorksAfterOptimize) {
+ // This test sets its schema assuming that max property storages == 32.
+ ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32));
+
+ PropertyConfigProto int_property_config =
+ PropertyConfigBuilder()
+ .SetName("otherProperty1")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build();
+ // Create a schema with two types:
+ // - TypeA has 34 properties:
+ // 'desiredProperty', 'otherProperty'*, 'undesiredProperty'
+ // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty'
+ // 1. The 32 'otherProperty's will consume all of the individual storages
+ // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned
+ // SectionId = 0 for their respective types.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(int_property_config)
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty2"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty3"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty4"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty5"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty6"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty7"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty8"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty9"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty10"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty11"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty12"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty13"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty14"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty15"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty16"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty17"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty18"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty19"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty20"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty21"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty22"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty23"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty24"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty25"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty26"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty27"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty28"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty29"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty30"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty31"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty32"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("undesiredProperty")))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("anotherProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty")))
+ .Build();
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(
+ schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Ids are assigned alphabetically, so the property ids are:
+ // TypeA.desiredProperty = 0
+ // TypeA.otherPropertyN = N
+ // TypeA.undesiredProperty = 33
+ // TypeB.anotherProperty = 0
+ // TypeB.desiredProperty = 1
+ SectionId typea_desired_prop_id = 0;
+ SectionId typea_undesired_prop_id = 33;
+ SectionId typeb_another_prop_id = 0;
+ SectionId typeb_desired_prop_id = 1;
+ std::string desired_property = "desiredProperty";
+ std::string undesired_property = "undesiredProperty";
+ std::string another_property = "anotherProperty";
+
+ // Only the hits for 'desired_prop_id' should be returned.
+ std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id};
+ std::vector<SectionId> expected_sections_typeb = {typeb_desired_prop_id};
+
+ // Put 11 docs of "TypeA" into the document store.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build()));
+
+ // Put 10 docs of "TypeB" into the document store.
+ doc = DocumentBuilder(doc).SetUri("uri11").SetSchema("TypeB").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri14").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri15").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri16").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri17").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri18").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri19").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri20").Build()));
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ // Index numeric content for other properties to force our property into the
+ // wildcard storage.
+ std::string other_property_path = "otherProperty";
+ for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) {
+ Index(integer_index.get(),
+ absl_ports::StrCat(other_property_path, std::to_string(i)),
+ /*document_id=*/0, /*section_id=*/i, /*keys=*/{i});
+ }
+
+ // Index numeric content for TypeA.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/0,
+ typea_desired_prop_id, /*keys=*/{1});
+ Index(integer_index.get(), desired_property, /*document_id=*/1,
+ typea_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/2,
+ typea_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/3,
+ typea_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/4,
+ typea_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeA.undesiredProperty
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), undesired_property, /*document_id=*/7,
+ typea_undesired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), undesired_property, /*document_id=*/8,
+ typea_undesired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), undesired_property, /*document_id=*/9,
+ typea_undesired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), undesired_property, /*document_id=*/10,
+ typea_undesired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeB.undesiredProperty
+ Index(integer_index.get(), another_property, /*document_id=*/11,
+ typeb_another_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), another_property, /*document_id=*/12,
+ typeb_another_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), another_property, /*document_id=*/13,
+ typeb_another_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), another_property, /*document_id=*/14,
+ typeb_another_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), another_property, /*document_id=*/15,
+ typeb_another_prop_id, /*keys=*/{2});
+
+ // Finally, index the same numeric content for TypeB.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/16,
+ typeb_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/17,
+ typeb_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/18,
+ typeb_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/19,
+ typeb_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/20,
+ typeb_desired_prop_id, /*keys=*/{2});
+
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/3,
+ clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/5,
+ clock_.GetSystemTimeMilliseconds()));
+ // Delete doc id = 3, 5, compress and keep the rest.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentId> document_id_old_to_new,
+ CompactDocStore());
+
+ DocumentId new_last_added_document_id = 18;
+ EXPECT_THAT(integer_index->Optimize(document_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(),
+ Eq(new_last_added_document_id));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea))));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/16 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea))));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(33));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea))));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/20 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/17 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/16 - 2, expected_sections_typeb),
+ EqualsDocHitInfo(/*document_id=*/2, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/0, expected_sections_typea))));
+}
+
+// This test covers the situation where Optimize causes us to throw out some of
+// the individual index storages (because they don't have any hits anymore).
+// In this case, any properties that added content to the wildcard storage (even
+// if all of their content was also deleted) should still be placed in the
+// wildcard storage.
+TEST_P(IntegerIndexTest, WildcardStorageAvailableIndicesAfterOptimize) {
+ // This test sets its schema assuming that max property storages == 32.
+ ASSERT_THAT(IntegerIndex::kMaxPropertyStorages, Eq(32));
+
+ PropertyConfigProto int_property_config =
+ PropertyConfigBuilder()
+ .SetName("otherProperty1")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build();
+ // Create a schema with two types:
+ // - TypeA has 34 properties:
+ // 'desiredProperty', 'otherProperty'*, 'undesiredProperty'
+ // - TypeB has 2 properties: 'anotherProperty', 'desiredProperty'
+ // 1. The 32 'otherProperty's will consume all of the individual storages
+ // 2. TypeA.desiredProperty and TypeB.anotherProperty will both be assigned
+ // SectionId = 0 for their respective types.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(int_property_config)
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty2"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty3"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty4"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty5"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty6"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty7"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty8"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty9"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty10"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty11"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty12"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty13"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty14"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty15"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty16"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty17"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty18"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty19"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty20"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty21"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty22"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty23"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty24"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty25"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty26"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty27"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty28"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty29"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty30"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty31"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("otherProperty32"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("desiredProperty"))
+ .AddProperty(PropertyConfigBuilder(int_property_config)
+ .SetName("undesiredProperty")))
+ .Build();
+ ICING_ASSERT_OK(this->schema_store_->SetSchema(
+ schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Ids are assigned alphabetically, so the property ids are:
+ // TypeA.desiredProperty = 0
+ // TypeA.otherPropertyN = N
+ // TypeA.undesiredProperty = 33
+ // TypeB.anotherProperty = 0
+ // TypeB.desiredProperty = 1
+ SectionId typea_desired_prop_id = 0;
+ SectionId typea_undesired_prop_id = 33;
+ SectionId typea_other1_prop_id = 1;
+ std::string desired_property = "desiredProperty";
+ std::string undesired_property = "undesiredProperty";
+ std::string another_property = "anotherProperty";
+ std::string other_property_1 = "otherProperty1";
+
+ // Only the hits for 'desired_prop_id' should be returned.
+ std::vector<SectionId> expected_sections_typea = {typea_desired_prop_id};
+
+ // Put 11 docs of "TypeA" into the document store.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns1", "uri0").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri8").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri9").Build()));
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri10").Build()));
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ // Index numeric content for other properties to force our property into the
+ // wildcard storage.
+ std::string other_property_path = "otherProperty";
+ for (int i = 1; i <= IntegerIndex::kMaxPropertyStorages; ++i) {
+ Index(integer_index.get(),
+ absl_ports::StrCat(other_property_path, std::to_string(i)),
+ /*document_id=*/0, /*section_id=*/i, /*keys=*/{i});
+ }
+
+ // Index numeric content for TypeA.desiredProperty
+ Index(integer_index.get(), desired_property, /*document_id=*/0,
+ typea_desired_prop_id, /*keys=*/{1});
+ Index(integer_index.get(), desired_property, /*document_id=*/1,
+ typea_desired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), desired_property, /*document_id=*/2,
+ typea_desired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), desired_property, /*document_id=*/3,
+ typea_desired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), desired_property, /*document_id=*/4,
+ typea_desired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{2});
+
+ // Index the same numeric content for TypeA.undesiredProperty
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ Index(integer_index.get(), undesired_property, /*document_id=*/7,
+ typea_undesired_prop_id, /*keys=*/{2});
+ Index(integer_index.get(), undesired_property, /*document_id=*/8,
+ typea_undesired_prop_id, /*keys=*/{0});
+ Index(integer_index.get(), undesired_property, /*document_id=*/9,
+ typea_undesired_prop_id, /*keys=*/{4});
+ Index(integer_index.get(), undesired_property, /*document_id=*/10,
+ typea_undesired_prop_id, /*keys=*/{2});
+
+ // Delete all the docs that had hits in otherProperty* and
+ // undesiredProperty.
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/0,
+ clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/6,
+ clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/7,
+ clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/8,
+ clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/9,
+ clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK(doc_store_->Delete(/*document_id=*/10,
+ clock_.GetSystemTimeMilliseconds()));
+ // Delete doc id = 0, 6, 7, 8, 9, 10. Compress and keep the rest.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentId> document_id_old_to_new,
+ CompactDocStore());
+
+ DocumentId new_last_added_document_id = 5 - 1;
+ EXPECT_THAT(integer_index->Optimize(document_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(integer_index->last_added_document_id(),
+ Eq(new_last_added_document_id));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/2, /*key_upper=*/2),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/5 - 1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2 - 1, expected_sections_typea))));
+
+ EXPECT_THAT(
+ Query(integer_index.get(), desired_property,
+ /*key_lower=*/1, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(
+ EqualsDocHitInfo(/*document_id=*/5 - 1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/2 - 1, expected_sections_typea),
+ EqualsDocHitInfo(/*document_id=*/1 - 1, expected_sections_typea))));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(1));
+
+ // Add a new doc (docid==5) and a hit for desiredProperty. This should still
+ // be placed into the wildcard integer storage.
+ doc = DocumentBuilder().SetKey("ns1", "uri11").SetSchema("TypeA").Build();
+ ICING_ASSERT_OK(this->doc_store_->Put(doc));
+ Index(integer_index.get(), desired_property, /*document_id=*/5,
+ typea_desired_prop_id, /*keys=*/{12});
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(1));
+
+ EXPECT_THAT(Query(integer_index.get(), desired_property,
+ /*key_lower=*/12, /*key_upper=*/12),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/5, expected_sections_typea))));
+
+ // Add a new doc (docid==6) and a hit for undesiredProperty. This should still
+ // be placed into the wildcard integer storage.
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri12").Build()));
+ Index(integer_index.get(), undesired_property, /*document_id=*/6,
+ typea_undesired_prop_id, /*keys=*/{3});
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(1));
+
+ expected_sections_typea = {typea_undesired_prop_id};
+ EXPECT_THAT(Query(integer_index.get(), undesired_property,
+ /*key_lower=*/3, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/6, expected_sections_typea))));
+
+ // Add a new doc (docid==7) and a hit for otherProperty1. This should be given
+ // its own individual storage.
+ ICING_ASSERT_OK(
+ this->doc_store_->Put(DocumentBuilder(doc).SetUri("uri13").Build()));
+ Index(integer_index.get(), other_property_1, /*document_id=*/7,
+ typea_other1_prop_id, /*keys=*/{3});
+ EXPECT_THAT(integer_index->num_property_indices(), Eq(2));
+
+ expected_sections_typea = {typea_other1_prop_id};
+ EXPECT_THAT(Query(integer_index.get(), other_property_1,
+ /*key_lower=*/3, /*key_upper=*/3),
+ IsOkAndHolds(ElementsAre(EqualsDocHitInfo(
+ /*document_id=*/7, expected_sections_typea))));
+}
+
+TEST_P(IntegerIndexTest, IteratorCallStats) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{0});
+
+ // GetIterator for range [INT_MIN, INT_MAX] and Advance all. Those 4 keys are
+ // in 1 single bucket, so there will be only 1 posting list (and 1 block).
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iter,
+ integer_index->GetIterator(
+ kDefaultTestPropertyPath,
+ /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *doc_store_,
+ *schema_store_, clock_.GetSystemTimeMilliseconds()));
+
+ // 1 block should be read even without calling Advance(), since we read the
+ // posting list and put bucket into the priority queue in ctor.
+ EXPECT_THAT(iter->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/1,
+ /*num_leaf_advance_calls_no_index=*/0,
+ /*num_blocks_inspected=*/1));
+
+ // 1st Advance().
+ ICING_ASSERT_OK(iter->Advance());
+ EXPECT_THAT(iter->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/2,
+ /*num_leaf_advance_calls_no_index=*/0,
+ /*num_blocks_inspected=*/1));
+
+ // 2nd Advance().
+ ICING_ASSERT_OK(iter->Advance());
+ EXPECT_THAT(iter->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/3,
+ /*num_leaf_advance_calls_no_index=*/0,
+ /*num_blocks_inspected=*/1));
+
+ // 3rd Advance().
+ ICING_ASSERT_OK(iter->Advance());
+ EXPECT_THAT(iter->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/4,
+ /*num_leaf_advance_calls_no_index=*/0,
+ /*num_blocks_inspected=*/1));
+
+ // 4th Advance().
+ ICING_ASSERT_OK(iter->Advance());
+ EXPECT_THAT(iter->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/4,
+ /*num_leaf_advance_calls_no_index=*/0,
+ /*num_blocks_inspected=*/1));
+
+ // 5th Advance().
+ ASSERT_THAT(iter->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(iter->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/4,
+ /*num_leaf_advance_calls_no_index=*/0,
+ /*num_blocks_inspected=*/1));
+}
+
+TEST_P(IntegerIndexTest, IteratorCallStatsNonExistingProperty) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<IntegerIndex> integer_index,
+ IntegerIndex::Create(filesystem_, working_path_,
+ GetParam().num_data_threshold_for_bucket_split,
+ GetParam().pre_mapping_fbv));
+
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/0,
+ kDefaultSectionId, /*keys=*/{1});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/1,
+ kDefaultSectionId, /*keys=*/{3});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/2,
+ kDefaultSectionId, /*keys=*/{2});
+ Index(integer_index.get(), kDefaultTestPropertyPath, /*document_id=*/3,
+ kDefaultSectionId, /*keys=*/{0});
+
+ // GetIterator for property "otherProperty1".
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> iter,
+ integer_index->GetIterator(
+ "otherProperty1", /*key_lower=*/std::numeric_limits<int64_t>::min(),
+ /*key_upper=*/std::numeric_limits<int64_t>::max(), *doc_store_,
+ *schema_store_, clock_.GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(iter->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0,
+ /*num_blocks_inspected=*/0));
+
+ // 1st Advance().
+ ASSERT_THAT(iter->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ EXPECT_THAT(iter->GetCallStats(),
+ EqualsDocHitInfoIteratorCallStats(
+ /*num_leaf_advance_calls_lite_index=*/0,
+ /*num_leaf_advance_calls_main_index=*/0,
+ /*num_leaf_advance_calls_integer_index=*/0,
+ /*num_leaf_advance_calls_no_index=*/0,
+ /*num_blocks_inspected=*/0));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ IntegerIndexTest, IntegerIndexTest,
+ testing::Values(
+ IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/341,
+ /*pre_mapping_fbv_in=*/false),
+ IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/341,
+ /*pre_mapping_fbv_in=*/true),
+
+ IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/16384,
+ /*pre_mapping_fbv_in=*/false),
+ IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/32768,
+ /*pre_mapping_fbv_in=*/false),
+ IntegerIndexTestParam(/*num_data_threshold_for_bucket_split_in=*/65536,
+ /*pre_mapping_fbv_in=*/false)));
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/numeric-index.h b/icing/index/numeric/numeric-index.h
new file mode 100644
index 0000000..d094d3d
--- /dev/null
+++ b/icing/index/numeric/numeric-index.h
@@ -0,0 +1,204 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_NUMERIC_INDEX_H_
+#define ICING_INDEX_NUMERIC_NUMERIC_INDEX_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+template <typename T>
+class NumericIndex : public PersistentStorage {
+ public:
+ using value_type = T;
+
+ // Editor class for batch adding new records into numeric index for a given
+ // property, DocumentId and SectionId. The caller should use BufferKey to
+ // buffer a key (calls several times for multiple keys) and finally call
+ // IndexAllBufferedKeys to batch add all buffered keys (with DocumentId +
+ // SectionId info, i.e. BasicHit) into numeric index.
+ //
+ // For example, there are values = [5, 1, 10, -100] in DocumentId = 5,
+ // SectionId = 1 (property "timestamp").
+ // Then the client should call BufferKey(5), BufferKey(1), BufferKey(10),
+ // BufferKey(-100) first, and finally call IndexAllBufferedKeys once to batch
+ // add these records into numeric index.
+ class Editor {
+ public:
+ explicit Editor(std::string_view property_path, DocumentId document_id,
+ SectionId section_id)
+ : property_path_(property_path),
+ document_id_(document_id),
+ section_id_(section_id) {}
+
+ virtual ~Editor() = default;
+
+ // Buffers a new key.
+ //
+ // Returns:
+ // - OK on success
+ // - Any other errors, depending on the actual implementation
+ virtual libtextclassifier3::Status BufferKey(T key) = 0;
+
+ // Adds all buffered keys into numeric index.
+ //
+ // Returns:
+ // - OK on success
+ // - Any other errors, depending on the actual implementation
+ virtual libtextclassifier3::Status IndexAllBufferedKeys() && = 0;
+
+ protected:
+ std::string property_path_;
+ DocumentId document_id_;
+ SectionId section_id_;
+ };
+
+ // Iterator class for numeric index range query [key_lower, key_upper]
+ // (inclusive for both side) on a given property (see GetIterator). There are
+ // some basic requirements for implementation:
+ // - Iterates through all relevant doc hits.
+ // - Merges multiple SectionIds of doc hits with same DocumentId into a single
+ // SectionIdMask and constructs DocHitInfo.
+ // - Returns DocHitInfo in descending DocumentId order.
+ //
+ // For example, relevant doc hits (DocumentId, SectionId) are [(2, 0), (4, 3),
+ // (2, 1), (6, 2), (4, 2)]. Advance() and GetDocHitInfo() should return
+ // DocHitInfo(6, SectionIdMask(2)), DocHitInfo(4, SectionIdMask(2, 3)) and
+ // DocHitInfo(2, SectionIdMask(0, 1)).
+ class Iterator {
+ public:
+ explicit Iterator(T key_lower, T key_upper)
+ : key_lower_(key_lower), key_upper_(key_upper) {}
+
+ virtual ~Iterator() = default;
+
+ virtual libtextclassifier3::Status Advance() = 0;
+
+ virtual DocHitInfo GetDocHitInfo() const = 0;
+
+ virtual int32_t GetNumAdvanceCalls() const = 0;
+
+ virtual int32_t GetNumBlocksInspected() const = 0;
+
+ protected:
+ T key_lower_;
+ T key_upper_;
+ };
+
+ virtual ~NumericIndex() = default;
+
+ // Returns an Editor instance for adding new records into numeric index for a
+ // given property, DocumentId and SectionId. See Editor for more details.
+ virtual std::unique_ptr<Editor> Edit(std::string_view property_path,
+ DocumentId document_id,
+ SectionId section_id) = 0;
+
+ // Returns a DocHitInfoIteratorNumeric (in DocHitInfoIterator interface type
+ // format) for iterating through all docs which have the specified (numeric)
+ // property contents in range [key_lower, key_upper].
+ //
+ // In general, different numeric index implementations require different data
+ // iterator implementations, so class Iterator is an abstraction of the data
+ // iterator and DocHitInfoIteratorNumeric can work with any implementation of
+ // it. See Iterator and DocHitInfoIteratorNumeric for more details.
+ //
+ // Returns:
+ // - std::unique_ptr<DocHitInfoIterator> on success
+ // - NOT_FOUND_ERROR if there is no numeric index for property_path
+ // - INVALID_ARGUMENT_ERROR if key_lower > key_upper
+ // - Any other errors, depending on the actual implementation
+ virtual libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+ GetIterator(std::string_view property_path, T key_lower, T key_upper,
+ const DocumentStore& document_store,
+ const SchemaStore& schema_store,
+ int64_t current_time_ms) const = 0;
+
+ // Reduces internal file sizes by reclaiming space and ids of deleted
+ // documents. Numeric index will convert all data (hits) to the new document
+ // ids and regenerate all index files. If all data in a property path are
+ // completely deleted, then the underlying storage must be discarded as well.
+ //
+ // - document_id_old_to_new: a map for converting old document id to new
+ // document id.
+ // - new_last_added_document_id: will be used to update the last added
+ // document id in the numeric index.
+ //
+ // Returns:
+ // - OK on success
+ // - Any other errors, depending on the actual implementation
+ virtual libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId new_last_added_document_id) = 0;
+
+ // Clears all data in the integer index and set last_added_document_id to
+ // kInvalidDocumentId.
+ //
+ // Returns:
+ // - OK on success
+ // - Any other errors, depending on the actual implementation
+ virtual libtextclassifier3::Status Clear() = 0;
+
+ // Returns the largest document_id added to the index. Note that DocumentIds
+ // are always inserted in increasing order.
+ virtual DocumentId last_added_document_id() const = 0;
+
+ // Sets last_added_document_id to document_id so long as document_id >
+ // last_added_document_id() or last_added_document_id() is invalid.
+ virtual void set_last_added_document_id(DocumentId document_id) = 0;
+
+ // The number of individual indices that the NumericIndex has created to
+ // search over all indexed properties thus far.
+ virtual int num_property_indices() const = 0;
+
+ protected:
+ explicit NumericIndex(const Filesystem& filesystem,
+ std::string&& working_path,
+ PersistentStorage::WorkingPathType working_path_type)
+ : PersistentStorage(filesystem, std::move(working_path),
+ working_path_type) {}
+
+ virtual libtextclassifier3::Status PersistStoragesToDisk(
+ bool force) override = 0;
+
+ virtual libtextclassifier3::Status PersistMetadataToDisk(
+ bool force) override = 0;
+
+ virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(
+ bool force) override = 0;
+
+ virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) override = 0;
+
+ virtual Crcs& crcs() override = 0;
+ virtual const Crcs& crcs() const override = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_NUMERIC_INDEX_H_
diff --git a/icing/index/numeric/posting-list-integer-index-accessor.cc b/icing/index/numeric/posting-list-integer-index-accessor.cc
new file mode 100644
index 0000000..af2aea4
--- /dev/null
+++ b/icing/index/numeric/posting-list-integer-index-accessor.cc
@@ -0,0 +1,164 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/posting-list-integer-index-accessor.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<PostingListIntegerIndexAccessor>>
+PostingListIntegerIndexAccessor::Create(
+ FlashIndexStorage* storage, PostingListIntegerIndexSerializer* serializer) {
+ uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ storage->block_size(), serializer->GetDataTypeBytes());
+ ICING_ASSIGN_OR_RETURN(PostingListUsed in_memory_posting_list,
+ PostingListUsed::CreateFromUnitializedRegion(
+ serializer, max_posting_list_bytes));
+ return std::unique_ptr<PostingListIntegerIndexAccessor>(
+ new PostingListIntegerIndexAccessor(
+ storage, std::move(in_memory_posting_list), serializer));
+}
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<PostingListIntegerIndexAccessor>>
+PostingListIntegerIndexAccessor::CreateFromExisting(
+ FlashIndexStorage* storage, PostingListIntegerIndexSerializer* serializer,
+ PostingListIdentifier existing_posting_list_id) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ Create(storage, serializer));
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage->GetPostingList(existing_posting_list_id));
+ pl_accessor->preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ return pl_accessor;
+}
+
+// Returns the next batch of integer index data for the provided posting list.
+libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+PostingListIntegerIndexAccessor::GetNextDataBatch() {
+ return GetNextDataBatchImpl(/*free_posting_list=*/false);
+}
+
+libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+PostingListIntegerIndexAccessor::GetAllDataAndFree() {
+ if (preexisting_posting_list_ == nullptr) {
+ return absl_ports::FailedPreconditionError(
+ "Cannot retrieve data from a PostingListIntegerIndexAccessor that "
+ "was not created from a preexisting posting list.");
+ }
+
+ std::vector<IntegerIndexData> all_data;
+ while (true) {
+ ICING_ASSIGN_OR_RETURN(std::vector<IntegerIndexData> batch,
+ GetNextDataBatchImpl(/*free_posting_list=*/true));
+ if (batch.empty()) {
+ break;
+ }
+ std::move(batch.begin(), batch.end(), std::back_inserter(all_data));
+ }
+
+ return all_data;
+}
+
+libtextclassifier3::Status PostingListIntegerIndexAccessor::PrependData(
+ const IntegerIndexData& data) {
+ PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr)
+ ? preexisting_posting_list_->posting_list
+ : in_memory_posting_list_;
+ libtextclassifier3::Status status =
+ serializer_->PrependData(&active_pl, data);
+ if (!absl_ports::IsResourceExhausted(status)) {
+ return status;
+ }
+ // There is no more room to add data to this current posting list! Therefore,
+ // we need to either move those data to a larger posting list or flush this
+ // posting list and create another max-sized posting list in the chain.
+ if (preexisting_posting_list_ != nullptr) {
+ ICING_RETURN_IF_ERROR(FlushPreexistingPostingList());
+ } else {
+ ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
+ }
+
+ // Re-add data. Should always fit since we just cleared
+ // in_memory_posting_list_. It's fine to explicitly reference
+ // in_memory_posting_list_ here because there's no way of reaching this line
+ // while preexisting_posting_list_ is still in use.
+ return serializer_->PrependData(&in_memory_posting_list_, data);
+}
+
+libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+PostingListIntegerIndexAccessor::GetNextDataBatchImpl(bool free_posting_list) {
+ if (preexisting_posting_list_ == nullptr) {
+ if (has_reached_posting_list_chain_end_) {
+ return std::vector<IntegerIndexData>();
+ }
+ return absl_ports::FailedPreconditionError(
+ "Cannot retrieve data from a PostingListIntegerIndexAccessor that "
+ "was not created from a preexisting posting list.");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<IntegerIndexData> batch,
+ serializer_->GetData(&preexisting_posting_list_->posting_list));
+ uint32_t next_block_index = kInvalidBlockIndex;
+ // Posting lists will only be chained when they are max-sized, in which case
+ // next_block_index will point to the next block for the next posting list.
+ // Otherwise, next_block_index can be kInvalidBlockIndex or be used to point
+ // to the next free list block, which is not relevant here.
+ if (preexisting_posting_list_->posting_list.size_in_bytes() ==
+ storage_->max_posting_list_bytes()) {
+ next_block_index = preexisting_posting_list_->next_block_index;
+ }
+
+ if (free_posting_list) {
+ ICING_RETURN_IF_ERROR(
+ storage_->FreePostingList(std::move(*preexisting_posting_list_)));
+ }
+
+ if (next_block_index != kInvalidBlockIndex) {
+ // Since we only have to deal with next block for max-sized posting list
+ // block, max_num_posting_lists is 1 and posting_list_index_bits is
+ // BitsToStore(1).
+ PostingListIdentifier next_posting_list_id(
+ next_block_index, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/BitsToStore(1));
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage_->GetPostingList(next_posting_list_id));
+ preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ } else {
+ has_reached_posting_list_chain_end_ = true;
+ preexisting_posting_list_.reset();
+ }
+ return batch;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/posting-list-integer-index-accessor.h b/icing/index/numeric/posting-list-integer-index-accessor.h
new file mode 100644
index 0000000..4f667a0
--- /dev/null
+++ b/icing/index/numeric/posting-list-integer-index-accessor.h
@@ -0,0 +1,130 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_ACCESSOR_H_
+#define ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_ACCESSOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-accessor.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+
+namespace icing {
+namespace lib {
+
+// TODO(b/259743562): Refactor PostingListAccessor derived classes
+
+// This class is used to provide a simple abstraction for adding integer index
+// data to posting lists. PostingListIntegerIndexAccessor handles:
+// 1) selection of properly-sized posting lists for the accumulated integer
+// index data during Finalize()
+// 2) chaining of max-sized posting lists.
+class PostingListIntegerIndexAccessor : public PostingListAccessor {
+ public:
+ // Creates an empty PostingListIntegerIndexAccessor.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListIntegerIndexAccessor
+ // - INVALID_ARGUMENT error if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<PostingListIntegerIndexAccessor>>
+ Create(FlashIndexStorage* storage,
+ PostingListIntegerIndexSerializer* serializer);
+
+ // Creates a PostingListIntegerIndexAccessor with an existing posting list
+ // identified by existing_posting_list_id.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListIntegerIndexAccessor
+ // - INVALID_ARGUMENT if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<PostingListIntegerIndexAccessor>>
+ CreateFromExisting(FlashIndexStorage* storage,
+ PostingListIntegerIndexSerializer* serializer,
+ PostingListIdentifier existing_posting_list_id);
+
+ PostingListSerializer* GetSerializer() override { return serializer_; }
+
+ // Retrieves the next batch of data in the posting list chain.
+ //
+ // RETURNS:
+ // - On success, a vector of integer index data in the posting list chain
+ // - FAILED_PRECONDITION_ERROR if called on an instance that was created via
+ // Create.
+ // - INTERNAL_ERROR if unable to read the next posting list in the chain or
+ // if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+ GetNextDataBatch();
+
+ // Retrieves all data from the posting list chain and frees all posting
+ // list(s).
+ //
+ // RETURNS:
+ // - On success, a vector of integer index data in the posting list chain
+ // - FAILED_PRECONDITION_ERROR if called on an instance that was created via
+ // Create.
+ // - INTERNAL_ERROR if unable to read the next posting list in the chain or
+ // if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+ GetAllDataAndFree();
+
+ // Prepends one data. This may result in flushing the posting list to disk (if
+ // the PostingListIntegerIndexAccessor holds a max-sized posting list that
+ // is full) or freeing a pre-existing posting list if it is too small to fit
+ // all data necessary.
+ //
+ // RETURNS:
+ // - OK, on success
+ // - INVALID_ARGUMENT if !data.is_valid() or if data is greater than the
+ // previously added data.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status PrependData(const IntegerIndexData& data);
+
+ private:
+ explicit PostingListIntegerIndexAccessor(
+ FlashIndexStorage* storage, PostingListUsed in_memory_posting_list,
+ PostingListIntegerIndexSerializer* serializer)
+ : PostingListAccessor(storage, std::move(in_memory_posting_list)),
+ serializer_(serializer) {}
+
+ // Retrieves the next batch of data in the posting list chain.
+ //
+ // - free_posting_list: a boolean flag indicating whether freeing all posting
+ // lists after retrieving batch data.
+ //
+ // RETURNS:
+ // - On success, a vector of integer index data in the posting list chain
+ // - FAILED_PRECONDITION_ERROR if called on an instance that was created via
+ // Create.
+ // - INTERNAL_ERROR if unable to read the next posting list in the chain or
+ // if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+ GetNextDataBatchImpl(bool free_posting_list);
+
+ PostingListIntegerIndexSerializer* serializer_; // Does not own.
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_ACCESSOR_H_
diff --git a/icing/index/numeric/posting-list-integer-index-accessor_test.cc b/icing/index/numeric/posting-list-integer-index-accessor_test.cc
new file mode 100644
index 0000000..f655fea
--- /dev/null
+++ b/icing/index/numeric/posting-list-integer-index-accessor_test.cc
@@ -0,0 +1,535 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/posting-list-integer-index-accessor.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Lt;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+class PostingListIntegerIndexAccessorTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/test_dir";
+ file_name_ = test_dir_ + "/test_file.idx.index";
+
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+
+ serializer_ = std::make_unique<PostingListIntegerIndexSerializer>();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ flash_index_storage_ =
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
+ }
+
+ void TearDown() override {
+ flash_index_storage_.reset();
+ serializer_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+ std::string file_name_;
+ std::unique_ptr<PostingListIntegerIndexSerializer> serializer_;
+ std::unique_ptr<FlashIndexStorage> flash_index_storage_;
+};
+
+std::vector<IntegerIndexData> CreateData(int num_data,
+ DocumentId start_document_id,
+ int64_t start_key) {
+ SectionId section_id = kMaxSectionId;
+
+ std::vector<IntegerIndexData> data;
+ data.reserve(num_data);
+ for (int i = 0; i < num_data; ++i) {
+ data.push_back(IntegerIndexData(section_id, start_document_id, start_key));
+
+ if (section_id == kMinSectionId) {
+ section_id = kMaxSectionId;
+ } else {
+ --section_id;
+ }
+ ++start_document_id;
+ ++start_key;
+ }
+ return data;
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest, DataAddAndRetrieveProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add some integer index data
+ std::vector<IntegerIndexData> data_vec =
+ CreateData(/*num_data=*/5, /*start_document_id=*/0, /*start_key=*/819);
+ for (const IntegerIndexData& data : data_vec) {
+ EXPECT_THAT(pl_accessor->PrependData(data), IsOk());
+ }
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ EXPECT_THAT(result.status, IsOk());
+ EXPECT_THAT(result.id.block_index(), Eq(1));
+ EXPECT_THAT(result.id.posting_list_index(), Eq(0));
+
+ // Retrieve some data.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result.id));
+ EXPECT_THAT(
+ serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend())));
+ EXPECT_THAT(pl_holder.next_block_index, Eq(kInvalidBlockIndex));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest, PreexistingPLKeepOnSameBlock) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add a single data. This will fit in a min-sized posting list.
+ IntegerIndexData data1(/*section_id=*/1, /*document_id=*/0, /*key=*/12345);
+ ICING_ASSERT_OK(pl_accessor->PrependData(data1));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+ // Should be allocated to the first block.
+ ASSERT_THAT(result1.id.block_index(), Eq(1));
+ ASSERT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Add one more data. The minimum size for a posting list must be able to fit
+ // two data, so this should NOT cause the previous pl to be reallocated.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ IntegerIndexData data2(/*section_id=*/1, /*document_id=*/1, /*key=*/23456);
+ ICING_ASSERT_OK(pl_accessor->PrependData(data2));
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result2.status);
+ // Should be in the same posting list.
+ EXPECT_THAT(result2.id, Eq(result1.id));
+
+ // The posting list at result2.id should hold all of the data that have been
+ // added.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result2.id));
+ EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAre(data2, data1)));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest, PreexistingPLReallocateToLargerPL) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Adding 3 data should cause Finalize allocating a 48-byte posting list,
+ // which can store at most 4 data.
+ std::vector<IntegerIndexData> data_vec1 =
+ CreateData(/*num_data=*/3, /*start_document_id=*/0, /*start_key=*/819);
+ for (const IntegerIndexData& data : data_vec1) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+ // Should be allocated to the first block.
+ ASSERT_THAT(result1.id.block_index(), Eq(1));
+ ASSERT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Now add more data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ // The current posting list can fit 1 more data. Adding 12 more data should
+ // result in these data being moved to a larger posting list. Also the total
+ // size of these data won't exceed max size posting list, so there will be
+ // only one single posting list and no chain.
+ std::vector<IntegerIndexData> data_vec2 = CreateData(
+ /*num_data=*/12,
+ /*start_document_id=*/data_vec1.back().basic_hit().document_id() + 1,
+ /*start_key=*/819);
+
+ for (const IntegerIndexData& data : data_vec2) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result2.status);
+ // Should be allocated to the second (new) block because the posting list
+ // should grow beyond the size that the first block maintains.
+ EXPECT_THAT(result2.id.block_index(), Eq(2));
+ EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
+
+ // The posting list at result2.id should hold all of the data that have been
+ // added.
+ std::vector<IntegerIndexData> all_data_vec;
+ all_data_vec.reserve(data_vec1.size() + data_vec2.size());
+ all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end());
+ all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end());
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result2.id));
+ EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(all_data_vec.rbegin(),
+ all_data_vec.rend())));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest, MultiBlockChainsBlocksProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(IntegerIndexData)
+ // is 12, so the max size posting list can store (4096 - 12) / 12 = 340 data.
+ // Adding 341 data should cause:
+ // - 2 max size posting lists being allocated to block 1 and block 2.
+ // - Chaining: block 2 -> block 1
+ std::vector<IntegerIndexData> data_vec =
+ CreateData(/*num_data=*/341, /*start_document_id=*/0, /*start_key=*/819);
+ for (const IntegerIndexData& data : data_vec) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+ PostingListIdentifier second_block_id = result1.id;
+ // Should be allocated to the second block.
+ EXPECT_THAT(second_block_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // We should be able to retrieve all data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(second_block_id));
+ // This pl_holder will only hold a posting list with the data that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<IntegerIndexData> second_block_data,
+ serializer_->GetData(&pl_holder.posting_list));
+ ASSERT_THAT(second_block_data, SizeIs(Lt(data_vec.size())));
+ auto first_block_data_start = data_vec.rbegin() + second_block_data.size();
+ EXPECT_THAT(second_block_data,
+ ElementsAreArray(data_vec.rbegin(), first_block_data_start));
+
+ // Now retrieve all of the data that were on the first block.
+ uint32_t first_block_id = pl_holder.next_block_index;
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage_->GetPostingList(pl_id));
+ EXPECT_THAT(
+ serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(first_block_data_start, data_vec.rend())));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest,
+ PreexistingMultiBlockReusesBlocksProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(IntegerIndexData)
+ // is 12, so the max size posting list can store (4096 - 12) / 12 = 340 data.
+ // Adding 341 data will cause:
+ // - 2 max size posting lists being allocated to block 1 and block 2.
+ // - Chaining: block 2 -> block 1
+ std::vector<IntegerIndexData> data_vec1 =
+ CreateData(/*num_data=*/341, /*start_document_id=*/0, /*start_key=*/819);
+ for (const IntegerIndexData& data : data_vec1) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+ PostingListIdentifier first_add_id = result1.id;
+ EXPECT_THAT(first_add_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // Now add more data. These should fit on the existing second block and not
+ // fill it up.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), first_add_id));
+ std::vector<IntegerIndexData> data_vec2 = CreateData(
+ /*num_data=*/10,
+ /*start_document_id=*/data_vec1.back().basic_hit().document_id() + 1,
+ /*start_key=*/819);
+ for (const IntegerIndexData& data : data_vec2) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result2.status);
+ PostingListIdentifier second_add_id = result2.id;
+ EXPECT_THAT(second_add_id, Eq(first_add_id));
+
+ // We should be able to retrieve all data.
+ std::vector<IntegerIndexData> all_data_vec;
+ all_data_vec.reserve(data_vec1.size() + data_vec2.size());
+ all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end());
+ all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(second_add_id));
+ // This pl_holder will only hold a posting list with the data that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<IntegerIndexData> second_block_data,
+ serializer_->GetData(&pl_holder.posting_list));
+ ASSERT_THAT(second_block_data, SizeIs(Lt(all_data_vec.size())));
+ auto first_block_data_start =
+ all_data_vec.rbegin() + second_block_data.size();
+ EXPECT_THAT(second_block_data,
+ ElementsAreArray(all_data_vec.rbegin(), first_block_data_start));
+
+ // Now retrieve all of the data that were on the first block.
+ uint32_t first_block_id = pl_holder.next_block_index;
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage_->GetPostingList(pl_id));
+ EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(first_block_data_start,
+ all_data_vec.rend())));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest,
+ InvalidDataShouldReturnInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ IntegerIndexData invalid_data;
+ EXPECT_THAT(pl_accessor->PrependData(invalid_data),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest,
+ BasicHitIncreasingShouldReturnInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ IntegerIndexData data1(/*section_id=*/3, /*document_id=*/1, /*key=*/12345);
+ ICING_ASSERT_OK(pl_accessor->PrependData(data1));
+
+ IntegerIndexData data2(/*section_id=*/6, /*document_id=*/1, /*key=*/12345);
+ EXPECT_THAT(pl_accessor->PrependData(data2),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ IntegerIndexData data3(/*section_id=*/2, /*document_id=*/0, /*key=*/12345);
+ EXPECT_THAT(pl_accessor->PrependData(data3),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest,
+ NewPostingListNoDataAddedShouldReturnInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ EXPECT_THAT(result.status,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest,
+ PreexistingPostingListNoDataAddedShouldSucceed) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ IntegerIndexData data1(/*section_id=*/3, /*document_id=*/1, /*key=*/12345);
+ ICING_ASSERT_OK(pl_accessor1->PrependData(data1));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor1).Finalize();
+ ICING_ASSERT_OK(result1.status);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor2).Finalize();
+ EXPECT_THAT(result2.status, IsOk());
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest, GetAllDataAndFree) {
+ IntegerIndexData data1(/*section_id=*/3, /*document_id=*/1, /*key=*/123);
+ IntegerIndexData data2(/*section_id=*/3, /*document_id=*/2, /*key=*/456);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add 2 data.
+ ICING_ASSERT_OK(pl_accessor1->PrependData(data1));
+ ICING_ASSERT_OK(pl_accessor1->PrependData(data2));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor1).Finalize();
+ ICING_ASSERT_OK(result1.status);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ EXPECT_THAT(pl_accessor2->GetAllDataAndFree(),
+ IsOkAndHolds(ElementsAre(data2, data1)));
+
+ // Allocate a new posting list with same size again.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor3,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add 2 data.
+ ICING_ASSERT_OK(pl_accessor3->PrependData(data1));
+ ICING_ASSERT_OK(pl_accessor3->PrependData(data2));
+ PostingListAccessor::FinalizeResult result3 =
+ std::move(*pl_accessor3).Finalize();
+ ICING_ASSERT_OK(result3.status);
+ // We should get the same id if the previous one has been freed correctly by
+ // GetAllDataAndFree.
+ EXPECT_THAT(result3.id, Eq(result1.id));
+}
+
+TEST_F(PostingListIntegerIndexAccessorTest, GetAllDataAndFreePostingListChain) {
+ uint32_t block_size = FlashIndexStorage::SelectBlockSize();
+ uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ block_size, serializer_->GetDataTypeBytes());
+ uint32_t max_num_data_single_posting_list =
+ max_posting_list_bytes / serializer_->GetDataTypeBytes();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor1,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+
+ // Prepend max_num_data_single_posting_list + 1 data.
+ std::vector<IntegerIndexData> data_vec;
+ for (uint32_t i = 0; i < max_num_data_single_posting_list + 1; ++i) {
+ IntegerIndexData data(/*section_id=*/3, static_cast<DocumentId>(i),
+ /*key=*/i);
+ ICING_ASSERT_OK(pl_accessor1->PrependData(data));
+ data_vec.push_back(data);
+ }
+
+ // This will cause:
+ // - Allocate the first max-sized posting list at block index = 1, storing
+ // max_num_data_single_posting_list data.
+ // - Allocate the second max-sized posting list at block index = 2, storing 1
+ // data. Also its next_block_index is 1.
+ // - IOW, we will get 2 -> 1 and result1.id points to 2.
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor1).Finalize();
+ ICING_ASSERT_OK(result1.status);
+
+ uint32_t first_pl_block_index = kInvalidBlockIndex;
+ {
+ // result1.id points at the second (max-sized) PL, and next_block_index of
+ // the second PL points to the first PL's block. Fetch the first PL's block
+ // index manually.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result1.id));
+ first_pl_block_index = pl_holder.next_block_index;
+ }
+ ASSERT_THAT(first_pl_block_index, Ne(kInvalidBlockIndex));
+
+ // Call GetAllDataAndFree. This will free block 2 and block 1.
+ // Free block list: 1 -> 2 (since free block list is LIFO).
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor2,
+ PostingListIntegerIndexAccessor::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ EXPECT_THAT(
+ pl_accessor2->GetAllDataAndFree(),
+ IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend())));
+ pl_accessor2.reset();
+
+ // Allocate a new posting list with same size again.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListIntegerIndexAccessor> pl_accessor3,
+ PostingListIntegerIndexAccessor::Create(flash_index_storage_.get(),
+ serializer_.get()));
+ // Add same set of data.
+ for (uint32_t i = 0; i < max_num_data_single_posting_list + 1; ++i) {
+ ICING_ASSERT_OK(pl_accessor3->PrependData(data_vec[i]));
+ }
+
+ // This will cause:
+ // - Allocate the first max-sized posting list from the free block list, which
+ // is block index = 1, storing max_num_data_single_posting_list data.
+ // - Allocate the second max-sized posting list from the next block in free
+ // block list, which is block index = 2, storing 1 data. Also its
+ // next_block_index should be 1.
+ PostingListAccessor::FinalizeResult result3 =
+ std::move(*pl_accessor3).Finalize();
+ ICING_ASSERT_OK(result3.status);
+ // We should get the same id if the previous one has been freed correctly by
+ // GetAllDataAndFree.
+ EXPECT_THAT(result3.id, Eq(result1.id));
+ // Also the first PL should be the same if it has been freed correctly by
+ // GetAllDataAndFree. Since it is a max-sized posting list, we just need to
+ // verify the block index.
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result3.id));
+ EXPECT_THAT(pl_holder.next_block_index, Eq(first_pl_block_index));
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/posting-list-integer-index-serializer.cc b/icing/index/numeric/posting-list-integer-index-serializer.cc
new file mode 100644
index 0000000..99f14f9
--- /dev/null
+++ b/icing/index/numeric/posting-list-integer-index-serializer.cc
@@ -0,0 +1,512 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+uint32_t PostingListIntegerIndexSerializer::GetBytesUsed(
+ const PostingListUsed* posting_list_used) const {
+ // The special data will be included if they represent actual data. If they
+ // represent the data start offset or the invalid data sentinel, they are not
+ // included.
+ return posting_list_used->size_in_bytes() -
+ GetStartByteOffset(posting_list_used);
+}
+
+uint32_t PostingListIntegerIndexSerializer::GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const {
+ if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
+ // If in either the FULL state or ALMOST_FULL state, this posting list *is*
+ // the minimum size posting list that can fit these data. So just return the
+ // size of the posting list.
+ return posting_list_used->size_in_bytes();
+ }
+
+ // In NOT_FULL state, BytesUsed contains no special data. The minimum sized
+ // posting list that would be guaranteed to fit these data would be
+ // ALMOST_FULL, with kInvalidData in special data 0, the uncompressed data in
+ // special data 1 and the n compressed data in the compressed region.
+ // BytesUsed contains one uncompressed data and n compressed data. Therefore,
+ // fitting these data into a posting list would require BytesUsed plus one
+ // extra data.
+ return GetBytesUsed(posting_list_used) + GetDataTypeBytes();
+}
+
+void PostingListIntegerIndexSerializer::Clear(
+ PostingListUsed* posting_list_used) const {
+ // Safe to ignore return value because posting_list_used->size_in_bytes() is
+ // a valid argument.
+ SetStartByteOffset(posting_list_used,
+ /*offset=*/posting_list_used->size_in_bytes());
+}
+
+libtextclassifier3::Status PostingListIntegerIndexSerializer::MoveFrom(
+ PostingListUsed* dst, PostingListUsed* src) const {
+ ICING_RETURN_ERROR_IF_NULL(dst);
+ ICING_RETURN_ERROR_IF_NULL(src);
+ if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "src MinPostingListSizeToFit %d must be larger than size %d.",
+ GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
+ }
+
+ if (!IsPostingListValid(dst)) {
+ return absl_ports::FailedPreconditionError(
+ "Dst posting list is in an invalid state and can't be used!");
+ }
+ if (!IsPostingListValid(src)) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot MoveFrom an invalid src posting list!");
+ }
+
+ // Pop just enough data that all of src's compressed data fit in
+ // dst posting_list's compressed area. Then we can memcpy that area.
+ std::vector<IntegerIndexData> data_arr;
+ while (IsFull(src) || IsAlmostFull(src) ||
+ (dst->size_in_bytes() - kSpecialDataSize < GetBytesUsed(src))) {
+ if (!GetDataInternal(src, /*limit=*/1, /*pop=*/true, &data_arr).ok()) {
+ return absl_ports::AbortedError(
+ "Unable to retrieve data from src posting list.");
+ }
+ }
+
+ // memcpy the area and set up start byte offset.
+ Clear(dst);
+ memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
+ src->posting_list_buffer() + GetStartByteOffset(src),
+ GetBytesUsed(src));
+ // Because we popped all data from src outside of the compressed area and we
+ // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
+ // kSpecialDataSize. This is guaranteed to be a valid byte offset for the
+ // NOT_FULL state, so ignoring the value is safe.
+ SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
+
+ // Put back remaining data.
+ for (auto riter = data_arr.rbegin(); riter != data_arr.rend(); ++riter) {
+ // PrependData may return:
+ // - INVALID_ARGUMENT: if data is invalid or not less than the previous data
+ // - RESOURCE_EXHAUSTED
+ // RESOURCE_EXHAUSTED should be impossible because we've already assured
+ // that there is enough room above.
+ ICING_RETURN_IF_ERROR(PrependData(dst, *riter));
+ }
+
+ Clear(src);
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status
+PostingListIntegerIndexSerializer::PrependDataToAlmostFull(
+ PostingListUsed* posting_list_used, const IntegerIndexData& data) const {
+ SpecialDataType special_data = GetSpecialData(posting_list_used, /*index=*/1);
+ if (special_data.data().basic_hit() < data.basic_hit()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "BasicHit %d being prepended must not be greater than the most recent"
+ "BasicHit %d",
+ data.basic_hit().value(), special_data.data().basic_hit().value()));
+ }
+
+ // TODO(b/259743562): [Optimization 2] compression
+ // Without compression, prepend a new data into ALMOST_FULL posting list will
+ // change the posting list to FULL state. Therefore, set special data 0
+ // directly.
+ SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(data));
+ return libtextclassifier3::Status::OK;
+}
+
+void PostingListIntegerIndexSerializer::PrependDataToEmpty(
+ PostingListUsed* posting_list_used, const IntegerIndexData& data) const {
+ // First data to be added. Just add verbatim, no compression.
+ if (posting_list_used->size_in_bytes() == kSpecialDataSize) {
+ // First data will be stored at special data 1.
+ // Safe to ignore the return value because 1 < kNumSpecialData
+ SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
+ // Safe to ignore the return value because sizeof(IntegerIndexData) is a
+ // valid argument.
+ SetStartByteOffset(posting_list_used,
+ /*offset=*/sizeof(IntegerIndexData));
+ } else {
+ // Since this is the first data, size != kSpecialDataSize and
+ // size % sizeof(IntegerIndexData) == 0, we know that there is room to fit
+ // 'data' into the compressed region, so ValueOrDie is safe.
+ uint32_t offset =
+ PrependDataUncompressed(posting_list_used, data,
+ /*offset=*/posting_list_used->size_in_bytes())
+ .ValueOrDie();
+ // Safe to ignore the return value because PrependDataUncompressed is
+ // guaranteed to return a valid offset.
+ SetStartByteOffset(posting_list_used, offset);
+ }
+}
+
+libtextclassifier3::Status
+PostingListIntegerIndexSerializer::PrependDataToNotFull(
+ PostingListUsed* posting_list_used, const IntegerIndexData& data,
+ uint32_t offset) const {
+ IntegerIndexData cur;
+ memcpy(&cur, posting_list_used->posting_list_buffer() + offset,
+ sizeof(IntegerIndexData));
+ if (cur.basic_hit() < data.basic_hit()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "BasicHit %d being prepended must not be greater than the most recent"
+ "BasicHit %d",
+ data.basic_hit().value(), cur.basic_hit().value()));
+ }
+
+ // TODO(b/259743562): [Optimization 2] compression
+ if (offset >= kSpecialDataSize + sizeof(IntegerIndexData)) {
+ offset =
+ PrependDataUncompressed(posting_list_used, data, offset).ValueOrDie();
+ SetStartByteOffset(posting_list_used, offset);
+ } else {
+ // The new data must be put in special data 1.
+ SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
+ // State ALMOST_FULL. Safe to ignore the return value because
+ // sizeof(IntegerIndexData) is a valid argument.
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(IntegerIndexData));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PostingListIntegerIndexSerializer::PrependData(
+ PostingListUsed* posting_list_used, const IntegerIndexData& data) const {
+ static_assert(
+ sizeof(BasicHit::Value) <= sizeof(uint64_t),
+ "BasicHit::Value cannot be larger than 8 bytes because the delta "
+ "must be able to fit in 8 bytes.");
+
+ if (!data.is_valid()) {
+ return absl_ports::InvalidArgumentError("Cannot prepend an invalid data!");
+ }
+ if (!IsPostingListValid(posting_list_used)) {
+ return absl_ports::FailedPreconditionError(
+ "This PostingListUsed is in an invalid state and can't add any data!");
+ }
+
+ if (IsFull(posting_list_used)) {
+ // State FULL: no space left.
+ return absl_ports::ResourceExhaustedError("No more room for data");
+ } else if (IsAlmostFull(posting_list_used)) {
+ return PrependDataToAlmostFull(posting_list_used, data);
+ } else if (IsEmpty(posting_list_used)) {
+ PrependDataToEmpty(posting_list_used, data);
+ return libtextclassifier3::Status::OK;
+ } else {
+ uint32_t offset = GetStartByteOffset(posting_list_used);
+ return PrependDataToNotFull(posting_list_used, data, offset);
+ }
+}
+
+libtextclassifier3::StatusOr<uint32_t>
+PostingListIntegerIndexSerializer::PrependDataArray(
+ PostingListUsed* posting_list_used, const IntegerIndexData* array,
+ uint32_t num_data, bool keep_prepended) const {
+ if (!IsPostingListValid(posting_list_used)) {
+ return 0;
+ }
+
+ uint32_t i;
+ for (i = 0; i < num_data; ++i) {
+ if (!PrependData(posting_list_used, array[i]).ok()) {
+ break;
+ }
+ }
+ if (i != num_data && !keep_prepended) {
+ // Didn't fit. Undo everything and check that we have the same offset as
+ // before. PopFrontData guarantees that it will remove all 'i' data so long
+ // as there are at least 'i' data in the posting list, which we know there
+ // are.
+ ICING_RETURN_IF_ERROR(PopFrontData(posting_list_used, /*num_data=*/i));
+ return 0;
+ }
+ return i;
+}
+
+libtextclassifier3::StatusOr<std::vector<IntegerIndexData>>
+PostingListIntegerIndexSerializer::GetData(
+ const PostingListUsed* posting_list_used) const {
+ std::vector<IntegerIndexData> data_arr_out;
+ ICING_RETURN_IF_ERROR(GetData(posting_list_used, &data_arr_out));
+ return data_arr_out;
+}
+
+libtextclassifier3::Status PostingListIntegerIndexSerializer::GetData(
+ const PostingListUsed* posting_list_used,
+ std::vector<IntegerIndexData>* data_arr_out) const {
+ return GetDataInternal(posting_list_used,
+ /*limit=*/std::numeric_limits<uint32_t>::max(),
+ /*pop=*/false, data_arr_out);
+}
+
+libtextclassifier3::Status PostingListIntegerIndexSerializer::PopFrontData(
+ PostingListUsed* posting_list_used, uint32_t num_data) const {
+ if (num_data == 1 && IsFull(posting_list_used)) {
+ // The PL is in FULL state which means that we save 2 uncompressed data in
+ // the 2 special postions. But FULL state may be reached by 2 different
+ // states.
+ // (1) In ALMOST_FULL state
+ // +------------------+-----------------+-----+---------------------------+
+ // |Data::Invalid |1st data |(pad)|(compressed) data |
+ // | | | | |
+ // +------------------+-----------------+-----+---------------------------+
+ // When we prepend another data, we can only put it at special data 0, and
+ // thus get a FULL PL
+ // +------------------+-----------------+-----+---------------------------+
+ // |new 1st data |original 1st data|(pad)|(compressed) data |
+ // | | | | |
+ // +------------------+-----------------+-----+---------------------------+
+ //
+ // (2) In NOT_FULL state
+ // +------------------+-----------------+-------+---------+---------------+
+ // |data-start-offset |Data::Invalid |(pad) |1st data |(compressed) |
+ // | | | | |data |
+ // +------------------+-----------------+-------+---------+---------------+
+ // When we prepend another data, we can reach any of the 3 following
+ // scenarios:
+ // (2.1) NOT_FULL
+ // if the space of pad and original 1st data can accommodate the new 1st
+ // data and the encoded delta value.
+ // +------------------+-----------------+-----+--------+------------------+
+ // |data-start-offset |Data::Invalid |(pad)|new |(compressed) data |
+ // | | | |1st data| |
+ // +------------------+-----------------+-----+--------+------------------+
+ // (2.2) ALMOST_FULL
+ // If the space of pad and original 1st data cannot accommodate the new 1st
+ // data and the encoded delta value but can accommodate the encoded delta
+ // value only. We can put the new 1st data at special position 1.
+ // +------------------+-----------------+---------+-----------------------+
+ // |Data::Invalid |new 1st data |(pad) |(compressed) data |
+ // | | | | |
+ // +------------------+-----------------+---------+-----------------------+
+ // (2.3) FULL
+ // In very rare case, it cannot even accommodate only the encoded delta
+ // value. we can move the original 1st data into special position 1 and the
+ // new 1st data into special position 0. This may happen because we use
+ // VarInt encoding method which may make the encoded value longer (about
+ // 4/3 times of original)
+ // +------------------+-----------------+--------------+------------------+
+ // |new 1st data |original 1st data|(pad) |(compressed) data |
+ // | | | | |
+ // +------------------+-----------------+--------------+------------------+
+ //
+ // Suppose now the PL is in FULL state. But we don't know whether it arrived
+ // this state from NOT_FULL (like (2.3)) or from ALMOST_FULL (like (1)).
+ // We'll return to ALMOST_FULL state like (1) if we simply pop the new 1st
+ // data, but we want to make the prepending operation "reversible". So
+ // there should be some way to return to NOT_FULL if possible. A simple way
+ // to do is:
+ // - Pop 2 data out of the PL to state ALMOST_FULL or NOT_FULL.
+ // - Add the second data ("original 1st data") back.
+ //
+ // Then we can return to the correct original states of (2.1) or (1). This
+ // makes our prepending operation reversible.
+ std::vector<IntegerIndexData> out;
+
+ // Popping 2 data should never fail because we've just ensured that the
+ // posting list is in the FULL state.
+ ICING_RETURN_IF_ERROR(
+ GetDataInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
+
+ // PrependData should never fail because:
+ // - out[1] is a valid data less than all previous data in the posting list.
+ // - There's no way that the posting list could run out of room because it
+ // previously stored these 2 data.
+ ICING_RETURN_IF_ERROR(PrependData(posting_list_used, out[1]));
+ } else if (num_data > 0) {
+ return GetDataInternal(posting_list_used, /*limit=*/num_data, /*pop=*/true,
+ /*out=*/nullptr);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status PostingListIntegerIndexSerializer::GetDataInternal(
+ const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+ std::vector<IntegerIndexData>* out) const {
+ // TODO(b/259743562): [Optimization 2] handle compressed data
+
+ uint32_t offset = GetStartByteOffset(posting_list_used);
+ uint32_t count = 0;
+
+ // First traverse the first two special positions.
+ while (count < limit && offset < kSpecialDataSize) {
+ // offset / sizeof(IntegerIndexData) < kNumSpecialData because of the check
+ // above.
+ SpecialDataType special_data =
+ GetSpecialData(posting_list_used,
+ /*index=*/offset / sizeof(IntegerIndexData));
+ if (out != nullptr) {
+ out->push_back(special_data.data());
+ }
+ offset += sizeof(IntegerIndexData);
+ ++count;
+ }
+
+ // - We don't compress the data now.
+ // - The posting list size is a multiple of data type bytes.
+ // So offset of the first non-special data is guaranteed to be at
+ // kSpecialDataSize if in ALMOST_FULL or FULL state. In fact, we must not
+ // apply padding skipping logic here when still storing uncompressed data,
+ // because in this case 0 bytes are meanful (e.g. inverted doc id byte = 0).
+ // TODO(b/259743562): [Optimization 2] deal with padding skipping logic when
+ // apply data compression.
+
+ while (count < limit && offset < posting_list_used->size_in_bytes()) {
+ IntegerIndexData data;
+ memcpy(&data, posting_list_used->posting_list_buffer() + offset,
+ sizeof(IntegerIndexData));
+ offset += sizeof(IntegerIndexData);
+ if (out != nullptr) {
+ out->push_back(data);
+ }
+ ++count;
+ }
+
+ if (pop) {
+ PostingListUsed* mutable_posting_list_used =
+ const_cast<PostingListUsed*>(posting_list_used);
+ // Modify the posting list so that we pop all data actually traversed.
+ if (offset >= kSpecialDataSize &&
+ offset < posting_list_used->size_in_bytes()) {
+ memset(
+ mutable_posting_list_used->posting_list_buffer() + kSpecialDataSize,
+ 0, offset - kSpecialDataSize);
+ }
+ SetStartByteOffset(mutable_posting_list_used, offset);
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+PostingListIntegerIndexSerializer::SpecialDataType
+PostingListIntegerIndexSerializer::GetSpecialData(
+ const PostingListUsed* posting_list_used, uint32_t index) const {
+ // It is ok to temporarily construct a SpecialData with offset = 0 since we're
+ // going to overwrite it by memcpy.
+ SpecialDataType special_data(0);
+ memcpy(&special_data,
+ posting_list_used->posting_list_buffer() +
+ index * sizeof(SpecialDataType),
+ sizeof(SpecialDataType));
+ return special_data;
+}
+
+void PostingListIntegerIndexSerializer::SetSpecialData(
+ PostingListUsed* posting_list_used, uint32_t index,
+ const SpecialDataType& special_data) const {
+ memcpy(posting_list_used->posting_list_buffer() +
+ index * sizeof(SpecialDataType),
+ &special_data, sizeof(SpecialDataType));
+}
+
+bool PostingListIntegerIndexSerializer::IsPostingListValid(
+ const PostingListUsed* posting_list_used) const {
+ if (IsAlmostFull(posting_list_used)) {
+ // Special data 1 should hold a valid data.
+ if (!GetSpecialData(posting_list_used, /*index=*/1).data().is_valid()) {
+ ICING_LOG(ERROR)
+ << "Both special data cannot be invalid at the same time.";
+ return false;
+ }
+ } else if (!IsFull(posting_list_used)) {
+ // NOT_FULL. Special data 0 should hold a valid offset.
+ SpecialDataType special_data =
+ GetSpecialData(posting_list_used, /*index=*/0);
+ if (special_data.data_start_offset() > posting_list_used->size_in_bytes() ||
+ special_data.data_start_offset() < kSpecialDataSize) {
+ ICING_LOG(ERROR) << "Offset: " << special_data.data_start_offset()
+ << " size: " << posting_list_used->size_in_bytes()
+ << " sp size: " << kSpecialDataSize;
+ return false;
+ }
+ }
+ return true;
+}
+
+uint32_t PostingListIntegerIndexSerializer::GetStartByteOffset(
+ const PostingListUsed* posting_list_used) const {
+ if (IsFull(posting_list_used)) {
+ return 0;
+ } else if (IsAlmostFull(posting_list_used)) {
+ return sizeof(IntegerIndexData);
+ } else {
+ return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset();
+ }
+}
+
+bool PostingListIntegerIndexSerializer::SetStartByteOffset(
+ PostingListUsed* posting_list_used, uint32_t offset) const {
+ if (offset > posting_list_used->size_in_bytes()) {
+ ICING_LOG(ERROR) << "offset cannot be a value greater than size "
+ << posting_list_used->size_in_bytes() << ". offset is "
+ << offset << ".";
+ return false;
+ }
+ if (offset < kSpecialDataSize && offset > sizeof(IntegerIndexData)) {
+ ICING_LOG(ERROR) << "offset cannot be a value between ("
+ << sizeof(IntegerIndexData) << ", " << kSpecialDataSize
+ << "). offset is " << offset << ".";
+ return false;
+ }
+ if (offset < sizeof(IntegerIndexData) && offset != 0) {
+ ICING_LOG(ERROR) << "offset cannot be a value between (0, "
+ << sizeof(IntegerIndexData) << "). offset is " << offset
+ << ".";
+ return false;
+ }
+
+ if (offset >= kSpecialDataSize) {
+ // NOT_FULL state.
+ SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(offset));
+ SetSpecialData(posting_list_used, /*index=*/1,
+ SpecialDataType(IntegerIndexData()));
+ } else if (offset == sizeof(IntegerIndexData)) {
+ // ALMOST_FULL state.
+ SetSpecialData(posting_list_used, /*index=*/0,
+ SpecialDataType(IntegerIndexData()));
+ }
+ // Nothing to do for the FULL state - the offset isn't actually stored
+ // anywhere and both 2 special data hold valid data.
+ return true;
+}
+
+libtextclassifier3::StatusOr<uint32_t>
+PostingListIntegerIndexSerializer::PrependDataUncompressed(
+ PostingListUsed* posting_list_used, const IntegerIndexData& data,
+ uint32_t offset) const {
+ if (offset < kSpecialDataSize + sizeof(IntegerIndexData)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Not enough room to prepend IntegerIndexData at offset %d.", offset));
+ }
+ offset -= sizeof(IntegerIndexData);
+ memcpy(posting_list_used->posting_list_buffer() + offset, &data,
+ sizeof(IntegerIndexData));
+ return offset;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/numeric/posting-list-integer-index-serializer.h b/icing/index/numeric/posting-list-integer-index-serializer.h
new file mode 100644
index 0000000..cbaed33
--- /dev/null
+++ b/icing/index/numeric/posting-list-integer-index-serializer.h
@@ -0,0 +1,338 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_
+#define ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/numeric/integer-index-data.h"
+
+namespace icing {
+namespace lib {
+
+// A serializer class to serialize IntegerIndexData to PostingListUsed.
+class PostingListIntegerIndexSerializer : public PostingListSerializer {
+ public:
+ using SpecialDataType = SpecialData<IntegerIndexData>;
+ static_assert(sizeof(SpecialDataType) == sizeof(IntegerIndexData), "");
+
+ static constexpr uint32_t kSpecialDataSize =
+ kNumSpecialData * sizeof(SpecialDataType);
+
+ uint32_t GetDataTypeBytes() const override {
+ return sizeof(IntegerIndexData);
+ }
+
+ uint32_t GetMinPostingListSize() const override {
+ static constexpr uint32_t kMinPostingListSize = kSpecialDataSize;
+ static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
+ "PostingListIndex must be small enough to fit in a "
+ "minimum-sized Posting List.");
+
+ return kMinPostingListSize;
+ }
+
+ uint32_t GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const override;
+
+ uint32_t GetBytesUsed(
+ const PostingListUsed* posting_list_used) const override;
+
+ void Clear(PostingListUsed* posting_list_used) const override;
+
+ libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
+ PostingListUsed* src) const override;
+
+ // Prepend an IntegerIndexData to the posting list.
+ //
+ // RETURNS:
+ // - INVALID_ARGUMENT if !data.is_valid() or if data is not less than the
+ // previously added data.
+ // - RESOURCE_EXHAUSTED if there is no more room to add data to the posting
+ // list.
+ libtextclassifier3::Status PrependData(PostingListUsed* posting_list_used,
+ const IntegerIndexData& data) const;
+
+ // Prepend multiple IntegerIndexData to the posting list. Data should be
+ // sorted in ascending order (as defined by the less than operator for
+ // IntegerIndexData)
+ // If keep_prepended is true, whatever could be prepended is kept, otherwise
+ // the posting list is reverted and left in its original state.
+ //
+ // RETURNS:
+ // The number of data that have been prepended to the posting list. If
+ // keep_prepended is false and reverted, then it returns 0.
+ libtextclassifier3::StatusOr<uint32_t> PrependDataArray(
+ PostingListUsed* posting_list_used, const IntegerIndexData* array,
+ uint32_t num_data, bool keep_prepended) const;
+
+ // Retrieves all data stored in the posting list.
+ //
+ // RETURNS:
+ // - On success, a vector of IntegerIndexData sorted by the reverse order of
+ // prepending.
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<IntegerIndexData>> GetData(
+ const PostingListUsed* posting_list_used) const;
+
+ // Same as GetData but appends data to data_arr_out.
+ //
+ // RETURNS:
+ // - OK on success, and data_arr_out will be appended IntegerIndexData
+ // sorted by the reverse order of prepending.
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status GetData(
+ const PostingListUsed* posting_list_used,
+ std::vector<IntegerIndexData>* data_arr_out) const;
+
+ // Undo the last num_data data prepended. If num_data > number of data, then
+ // we clear all data.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used,
+ uint32_t num_data) const;
+
+ // Helper function to determine if posting list is full.
+ bool IsFull(const PostingListUsed* posting_list_used) const {
+ return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
+ GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+ }
+
+ private:
+ // Posting list layout formats:
+ //
+ // NOT_FULL
+ // +-special-data-0--+-special-data-1--+------------+-----------------------+
+ // | | | | |
+ // |data-start-offset| Data::Invalid | 0x00000000 | (compressed) data |
+ // | | | | |
+ // +-----------------+-----------------+------------+-----------------------+
+ //
+ // ALMOST_FULL
+ // +-special-data-0--+-special-data-1--+-----+------------------------------+
+ // | | | | |
+ // | Data::Invalid | 1st data |(pad)| (compressed) data |
+ // | | | | |
+ // +-----------------+-----------------+-----+------------------------------+
+ //
+ // FULL
+ // +-special-data-0--+-special-data-1--+-----+------------------------------+
+ // | | | | |
+ // | 1st data | 2nd data |(pad)| (compressed) data |
+ // | | | | |
+ // +-----------------+-----------------+-----+------------------------------+
+ //
+ // The first two uncompressed (special) data also implicitly encode
+ // information about the size of the compressed data region.
+ //
+ // 1. If the posting list is NOT_FULL, then special_data_0 contains the byte
+ // offset of the start of the compressed data. Thus, the size of the
+ // compressed data is
+ // posting_list_used->size_in_bytes() - special_data_0.data_start_offset().
+ //
+ // 2. If posting list is ALMOST_FULL or FULL, then the compressed data region
+ // starts somewhere between
+ // [kSpecialDataSize, kSpecialDataSize + sizeof(IntegerIndexData) - 1] and
+ // ends at posting_list_used->size_in_bytes() - 1.
+ //
+ // EXAMPLE
+ // Posting list storage. Posting list size: 36 bytes
+ //
+ // EMPTY!
+ // +--- byte 0-11 ---+----- 12-23 -----+-------------- 24-35 ---------------+
+ // | | | |
+ // | 36 | Data::Invalid | 0x00000000 |
+ // | | | |
+ // +-----------------+-----------------+------------------------------------+
+ //
+ // Add IntegerIndexData(0x0FFFFCC3, 5)
+ // (DocumentId = 12, SectionId = 3; Key = 5)
+ // (VarInt64(5) is encoded as 10 (b'1010), requires 1 byte)
+ // NOT FULL!
+ // +--- byte 0-11 ---+----- 12-23 -----+------- 24-30 -------+--- 31-35 ----+
+ // | | | | 0x0FFFFCC3 |
+ // | 31 | Data::Invalid | 0x00000000 | VI64(5) |
+ // | | | | |
+ // +-----------------+-----------------+---------------------+--------------+
+ //
+ // Add IntegerIndexData(0x0FFFFB40, -2)
+ // (DocumentId = 18, SectionId = 0; Key = -2)
+ // (VarInt64(-2) is encoded as 3 (b'11), requires 1 byte)
+ // Previous IntegerIndexData BasicHit delta varint encoding:
+ // 0x0FFFFCC3 - 0x0FFFFB40 = 387, VarUnsignedInt(387) requires 2 bytes
+ // +--- byte 0-11 ---+----- 12-23 -----+-- 24-27 ---+--- 28-32 ----+ 33-35 -+
+ // | | | | 0x0FFFFB40 |VUI(387)|
+ // | 28 | Data::Invalid | 0x00 | VI64(-2) |VI64(5) |
+ // | | | | | |
+ // +-----------------+-----------------+------------+--------------+--------+
+ //
+ // Add IntegerIndexData(0x0FFFFA4A, 3)
+ // (DocumentId = 22, SectionId = 10; Key = 3)
+ // (VarInt64(3) is encoded as 6 (b'110), requires 1 byte)
+ // Previous IntegerIndexData BasicHit delta varint encoding:
+ // 0x0FFFFB40 - 0x0FFFFA4A = 246, VarUnsignedInt(246) requires 2 bytes
+ // +--- byte 0-11 ---+----- 12-23 -----+---+--- 25-29 ----+ 30-32 -+ 33-35 -+
+ // | | | | 0x0FFFFA4A |VUI(246)|VUI(387)|
+ // | 25 | Data::Invalid | | VI64(3) |VI64(-2)|VI64(5) |
+ // | | | | | | |
+ // +-----------------+-----------------+---+--------------+--------+--------+
+ //
+ // Add IntegerIndexData(0x0FFFFA01, -4)
+ // (DocumentId = 23, SectionId = 1; Key = -4)
+ // (No VarInt64 for key, since it is stored in special data section)
+ // Previous IntegerIndexData BasicHit delta varint encoding:
+ // 0x0FFFFA4A - 0x0FFFFA01 = 73, VarUnsignedInt(73) requires 1 byte)
+ // ALMOST_FULL!
+ // +--- byte 0-11 ---+----- 12-23 -----+-- 24-27 ---+28-29+ 30-32 -+ 33-35 -+
+ // | | 0x0FFFFA01 | |(73) |VUI(246)|VUI(387)|
+ // | Data::Invalid | 0xFFFFFFFF | (pad) |(3) |VI64(-2)|VI64(5) |
+ // | | 0xFFFFFFFC | | | | |
+ // +-----------------+-----------------+------------+-----+--------+--------+
+ //
+ // Add IntegerIndexData(0x0FFFF904, 0)
+ // (DocumentId = 27, SectionId = 4; Key = 0)
+ // (No VarInt64 for key, since it is stored in special data section)
+ // Previous IntegerIndexData:
+ // Since 0x0FFFFA01 - 0x0FFFF904 = 253 and VarInt64(-4) is encoded as 7
+ // (b'111), it requires only 3 bytes after compression. It's able to fit
+ // into the padding section.
+ // Still ALMOST_FULL!
+ // +--- byte 0-11 ---+----- 12-23 -----+---+ 25-27 -+28-29+ 30-32 -+ 33-35 -+
+ // | | 0x0FFFF904 | |VUI(253)|(73) |VUI(246)|VUI(387)|
+ // | Data::Invalid | 0x00000000 | |VI64(-4)|(3) |VI64(-2)|VI64(5) |
+ // | | 0x00000000 | | | | | |
+ // +-----------------+-----------------+---+--------+-----+--------+--------+
+ //
+ // Add IntegerIndexData(0x0FFFF8C3, -1)
+ // (DocumentId = 28, SectionId = 3; Key = -1)
+ // (No VarInt64 for key, since it is stored in special data section)
+ // (No VarUnsignedInt for previous IntegerIndexData BasicHit)
+ // FULL!
+ // +--- byte 0-11 ---+----- 12-23 -----+---+ 25-27 -+28-29+ 30-32 -+ 33-35 -+
+ // | 0x0FFFF8C3 | 0x0FFFF904 | |VUI(253)|(73) |VUI(246)|VUI(387)|
+ // | 0xFFFFFFFF | 0x00000000 | |VI64(-4)|(3) |VI64(-2)|VI64(5) |
+ // | 0xFFFFFFFF | 0x00000000 | | | | | |
+ // +-----------------+-----------------+---+--------+-----+--------+--------+
+
+ // Helpers to determine what state the posting list is in.
+ bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
+ return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
+ GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+ }
+
+ bool IsEmpty(const PostingListUsed* posting_list_used) const {
+ return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset() ==
+ posting_list_used->size_in_bytes() &&
+ !GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+ }
+
+ // Returns false if both special data are invalid or if data start offset
+ // stored in the special data is less than kSpecialDataSize or greater than
+ // posting_list_used->size_in_bytes(). Returns true, otherwise.
+ bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
+
+ // Prepend data to a posting list that is in the ALMOST_FULL state.
+ //
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if data is not less than the previously added data.
+ libtextclassifier3::Status PrependDataToAlmostFull(
+ PostingListUsed* posting_list_used, const IntegerIndexData& data) const;
+
+ // Prepend data to a posting list that is in the EMPTY state. This will always
+ // succeed because there are no pre-existing data and no validly constructed
+ // posting list could fail to fit one data.
+ void PrependDataToEmpty(PostingListUsed* posting_list_used,
+ const IntegerIndexData& data) const;
+
+ // Prepend data to a posting list that is in the NOT_FULL state.
+ //
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if data is not less than the previously added data.
+ libtextclassifier3::Status PrependDataToNotFull(
+ PostingListUsed* posting_list_used, const IntegerIndexData& data,
+ uint32_t offset) const;
+
+ // Returns either 0 (FULL state), sizeof(IntegerIndexData) (ALMOST_FULL state)
+ // or a byte offset between kSpecialDataSize and
+ // posting_list_used->size_in_bytes() (inclusive) (NOT_FULL state).
+ uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
+
+ // Sets special data 0 to properly reflect what start byte offset is (see
+ // layout comment for further details).
+ //
+ // Returns false if offset > posting_list_used->size_in_bytes() or offset is
+ // in range (kSpecialDataSize, sizeof(IntegerIndexData)) or
+ // (sizeof(IntegerIndexData), 0). True, otherwise.
+ bool SetStartByteOffset(PostingListUsed* posting_list_used,
+ uint32_t offset) const;
+
+ // Helper for MoveFrom/GetData/PopFrontData. Adds limit number of data to out
+ // or all data in the posting list if the posting list contains less than
+ // limit number of data. out can be NULL.
+ //
+ // NOTE: If called with limit=1, pop=true on a posting list that transitioned
+ // from NOT_FULL directly to FULL, GetDataInternal will not return the posting
+ // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
+ // ALMOST_FULL.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status GetDataInternal(
+ const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+ std::vector<IntegerIndexData>* out) const;
+
+ // Retrieves the value stored in the index-th special data.
+ //
+ // REQUIRES:
+ // 0 <= index < kNumSpecialData.
+ //
+ // RETURNS:
+ // - A valid SpecialData<IntegerIndexData>.
+ SpecialDataType GetSpecialData(const PostingListUsed* posting_list_used,
+ uint32_t index) const;
+
+ // Sets the value stored in the index-th special data to special_data.
+ //
+ // REQUIRES:
+ // 0 <= index < kNumSpecialData.
+ void SetSpecialData(PostingListUsed* posting_list_used, uint32_t index,
+ const SpecialDataType& special_data) const;
+
+ // Prepends data to the memory region [offset - sizeof(IntegerIndexData),
+ // offset - 1] and returns the new beginning of the region.
+ //
+ // RETURNS:
+ // - The new beginning of the padded region, if successful.
+ // - INVALID_ARGUMENT if data will not fit (uncompressed) between
+ // [kSpecialDataSize, offset - 1]
+ libtextclassifier3::StatusOr<uint32_t> PrependDataUncompressed(
+ PostingListUsed* posting_list_used, const IntegerIndexData& data,
+ uint32_t offset) const;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_NUMERIC_POSTING_LIST_INTEGER_INDEX_SERIALIZER_H_
diff --git a/icing/index/numeric/posting-list-integer-index-serializer_test.cc b/icing/index/numeric/posting-list-integer-index-serializer_test.cc
new file mode 100644
index 0000000..716d1aa
--- /dev/null
+++ b/icing/index/numeric/posting-list-integer-index-serializer_test.cc
@@ -0,0 +1,491 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/numeric/posting-list-integer-index-serializer.h"
+
+#include <memory>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/index/numeric/integer-index-data.h"
+#include "icing/testing/common-matchers.h"
+
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::IsEmpty;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// TODO(b/259743562): [Optimization 2] update unit tests after applying
+// compression. Remember to create varint/delta encoding
+// overflow (which causes state NOT_FULL -> FULL directly
+// without ALMOST_FULL) test cases, including for
+// PopFrontData.
+
+TEST(PostingListIntegerIndexSerializerTest, GetMinPostingListSizeToFitNotNull) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 2551 * sizeof(IntegerIndexData);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ ASSERT_THAT(serializer.PrependData(
+ &pl_used, IntegerIndexData(/*section_id=*/0,
+ /*document_id=*/0, /*key=*/2)),
+ IsOk());
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used),
+ Eq(2 * sizeof(IntegerIndexData)));
+
+ ASSERT_THAT(serializer.PrependData(
+ &pl_used, IntegerIndexData(/*section_id=*/0,
+ /*document_id=*/1, /*key=*/5)),
+ IsOk());
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used),
+ Eq(3 * sizeof(IntegerIndexData)));
+}
+
+TEST(PostingListIntegerIndexSerializerTest,
+ GetMinPostingListSizeToFitAlmostFull) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 3 * sizeof(IntegerIndexData);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ ASSERT_THAT(serializer.PrependData(
+ &pl_used, IntegerIndexData(/*section_id=*/0,
+ /*document_id=*/0, /*key=*/2)),
+ IsOk());
+ ASSERT_THAT(serializer.PrependData(
+ &pl_used, IntegerIndexData(/*section_id=*/0,
+ /*document_id=*/1, /*key=*/5)),
+ IsOk());
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), Eq(size));
+}
+
+TEST(PostingListIntegerIndexSerializerTest, GetMinPostingListSizeToFitFull) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 3 * sizeof(IntegerIndexData);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ ASSERT_THAT(serializer.PrependData(
+ &pl_used, IntegerIndexData(/*section_id=*/0,
+ /*document_id=*/0, /*key=*/2)),
+ IsOk());
+ ASSERT_THAT(serializer.PrependData(
+ &pl_used, IntegerIndexData(/*section_id=*/0,
+ /*document_id=*/1, /*key=*/5)),
+ IsOk());
+ ASSERT_THAT(serializer.PrependData(
+ &pl_used, IntegerIndexData(/*section_id=*/0,
+ /*document_id=*/2, /*key=*/0)),
+ IsOk());
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), Eq(size));
+}
+
+TEST(PostingListIntegerIndexSerializerTest, PrependDataNotFull) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 2551 * sizeof(IntegerIndexData);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ // Make used.
+ IntegerIndexData data0(/*section_id=*/0, /*document_id=*/0, /*key=*/2);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk());
+ // Size = sizeof(uncompressed data0)
+ int expected_size = sizeof(IntegerIndexData);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(ElementsAre(data0)));
+
+ IntegerIndexData data1(/*section_id=*/0, /*document_id=*/1, /*key=*/5);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk());
+ // Size = sizeof(uncompressed data1)
+ // + sizeof(uncompressed data0)
+ expected_size += sizeof(IntegerIndexData);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data1, data0)));
+
+ IntegerIndexData data2(/*section_id=*/0, /*document_id=*/2, /*key=*/0);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data2), IsOk());
+ // Size = sizeof(uncompressed data2)
+ // + sizeof(uncompressed data1)
+ // + sizeof(uncompressed data0)
+ expected_size += sizeof(IntegerIndexData);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data2, data1, data0)));
+}
+
+TEST(PostingListIntegerIndexSerializerTest, PrependDataAlmostFull) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 4 * sizeof(IntegerIndexData);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ // Fill up the compressed region.
+ // Transitions:
+ // Adding data0: EMPTY -> NOT_FULL
+ // Adding data1: NOT_FULL -> NOT_FULL
+ IntegerIndexData data0(/*section_id=*/0, /*document_id=*/0, /*key=*/2);
+ IntegerIndexData data1(/*section_id=*/0, /*document_id=*/1, /*key=*/5);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk());
+ EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk());
+ int expected_size = 2 * sizeof(IntegerIndexData);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data1, data0)));
+
+ // Add one more data to transition NOT_FULL -> ALMOST_FULL
+ IntegerIndexData data2(/*section_id=*/0, /*document_id=*/2, /*key=*/0);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data2), IsOk());
+ expected_size = 3 * sizeof(IntegerIndexData);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data2, data1, data0)));
+
+ // Add one more data to transition ALMOST_FULL -> FULL
+ IntegerIndexData data3(/*section_id=*/0, /*document_id=*/3, /*key=*/-3);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data3), IsOk());
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data3, data2, data1, data0)));
+
+ // The posting list is FULL. Adding another data should fail.
+ IntegerIndexData data4(/*section_id=*/0, /*document_id=*/4, /*key=*/100);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data4),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListIntegerIndexSerializerTest, PrependDataPostingListUsedMinSize) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ // PL State: EMPTY
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
+ EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(IsEmpty()));
+
+ // Add a data. PL should shift to ALMOST_FULL state
+ IntegerIndexData data0(/*section_id=*/0, /*document_id=*/0, /*key=*/2);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk());
+ // Size = sizeof(uncompressed data0)
+ int expected_size = sizeof(IntegerIndexData);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(ElementsAre(data0)));
+
+ // Add another data. PL should shift to FULL state.
+ IntegerIndexData data1(/*section_id=*/0, /*document_id=*/1, /*key=*/5);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk());
+ // Size = sizeof(uncompressed data1) + sizeof(uncompressed data0)
+ expected_size += sizeof(IntegerIndexData);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data1, data0)));
+
+ // The posting list is FULL. Adding another data should fail.
+ IntegerIndexData data2(/*section_id=*/0, /*document_id=*/2, /*key=*/0);
+ EXPECT_THAT(serializer.PrependData(&pl_used, data2),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListIntegerIndexSerializerTest,
+ PrependDataArrayDoNotKeepPrepended) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 6 * sizeof(IntegerIndexData);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<IntegerIndexData> data_in;
+ std::vector<IntegerIndexData> data_pushed;
+
+ // Add 3 data. The PL is in the empty state and should be able to fit all 3
+ // data without issue, transitioning the PL from EMPTY -> NOT_FULL.
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/0, /*key=*/2));
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/1, /*key=*/5));
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/2, /*key=*/0));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_in.size()));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() * sizeof(IntegerIndexData)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+
+ // Add 2 data. The PL should transition from NOT_FULL to ALMOST_FULL.
+ data_in.clear();
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/3, /*key=*/-3));
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/4, /*key=*/100));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_in.size()));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() * sizeof(IntegerIndexData)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+
+ // Add 2 data. The PL should remain ALMOST_FULL since the remaining space can
+ // only fit 1 data.
+ data_in.clear();
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/5, /*key=*/-200));
+ data_in.push_back(IntegerIndexData(/*section_id=*/0, /*document_id=*/6,
+ /*key=*/2147483647));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(0));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() * sizeof(IntegerIndexData)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+
+ // Add 1 data. The PL should transition from ALMOST_FULL to FULL.
+ data_in.resize(1);
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_in.size()));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() * sizeof(IntegerIndexData)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+}
+
+TEST(PostingListIntegerIndexSerializerTest, PrependDataArrayKeepPrepended) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 6 * sizeof(IntegerIndexData);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<IntegerIndexData> data_in;
+ std::vector<IntegerIndexData> data_pushed;
+
+ // Add 3 data. The PL is in the empty state and should be able to fit all 3
+ // data without issue, transitioning the PL from EMPTY -> NOT_FULL.
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/0, /*key=*/2));
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/1, /*key=*/5));
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/2, /*key=*/0));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/true),
+ IsOkAndHolds(data_in.size()));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() * sizeof(IntegerIndexData)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+
+ // Add 4 data. The PL should prepend 3 data and transition from NOT_FULL to
+ // FULL.
+ data_in.clear();
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/3, /*key=*/-3));
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/4, /*key=*/100));
+ data_in.push_back(
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/5, /*key=*/-200));
+ data_in.push_back(IntegerIndexData(/*section_id=*/0, /*document_id=*/6,
+ /*key=*/2147483647));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/true),
+ IsOkAndHolds(3));
+ data_in.resize(3);
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() * sizeof(IntegerIndexData)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+}
+
+TEST(PostingListIntegerIndexSerializerTest, MoveFrom) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<IntegerIndexData> data_arr1 = {
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/0, /*key=*/2),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/1, /*key=*/5)};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used1, data_arr1.data(), data_arr1.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr1.size()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<IntegerIndexData> data_arr2 = {
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/2, /*key=*/0),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/3, /*key=*/-3),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/4, /*key=*/100),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/5, /*key=*/-200)};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used2, data_arr2.data(), data_arr2.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr2.size()));
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
+ IsOk());
+ EXPECT_THAT(
+ serializer.GetData(&pl_used2),
+ IsOkAndHolds(ElementsAreArray(data_arr1.rbegin(), data_arr1.rend())));
+ EXPECT_THAT(serializer.GetData(&pl_used1), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PostingListIntegerIndexSerializerTest,
+ MoveToNullReturnsFailedPrecondition) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<IntegerIndexData> data_arr = {
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/0, /*key=*/2),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/1, /*key=*/5)};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used, data_arr.data(), data_arr.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr.size()));
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used, /*src=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend())));
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/nullptr, /*src=*/&pl_used),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend())));
+}
+
+TEST(PostingListIntegerIndexSerializerTest, MoveToPostingListTooSmall) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size1 = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size1));
+ std::vector<IntegerIndexData> data_arr1 = {
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/0, /*key=*/2),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/1, /*key=*/5),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/2, /*key=*/0),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/3, /*key=*/-3),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/4, /*key=*/100)};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used1, data_arr1.data(), data_arr1.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr1.size()));
+
+ int size2 = serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size2));
+ std::vector<IntegerIndexData> data_arr2 = {
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/5, /*key=*/-200)};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used2, data_arr2.data(), data_arr2.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr2.size()));
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used1),
+ IsOkAndHolds(ElementsAreArray(data_arr1.rbegin(), data_arr1.rend())));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used2),
+ IsOkAndHolds(ElementsAreArray(data_arr2.rbegin(), data_arr2.rend())));
+}
+
+TEST(PostingListIntegerIndexSerializerTest, PopFrontData) {
+ PostingListIntegerIndexSerializer serializer;
+
+ int size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<IntegerIndexData> data_arr = {
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/0, /*key=*/2),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/1, /*key=*/5),
+ IntegerIndexData(/*section_id=*/0, /*document_id=*/2, /*key=*/0)};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used, data_arr.data(), data_arr.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr.size()));
+ ASSERT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend())));
+
+ // Now, pop the last data. The posting list should contain the first three
+ // data.
+ EXPECT_THAT(serializer.PopFrontData(&pl_used, /*num_data=*/1), IsOk());
+ data_arr.pop_back();
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend())));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/posting-list-used.cc b/icing/index/posting-list-used.cc
deleted file mode 100644
index 708b13b..0000000
--- a/icing/index/posting-list-used.cc
+++ /dev/null
@@ -1,613 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/index/posting-list-used.h"
-
-#include <algorithm>
-#include <cinttypes>
-#include <cstdint>
-#include <limits>
-
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/index/posting-list-utils.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/legacy/index/icing-bit-util.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-
-uint32_t GetScoreByteSize(const Hit &hit) {
- return hit.has_score() ? sizeof(Hit::Score) : 0;
-}
-
-} // namespace
-
-libtextclassifier3::StatusOr<PostingListUsed>
-PostingListUsed::CreateFromPreexistingPostingListUsedRegion(
- void *posting_list_buffer, uint32_t size_in_bytes) {
- ICING_RETURN_ERROR_IF_NULL(posting_list_buffer);
- if (!posting_list_utils::IsValidPostingListSize(size_in_bytes)) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Requested posting list size %d is invalid!", size_in_bytes));
- }
- return PostingListUsed(posting_list_buffer, size_in_bytes);
-}
-
-libtextclassifier3::StatusOr<PostingListUsed>
-PostingListUsed::CreateFromUnitializedRegion(void *posting_list_buffer,
- uint32_t size_in_bytes) {
- ICING_ASSIGN_OR_RETURN(PostingListUsed posting_list_used,
- CreateFromPreexistingPostingListUsedRegion(
- posting_list_buffer, size_in_bytes));
- posting_list_used.Clear();
- return posting_list_used;
-}
-
-void PostingListUsed::Clear() { set_start_byte_offset(size_in_bytes_); }
-
-libtextclassifier3::Status PostingListUsed::MoveFrom(PostingListUsed *other) {
- ICING_RETURN_ERROR_IF_NULL(other);
- if (other->MinPostingListSizeToFit() > size_in_bytes_) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "other->MinPostingListSizeToFit %d must be larger than size %d.",
- other->MinPostingListSizeToFit(), size_in_bytes_));
- }
-
- if (!IsPostingListValid()) {
- return absl_ports::FailedPreconditionError(
- "This posting list is in an invalid state and can't be used!");
- }
- if (other->IsPostingListValid()) {
- return absl_ports::InvalidArgumentError(
- "Cannot MoveFrom an invalid posting list!");
- }
-
- // Pop just enough hits that all of other's compressed hits fit in
- // this posting_list's compressed area. Then we can memcpy that area.
- std::vector<Hit> hits;
- while (other->full() || other->almost_full() ||
- (size_in_bytes_ - posting_list_utils::kSpecialHitsSize <
- other->BytesUsed())) {
- if (other->GetHitsInternal(/*limit=*/1, /*pop=*/true, &hits) != 1) {
- return absl_ports::AbortedError(
- "Unable to retrieve hits from other posting list.");
- }
- }
-
- // memcpy the area and set up start byte offset.
- Clear();
- memcpy(posting_list_buffer_ + size_in_bytes_ - other->BytesUsed(),
- other->posting_list_buffer_ + other->get_start_byte_offset(),
- other->BytesUsed());
- // Because we popped all hits from other outside of the compressed area and we
- // guaranteed that other->BytesUsed is less than size_in_bytes_ -
- // kSpecialHitSize. This is guaranteed to be a valid byte offset for the
- // NOT_FULL state.
- set_start_byte_offset(size_in_bytes_ - other->BytesUsed());
-
- // Put back remaining hits.
- for (size_t i = 0; i < hits.size(); i++) {
- const Hit &hit = hits[hits.size() - i - 1];
- // PrependHit can return either INVALID_ARGUMENT - if hit is invalid or not
- // less than the previous hit - or RESOURCE_EXHAUSTED. RESOURCE_EXHAUSTED
- // should be impossible because we've already assured that there is enough
- // room above.
- ICING_RETURN_IF_ERROR(PrependHit(hit));
- }
-
- other->Clear();
- return libtextclassifier3::Status::OK;
-}
-
-uint32_t PostingListUsed::GetPadEnd(uint32_t offset) const {
- Hit::Value pad;
- uint32_t pad_end = offset;
- while (pad_end < size_in_bytes_) {
- size_t pad_len = VarInt::Decode(posting_list_buffer_ + pad_end, &pad);
- if (pad != 0) {
- // No longer a pad.
- break;
- }
- pad_end += pad_len;
- }
- return pad_end;
-}
-
-void PostingListUsed::PadToEnd(uint32_t start, uint32_t end) {
- if (end > size_in_bytes_) {
- ICING_LOG(ERROR) << "Cannot pad a region that ends after size!";
- return;
- }
- // In VarInt a value of 0 encodes to 0.
- memset(posting_list_buffer_ + start, 0, end - start);
-}
-
-libtextclassifier3::Status PostingListUsed::PrependHitToAlmostFull(
- const Hit &hit) {
- // Get delta between first hit and the new hit. Try to fit delta
- // in the padded area and put new hit at the special position 1.
- Hit cur = get_special_hit(1);
- if (cur.value() <= hit.value()) {
- return absl_ports::InvalidArgumentError(
- "Hit being prepended must be strictly less than the most recent Hit");
- }
- uint64_t delta = cur.value() - hit.value();
- uint8_t delta_buf[VarInt::kMaxEncodedLen64];
- size_t delta_len = VarInt::Encode(delta, delta_buf);
- uint32_t cur_score_bytes = GetScoreByteSize(cur);
-
- uint32_t pad_end = GetPadEnd(posting_list_utils::kSpecialHitsSize);
-
- if (pad_end >=
- posting_list_utils::kSpecialHitsSize + delta_len + cur_score_bytes) {
- // Pad area has enough space for delta and score of existing hit
- // (cur). Write delta at pad_end - delta_len - cur_score_bytes.
- uint8_t *delta_offset =
- posting_list_buffer_ + pad_end - delta_len - cur_score_bytes;
- memcpy(delta_offset, delta_buf, delta_len);
- // Now copy score.
- Hit::Score score = cur.score();
- uint8_t *score_offset = delta_offset + delta_len;
- memcpy(score_offset, &score, cur_score_bytes);
-
- // Now first hit is the new hit, at special position 1.
- set_special_hit(1, hit);
- set_start_byte_offset(sizeof(Hit));
- } else {
- // No space for delta. We put the new hit at special position 0
- // and go to the full state.
- set_special_hit(0, hit);
- }
- return libtextclassifier3::Status::OK;
-}
-
-void PostingListUsed::PrependHitToEmpty(const Hit &hit) {
- // First hit to be added. Just add verbatim, no compression.
- if (size_in_bytes_ == posting_list_utils::kSpecialHitsSize) {
- set_special_hit(1, hit);
- set_start_byte_offset(sizeof(Hit));
- } else {
- // Since this is the first hit, size != kSpecialHitsSize and
- // size % sizeof(Hit) == 0, we know that there is room to fit 'hit' into
- // the compressed region.
- uint32_t offset = PrependHitUncompressed(hit, size_in_bytes_);
- set_start_byte_offset(offset);
- }
-}
-
-libtextclassifier3::Status PostingListUsed::PrependHitToNotFull(
- const Hit &hit, uint32_t offset) {
- // First hit in compressed area. It is uncompressed. See if delta
- // between the first hit and new hit will still fit in the
- // compressed area.
- if (offset + sizeof(Hit::Value) > size_in_bytes_) {
- // The first hit in the compressed region *should* be uncompressed, but
- // somehow there isn't enough room between offset and the end of the
- // compressed area to fit an uncompressed hit. This should NEVER happen.
- return absl_ports::FailedPreconditionError(
- "Posting list is in an invalid state.");
- }
- Hit::Value cur_value;
- memcpy(&cur_value, posting_list_buffer_ + offset, sizeof(Hit::Value));
- if (cur_value <= hit.value()) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Hit %d being prepended must be strictly less than the most recent "
- "Hit %d",
- hit.value(), cur_value));
- }
- uint64_t delta = cur_value - hit.value();
- uint8_t delta_buf[VarInt::kMaxEncodedLen64];
- size_t delta_len = VarInt::Encode(delta, delta_buf);
- uint32_t hit_score_bytes = GetScoreByteSize(hit);
-
- // offset now points to one past the end of the first hit.
- offset += sizeof(Hit::Value);
- if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) + delta_len +
- hit_score_bytes <=
- offset) {
- // Enough space for delta in compressed area.
-
- // Prepend delta.
- offset -= delta_len;
- memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
-
- // Prepend new hit with (possibly) its score. We know that there is room
- // for 'hit' because of the if statement above.
- offset = PrependHitUncompressed(hit, offset);
- // offset is guaranteed to be valid here. The if above will guarantee that
- // offset >= kSpecialHitSize and < size_in_bytes_ because the if ensures
- // that there is enough room between offset and kSpecialHitSize to fit the
- // delta of the previous hit, any score and the uncompressed hit.
- set_start_byte_offset(offset);
- } else if (posting_list_utils::kSpecialHitsSize + delta_len <= offset) {
- // Only have space for delta. The new hit must be put in special
- // position 1.
-
- // Prepend delta.
- offset -= delta_len;
- memcpy(posting_list_buffer_ + offset, delta_buf, delta_len);
-
- // Prepend pad.
- PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
-
- // Put new hit in special position 1.
- set_special_hit(1, hit);
-
- // State almost_full.
- set_start_byte_offset(sizeof(Hit));
- } else {
- // Very rare case where delta is larger than sizeof(Hit::Value)
- // (i.e. varint delta encoding expanded required storage). We
- // move first hit to special position 1 and put new hit in
- // special position 0.
- Hit cur(cur_value);
- if (cur.has_score()) {
- cur = Hit(cur_value, ReadScore(offset));
- offset += sizeof(Hit::Score);
- }
- PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
- set_special_hit(1, cur);
- set_special_hit(0, hit);
- }
- return libtextclassifier3::Status::OK;
-}
-
-libtextclassifier3::Status PostingListUsed::PrependHit(const Hit &hit) {
- static_assert(sizeof(Hit::Value) <= sizeof(uint64_t),
- "Hit::Value cannot be larger than 8 bytes because the delta "
- "must be able to fit in 8 bytes.");
- if (!hit.is_valid()) {
- return absl_ports::InvalidArgumentError("Cannot prepend an invalid hit!");
- }
- if (!IsPostingListValid()) {
- return absl_ports::FailedPreconditionError(
- "This PostingListUsed is in an invalid state and can't add any hits!");
- }
-
- if (full()) {
- // State full: no space left.
- return absl_ports::ResourceExhaustedError("No more room for hits");
- } else if (almost_full()) {
- return PrependHitToAlmostFull(hit);
- } else if (empty()) {
- PrependHitToEmpty(hit);
- return libtextclassifier3::Status::OK;
- } else {
- uint32_t offset = get_start_byte_offset();
- return PrependHitToNotFull(hit, offset);
- }
-}
-
-std::vector<Hit> PostingListUsed::GetHits() const {
- std::vector<Hit> hits_out;
- GetHits(&hits_out);
- return hits_out;
-}
-
-void PostingListUsed::GetHits(std::vector<Hit> *hits_out) const {
- GetHitsInternal(/*limit=*/std::numeric_limits<uint32_t>::max(), /*pop=*/false,
- hits_out);
-}
-
-void PostingListUsed::PopFrontHits(uint32_t num_hits) {
- if (num_hits == 1 && full()) {
- // The PL is in full status which means that we save 2 uncompressed hits in
- // the 2 special postions. But full status may be reached by 2 different
- // statuses.
- // (1) In "almost full" status
- // +-----------------+----------------+-------+-----------------+
- // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
- // +-----------------+----------------+-------+-----------------+
- // When we prepend another hit, we can only put it at the special
- // position 0. And we get a full PL
- // +-----------------+----------------+-------+-----------------+
- // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
- // +-----------------+----------------+-------+-----------------+
- // (2) In "not full" status
- // +-----------------+----------------+------+-------+------------------+
- // |hits-start-offset|Hit::kInvalidVal|(pad) |1st hit|(compressed) hits |
- // +-----------------+----------------+------+-------+------------------+
- // When we prepend another hit, we can reach any of the 3 following
- // scenarios:
- // (2.1) not full
- // if the space of pad and original 1st hit can accommodate the new 1st hit
- // and the encoded delta value.
- // +-----------------+----------------+------+-----------+-----------------+
- // |hits-start-offset|Hit::kInvalidVal|(pad) |new 1st hit|(compressed) hits|
- // +-----------------+----------------+------+-----------+-----------------+
- // (2.2) almost full
- // If the space of pad and original 1st hit cannot accommodate the new 1st
- // hit and the encoded delta value but can accommodate the encoded delta
- // value only. We can put the new 1st hit at special position 1.
- // +-----------------+----------------+-------+-----------------+
- // |Hit::kInvalidVal |new 1st hit |(pad) |(compressed) hits|
- // +-----------------+----------------+-------+-----------------+
- // (2.3) full
- // In very rare case, it cannot even accommodate only the encoded delta
- // value. we can move the original 1st hit into special position 1 and the
- // new 1st hit into special position 0. This may happen because we use
- // VarInt encoding method which may make the encoded value longer (about
- // 4/3 times of original)
- // +-----------------+----------------+-------+-----------------+
- // |new 1st hit |original 1st hit|(pad) |(compressed) hits|
- // +-----------------+----------------+-------+-----------------+
- // Suppose now the PL is full. But we don't know whether it arrived to
- // this status from "not full" like (2.3) or from "almost full" like (1).
- // We'll return to "almost full" status like (1) if we simply pop the new
- // 1st hit but we want to make the prepending operation "reversible". So
- // there should be some way to return to "not full" if possible. A simple
- // way to do it is to pop 2 hits out of the PL to status "almost full" or
- // "not full". And add the original 1st hit back. We can return to the
- // correct original statuses of (2.1) or (1). This makes our prepending
- // operation reversible.
- std::vector<Hit> out;
-
- // Popping 2 hits should never fail because we've just ensured that the
- // posting list is in the FULL state.
- GetHitsInternal(/*limit=*/2, /*pop=*/true, &out);
-
- // PrependHit should never fail because out[1] is a valid hit less than
- // previous hits in the posting list and because there's no way that the
- // posting list could run out of room because it previously stored this hit
- // AND another hit.
- PrependHit(out[1]);
- } else if (num_hits > 0) {
- GetHitsInternal(/*limit=*/num_hits, /*pop=*/true, nullptr);
- }
-}
-
-uint32_t PostingListUsed::GetHitsInternal(uint32_t limit, bool pop,
- std::vector<Hit> *out) const {
- // Put current uncompressed val here.
- Hit::Value val = Hit::kInvalidValue;
- uint32_t offset = get_start_byte_offset();
- uint32_t count = 0;
-
- // First traverse the first two special positions.
- while (count < limit && offset < posting_list_utils::kSpecialHitsSize) {
- Hit hit = get_special_hit(offset / sizeof(Hit));
- val = hit.value();
- if (out != nullptr) {
- out->push_back(hit);
- }
- offset += sizeof(Hit);
- count++;
- }
-
- // If special position 1 was set then we need to skip padding.
- if (val != Hit::kInvalidValue &&
- offset == posting_list_utils::kSpecialHitsSize) {
- offset = GetPadEnd(offset);
- }
-
- while (count < limit && offset < size_in_bytes_) {
- if (val == Hit::kInvalidValue) {
- // First hit is in compressed area. Put that in val.
- memcpy(&val, posting_list_buffer_ + offset, sizeof(Hit::Value));
- offset += sizeof(Hit::Value);
- } else {
- // Now we have delta encoded subsequent hits. Decode and push.
- uint64_t delta;
- offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
- val += delta;
- }
- Hit hit(val);
- if (hit.has_score()) {
- hit = Hit(val, ReadScore(offset));
- offset += sizeof(Hit::Score);
- }
- if (out != nullptr) {
- out->push_back(hit);
- }
- count++;
- }
-
- if (pop) {
- PostingListUsed *mutable_this = const_cast<PostingListUsed *>(this);
- // Modify the posting list so that we pop all hits actually
- // traversed.
- if (offset >= posting_list_utils::kSpecialHitsSize &&
- offset < size_in_bytes_) {
- // In the compressed area. Pop and reconstruct. offset/val is
- // the last traversed hit, which we must discard. So move one
- // more forward.
- uint64_t delta;
- offset += VarInt::Decode(posting_list_buffer_ + offset, &delta);
- val += delta;
-
- // Now val is the first hit of the new posting list.
- if (posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value) <= offset) {
- // val fits in compressed area. Simply copy.
- offset -= sizeof(Hit::Value);
- memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
- } else {
- // val won't fit in compressed area. Also see if there is a
- // score.
- Hit hit(val);
- if (hit.has_score()) {
- hit = Hit(val, ReadScore(offset));
- }
- mutable_this->set_special_hit(1, hit);
- mutable_this->PadToEnd(posting_list_utils::kSpecialHitsSize, offset);
- offset = sizeof(Hit);
- }
- }
- // offset is guaranteed to be valid. It falls into one of four scenarios:
- // Scenario 1: the above if was false because offset is not < size_in_bytes_
- // In this case, offset must be == size_in_bytes_ because we reached
- // offset by unwinding hits on the posting list.
- // Scenario 2: offset is < kSpecialHitSize
- // In this case, offset is guaranteed to be either 0 or sizeof(Hit)
- // because offset is incremented by sizeof(Hit) within the first while
- // loop.
- // Scenario 3: offset is within the compressed region and the new first hit
- // in the posting list (the value that 'val' holds) will fit as an
- // uncompressed hit in the compressed region. The resulting offset from
- // decompressing val must be >= kSpecialHitSize because otherwise we'd be
- // in Scenario 4
- // Scenario 4: offset is within the compressed region, but the new first hit
- // in the posting list is too large to fit as an uncompressed hit in the
- // in the compressed region. Therefore, it must be stored in a special hit
- // and offset will be sizeof(Hit).
- mutable_this->set_start_byte_offset(offset);
- }
-
- return count;
-}
-
-Hit PostingListUsed::get_special_hit(uint32_t index) const {
- static_assert(sizeof(Hit::Value) >= sizeof(uint32_t), "HitTooSmall");
- if (index >= posting_list_utils::kSpecialHitsSize / sizeof(Hit)) {
- ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
- return Hit();
- }
- Hit val;
- memcpy(&val, posting_list_buffer_ + index * sizeof(val), sizeof(val));
- return val;
-}
-
-void PostingListUsed::set_special_hit(uint32_t index, const Hit &val) {
- if (index >= posting_list_utils::kSpecialHitsSize / sizeof(Hit)) {
- ICING_LOG(ERROR) << "Special hits only exist at indices 0 and 1";
- return;
- }
- memcpy(posting_list_buffer_ + index * sizeof(val), &val, sizeof(val));
-}
-
-uint32_t PostingListUsed::BytesUsed() const {
- // The special hits will be included if they represent actual hits. If they
- // represent the hit offset or the invalid hit sentinel, they are not
- // included.
- return size_in_bytes_ - get_start_byte_offset();
-}
-
-uint32_t PostingListUsed::MinPostingListSizeToFit() const {
- if (full() || almost_full()) {
- // If in either the FULL state or ALMOST_FULL state, this posting list *is*
- // the minimum size posting list that can fit these hits. So just return the
- // size of the posting list.
- return size_in_bytes_;
- }
-
- // In NOT_FULL status BytesUsed contains no special hits. The minimum sized
- // posting list that would be guaranteed to fit these hits would be
- // ALMOST_FULL, with kInvalidHit in special_hit(0), the uncompressed Hit in
- // special_hit(1) and the n compressed hits in the compressed region.
- // BytesUsed contains one uncompressed Hit and n compressed hits. Therefore,
- // fitting these hits into a posting list would require BytesUsed plus one
- // extra hit.
- return BytesUsed() + sizeof(Hit);
-}
-
-bool PostingListUsed::IsPostingListValid() const {
- if (almost_full()) {
- // Special Hit 1 should hold a Hit.
- if (!get_special_hit(1).is_valid()) {
- ICING_LOG(ERROR)
- << "Both special hits cannot be invalid at the same time.";
- return false;
- }
- } else if (!full()) {
- // NOT_FULL. Special Hit 0 should hold a valid offset.
- if (get_special_hit(0).value() > size_in_bytes_ ||
- get_special_hit(0).value() < posting_list_utils::kSpecialHitsSize) {
- ICING_LOG(ERROR) << "Hit: " << get_special_hit(0).value()
- << " size: " << size_in_bytes_
- << " sp size: " << posting_list_utils::kSpecialHitsSize;
- return false;
- }
- }
- return true;
-}
-
-uint32_t PostingListUsed::get_start_byte_offset() const {
- if (full()) {
- return 0;
- } else if (almost_full()) {
- return sizeof(Hit);
- } else {
- // NOT_FULL
- return get_special_hit(0).value();
- }
-}
-
-void PostingListUsed::set_start_byte_offset(uint32_t offset) {
- if (offset > size_in_bytes_) {
- ICING_LOG(ERROR) << "offset cannot be a value greater than size "
- << size_in_bytes_ << ". offset is " << offset << ".";
- return;
- }
- if (offset < posting_list_utils::kSpecialHitsSize && offset > sizeof(Hit)) {
- ICING_LOG(ERROR) << "offset cannot be a value between (" << sizeof(Hit)
- << ", " << posting_list_utils::kSpecialHitsSize
- << "). offset is " << offset << ".";
- return;
- }
- if (offset < sizeof(Hit) && offset != 0) {
- ICING_LOG(ERROR) << "offset cannot be a value between (0, " << sizeof(Hit)
- << "). offset is " << offset << ".";
- return;
- }
- if (offset >= posting_list_utils::kSpecialHitsSize) {
- // not_full state.
- set_special_hit(0, Hit(offset));
- set_special_hit(1, Hit());
- } else if (offset == sizeof(Hit)) {
- // almost_full state.
- set_special_hit(0, Hit());
- }
- // Nothing to do for the FULL state - the offset isn't actually stored
- // anywhere and both special hits hold valid hits.
-}
-
-uint32_t PostingListUsed::PrependHitUncompressed(const Hit &hit,
- uint32_t offset) {
- if (hit.has_score()) {
- if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit)) {
- ICING_LOG(ERROR) << "Not enough room to prepend Hit at offset " << offset
- << ".";
- return offset;
- }
- offset -= sizeof(Hit);
- memcpy(posting_list_buffer_ + offset, &hit, sizeof(Hit));
- } else {
- if (offset < posting_list_utils::kSpecialHitsSize + sizeof(Hit::Value)) {
- ICING_LOG(ERROR) << "Not enough room to prepend Hit::Value at offset "
- << offset << ".";
- return offset;
- }
- offset -= sizeof(Hit::Value);
- Hit::Value val = hit.value();
- memcpy(posting_list_buffer_ + offset, &val, sizeof(Hit::Value));
- }
- return offset;
-}
-
-Hit::Score PostingListUsed::ReadScore(uint32_t offset) const {
- if (offset + sizeof(Hit::Score) > size_in_bytes_) {
- ICING_LOG(FATAL)
- << "offset " << offset
- << " must not point past the end of the posting list of size "
- << size_in_bytes_ << ".";
- }
- Hit::Score score;
- memcpy(&score, posting_list_buffer_ + offset, sizeof(Hit::Score));
- return score;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/index/posting-list-used.h b/icing/index/posting-list-used.h
deleted file mode 100644
index 492435b..0000000
--- a/icing/index/posting-list-used.h
+++ /dev/null
@@ -1,321 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_INDEX_POSTING_LIST_USED_H_
-#define ICING_INDEX_POSTING_LIST_USED_H_
-
-#include <string.h>
-#include <sys/mman.h>
-
-#include <algorithm>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/index/hit/hit.h"
-#include "icing/index/posting-list-utils.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-
-// A posting list with hits in it. Layout described in comments in
-// posting-list-used.cc.
-class PostingListUsed {
- public:
- // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes.
- // 'Preexisting' means that posting_list_buffer was previously modified by
- // another instance of PostingListUsed.
- //
- // Caller owns the hits buffer and must not free it while using a
- // PostingListUsed.
- //
- // RETURNS:
- // - A valid PostingListUsed if successful
- // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
- // || size_in_bytes % sizeof(Hit) != 0.
- // - FAILED_PRECONDITION if posting_list_buffer is null
- static libtextclassifier3::StatusOr<PostingListUsed>
- CreateFromPreexistingPostingListUsedRegion(void *posting_list_buffer,
- uint32_t size_in_bytes);
-
- // Creates a PostingListUsed that points to a buffer of size_in_bytes bytes
- // and initializes the content of the buffer so that the returned
- // PostingListUsed is empty.
- //
- // Caller owns the posting_list_buffer buffer and must not free it while using
- // a PostingListUsed.
- //
- // RETURNS:
- // - A valid PostingListUsed if successful
- // - INVALID_ARGUMENT if size_in_bytes < min_posting_list_size()
- // || size_in_bytes % sizeof(Hit) != 0.
- // - FAILED_PRECONDITION if posting_list_buffer is null
- static libtextclassifier3::StatusOr<PostingListUsed>
- CreateFromUnitializedRegion(void *posting_list_buffer,
- uint32_t size_in_bytes);
-
- // Move contents from another posting list. Clears other.
- //
- // RETURNS:
- // - OK, if successful
- // - INVALID_ARGUMENT if 'other' is not valid or 'other' is too large to fit
- // in 'this'.
- // - FAILED_PRECONDITION if 'this' posting list is in a corrupted state.
- libtextclassifier3::Status MoveFrom(PostingListUsed *other);
-
- // Min size of posting list that can fit these used bytes. (See
- // MoveFrom.)
- uint32_t MinPostingListSizeToFit() const;
-
- // Prepend a hit to the posting list.
- // RETURNS:
- // - INVALID_ARGUMENT if !hit.is_valid() or if hit is not less than the
- // previously added hit.
- // - RESOURCE_EXHAUSTED if there is no more room to add hit to the posting
- // list.
- libtextclassifier3::Status PrependHit(const Hit &hit);
-
- // Prepend hits to the posting list. Hits should be sorted in
- // descending order (as defined by the less than operator for Hit)
- //
- // Returns the number of hits that could be prepended to the posting list. If
- // keep_prepended is true, whatever could be prepended is kept, otherwise the
- // posting list is left in its original state.
- template <class T, Hit (*GetHit)(const T &)>
- uint32_t PrependHitArray(const T *array, uint32_t num_hits,
- bool keep_prepended);
-
- // Return hits sorted by the reverse order of prepending.
- std::vector<Hit> GetHits() const;
-
- // Same as GetHits but appends hits to hits_out.
- void GetHits(std::vector<Hit> *hits_out) const;
-
- // Undo the last num_hits hits prepended. If num_hits > number of
- // hits we clear all hits.
- void PopFrontHits(uint32_t num_hits);
-
- // Returns bytes used by actual hits.
- uint32_t BytesUsed() const;
-
- private:
- // Posting list layout formats:
- //
- // not_full
- //
- // +-----------------+----------------+-------+-----------------+
- // |hits-start-offset|Hit::kInvalidVal|xxxxxxx|(compressed) hits|
- // +-----------------+----------------+-------+-----------------+
- //
- // almost_full
- //
- // +-----------------+----------------+-------+-----------------+
- // |Hit::kInvalidVal |1st hit |(pad) |(compressed) hits|
- // +-----------------+----------------+-------+-----------------+
- //
- // full()
- //
- // +-----------------+----------------+-------+-----------------+
- // |1st hit |2nd hit |(pad) |(compressed) hits|
- // +-----------------+----------------+-------+-----------------+
- //
- // The first two uncompressed hits also implicitly encode information about
- // the size of the compressed hits region.
- //
- // 1. If the posting list is NOT_FULL, then
- // posting_list_buffer_[0] contains the byte offset of the start of the
- // compressed hits - and, thus, the size of the compressed hits region is
- // size_in_bytes - posting_list_buffer_[0].
- //
- // 2. If posting list is ALMOST_FULL or FULL, then the compressed hits region
- // starts somewhere between [kSpecialHitsSize, kSpecialHitsSize + sizeof(Hit)
- // - 1] and ends at size_in_bytes - 1.
- //
- // Hit scores are stored after the hit value, compressed or
- // uncompressed. For the first two special hits, we always have a
- // space for the score. For hits in the compressed area, we only have
- // the score following the hit value of hit.has_score() is true. This
- // allows good compression in the common case where hits don't have a
- // specific score.
- //
- // EXAMPLE
- // Posting list storage. Posting list size: 20 bytes
- // EMPTY!
- // +--bytes 0-4--+----- 5-9 ------+---------------- 10-19 -----------------+
- // | 20 |Hit::kInvalidVal| 0x000 |
- // +-------------+----------------+----------------+-----------------------+
- //
- // Add Hit 0x07FFF998 (DocumentId = 12, SectionId = 3, Flags = 0)
- // NOT FULL!
- // +--bytes 0-4--+----- 5-9 ------+----- 10-15 -----+-------- 16-19 -------+
- // | 16 |Hit::kInvalidVal| 0x000 | 0x07FFF998 |
- // +-------------+----------------+-----------------+----------------------+
- //
- // Add Hit 0x07FFF684 (DocumentId = 18, SectionId = 0, Flags = 4, Score=125)
- // (Hit 0x07FFF998 - Hit 0x07FFF684 = 788)
- // +--bytes 0-4--+----- 5-9 ------+-- 10-12 --+-- 13-16 --+- 17 -+-- 18-19 --+
- // | 13 |Hit::kInvalidVal| 0x000 | 0x07FFF684| 125 | 788 |
- // +-------------+----------------+-----------+-----------+------+-----------+
- //
- // Add Hit 0x07FFF4D2 (DocumentId = 22, SectionId = 10, Flags = 2)
- // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 434)
- // +--bytes 0-4--+--- 5-9 ----+-- 10 --+-- 11-14 -+- 15-16 -+- 17 -+- 18-19 -+
- // | 9 |Hit::kInvVal| 0x00 |0x07FFF4D2| 434 | 125 | 788 |
- // +-------------+------------+--------+----------+---------+------+---------+
- //
- // Add Hit 0x07FFF40E (DocumentId = 23, SectionId = 1, Flags = 6, Score = 87)
- // (Hit 0x07FFF684 - Hit 0x07FFF4D2 = 196)
- // ALMOST FULL!
- // +--bytes 0-4-+---- 5-9 ----+- 10-12 -+- 13-14 -+- 15-16 -+- 17 -+- 18-19 -+
- // |Hit::kInvVal|0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788 |
- // +-------------+------------+---------+---------+---------+------+---------+
- //
- // Add Hit 0x07FFF320 (DocumentId = 27, SectionId = 4, Flags = 0)
- // FULL!
- // +--bytes 0-4--+---- 5-9 ----+- 10-13 -+-- 14-15 -+- 16-17 -+- 18 -+- 19-20
- // -+ | 0x07FFF320 |0x07FFF40E,87| 0x000 | 196 | 434 | 125 | 788
- // |
- // +-------------+-------------+---------+----------+---------+------+---------+
- PostingListUsed(void *posting_list_buffer, uint32_t size_in_bytes)
- : posting_list_buffer_(static_cast<uint8_t *>(posting_list_buffer)),
- size_in_bytes_(size_in_bytes) {}
-
- // Helpers to determine what state the posting list is in.
- bool full() const {
- return get_special_hit(0).is_valid() && get_special_hit(1).is_valid();
- }
- bool almost_full() const { return !get_special_hit(0).is_valid(); }
- bool empty() const {
- return get_special_hit(0).value() == size_in_bytes_ &&
- !get_special_hit(1).is_valid();
- }
-
- // Returns false if both special hits are invalid or if the offset value
- // stored in the special hit is less than kSpecialHitsSize or greater than
- // size_in_bytes_. Returns true, otherwise.
- bool IsPostingListValid() const;
-
- // Prepend hit to a posting list that is in the ALMOST_FULL state.
- // RETURNS:
- // - OK, if successful
- // - INVALID_ARGUMENT if hit is not less than the previously added hit.
- libtextclassifier3::Status PrependHitToAlmostFull(const Hit &hit);
-
- // Prepend hit to a posting list that is in the EMPTY state. This will always
- // succeed because there are no pre-existing hits and no validly constructed
- // posting list could fail to fit one hit.
- void PrependHitToEmpty(const Hit &hit);
-
- // Prepend hit to a posting list that is in the NOT_FULL state.
- // RETURNS:
- // - OK, if successful
- // - INVALID_ARGUMENT if hit is not less than the previously added hit.
- libtextclassifier3::Status PrependHitToNotFull(const Hit &hit,
- uint32_t offset);
-
- // Reset contents to an empty posting list. This *must* be called if the
- // posting_list_buffer_ region is uninitialized.
- void Clear();
-
- // Returns either 0 (full state), sizeof(Hit) (almost_full state) or
- // a byte offset between kSpecialHitsSize and size_in_bytes_ (inclusive)
- // (not_full state).
- uint32_t get_start_byte_offset() const;
-
- // Sets the special hits to properly reflect what offset is (see layout
- // comment for further details).
- // If offset > size_in_bytes_ or offset is (kSpecialHitsSize, sizeof(Hit)) or
- // offset is (sizeof(Hit), 0), then offset is considered invalid and this
- // function has no effect.
- void set_start_byte_offset(uint32_t offset);
-
- // Manipulate padded areas. We never store the same hit value twice
- // so a delta of 0 is a pad byte.
-
- // Returns offset of first non-pad byte.
- uint32_t GetPadEnd(uint32_t offset) const;
-
- // Fill padding between offset start and offset end with 0s. If end >
- // size_in_bytes_, this function has no effect.
- void PadToEnd(uint32_t start, uint32_t end);
-
- // Helper for AppendHits/PopFrontHits. Returns number actually traversed (also
- // the size of out if non-NULL), which will always be equal to 'limit' unless
- // there are fewer than 'limit' hits in the posting list. out can be NULL.
- //
- // NOTE: If called with limit=1, pop=true on a posting list that transitioned
- // from NOT_FULL directly to FULL, GetHitsInternal will not return the posting
- // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
- // ALMOST_FULL.
- uint32_t GetHitsInternal(uint32_t limit, bool pop,
- std::vector<Hit> *out) const;
-
- // Retrieves the value stored in the index-th special hit. If index is not
- // less than kSpecialHitSize / sizeof(Hit), returns Hit::kInvalidValue.
- Hit get_special_hit(uint32_t index) const;
-
- // Sets the value stored in the index-th special hit to val. If index is not
- // less than kSpecialHitSize / sizeof(Hit), this has no effect.
- void set_special_hit(uint32_t index, const Hit &val);
-
- // Prepends hit to the memory region [offset - sizeof(Hit), offset] and
- // returns the new beginning of the padded region.
- //
- // If offset - kSpecialHitSize < sizeof(Hit/Hit::Value), then this function
- // has no effect.
- uint32_t PrependHitUncompressed(const Hit &hit, uint32_t offset);
-
- // Reads the score located at offset and returns it. Callers are responsible
- // for ensuring that the bytes starting at offset actually represent a score.
- //
- // REQUIRES: offset + sizeof(Hit::Score) < size_in_bytes_
- // REQUIRES enforced by CHECK
- Hit::Score ReadScore(uint32_t offset) const;
-
- // A byte array of size size_in_bytes_ containing encoded hits for this
- // posting list.
- uint8_t *posting_list_buffer_; // does not own!
- uint32_t size_in_bytes_;
-};
-
-// Inlined functions. Implementation details below. Avert eyes!
-template <class T, Hit (*GetHit)(const T &)>
-uint32_t PostingListUsed::PrependHitArray(const T *array, uint32_t num_hits,
- bool keep_prepended) {
- if (!IsPostingListValid()) {
- return 0;
- }
-
- // Prepend hits working backwards from array[num_hits - 1].
- uint32_t i;
- for (i = 0; i < num_hits; ++i) {
- if (!PrependHit(GetHit(array[num_hits - i - 1])).ok()) {
- break;
- }
- }
- if (i != num_hits && !keep_prepended) {
- // Didn't fit. Undo everything and check that we have the same offset as
- // before. PopFrontHits guarantees that it will remove all 'i' hits so long
- // as there are at least 'i' hits in the posting list, which we know there
- // are.
- PopFrontHits(i);
- }
- return i;
-}
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_INDEX_POSTING_LIST_USED_H_
diff --git a/icing/index/posting-list-used_test.cc b/icing/index/posting-list-used_test.cc
deleted file mode 100644
index a0e9514..0000000
--- a/icing/index/posting-list-used_test.cc
+++ /dev/null
@@ -1,537 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/index/posting-list-used.h"
-
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include <algorithm>
-#include <cstdint>
-#include <deque>
-#include <iterator>
-#include <memory>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/index/posting-list-utils.h"
-#include "icing/legacy/index/icing-bit-util.h"
-#include "icing/schema/section.h"
-#include "icing/store/document-id.h"
-#include "icing/testing/common-matchers.h"
-
-using std::min;
-using std::reverse;
-using std::vector;
-using testing::ElementsAre;
-using testing::ElementsAreArray;
-using testing::IsEmpty;
-
-namespace icing {
-namespace lib {
-
-struct HitElt {
- HitElt() = default;
- explicit HitElt(const Hit &hit_in) : hit(hit_in) {}
-
- static Hit get_hit(const HitElt &hit_elt) {
- return hit_elt.hit;
- }
-
- Hit hit;
-};
-
-// Produces a vector with num_hits HitElts. When delta encoded each hit should
-// be 1 byte with a 1 byte Hit::Score.
-std::vector<HitElt> CreateHits(DocumentId start_docid, int num_hits) {
- std::vector<HitElt> hits;
- hits.reserve(num_hits);
- while (num_hits--) {
- Hit::Score score = (start_docid % 7) + 1;
- SectionId section_id = (start_docid + 2) % (kMaxSectionId + 1);
- hits.emplace_back(Hit(section_id, start_docid, score));
- ++start_docid;
- }
- std::reverse(hits.begin(), hits.end());
- return hits;
-}
-
-Hit CreateHit(Hit last_hit, int desired_byte_length) {
- Hit hit =
- (last_hit.section_id() == kMinSectionId)
- ? Hit(kMaxSectionId, last_hit.document_id() + 1, last_hit.score())
- : Hit(last_hit.section_id() - 1, last_hit.document_id(),
- last_hit.score());
- uint8_t buf[5];
- while (VarInt::Encode(last_hit.value() - hit.value(), buf) <
- desired_byte_length) {
- hit = (hit.section_id() == kMinSectionId)
- ? Hit(kMaxSectionId, hit.document_id() + 1, hit.score())
- : Hit(hit.section_id() - 1, hit.document_id(), hit.score());
- }
- return hit;
-}
-
-DocumentId InvertDocumentId(DocumentId document_id) {
- return kMaxDocumentId - document_id;
-}
-
-TEST(PostingListTest, PostingListUsedPrependHitNotFull) {
- static const int kNumHits = 2551;
- static const size_t kHitsSize = kNumHits * sizeof(Hit);
-
- std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
-
- // Make used.
- Hit hit0(/*section_id=*/0, 0, /*score=*/56);
- pl_used.PrependHit(hit0);
- // Size = sizeof(uncompressed hit0)
- int expected_size = sizeof(Hit);
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit0));
-
- Hit hit1(/*section_id=*/0, 1, Hit::kMaxHitScore);
- pl_used.PrependHit(hit1);
- // Size = sizeof(uncompressed hit1)
- // + sizeof(hit0-hit1) + sizeof(hit0::score)
- expected_size += 2 + sizeof(Hit::Score);
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit1, hit0));
-
- Hit hit2(/*section_id=*/0, 2, /*score=*/56);
- pl_used.PrependHit(hit2);
- // Size = sizeof(uncompressed hit2)
- // + sizeof(hit1-hit2)
- // + sizeof(hit0-hit1) + sizeof(hit0::score)
- expected_size += 2;
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit2, hit1, hit0));
-
- Hit hit3(/*section_id=*/0, 3, Hit::kMaxHitScore);
- pl_used.PrependHit(hit3);
- // Size = sizeof(uncompressed hit3)
- // + sizeof(hit2-hit3) + sizeof(hit2::score)
- // + sizeof(hit1-hit2)
- // + sizeof(hit0-hit1) + sizeof(hit0::score)
- expected_size += 2 + sizeof(Hit::Score);
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit3, hit2, hit1, hit0));
-}
-
-TEST(PostingListTest, PostingListUsedPrependHitAlmostFull) {
- constexpr int kHitsSize = 2 * posting_list_utils::min_posting_list_size();
- std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
-
- // Fill up the compressed region.
- // Transitions:
- // Adding hit0: EMPTY -> NOT_FULL
- // Adding hit1: NOT_FULL -> NOT_FULL
- // Adding hit2: NOT_FULL -> NOT_FULL
- Hit hit0(/*section_id=*/0, 0, Hit::kMaxHitScore);
- Hit hit1 = CreateHit(hit0, /*desired_byte_length=*/2);
- Hit hit2 = CreateHit(hit1, /*desired_byte_length=*/2);
- ICING_EXPECT_OK(pl_used.PrependHit(hit0));
- ICING_EXPECT_OK(pl_used.PrependHit(hit1));
- ICING_EXPECT_OK(pl_used.PrependHit(hit2));
- // Size used will be 2+2+4=8 bytes
- int expected_size = sizeof(Hit::Value) + 2 + 2;
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit2, hit1, hit0));
-
- // Add one more hit to transition NOT_FULL -> ALMOST_FULL
- Hit hit3 = CreateHit(hit2, /*desired_byte_length=*/3);
- ICING_EXPECT_OK(pl_used.PrependHit(hit3));
- // Compressed region would be 2+2+3+4=11 bytes, but the compressed region is
- // only 10 bytes. So instead, the posting list will transition to ALMOST_FULL.
- // The in-use compressed region will actually shrink from 8 bytes to 7 bytes
- // because the uncompressed version of hit2 will be overwritten with the
- // compressed delta of hit2. hit3 will be written to one of the special hits.
- // Because we're in ALMOST_FULL, the expected size is the size of the pl minus
- // the one hit used to mark the posting list as ALMOST_FULL.
- expected_size = kHitsSize - sizeof(Hit);
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit3, hit2, hit1, hit0));
-
- // Add one more hit to transition ALMOST_FULL -> ALMOST_FULL
- Hit hit4 = CreateHit(hit3, /*desired_byte_length=*/2);
- ICING_EXPECT_OK(pl_used.PrependHit(hit4));
- // There are currently 7 bytes in use in the compressed region. hit3 will have
- // a 2-byte delta. That delta will fit in the compressed region (which will
- // now have 9 bytes in use), hit4 will be placed in one of the special hits
- // and the posting list will remain in ALMOST_FULL.
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit4, hit3, hit2, hit1, hit0));
-
- // Add one more hit to transition ALMOST_FULL -> FULL
- Hit hit5 = CreateHit(hit4, /*desired_byte_length=*/2);
- ICING_EXPECT_OK(pl_used.PrependHit(hit5));
- // There are currently 9 bytes in use in the compressed region. hit4 will have
- // a 2-byte delta which will not fit in the compressed region. So hit4 will
- // remain in one of the special hits and hit5 will occupy the other, making
- // the posting list FULL.
- EXPECT_LE(pl_used.BytesUsed(), kHitsSize);
- EXPECT_THAT(pl_used.GetHits(),
- ElementsAre(hit5, hit4, hit3, hit2, hit1, hit0));
-
- // The posting list is FULL. Adding another hit should fail.
- Hit hit6 = CreateHit(hit5, /*desired_byte_length=*/1);
- EXPECT_THAT(pl_used.PrependHit(hit6),
- StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-}
-
-TEST(PostingListTest, PostingListUsedMinSize) {
- std::unique_ptr<char[]> hits_buf =
- std::make_unique<char[]>(posting_list_utils::min_posting_list_size());
-
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()),
- posting_list_utils::min_posting_list_size()));
- // PL State: EMPTY
- EXPECT_LE(pl_used.BytesUsed(), 0);
- EXPECT_THAT(pl_used.GetHits(), IsEmpty());
-
- // Add a hit, PL should shift to ALMOST_FULL state
- Hit hit0(/*section_id=*/0, 0, /*score=*/0, /*is_in_prefix_section=*/false,
- /*is_prefix_hit=*/true);
- ICING_EXPECT_OK(pl_used.PrependHit(hit0));
- // Size = sizeof(uncompressed hit0)
- int expected_size = sizeof(Hit);
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit0));
-
- // Add the smallest hit possible - no score and a delta of 1. PL should shift
- // to FULL state.
- Hit hit1(/*section_id=*/0, 0, /*score=*/0, /*is_in_prefix_section=*/true,
- /*is_prefix_hit=*/false);
- ICING_EXPECT_OK(pl_used.PrependHit(hit1));
- // Size = sizeof(uncompressed hit1) + sizeof(uncompressed hit0)
- expected_size += sizeof(Hit);
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit1, hit0));
-
- // Try to add the smallest hit possible. Should fail
- Hit hit2(/*section_id=*/0, 0, /*score=*/0, /*is_in_prefix_section=*/false,
- /*is_prefix_hit=*/false);
- EXPECT_THAT(pl_used.PrependHit(hit2),
- StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
- EXPECT_LE(pl_used.BytesUsed(), expected_size);
- EXPECT_THAT(pl_used.GetHits(), ElementsAre(hit1, hit0));
-}
-
-TEST(PostingListTest, PostingListPrependHitArrayMinSizePostingList) {
- constexpr int kFinalSize = 1025;
- std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kFinalSize);
-
- // Min Size = 10
- int size = posting_list_utils::min_posting_list_size();
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), size));
-
- std::vector<HitElt> hits_in;
- hits_in.emplace_back(Hit(1, 0, Hit::kMaxHitScore));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- std::reverse(hits_in.begin(), hits_in.end());
-
- // Add five hits. The PL is in the empty state and an empty min size PL can
- // only fit two hits. So PrependHitArray should fail.
- uint32_t num_can_prepend = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
- EXPECT_EQ(num_can_prepend, 2);
-
- int can_fit_hits = num_can_prepend;
- // The PL has room for 2 hits. We should be able to add them without any
- // problem, transitioning the PL from EMPTY -> ALMOST_FULL -> FULL
- const HitElt *hits_in_ptr = hits_in.data() + (hits_in.size() - 2);
- num_can_prepend = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- hits_in_ptr, can_fit_hits, false);
- EXPECT_EQ(num_can_prepend, can_fit_hits);
- EXPECT_EQ(size, pl_used.BytesUsed());
- std::deque<Hit> hits_pushed;
- std::transform(hits_in.rbegin(),
- hits_in.rend() - hits_in.size() + can_fit_hits,
- std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), ElementsAreArray(hits_pushed));
-}
-
-TEST(PostingListTest, PostingListPrependHitArrayPostingList) {
- // Size = 30
- int size = 3 * posting_list_utils::min_posting_list_size();
- std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(size);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), size));
-
- std::vector<HitElt> hits_in;
- hits_in.emplace_back(Hit(1, 0, Hit::kMaxHitScore));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- std::reverse(hits_in.begin(), hits_in.end());
- // The last hit is uncompressed and the four before it should only take one
- // byte. Total use = 8 bytes.
- // ----------------------
- // 29 delta(Hit #1)
- // 28 delta(Hit #2)
- // 27 delta(Hit #3)
- // 26 delta(Hit #4)
- // 25-22 Hit #5
- // 21-10 <unused>
- // 9-5 kSpecialHit
- // 4-0 Offset=22
- // ----------------------
- int byte_size = sizeof(Hit::Value) + hits_in.size() - 1;
-
- // Add five hits. The PL is in the empty state and should be able to fit all
- // five hits without issue, transitioning the PL from EMPTY -> NOT_FULL.
- uint32_t num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
- EXPECT_EQ(num_could_fit, hits_in.size());
- EXPECT_EQ(byte_size, pl_used.BytesUsed());
- std::deque<Hit> hits_pushed;
- std::transform(hits_in.rbegin(), hits_in.rend(),
- std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), ElementsAreArray(hits_pushed));
-
- Hit first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1);
- hits_in.clear();
- hits_in.emplace_back(first_hit);
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/1));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/3));
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
- std::reverse(hits_in.begin(), hits_in.end());
- // Size increased by the deltas of these hits (1+2+1+2+3+2) = 11 bytes
- // ----------------------
- // 29 delta(Hit #1)
- // 28 delta(Hit #2)
- // 27 delta(Hit #3)
- // 26 delta(Hit #4)
- // 25 delta(Hit #5)
- // 24-23 delta(Hit #6)
- // 22 delta(Hit #7)
- // 21-20 delta(Hit #8)
- // 19-17 delta(Hit #9)
- // 16-15 delta(Hit #10)
- // 14-11 Hit #11
- // 10 <unused>
- // 9-5 kSpecialHit
- // 4-0 Offset=22
- // ----------------------
- byte_size += 11;
-
- // Add these 6 hits. The PL is currently in the NOT_FULL state and should
- // remain in the NOT_FULL state.
- num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
- EXPECT_EQ(num_could_fit, hits_in.size());
- EXPECT_EQ(byte_size, pl_used.BytesUsed());
- // All hits from hits_in were added.
- std::transform(hits_in.rbegin(), hits_in.rend(),
- std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), ElementsAreArray(hits_pushed));
-
- first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/3);
- hits_in.clear();
- hits_in.emplace_back(first_hit);
- // ----------------------
- // 29 delta(Hit #1)
- // 28 delta(Hit #2)
- // 27 delta(Hit #3)
- // 26 delta(Hit #4)
- // 25 delta(Hit #5)
- // 24-23 delta(Hit #6)
- // 22 delta(Hit #7)
- // 21-20 delta(Hit #8)
- // 19-17 delta(Hit #9)
- // 16-15 delta(Hit #10)
- // 14-12 delta(Hit #11)
- // 11-10 <unused>
- // 9-5 Hit #12
- // 4-0 kSpecialHit
- // ----------------------
- byte_size = 25;
-
- // Add this 1 hit. The PL is currently in the NOT_FULL state and should
- // transition to the ALMOST_FULL state - even though there is still some
- // unused space.
- num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
- EXPECT_EQ(num_could_fit, hits_in.size());
- EXPECT_EQ(byte_size, pl_used.BytesUsed());
- // All hits from hits_in were added.
- std::transform(hits_in.rbegin(), hits_in.rend(),
- std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), ElementsAreArray(hits_pushed));
-
- first_hit = CreateHit(hits_in.begin()->hit, /*desired_byte_length=*/1);
- hits_in.clear();
- hits_in.emplace_back(first_hit);
- hits_in.emplace_back(
- CreateHit(hits_in.rbegin()->hit, /*desired_byte_length=*/2));
- std::reverse(hits_in.begin(), hits_in.end());
- // ----------------------
- // 29 delta(Hit #1)
- // 28 delta(Hit #2)
- // 27 delta(Hit #3)
- // 26 delta(Hit #4)
- // 25 delta(Hit #5)
- // 24-23 delta(Hit #6)
- // 22 delta(Hit #7)
- // 21-20 delta(Hit #8)
- // 19-17 delta(Hit #9)
- // 16-15 delta(Hit #10)
- // 14-12 delta(Hit #11)
- // 11 delta(Hit #12)
- // 10 <unused>
- // 9-5 Hit #13
- // 4-0 Hit #14
- // ----------------------
-
- // Add these 2 hits. The PL is currently in the ALMOST_FULL state. Adding the
- // first hit should keep the PL in ALMOST_FULL because the delta between Hit
- // #12 and Hit #13 (1 byte) can fit in the unused area (2 bytes). Adding the
- // second hit should tranisition to the FULL state because the delta between
- // Hit #13 and Hit #14 (2 bytes) is larger than the remaining unused area
- // (1 byte).
- num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in[0], hits_in.size(), false);
- EXPECT_EQ(num_could_fit, hits_in.size());
- EXPECT_EQ(size, pl_used.BytesUsed());
- // All hits from hits_in were added.
- std::transform(hits_in.rbegin(), hits_in.rend(),
- std::front_inserter(hits_pushed), HitElt::get_hit);
- EXPECT_THAT(pl_used.GetHits(), ElementsAreArray(hits_pushed));
-}
-
-TEST(PostingListTest, PostingListPrependHitArrayTooManyHits) {
- static constexpr int kNumHits = 128;
- static constexpr int kDeltaSize = 1;
- static constexpr int kScoreSize = 1;
- static constexpr size_t kHitsSize =
- ((kNumHits * (kDeltaSize + kScoreSize)) / 5) * 5;
-
- std::unique_ptr<char[]> hits_buf = std::make_unique<char[]>(kHitsSize);
-
- // Create an array with one too many hits
- vector<HitElt> hits_in_too_many = CreateHits(0, kNumHits + 1);
- ICING_ASSERT_OK_AND_ASSIGN(PostingListUsed pl_used,
- PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()),
- posting_list_utils::min_posting_list_size()));
-
- // PrependHitArray should fail because hits_in_too_many is far too large for
- // the minimum size pl.
- uint32_t num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in_too_many[0], hits_in_too_many.size(), false);
- ASSERT_LT(num_could_fit, hits_in_too_many.size());
- ASSERT_EQ(pl_used.BytesUsed(), 0);
- ASSERT_THAT(pl_used.GetHits(), testing::IsEmpty());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- pl_used, PostingListUsed::CreateFromUnitializedRegion(
- static_cast<void *>(hits_buf.get()), kHitsSize));
- // PrependHitArray should fail because hits_in_too_many is one hit too large
- // for this pl.
- num_could_fit = pl_used.PrependHitArray<HitElt, HitElt::get_hit>(
- &hits_in_too_many[0], hits_in_too_many.size(), false);
- ASSERT_LT(num_could_fit, hits_in_too_many.size());
- ASSERT_EQ(pl_used.BytesUsed(), 0);
- ASSERT_THAT(pl_used.GetHits(), testing::IsEmpty());
-}
-
-TEST(PostingListTest, PostingListStatusJumpFromNotFullToFullAndBack) {
- const uint32_t pl_size = 3 * sizeof(Hit);
- char hits_buf[pl_size];
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListUsed pl,
- PostingListUsed::CreateFromUnitializedRegion(hits_buf, pl_size));
- ICING_ASSERT_OK(pl.PrependHit(Hit(Hit::kInvalidValue - 1, 0)));
- uint32_t bytes_used = pl.BytesUsed();
- // Status not full.
- CHECK_LE(bytes_used, pl_size - posting_list_utils::kSpecialHitsSize);
- ICING_ASSERT_OK(pl.PrependHit(Hit(Hit::kInvalidValue >> 2, 0)));
- // Status should jump to full directly.
- CHECK_EQ(pl.BytesUsed(), pl_size);
- pl.PopFrontHits(1);
- // Status should return to not full as before.
- CHECK_EQ(pl.BytesUsed(), bytes_used);
-}
-
-TEST(PostingListTest, DeltaOverflow) {
- char hits_buf[1000];
- ICING_ASSERT_OK_AND_ASSIGN(
- PostingListUsed pl,
- PostingListUsed::CreateFromUnitializedRegion(hits_buf, 4 * sizeof(Hit)));
-
- static const Hit::Value kOverflow[4] = {
- Hit::kInvalidValue >> 2,
- (Hit::kInvalidValue >> 2) * 2,
- (Hit::kInvalidValue >> 2) * 3,
- Hit::kInvalidValue - 1,
- };
-
- // Fit at least 4 ordinary values.
- for (Hit::Value v = 0; v < 4; v++) {
- ICING_EXPECT_OK(pl.PrependHit(Hit(4 - v)));
- }
-
- // Cannot fit 4 overflow values.
- ICING_ASSERT_OK_AND_ASSIGN(pl, PostingListUsed::CreateFromUnitializedRegion(
- hits_buf, 4 * sizeof(Hit)));
- ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[3])));
- ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[2])));
-
- // Can fit only one more.
- ICING_EXPECT_OK(pl.PrependHit(Hit(kOverflow[1])));
- EXPECT_THAT(pl.PrependHit(Hit(kOverflow[0])),
- StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/index/property-existence-indexing-handler.cc b/icing/index/property-existence-indexing-handler.cc
new file mode 100644
index 0000000..504f380
--- /dev/null
+++ b/icing/index/property-existence-indexing-handler.cc
@@ -0,0 +1,127 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/property-existence-indexing-handler.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/index.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/util/clock.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+void ConstructPropertyExistenceMetaToken(
+ const std::string& current_path, const DocumentProto& document,
+ std::unordered_set<std::string>& meta_tokens) {
+ for (const PropertyProto& property : document.properties()) {
+ std::string new_path = current_path;
+ if (!new_path.empty()) {
+ new_path.append(".");
+ }
+ new_path.append(property.name());
+ for (const DocumentProto& nested_document : property.document_values()) {
+ ConstructPropertyExistenceMetaToken(new_path, nested_document,
+ meta_tokens);
+ }
+ // A string property exists if and only if there is at least one non-empty
+ // string in the property.
+ bool has_string_value = false;
+ for (const std::string& string_value : property.string_values()) {
+ if (!string_value.empty()) {
+ has_string_value = true;
+ break;
+ }
+ }
+ if (has_string_value || property.int64_values_size() > 0 ||
+ property.double_values_size() > 0 ||
+ property.boolean_values_size() > 0 ||
+ property.bytes_values_size() > 0 ||
+ property.document_values_size() > 0) {
+ meta_tokens.insert(
+ absl_ports::StrCat(kPropertyExistenceTokenPrefix, new_path));
+ }
+ }
+}
+
+} // namespace
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<PropertyExistenceIndexingHandler>>
+PropertyExistenceIndexingHandler::Create(const Clock* clock, Index* index) {
+ ICING_RETURN_ERROR_IF_NULL(clock);
+ ICING_RETURN_ERROR_IF_NULL(index);
+
+ return std::unique_ptr<PropertyExistenceIndexingHandler>(
+ new PropertyExistenceIndexingHandler(*clock, index));
+}
+
+libtextclassifier3::Status PropertyExistenceIndexingHandler::Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ PutDocumentStatsProto* put_document_stats) {
+ std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
+
+ libtextclassifier3::Status status;
+ // Section id is irrelevant to metadata tokens that is used to support
+ // property existence check.
+ Index::Editor editor =
+ index_.Edit(document_id, /*section_id=*/0, TermMatchType::EXACT_ONLY,
+ /*namespace_id=*/0);
+ std::unordered_set<std::string> meta_tokens;
+ ConstructPropertyExistenceMetaToken(
+ /*current_path=*/"", tokenized_document.document(), meta_tokens);
+ for (const std::string& meta_token : meta_tokens) {
+ status = editor.BufferTerm(meta_token.c_str());
+ if (!status.ok()) {
+ // We've encountered a failure. Bail out. We'll mark this doc as deleted
+ // and signal a failure to the client.
+ ICING_LOG(WARNING) << "Failed to buffer term in lite lexicon due to: "
+ << status.error_message();
+ break;
+ }
+ }
+
+ if (status.ok()) {
+ // Add all the metadata tokens to support property existence check.
+ status = editor.IndexAllBufferedTerms();
+ if (!status.ok()) {
+ ICING_LOG(WARNING) << "Failed to add hits in lite index due to: "
+ << status.error_message();
+ }
+ }
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_metadata_term_index_latency_ms(
+ index_timer->GetElapsedMilliseconds());
+ put_document_stats->mutable_tokenization_stats()
+ ->set_num_metadata_tokens_indexed(meta_tokens.size());
+ }
+
+ return status;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/property-existence-indexing-handler.h b/icing/index/property-existence-indexing-handler.h
new file mode 100644
index 0000000..55c0bb4
--- /dev/null
+++ b/icing/index/property-existence-indexing-handler.h
@@ -0,0 +1,86 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_PROPERTY_EXISTENCE_INDEXING_HANDLER_H_
+#define ICING_INDEX_PROPERTY_EXISTENCE_INDEXING_HANDLER_H_
+
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/index.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/util/clock.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+inline constexpr std::string_view kPropertyExistenceTokenPrefix =
+ "\xFF_HAS_\xFF";
+
+// This class is meant to be owned by TermIndexingHandler. Instead of using this
+// handler directly, callers should use TermIndexingHandler to index documents.
+//
+// This handler will not check or set last_added_document_id of the index, and
+// it will not merge or sort the lite index either.
+class PropertyExistenceIndexingHandler {
+ public:
+ // Creates a PropertyExistenceIndexingHandler instance which does not take
+ // ownership of any input components. All pointers must refer to valid objects
+ // that outlive the created PropertyExistenceIndexingHandler instance.
+ //
+ // Returns:
+ // - A PropertyExistenceIndexingHandler instance on success
+ // - FAILED_PRECONDITION_ERROR if any of the input pointer is null
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<PropertyExistenceIndexingHandler>>
+ Create(const Clock* clock, Index* index);
+
+ ~PropertyExistenceIndexingHandler() = default;
+
+ // Handles the property existence indexing process: add hits for metadata
+ // tokens used to index property existence.
+ //
+ // For example, if the passed in document has string properties "propA",
+ // "propB" and "propC.propD", and document property "propC", this handler will
+ // add the following metadata token to the index.
+ // - kPropertyExistenceTokenPrefix + "propA"
+ // - kPropertyExistenceTokenPrefix + "propB"
+ // - kPropertyExistenceTokenPrefix + "propC"
+ // - kPropertyExistenceTokenPrefix + "propC.propD"
+ //
+ /// Returns:
+ // - OK on success
+ // - RESOURCE_EXHAUSTED_ERROR if the index is full and can't add anymore
+ // content.
+ // - INTERNAL_ERROR if any other errors occur.
+ libtextclassifier3::Status Handle(const TokenizedDocument& tokenized_document,
+ DocumentId document_id,
+ PutDocumentStatsProto* put_document_stats);
+
+ private:
+ explicit PropertyExistenceIndexingHandler(const Clock& clock, Index* index)
+ : clock_(clock), index_(*index) {}
+
+ const Clock& clock_; // Does not own.
+ Index& index_; // Does not own.
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_PROPERTY_EXISTENCE_INDEXING_HANDLER_H_
diff --git a/icing/index/property-existence-indexing-handler_test.cc b/icing/index/property-existence-indexing-handler_test.cc
new file mode 100644
index 0000000..e42fbc3
--- /dev/null
+++ b/icing/index/property-existence-indexing-handler_test.cc
@@ -0,0 +1,524 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/property-existence-indexing-handler.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsTrue;
+using ::testing::Test;
+
+static constexpr std::string_view kTreeType = "TreeNode";
+static constexpr std::string_view kPropertyName = "name";
+static constexpr std::string_view kPropertyValue = "value";
+static constexpr std::string_view kPropertySubtrees = "subtrees";
+
+static constexpr std::string_view kValueType = "Value";
+static constexpr std::string_view kPropertyBody = "body";
+static constexpr std::string_view kPropertyTimestamp = "timestamp";
+static constexpr std::string_view kPropertyScore = "score";
+
+class PropertyExistenceIndexingHandlerTest : public Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ base_dir_ = GetTestTempDir() + "/icing_test";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ index_dir_ = base_dir_ + "/index";
+ schema_store_dir_ = base_dir_ + "/schema_store";
+ document_store_dir_ = base_dir_ + "/document_store";
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ normalizer_,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
+
+ ASSERT_THAT(
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kTreeType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyValue)
+ .SetDataTypeDocument(
+ kValueType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertySubtrees)
+ .SetDataTypeDocument(
+ kTreeType, /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kValueType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyBody)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyTimestamp)
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyScore)
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+
+ ASSERT_TRUE(
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult doc_store_create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false,
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(doc_store_create_result.document_store);
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ normalizer_.reset();
+ lang_segmenter_.reset();
+
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ FakeClock fake_clock_;
+ std::string base_dir_;
+ std::string index_dir_;
+ std::string schema_store_dir_;
+ std::string document_store_dir_;
+
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+QueryExistence(Index* index, std::string_view property_path) {
+ return index->GetIterator(
+ absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path),
+ /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY,
+ /*need_hit_term_frequency=*/false);
+}
+
+std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+}
+
+TEST_F(PropertyExistenceIndexingHandlerTest, HandlePropertyExistence) {
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ // Create a document with every property.
+ DocumentProto document0 =
+ DocumentBuilder()
+ .SetKey("icing", "uri0")
+ .SetSchema(std::string(kValueType))
+ .AddStringProperty(std::string(kPropertyBody), "foo")
+ .AddInt64Property(std::string(kPropertyTimestamp), 123)
+ .AddDoubleProperty(std::string(kPropertyScore), 456.789)
+ .Build();
+ // Create a document with missing body.
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(std::string(kValueType))
+ .AddInt64Property(std::string(kPropertyTimestamp), 123)
+ .AddDoubleProperty(std::string(kPropertyScore), 456.789)
+ .Build();
+ // Create a document with missing timestamp.
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(std::string(kValueType))
+ .AddStringProperty(std::string(kPropertyBody), "foo")
+ .AddDoubleProperty(std::string(kPropertyScore), 456.789)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document0,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document0)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document1,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document2,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id0,
+ document_store_->Put(tokenized_document0.document()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(tokenized_document1.document()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(tokenized_document2.document()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PropertyExistenceIndexingHandler> handler,
+ PropertyExistenceIndexingHandler::Create(&fake_clock_, index.get()));
+
+ // Handle all docs
+ EXPECT_THAT(handler->Handle(tokenized_document0, document_id0,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(handler->Handle(tokenized_document1, document_id1,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(handler->Handle(tokenized_document2, document_id2,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ // Get all documents that have "body".
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ QueryExistence(index.get(), kPropertyBody));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id2, std::vector<SectionId>{0}),
+ EqualsDocHitInfo(document_id0, std::vector<SectionId>{0})));
+
+ // Get all documents that have "timestamp".
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ QueryExistence(index.get(), kPropertyTimestamp));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id1, std::vector<SectionId>{0}),
+ EqualsDocHitInfo(document_id0, std::vector<SectionId>{0})));
+
+ // Get all documents that have "score".
+ ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyScore));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id2, std::vector<SectionId>{0}),
+ EqualsDocHitInfo(document_id1, std::vector<SectionId>{0}),
+ EqualsDocHitInfo(document_id0, std::vector<SectionId>{0})));
+}
+
+TEST_F(PropertyExistenceIndexingHandlerTest, HandleNestedPropertyExistence) {
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ // Create a complex nested root_document with the following property paths.
+ // - name
+ // - subtrees
+ // - subtrees.name
+ // - subtrees.value
+ // - subtrees.value.timestamp
+ // - subtrees.subtrees
+ // - subtrees.subtrees.name
+ // - subtrees.subtrees.value
+ // - subtrees.subtrees.value.body
+ // - subtrees.subtrees.value.score
+ DocumentProto leaf_document =
+ DocumentBuilder()
+ .SetKey("icing", "uri")
+ .SetSchema(std::string(kTreeType))
+ .AddStringProperty(std::string(kPropertyName), "leaf")
+ .AddDocumentProperty(
+ std::string(kPropertyValue),
+ DocumentBuilder()
+ .SetKey("icing", "uri")
+ .SetSchema(std::string(kValueType))
+ .AddStringProperty(std::string(kPropertyBody), "foo")
+ .AddDoubleProperty(std::string(kPropertyScore), 456.789)
+ .Build())
+ .Build();
+ DocumentProto intermediate_document1 =
+ DocumentBuilder()
+ .SetKey("icing", "uri")
+ .SetSchema(std::string(kTreeType))
+ .AddStringProperty(std::string(kPropertyName), "intermediate1")
+ .AddDocumentProperty(
+ std::string(kPropertyValue),
+ DocumentBuilder()
+ .SetKey("icing", "uri")
+ .SetSchema(std::string(kValueType))
+ .AddInt64Property(std::string(kPropertyTimestamp), 123)
+ .Build())
+ .AddDocumentProperty(std::string(kPropertySubtrees), leaf_document)
+ .Build();
+ DocumentProto intermediate_document2 =
+ DocumentBuilder()
+ .SetKey("icing", "uri")
+ .SetSchema(std::string(kTreeType))
+ .AddStringProperty(std::string(kPropertyName), "intermediate2")
+ .Build();
+ DocumentProto root_document =
+ DocumentBuilder()
+ .SetKey("icing", "uri")
+ .SetSchema(std::string(kTreeType))
+ .AddStringProperty(std::string(kPropertyName), "root")
+ .AddDocumentProperty(std::string(kPropertySubtrees),
+ intermediate_document1, intermediate_document2)
+ .Build();
+
+ // Handle root_document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_root_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(root_document)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(tokenized_root_document.document()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PropertyExistenceIndexingHandler> handler,
+ PropertyExistenceIndexingHandler::Create(&fake_clock_, index.get()));
+ EXPECT_THAT(handler->Handle(tokenized_root_document, document_id,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ // Check that the above property paths can be found by query.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ QueryExistence(index.get(), "name"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), "subtrees"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), "subtrees.name"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ QueryExistence(index.get(), "subtrees.value"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, QueryExistence(index.get(), "subtrees.value.timestamp"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(itr,
+ QueryExistence(index.get(), "subtrees.subtrees"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, QueryExistence(index.get(), "subtrees.subtrees.name"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, QueryExistence(index.get(), "subtrees.subtrees.value"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, QueryExistence(index.get(), "subtrees.subtrees.value.body"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, QueryExistence(index.get(), "subtrees.subtrees.value.score"));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+}
+
+TEST_F(PropertyExistenceIndexingHandlerTest, SingleEmptyStringIsNonExisting) {
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ // Create a document with one empty body.
+ DocumentProto document0 =
+ DocumentBuilder()
+ .SetKey("icing", "uri0")
+ .SetSchema(std::string(kValueType))
+ .AddStringProperty(std::string(kPropertyBody), "")
+ .Build();
+ // Create a document with two empty body.
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(std::string(kValueType))
+ .AddStringProperty(std::string(kPropertyBody), "", "")
+ .Build();
+ // Create a document with one non-empty body.
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(std::string(kValueType))
+ .AddStringProperty(std::string(kPropertyBody), "foo")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document0,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document0)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document1,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document2,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id0,
+ document_store_->Put(tokenized_document0.document()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(tokenized_document1.document()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(tokenized_document2.document()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PropertyExistenceIndexingHandler> handler,
+ PropertyExistenceIndexingHandler::Create(&fake_clock_, index.get()));
+
+ // Handle all docs
+ EXPECT_THAT(handler->Handle(tokenized_document0, document_id0,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(handler->Handle(tokenized_document1, document_id1,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(handler->Handle(tokenized_document2, document_id2,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ // Check that the documents that have one or two empty bodies will not be
+ // considered as having a body property.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<DocHitInfoIterator> itr,
+ QueryExistence(index.get(), kPropertyBody));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id2, std::vector<SectionId>{0})));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/string-section-indexing-handler.cc b/icing/index/string-section-indexing-handler.cc
new file mode 100644
index 0000000..8b20d04
--- /dev/null
+++ b/icing/index/string-section-indexing-handler.cc
@@ -0,0 +1,114 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/string-section-indexing-handler.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/index.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<StringSectionIndexingHandler>>
+StringSectionIndexingHandler::Create(const Normalizer* normalizer,
+ Index* index) {
+ ICING_RETURN_ERROR_IF_NULL(normalizer);
+ ICING_RETURN_ERROR_IF_NULL(index);
+
+ return std::unique_ptr<StringSectionIndexingHandler>(
+ new StringSectionIndexingHandler(normalizer, index));
+}
+
+libtextclassifier3::Status StringSectionIndexingHandler::Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ PutDocumentStatsProto* put_document_stats) {
+ uint32_t num_tokens = 0;
+ libtextclassifier3::Status status;
+ for (const TokenizedSection& section :
+ tokenized_document.tokenized_string_sections()) {
+ if (section.metadata.tokenizer ==
+ StringIndexingConfig::TokenizerType::NONE) {
+ ICING_LOG(WARNING)
+ << "Unexpected TokenizerType::NONE found when indexing document.";
+ }
+ // TODO(b/152934343): pass real namespace ids in
+ Index::Editor editor =
+ index_.Edit(document_id, section.metadata.id,
+ section.metadata.term_match_type, /*namespace_id=*/0);
+ for (std::string_view token : section.token_sequence) {
+ ++num_tokens;
+
+ switch (section.metadata.tokenizer) {
+ case StringIndexingConfig::TokenizerType::VERBATIM:
+ // data() is safe to use here because a token created from the
+ // VERBATIM tokenizer is the entire string value. The character at
+ // data() + token.length() is guaranteed to be a null char.
+ status = editor.BufferTerm(token.data());
+ break;
+ case StringIndexingConfig::TokenizerType::NONE:
+ [[fallthrough]];
+ case StringIndexingConfig::TokenizerType::RFC822:
+ [[fallthrough]];
+ case StringIndexingConfig::TokenizerType::URL:
+ [[fallthrough]];
+ case StringIndexingConfig::TokenizerType::PLAIN:
+ std::string normalized_term = normalizer_.NormalizeTerm(token);
+ status = editor.BufferTerm(normalized_term.c_str());
+ }
+
+ if (!status.ok()) {
+ // We've encountered a failure. Bail out. We'll mark this doc as deleted
+ // and signal a failure to the client.
+ ICING_LOG(WARNING) << "Failed to buffer term in lite lexicon due to: "
+ << status.error_message();
+ break;
+ }
+ }
+ if (!status.ok()) {
+ break;
+ }
+ // Add all the seen terms to the index with their term frequency.
+ status = editor.IndexAllBufferedTerms();
+ if (!status.ok()) {
+ ICING_LOG(WARNING) << "Failed to add hits in lite index due to: "
+ << status.error_message();
+ break;
+ }
+ }
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->mutable_tokenization_stats()->set_num_tokens_indexed(
+ num_tokens);
+ }
+
+ return status;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/string-section-indexing-handler.h b/icing/index/string-section-indexing-handler.h
new file mode 100644
index 0000000..8452e9f
--- /dev/null
+++ b/icing/index/string-section-indexing-handler.h
@@ -0,0 +1,77 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_STRING_SECTION_INDEXING_HANDLER_H_
+#define ICING_INDEX_STRING_SECTION_INDEXING_HANDLER_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/index.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+// This class is meant to be owned by TermIndexingHandler. Instead of using this
+// handler directly, callers should use TermIndexingHandler to index documents.
+//
+// This handler will not check or set last_added_document_id of the index, and
+// it will not merge or sort the lite index either.
+class StringSectionIndexingHandler {
+ public:
+ // Creates a StringSectionIndexingHandler instance which does not take
+ // ownership of any input components. All pointers must refer to valid objects
+ // that outlive the created StringSectionIndexingHandler instance.
+ //
+ // Returns:
+ // - A StringSectionIndexingHandler instance on success
+ // - FAILED_PRECONDITION_ERROR if any of the input pointer is null
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<StringSectionIndexingHandler>>
+ Create(const Normalizer* normalizer, Index* index);
+
+ ~StringSectionIndexingHandler() = default;
+
+ // Handles the string term indexing process: add hits into the lite index for
+ // all contents in tokenized_document.tokenized_string_sections and merge lite
+ // index into main index if necessary.
+ //
+ /// Returns:
+ // - OK on success
+ // - RESOURCE_EXHAUSTED_ERROR if the index is full and can't add anymore
+ // content.
+ // - INTERNAL_ERROR if any other errors occur.
+ // - Any main/lite index errors.
+ libtextclassifier3::Status Handle(const TokenizedDocument& tokenized_document,
+ DocumentId document_id,
+ PutDocumentStatsProto* put_document_stats);
+
+ private:
+ explicit StringSectionIndexingHandler(const Normalizer* normalizer,
+ Index* index)
+ : normalizer_(*normalizer), index_(*index) {}
+
+ const Normalizer& normalizer_; // Does not own.
+ Index& index_; // Does not own.
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_STRING_SECTION_INDEXING_HANDLER_H_
diff --git a/icing/index/term-indexing-handler.cc b/icing/index/term-indexing-handler.cc
new file mode 100644
index 0000000..7eb9dda
--- /dev/null
+++ b/icing/index/term-indexing-handler.cc
@@ -0,0 +1,146 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/term-indexing-handler.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/index.h"
+#include "icing/index/property-existence-indexing-handler.h"
+#include "icing/index/string-section-indexing-handler.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<TermIndexingHandler>>
+TermIndexingHandler::Create(const Clock* clock, const Normalizer* normalizer,
+ Index* index,
+ bool build_property_existence_metadata_hits) {
+ ICING_RETURN_ERROR_IF_NULL(clock);
+ ICING_RETURN_ERROR_IF_NULL(normalizer);
+ ICING_RETURN_ERROR_IF_NULL(index);
+
+ // Property existence index handler
+ std::unique_ptr<PropertyExistenceIndexingHandler>
+ property_existence_indexing_handler = nullptr;
+ if (build_property_existence_metadata_hits) {
+ ICING_ASSIGN_OR_RETURN(
+ property_existence_indexing_handler,
+ PropertyExistenceIndexingHandler::Create(clock, index));
+ }
+ // String section index handler
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler,
+ StringSectionIndexingHandler::Create(normalizer, index));
+
+ return std::unique_ptr<TermIndexingHandler>(new TermIndexingHandler(
+ clock, index, std::move(property_existence_indexing_handler),
+ std::move(string_section_indexing_handler)));
+}
+
+libtextclassifier3::Status TermIndexingHandler::Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ bool recovery_mode, PutDocumentStatsProto* put_document_stats) {
+ std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
+
+ if (index_.last_added_document_id() != kInvalidDocumentId &&
+ document_id <= index_.last_added_document_id()) {
+ if (recovery_mode) {
+ // Skip the document if document_id <= last_added_document_id in recovery
+ // mode without returning an error.
+ return libtextclassifier3::Status::OK;
+ }
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "DocumentId %d must be greater than last added document_id %d",
+ document_id, index_.last_added_document_id()));
+ }
+ index_.set_last_added_document_id(document_id);
+
+ libtextclassifier3::Status status = libtextclassifier3::Status::OK;
+ if (property_existence_indexing_handler_ != nullptr) {
+ status = property_existence_indexing_handler_->Handle(
+ tokenized_document, document_id, put_document_stats);
+ }
+ if (status.ok()) {
+ status = string_section_indexing_handler_->Handle(
+ tokenized_document, document_id, put_document_stats);
+ }
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_term_index_latency_ms(
+ index_timer->GetElapsedMilliseconds());
+ }
+
+ // Check if we should merge when we're either successful or we've hit resource
+ // exhausted.
+ bool should_merge =
+ (status.ok() || absl_ports::IsResourceExhausted(status)) &&
+ index_.WantsMerge();
+
+ // Check and sort the LiteIndex HitBuffer if we don't need to merge.
+ if (!should_merge && index_.LiteIndexNeedSort()) {
+ std::unique_ptr<Timer> sort_timer = clock_.GetNewTimer();
+ index_.SortLiteIndex();
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_lite_index_sort_latency_ms(
+ sort_timer->GetElapsedMilliseconds());
+ }
+ }
+
+ // Attempt index merge if needed.
+ if (should_merge) {
+ ICING_LOG(INFO) << "Merging the index at docid " << document_id << ".";
+
+ std::unique_ptr<Timer> merge_timer = clock_.GetNewTimer();
+ libtextclassifier3::Status merge_status = index_.Merge();
+
+ if (!merge_status.ok()) {
+ ICING_LOG(ERROR) << "Index merging failed. Clearing index.";
+ if (!index_.Reset().ok()) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Unable to reset to clear index after merge failure. Merge "
+ "failure=%d:%s",
+ merge_status.error_code(), merge_status.error_message().c_str()));
+ } else {
+ return absl_ports::DataLossError(IcingStringUtil::StringPrintf(
+ "Forced to reset index after merge failure. Merge failure=%d:%s",
+ merge_status.error_code(), merge_status.error_message().c_str()));
+ }
+ }
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_index_merge_latency_ms(
+ merge_timer->GetElapsedMilliseconds());
+ }
+ }
+ return status;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/term-indexing-handler.h b/icing/index/term-indexing-handler.h
new file mode 100644
index 0000000..c055bbf
--- /dev/null
+++ b/icing/index/term-indexing-handler.h
@@ -0,0 +1,97 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_INDEX_TERM_INDEXING_HANDLER_H_
+#define ICING_INDEX_TERM_INDEXING_HANDLER_H_
+
+#include <memory>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/data-indexing-handler.h"
+#include "icing/index/index.h"
+#include "icing/index/property-existence-indexing-handler.h"
+#include "icing/index/string-section-indexing-handler.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+class TermIndexingHandler : public DataIndexingHandler {
+ public:
+ // Creates a TermIndexingHandler instance which does not take
+ // ownership of any input components. All pointers must refer to valid objects
+ // that outlive the created TermIndexingHandler instance.
+ //
+ // Returns:
+ // - A TermIndexingHandler instance on success
+ // - FAILED_PRECONDITION_ERROR if any of the input pointer is null
+ static libtextclassifier3::StatusOr<std::unique_ptr<TermIndexingHandler>>
+ Create(const Clock* clock, const Normalizer* normalizer, Index* index,
+ bool build_property_existence_metadata_hits);
+
+ ~TermIndexingHandler() override = default;
+
+ // Handles term indexing process:
+ // - Checks if document_id > last_added_document_id.
+ // - Updates last_added_document_id to document_id.
+ // - Handles PropertyExistenceIndexingHandler.
+ // - Handles StringSectionIndexingHandler.
+ // - Sorts the lite index if necessary.
+ // - Merges the lite index into the main index if necessary.
+ //
+ /// Returns:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR if document_id is less than or equal to the
+ // document_id of a previously indexed document in non recovery mode.
+ // - RESOURCE_EXHAUSTED_ERROR if the index is full and can't add anymore
+ // content.
+ // - DATA_LOSS_ERROR if an attempt to merge the index fails and both indices
+ // are cleared as a result.
+ // - INTERNAL_ERROR if any other errors occur.
+ // - Any main/lite index errors.
+ libtextclassifier3::Status Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ bool recovery_mode, PutDocumentStatsProto* put_document_stats) override;
+
+ private:
+ explicit TermIndexingHandler(const Clock* clock, Index* index,
+ std::unique_ptr<PropertyExistenceIndexingHandler>
+ property_existence_indexing_handler,
+ std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler)
+ : DataIndexingHandler(clock),
+ index_(*index),
+ property_existence_indexing_handler_(
+ std::move(property_existence_indexing_handler)),
+ string_section_indexing_handler_(
+ std::move(string_section_indexing_handler)) {}
+
+ Index& index_; // Does not own.
+
+ std::unique_ptr<PropertyExistenceIndexingHandler>
+ property_existence_indexing_handler_; // Nullable
+ std::unique_ptr<StringSectionIndexingHandler>
+ string_section_indexing_handler_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_INDEX_TERM_INDEXING_HANDLER_H_
diff --git a/icing/index/term-indexing-handler_test.cc b/icing/index/term-indexing-handler_test.cc
new file mode 100644
index 0000000..1b03865
--- /dev/null
+++ b/icing/index/term-indexing-handler_test.cc
@@ -0,0 +1,664 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/index/term-indexing-handler.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/property-existence-indexing-handler.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Test;
+
+// Schema type with indexable properties and section Id.
+// Section Id is determined by the lexicographical order of indexable property
+// path.
+// Section id = 0: body
+// Section id = 1: title
+constexpr std::string_view kFakeType = "FakeType";
+constexpr std::string_view kPropertyBody = "body";
+constexpr std::string_view kPropertyTitle = "title";
+
+constexpr SectionId kSectionIdBody = 0;
+constexpr SectionId kSectionIdTitle = 1;
+
+// Schema type with nested indexable properties and section Id.
+// Section id = 0: "name"
+// Section id = 1: "nested.body"
+// Section id = 3: "nested.title"
+// Section id = 4: "subject"
+constexpr std::string_view kNestedType = "NestedType";
+constexpr std::string_view kPropertyName = "name";
+constexpr std::string_view kPropertyNestedDoc = "nested";
+constexpr std::string_view kPropertySubject = "subject";
+
+constexpr SectionId kSectionIdNestedBody = 1;
+
+class TermIndexingHandlerTest : public Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ base_dir_ = GetTestTempDir() + "/icing_test";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ index_dir_ = base_dir_ + "/index";
+ schema_store_dir_ = base_dir_ + "/schema_store";
+ document_store_dir_ = base_dir_ + "/document_store";
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ normalizer_,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
+
+ ASSERT_THAT(
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kFakeType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyTitle)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyBody)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kNestedType)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyNestedDoc)
+ .SetDataTypeDocument(
+ kFakeType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertySubject)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ASSERT_TRUE(
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult doc_store_create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false,
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(doc_store_create_result.document_store);
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ normalizer_.reset();
+ lang_segmenter_.reset();
+
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ FakeClock fake_clock_;
+ std::string base_dir_;
+ std::string index_dir_;
+ std::string schema_store_dir_;
+ std::string document_store_dir_;
+
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+QueryExistence(Index* index, std::string_view property_path) {
+ return index->GetIterator(
+ absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path),
+ /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY,
+ /*need_hit_term_frequency=*/false);
+}
+
+std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfo> infos;
+ while (iterator->Advance().ok()) {
+ infos.push_back(iterator->doc_hit_info());
+ }
+ return infos;
+}
+
+std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency(
+ std::unique_ptr<DocHitInfoIterator> iterator) {
+ std::vector<DocHitInfoTermFrequencyPair> infos;
+ while (iterator->Advance().ok()) {
+ std::vector<TermMatchInfo> matched_terms_stats;
+ iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ for (const TermMatchInfo& term_match_info : matched_terms_stats) {
+ infos.push_back(DocHitInfoTermFrequencyPair(
+ iterator->doc_hit_info(), term_match_info.term_frequencies));
+ }
+ }
+ return infos;
+}
+
+TEST_F(TermIndexingHandlerTest, HandleBothStringSectionAndPropertyExistence) {
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "foo")
+ .AddStringProperty(std::string(kPropertyBody), "")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(tokenized_document.document()));
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIndexingHandler> handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer_.get(), index.get(),
+ /*build_property_existence_metadata_hits=*/true));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(document_id));
+
+ // Query 'foo'
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kSectionIdTitle, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ document_id, expected_map)));
+
+ // Query for "title" property existence.
+ ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyTitle));
+ EXPECT_THAT(
+ GetHits(std::move(itr)),
+ ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
+
+ // Query for "body" property existence.
+ ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyBody));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(TermIndexingHandlerTest,
+ HandleIntoLiteIndex_sortInIndexingNotTriggered) {
+ Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "foo")
+ .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(tokenized_document.document()));
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIndexingHandler> handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer_.get(), index.get(),
+ /*build_property_existence_metadata_hits=*/true));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(document_id));
+
+ // Query 'foo'
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+ std::vector<DocHitInfoTermFrequencyPair> hits =
+ GetHitsWithTermFrequency(std::move(itr));
+ std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
+ {kSectionIdTitle, 1}, {kSectionIdBody, 1}};
+ EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
+ document_id, expected_map)));
+
+ // Query 'foo' with sectionId mask that masks all results
+ ICING_ASSERT_OK_AND_ASSIGN(
+ itr, index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, 1U << 2,
+ TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
+}
+
+TEST_F(TermIndexingHandlerTest, HandleIntoLiteIndex_sortInIndexingTriggered) {
+ // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the
+ // HitBuffer after inserting 8 hits
+ Index::Options options(index_dir_,
+ /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ DocumentProto document0 =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "foo foo foo")
+ .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
+ .Build();
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "bar baz baz")
+ .AddStringProperty(std::string(kPropertyBody), "foo foo baz")
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("icing", "nested_type/0")
+ .SetSchema(std::string(kNestedType))
+ .AddDocumentProperty(std::string(kPropertyNestedDoc), document1)
+ .AddStringProperty(std::string(kPropertyName), "qux")
+ .AddStringProperty(std::string(kPropertySubject), "bar bar")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document0,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document0)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id0,
+ document_store_->Put(tokenized_document0.document()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document1,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(tokenized_document1.document()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document2,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(tokenized_document2.document()));
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIndexingHandler> handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer_.get(), index.get(),
+ /*build_property_existence_metadata_hits=*/true));
+
+ // Handle doc0 and doc1. The LiteIndex should sort and merge after adding
+ // these
+ EXPECT_THAT(handler->Handle(tokenized_document0, document_id0,
+ /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(handler->Handle(tokenized_document1, document_id1,
+ /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(document_id1));
+ EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse());
+
+ // Handle doc2. The LiteIndex should have an unsorted portion after adding
+ EXPECT_THAT(handler->Handle(tokenized_document2, document_id2,
+ /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(document_id2));
+
+ // Hits in the hit buffer:
+ // <term>: {(docId, sectionId, term_freq)...}
+ // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1);
+ // (1, kSectionIdBody, 2);
+ // (2, kSectionIdNestedBody, 2)}
+ // bar: {(0, kSectionIdBody, 1);
+ // (1, kSectionIdTitle, 1);
+ // (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)}
+ // baz: {(0, kSectionIdBody, 1);
+ // (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1),
+ // (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)}
+ // qux: {(2, kSectionIdName, 1)}
+
+ // Query 'foo'
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+
+ // Advance the iterator and verify that we're returning hits in the correct
+ // order (i.e. in descending order of DocId)
+ ASSERT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2));
+ EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
+ Eq(1U << kSectionIdNestedBody));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}};
+ itr->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map2)));
+
+ ASSERT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1));
+ EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
+ Eq(1U << kSectionIdBody));
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{kSectionIdBody, 2}};
+ matched_terms_stats.clear();
+ itr->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map1)));
+
+ ASSERT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0));
+ EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
+ Eq(1U << kSectionIdTitle | 1U << kSectionIdBody));
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map0 = {{kSectionIdTitle, 3},
+ {kSectionIdBody, 1}};
+ matched_terms_stats.clear();
+ itr->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map0)));
+}
+
+TEST_F(TermIndexingHandlerTest, HandleIntoLiteIndex_enableSortInIndexing) {
+ // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the
+ // HitBuffer after inserting 8 hits
+ Index::Options options(index_dir_,
+ /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/false,
+ /*lite_index_sort_size=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Index> index,
+ Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ DocumentProto document0 =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/0")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "foo foo foo")
+ .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
+ .Build();
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyTitle), "bar baz baz")
+ .AddStringProperty(std::string(kPropertyBody), "foo foo baz")
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("icing", "nested_type/0")
+ .SetSchema(std::string(kNestedType))
+ .AddDocumentProperty(std::string(kPropertyNestedDoc), document1)
+ .AddStringProperty(std::string(kPropertyName), "qux")
+ .AddStringProperty(std::string(kPropertySubject), "bar bar")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document0,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document0)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id0,
+ document_store_->Put(tokenized_document0.document()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document1,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(tokenized_document1.document()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document2,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(tokenized_document2.document()));
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<TermIndexingHandler> handler,
+ TermIndexingHandler::Create(
+ &fake_clock_, normalizer_.get(), index.get(),
+ /*build_property_existence_metadata_hits=*/true));
+
+ // Handle all docs
+ EXPECT_THAT(handler->Handle(tokenized_document0, document_id0,
+ /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(handler->Handle(tokenized_document1, document_id1,
+ /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(handler->Handle(tokenized_document2, document_id2,
+ /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(document_id2));
+
+ // We've disabled sorting during indexing so the HitBuffer's unsorted section
+ // should exceed the sort threshold. PersistToDisk and reinitialize the
+ // LiteIndex with sort_at_indexing=true.
+ ASSERT_THAT(index->PersistToDisk(), IsOk());
+ options = Index::Options(index_dir_,
+ /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/64);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index, Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ // Verify that the HitBuffer has been sorted after initializing with
+ // sort_at_indexing enabled.
+ EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse());
+
+ // Hits in the hit buffer:
+ // <term>: {(docId, sectionId, term_freq)...}
+ // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1);
+ // (1, kSectionIdBody, 2);
+ // (2, kSectionIdNestedBody, 2)}
+ // bar: {(0, kSectionIdBody, 1);
+ // (1, kSectionIdTitle, 1);
+ // (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)}
+ // baz: {(0, kSectionIdBody, 1);
+ // (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1),
+ // (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)}
+ // qux: {(2, kSectionIdName, 1)}
+
+ // Query 'foo'
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DocHitInfoIterator> itr,
+ index->GetIterator("foo", /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY));
+
+ // Advance the iterator and verify that we're returning hits in the correct
+ // order (i.e. in descending order of DocId)
+ ASSERT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2));
+ EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
+ Eq(1U << kSectionIdNestedBody));
+ std::vector<TermMatchInfo> matched_terms_stats;
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}};
+ itr->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map2)));
+
+ ASSERT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1));
+ EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
+ Eq(1U << kSectionIdBody));
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map1 = {{kSectionIdBody, 2}};
+ matched_terms_stats.clear();
+ itr->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map1)));
+
+ ASSERT_THAT(itr->Advance(), IsOk());
+ EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0));
+ EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
+ Eq(1U << kSectionIdTitle | 1U << kSectionIdBody));
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map0 = {{kSectionIdTitle, 3},
+ {kSectionIdBody, 1}};
+ matched_terms_stats.clear();
+ itr->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map0)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/index/term-metadata.h b/icing/index/term-metadata.h
index c1c1564..09c59ae 100644
--- a/icing/index/term-metadata.h
+++ b/icing/index/term-metadata.h
@@ -22,14 +22,17 @@ namespace lib {
// A POD struct storing metadata of a term.
struct TermMetadata {
- TermMetadata(std::string content_in, int hit_count_in)
- : content(std::move(content_in)), hit_count(hit_count_in) {}
+ TermMetadata(std::string content_in, int score_in)
+ : content(std::move(content_in)), score(score_in) {}
// Content of the term.
std::string content;
- // Number of document hits associated with the term.
- int hit_count;
+ // The score of the term.
+ // It will either be:
+ //- HIT_COUNT - number of document+section hits associated with the term
+ //- TERM_FREQUENCY - the number of times that the term appears in documents
+ int score;
};
} // namespace lib
diff --git a/icing/jni.lds b/icing/jni.lds
new file mode 100644
index 0000000..64fae36
--- /dev/null
+++ b/icing/jni.lds
@@ -0,0 +1,9 @@
+VERS_1.0 {
+ # Export JNI symbols.
+ global:
+ JNI_OnLoad;
+
+ # Hide everything else
+ local:
+ *;
+};
diff --git a/icing/jni/icing-search-engine-jni.cc b/icing/jni/icing-search-engine-jni.cc
index b1b5420..a0883fa 100644
--- a/icing/jni/icing-search-engine-jni.cc
+++ b/icing/jni/icing-search-engine-jni.cc
@@ -15,11 +15,12 @@
#include <jni.h>
#include <string>
+#include <utility>
-#include "icing/jni/jni-cache.h"
-#include <google/protobuf/message_lite.h>
-#include "icing/absl_ports/status_imports.h"
#include "icing/icing-search-engine.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/jni/scoped-primitive-array-critical.h"
+#include "icing/jni/scoped-utf-chars.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/initialize.pb.h"
#include "icing/proto/optimize.pb.h"
@@ -27,38 +28,43 @@
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include <google/protobuf/message_lite.h>
namespace {
+
bool ParseProtoFromJniByteArray(JNIEnv* env, jbyteArray bytes,
google::protobuf::MessageLite* protobuf) {
- int bytes_size = env->GetArrayLength(bytes);
- uint8_t* bytes_ptr = static_cast<uint8_t*>(
- env->GetPrimitiveArrayCritical(bytes, /*isCopy=*/nullptr));
- bool parsed = protobuf->ParseFromArray(bytes_ptr, bytes_size);
- env->ReleasePrimitiveArrayCritical(bytes, bytes_ptr, /*mode=*/0);
-
- return parsed;
+ icing::lib::ScopedPrimitiveArrayCritical<uint8_t> scoped_array(env, bytes);
+ return protobuf->ParseFromArray(scoped_array.data(), scoped_array.size());
}
-jbyteArray SerializeProtoToJniByteArray(
- JNIEnv* env, const google::protobuf::MessageLite& protobuf) {
+jbyteArray SerializeProtoToJniByteArray(JNIEnv* env,
+ const google::protobuf::MessageLite& protobuf) {
int size = protobuf.ByteSizeLong();
jbyteArray ret = env->NewByteArray(size);
if (ret == nullptr) {
- ICING_LOG(ERROR) << "Failed to allocated bytes for jni protobuf";
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to allocated bytes for jni protobuf";
return nullptr;
}
- uint8_t* ret_buf = static_cast<uint8_t*>(
- env->GetPrimitiveArrayCritical(ret, /*isCopy=*/nullptr));
- protobuf.SerializeWithCachedSizesToArray(ret_buf);
- env->ReleasePrimitiveArrayCritical(ret, ret_buf, 0);
+ icing::lib::ScopedPrimitiveArrayCritical<uint8_t> scoped_array(env, ret);
+ protobuf.SerializeWithCachedSizesToArray(scoped_array.data());
return ret;
}
-icing::lib::IcingSearchEngine* GetIcingSearchEnginePointer(
- jlong native_pointer) {
+struct {
+ jfieldID native_pointer;
+} JavaIcingSearchEngineImpl;
+
+icing::lib::IcingSearchEngine* GetIcingSearchEnginePointer(JNIEnv* env,
+ jobject object) {
+ jlong native_pointer =
+ env->GetLongField(object, JavaIcingSearchEngineImpl.native_pointer);
return reinterpret_cast<icing::lib::IcingSearchEngine*>(native_pointer);
}
@@ -66,23 +72,12 @@ icing::lib::IcingSearchEngine* GetIcingSearchEnginePointer(
extern "C" {
-jint JNI_OnLoad(JavaVM* vm, void* reserved) {
- JNIEnv* env;
- if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6) != JNI_OK) {
- ICING_LOG(ERROR) << "ERROR: GetEnv failed";
- return JNI_ERR;
- }
-
- return JNI_VERSION_1_6;
-}
-
-JNIEXPORT jlong JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeCreate(
- JNIEnv* env, jclass clazz, jbyteArray icing_search_engine_options_bytes) {
+jlong nativeCreate(JNIEnv* env, jclass clazz,
+ jbyteArray icing_search_engine_options_bytes) {
icing::lib::IcingSearchEngineOptions options;
if (!ParseProtoFromJniByteArray(env, icing_search_engine_options_bytes,
&options)) {
- ICING_LOG(ERROR)
+ ICING_LOG(icing::lib::ERROR)
<< "Failed to parse IcingSearchEngineOptions in nativeCreate";
return 0;
}
@@ -96,11 +91,15 @@ Java_com_google_android_icing_IcingSearchEngine_nativeCreate(
return reinterpret_cast<jlong>(icing);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeInitialize(
- JNIEnv* env, jclass clazz, jlong native_pointer) {
+void nativeDestroy(JNIEnv* env, jclass clazz, jobject object) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
+ delete icing;
+}
+
+jbyteArray nativeInitialize(JNIEnv* env, jclass clazz, jobject object) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
icing::lib::InitializeResultProto initialize_result_proto =
icing->Initialize();
@@ -108,16 +107,16 @@ Java_com_google_android_icing_IcingSearchEngine_nativeInitialize(
return SerializeProtoToJniByteArray(env, initialize_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeSetSchema(
- JNIEnv* env, jclass clazz, jlong native_pointer, jbyteArray schema_bytes,
- jboolean ignore_errors_and_delete_documents) {
+jbyteArray nativeSetSchema(JNIEnv* env, jclass clazz, jobject object,
+ jbyteArray schema_bytes,
+ jboolean ignore_errors_and_delete_documents) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
icing::lib::SchemaProto schema_proto;
if (!ParseProtoFromJniByteArray(env, schema_bytes, &schema_proto)) {
- ICING_LOG(ERROR) << "Failed to parse SchemaProto in nativeSetSchema";
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse SchemaProto in nativeSetSchema";
return nullptr;
}
@@ -127,41 +126,36 @@ Java_com_google_android_icing_IcingSearchEngine_nativeSetSchema(
return SerializeProtoToJniByteArray(env, set_schema_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeGetSchema(
- JNIEnv* env, jclass clazz, jlong native_pointer) {
+jbyteArray nativeGetSchema(JNIEnv* env, jclass clazz, jobject object) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
icing::lib::GetSchemaResultProto get_schema_result_proto = icing->GetSchema();
return SerializeProtoToJniByteArray(env, get_schema_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeGetSchemaType(
- JNIEnv* env, jclass clazz, jlong native_pointer, jstring schema_type) {
+jbyteArray nativeGetSchemaType(JNIEnv* env, jclass clazz, jobject object,
+ jstring schema_type) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
- const char* native_schema_type =
- env->GetStringUTFChars(schema_type, /*isCopy=*/nullptr);
+ icing::lib::ScopedUtfChars scoped_schema_type_chars(env, schema_type);
icing::lib::GetSchemaTypeResultProto get_schema_type_result_proto =
- icing->GetSchemaType(native_schema_type);
+ icing->GetSchemaType(scoped_schema_type_chars.c_str());
return SerializeProtoToJniByteArray(env, get_schema_type_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativePut(
- JNIEnv* env, jclass clazz, jlong native_pointer,
- jbyteArray document_bytes) {
+jbyteArray nativePut(JNIEnv* env, jclass clazz, jobject object,
+ jbyteArray document_bytes) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
icing::lib::DocumentProto document_proto;
if (!ParseProtoFromJniByteArray(env, document_bytes, &document_proto)) {
- ICING_LOG(ERROR) << "Failed to parse DocumentProto in nativePut";
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse DocumentProto in nativePut";
return nullptr;
}
@@ -171,127 +165,220 @@ Java_com_google_android_icing_IcingSearchEngine_nativePut(
return SerializeProtoToJniByteArray(env, put_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeGet(
- JNIEnv* env, jclass clazz, jlong native_pointer, jstring name_space,
- jstring uri) {
+jbyteArray nativeGet(JNIEnv* env, jclass clazz, jobject object,
+ jstring name_space, jstring uri,
+ jbyteArray result_spec_bytes) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
- const char* native_name_space =
- env->GetStringUTFChars(name_space, /*isCopy=*/nullptr);
- const char* native_uri = env->GetStringUTFChars(uri, /*isCopy=*/nullptr);
+ icing::lib::GetResultSpecProto get_result_spec;
+ if (!ParseProtoFromJniByteArray(env, result_spec_bytes, &get_result_spec)) {
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse GetResultSpecProto in nativeGet";
+ return nullptr;
+ }
+ icing::lib::ScopedUtfChars scoped_name_space_chars(env, name_space);
+ icing::lib::ScopedUtfChars scoped_uri_chars(env, uri);
icing::lib::GetResultProto get_result_proto =
- icing->Get(native_name_space, native_uri);
+ icing->Get(scoped_name_space_chars.c_str(), scoped_uri_chars.c_str(),
+ get_result_spec);
return SerializeProtoToJniByteArray(env, get_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeSearch(
- JNIEnv* env, jclass clazz, jlong native_pointer,
- jbyteArray search_spec_bytes, jbyteArray scoring_spec_bytes,
- jbyteArray result_spec_bytes) {
+jbyteArray nativeReportUsage(JNIEnv* env, jclass clazz, jobject object,
+ jbyteArray usage_report_bytes) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
+
+ icing::lib::UsageReport usage_report;
+ if (!ParseProtoFromJniByteArray(env, usage_report_bytes, &usage_report)) {
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse UsageReport in nativeReportUsage";
+ return nullptr;
+ }
+
+ icing::lib::ReportUsageResultProto report_usage_result_proto =
+ icing->ReportUsage(usage_report);
+
+ return SerializeProtoToJniByteArray(env, report_usage_result_proto);
+}
+
+jbyteArray nativeGetAllNamespaces(JNIEnv* env, jclass clazz, jobject object) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
+
+ icing::lib::GetAllNamespacesResultProto get_all_namespaces_result_proto =
+ icing->GetAllNamespaces();
+
+ return SerializeProtoToJniByteArray(env, get_all_namespaces_result_proto);
+}
+
+jbyteArray nativeGetNextPage(JNIEnv* env, jclass clazz, jobject object,
+ jlong next_page_token,
+ jlong java_to_native_start_timestamp_ms) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
+
+ const std::unique_ptr<const icing::lib::Clock> clock =
+ std::make_unique<icing::lib::Clock>();
+ int32_t java_to_native_jni_latency_ms =
+ clock->GetSystemTimeMilliseconds() - java_to_native_start_timestamp_ms;
+
+ icing::lib::SearchResultProto next_page_result_proto =
+ icing->GetNextPage(next_page_token);
+
+ icing::lib::QueryStatsProto* query_stats =
+ next_page_result_proto.mutable_query_stats();
+ query_stats->set_java_to_native_jni_latency_ms(java_to_native_jni_latency_ms);
+ query_stats->set_native_to_java_start_timestamp_ms(
+ clock->GetSystemTimeMilliseconds());
+
+ return SerializeProtoToJniByteArray(env, next_page_result_proto);
+}
+
+void nativeInvalidateNextPageToken(JNIEnv* env, jclass clazz, jobject object,
+ jlong next_page_token) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
+
+ icing->InvalidateNextPageToken(next_page_token);
+
+ return;
+}
+
+jbyteArray nativeSearch(JNIEnv* env, jclass clazz, jobject object,
+ jbyteArray search_spec_bytes,
+ jbyteArray scoring_spec_bytes,
+ jbyteArray result_spec_bytes,
+ jlong java_to_native_start_timestamp_ms) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
icing::lib::SearchSpecProto search_spec_proto;
if (!ParseProtoFromJniByteArray(env, search_spec_bytes, &search_spec_proto)) {
- ICING_LOG(ERROR) << "Failed to parse SearchSpecProto in nativeSearch";
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse SearchSpecProto in nativeSearch";
return nullptr;
}
icing::lib::ScoringSpecProto scoring_spec_proto;
if (!ParseProtoFromJniByteArray(env, scoring_spec_bytes,
&scoring_spec_proto)) {
- ICING_LOG(ERROR) << "Failed to parse ScoringSpecProto in nativeSearch";
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse ScoringSpecProto in nativeSearch";
return nullptr;
}
icing::lib::ResultSpecProto result_spec_proto;
if (!ParseProtoFromJniByteArray(env, result_spec_bytes, &result_spec_proto)) {
- ICING_LOG(ERROR) << "Failed to parse ResultSpecProto in nativeSearch";
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse ResultSpecProto in nativeSearch";
return nullptr;
}
+ const std::unique_ptr<const icing::lib::Clock> clock =
+ std::make_unique<icing::lib::Clock>();
+ int32_t java_to_native_jni_latency_ms =
+ clock->GetSystemTimeMilliseconds() - java_to_native_start_timestamp_ms;
+
icing::lib::SearchResultProto search_result_proto =
icing->Search(search_spec_proto, scoring_spec_proto, result_spec_proto);
+ icing::lib::QueryStatsProto* query_stats =
+ search_result_proto.mutable_query_stats();
+ query_stats->set_java_to_native_jni_latency_ms(java_to_native_jni_latency_ms);
+ query_stats->set_native_to_java_start_timestamp_ms(
+ clock->GetSystemTimeMilliseconds());
+
return SerializeProtoToJniByteArray(env, search_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeDelete(
- JNIEnv* env, jclass clazz, jlong native_pointer, jstring name_space,
- jstring uri) {
+jbyteArray nativeDelete(JNIEnv* env, jclass clazz, jobject object,
+ jstring name_space, jstring uri) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
- const char* native_name_space =
- env->GetStringUTFChars(name_space, /*isCopy=*/nullptr);
- const char* native_uri = env->GetStringUTFChars(uri, /*isCopy=*/nullptr);
+ icing::lib::ScopedUtfChars scoped_name_space_chars(env, name_space);
+ icing::lib::ScopedUtfChars scoped_uri_chars(env, uri);
icing::lib::DeleteResultProto delete_result_proto =
- icing->Delete(native_name_space, native_uri);
+ icing->Delete(scoped_name_space_chars.c_str(), scoped_uri_chars.c_str());
return SerializeProtoToJniByteArray(env, delete_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeDeleteByNamespace(
- JNIEnv* env, jclass clazz, jlong native_pointer, jstring name_space) {
+jbyteArray nativeDeleteByNamespace(JNIEnv* env, jclass clazz, jobject object,
+ jstring name_space) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
- const char* native_name_space =
- env->GetStringUTFChars(name_space, /*isCopy=*/nullptr);
+ icing::lib::ScopedUtfChars scoped_name_space_chars(env, name_space);
icing::lib::DeleteByNamespaceResultProto delete_by_namespace_result_proto =
- icing->DeleteByNamespace(native_name_space);
+ icing->DeleteByNamespace(scoped_name_space_chars.c_str());
return SerializeProtoToJniByteArray(env, delete_by_namespace_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeDeleteBySchemaType(
- JNIEnv* env, jclass clazz, jlong native_pointer, jstring schema_type) {
+jbyteArray nativeDeleteBySchemaType(JNIEnv* env, jclass clazz, jobject object,
+ jstring schema_type) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
- const char* native_schema_type =
- env->GetStringUTFChars(schema_type, /*isCopy=*/nullptr);
+ icing::lib::ScopedUtfChars scoped_schema_type_chars(env, schema_type);
icing::lib::DeleteBySchemaTypeResultProto delete_by_schema_type_result_proto =
- icing->DeleteBySchemaType(native_schema_type);
+ icing->DeleteBySchemaType(scoped_schema_type_chars.c_str());
return SerializeProtoToJniByteArray(env, delete_by_schema_type_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativePersistToDisk(
- JNIEnv* env, jclass clazz, jlong native_pointer) {
+jbyteArray nativeDeleteByQuery(JNIEnv* env, jclass clazz, jobject object,
+ jbyteArray search_spec_bytes,
+ jboolean return_deleted_document_info) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
+ icing::lib::SearchSpecProto search_spec_proto;
+ if (!ParseProtoFromJniByteArray(env, search_spec_bytes, &search_spec_proto)) {
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse SearchSpecProto in nativeSearch";
+ return nullptr;
+ }
+ icing::lib::DeleteByQueryResultProto delete_result_proto =
+ icing->DeleteByQuery(search_spec_proto, return_deleted_document_info);
+
+ return SerializeProtoToJniByteArray(env, delete_result_proto);
+}
+
+jbyteArray nativePersistToDisk(JNIEnv* env, jclass clazz, jobject object,
+ jint persist_type_code) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
+
+ if (!icing::lib::PersistType::Code_IsValid(persist_type_code)) {
+ ICING_LOG(icing::lib::ERROR)
+ << persist_type_code << " is an invalid value for PersistType::Code";
+ return nullptr;
+ }
+ icing::lib::PersistType::Code persist_type_code_enum =
+ static_cast<icing::lib::PersistType::Code>(persist_type_code);
icing::lib::PersistToDiskResultProto persist_to_disk_result_proto =
- icing->PersistToDisk();
+ icing->PersistToDisk(persist_type_code_enum);
return SerializeProtoToJniByteArray(env, persist_to_disk_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeOptimize(
- JNIEnv* env, jclass clazz, jlong native_pointer) {
+jbyteArray nativeOptimize(JNIEnv* env, jclass clazz, jobject object) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
icing::lib::OptimizeResultProto optimize_result_proto = icing->Optimize();
return SerializeProtoToJniByteArray(env, optimize_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo(
- JNIEnv* env, jclass clazz, jlong native_pointer) {
+jbyteArray nativeGetOptimizeInfo(JNIEnv* env, jclass clazz, jobject object) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
icing::lib::GetOptimizeInfoResultProto get_optimize_info_result_proto =
icing->GetOptimizeInfo();
@@ -299,15 +386,188 @@ Java_com_google_android_icing_IcingSearchEngine_nativeGetOptimizeInfo(
return SerializeProtoToJniByteArray(env, get_optimize_info_result_proto);
}
-JNIEXPORT jbyteArray JNICALL
-Java_com_google_android_icing_IcingSearchEngine_nativeReset(
- JNIEnv* env, jclass clazz, jlong native_pointer) {
+jbyteArray nativeGetStorageInfo(JNIEnv* env, jclass clazz, jobject object) {
icing::lib::IcingSearchEngine* icing =
- GetIcingSearchEnginePointer(native_pointer);
+ GetIcingSearchEnginePointer(env, object);
+
+ icing::lib::StorageInfoResultProto storage_info_result_proto =
+ icing->GetStorageInfo();
+
+ return SerializeProtoToJniByteArray(env, storage_info_result_proto);
+}
+
+jbyteArray nativeReset(JNIEnv* env, jclass clazz, jobject object) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
icing::lib::ResetResultProto reset_result_proto = icing->Reset();
return SerializeProtoToJniByteArray(env, reset_result_proto);
}
+jbyteArray nativeSearchSuggestions(JNIEnv* env, jclass clazz, jobject object,
+ jbyteArray suggestion_spec_bytes) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
+
+ icing::lib::SuggestionSpecProto suggestion_spec_proto;
+ if (!ParseProtoFromJniByteArray(env, suggestion_spec_bytes,
+ &suggestion_spec_proto)) {
+ ICING_LOG(icing::lib::ERROR)
+ << "Failed to parse SuggestionSpecProto in nativeSearch";
+ return nullptr;
+ }
+ icing::lib::SuggestionResponse suggestionResponse =
+ icing->SearchSuggestions(suggestion_spec_proto);
+
+ return SerializeProtoToJniByteArray(env, suggestionResponse);
+}
+
+jbyteArray nativeGetDebugInfo(JNIEnv* env, jclass clazz, jobject object,
+ jint verbosity) {
+ icing::lib::IcingSearchEngine* icing =
+ GetIcingSearchEnginePointer(env, object);
+
+ if (!icing::lib::DebugInfoVerbosity::Code_IsValid(verbosity)) {
+ ICING_LOG(icing::lib::ERROR)
+ << "Invalid value for Debug Info verbosity: " << verbosity;
+ return nullptr;
+ }
+
+ icing::lib::DebugInfoResultProto debug_info_result_proto =
+ icing->GetDebugInfo(
+ static_cast<icing::lib::DebugInfoVerbosity::Code>(verbosity));
+
+ return SerializeProtoToJniByteArray(env, debug_info_result_proto);
+}
+
+jboolean nativeShouldLog(JNIEnv* env, jclass clazz, jshort severity,
+ jshort verbosity) {
+ if (!icing::lib::LogSeverity::Code_IsValid(severity)) {
+ ICING_LOG(icing::lib::ERROR)
+ << "Invalid value for logging severity: " << severity;
+ return false;
+ }
+ return icing::lib::ShouldLog(
+ static_cast<icing::lib::LogSeverity::Code>(severity), verbosity);
+}
+
+jboolean nativeSetLoggingLevel(JNIEnv* env, jclass clazz, jshort severity,
+ jshort verbosity) {
+ if (!icing::lib::LogSeverity::Code_IsValid(severity)) {
+ ICING_LOG(icing::lib::ERROR)
+ << "Invalid value for logging severity: " << severity;
+ return false;
+ }
+ return icing::lib::SetLoggingLevel(
+ static_cast<icing::lib::LogSeverity::Code>(severity), verbosity);
+}
+
+jstring nativeGetLoggingTag(JNIEnv* env, jclass clazz) {
+ return env->NewStringUTF(icing::lib::kIcingLoggingTag);
+}
+
+#pragma clang diagnostic ignored "-Wwrite-strings"
+jint JNI_OnLoad(JavaVM* vm, void* reserved) {
+ JNIEnv* env;
+ if (vm->GetEnv(reinterpret_cast<void**>(&env), JNI_VERSION_1_6) != JNI_OK) {
+ ICING_LOG(icing::lib::ERROR) << "ERROR: GetEnv failed";
+ return JNI_ERR;
+ }
+
+ // Find your class. JNI_OnLoad is called from the correct class loader context
+ // for this to work.
+ jclass java_class =
+ env->FindClass("com/google/android/icing/IcingSearchEngineImpl");
+ if (java_class == nullptr) {
+ return JNI_ERR;
+ }
+ JavaIcingSearchEngineImpl.native_pointer =
+ env->GetFieldID(java_class, "nativePointer", "J");
+
+ // Register your class' native methods.
+ static const JNINativeMethod methods[] = {
+ {"nativeCreate", "([B)J", reinterpret_cast<void*>(nativeCreate)},
+ {"nativeDestroy", "(Lcom/google/android/icing/IcingSearchEngineImpl;)V",
+ reinterpret_cast<void*>(nativeDestroy)},
+ {"nativeInitialize",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B",
+ reinterpret_cast<void*>(nativeInitialize)},
+ {"nativeSetSchema",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;[BZ)[B",
+ reinterpret_cast<void*>(nativeSetSchema)},
+ {"nativeGetSchema",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B",
+ reinterpret_cast<void*>(nativeGetSchema)},
+ {"nativeGetSchemaType",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B",
+ reinterpret_cast<void*>(nativeGetSchemaType)},
+ {"nativePut", "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B",
+ reinterpret_cast<void*>(nativePut)},
+ {"nativeGet",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/"
+ "String;Ljava/lang/String;[B)[B",
+ reinterpret_cast<void*>(nativeGet)},
+ {"nativeReportUsage",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B",
+ reinterpret_cast<void*>(nativeReportUsage)},
+ {"nativeGetAllNamespaces",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B",
+ reinterpret_cast<void*>(nativeGetAllNamespaces)},
+ {"nativeGetNextPage",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;JJ)[B",
+ reinterpret_cast<void*>(nativeGetNextPage)},
+ {"nativeInvalidateNextPageToken",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;J)V",
+ reinterpret_cast<void*>(nativeInvalidateNextPageToken)},
+ {"nativeSearch",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;[B[B[BJ)[B",
+ reinterpret_cast<void*>(nativeSearch)},
+ {"nativeDelete",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/"
+ "String;Ljava/lang/String;)[B",
+ reinterpret_cast<void*>(nativeDelete)},
+ {"nativeDeleteByNamespace",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B",
+ reinterpret_cast<void*>(nativeDeleteByNamespace)},
+ {"nativeDeleteBySchemaType",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;Ljava/lang/String;)[B",
+ reinterpret_cast<void*>(nativeDeleteBySchemaType)},
+ {"nativeDeleteByQuery",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;[BZ)[B",
+ reinterpret_cast<void*>(nativeDeleteByQuery)},
+ {"nativePersistToDisk",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;I)[B",
+ reinterpret_cast<void*>(nativePersistToDisk)},
+ {"nativeOptimize", "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B",
+ reinterpret_cast<void*>(nativeOptimize)},
+ {"nativeGetOptimizeInfo",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B",
+ reinterpret_cast<void*>(nativeGetOptimizeInfo)},
+ {"nativeGetStorageInfo",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B",
+ reinterpret_cast<void*>(nativeGetStorageInfo)},
+ {"nativeReset", "(Lcom/google/android/icing/IcingSearchEngineImpl;)[B",
+ reinterpret_cast<void*>(nativeReset)},
+ {"nativeSearchSuggestions",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;[B)[B",
+ reinterpret_cast<void*>(nativeSearchSuggestions)},
+ {"nativeGetDebugInfo",
+ "(Lcom/google/android/icing/IcingSearchEngineImpl;I)[B",
+ reinterpret_cast<void*>(nativeGetDebugInfo)},
+ {"nativeShouldLog", "(SS)Z", reinterpret_cast<void*>(nativeShouldLog)},
+ {"nativeSetLoggingLevel", "(SS)Z",
+ reinterpret_cast<void*>(nativeSetLoggingLevel)},
+ {"nativeGetLoggingTag", "()Ljava/lang/String;",
+ reinterpret_cast<void*>(nativeGetLoggingTag)},
+ };
+ int register_natives_success = env->RegisterNatives(
+ java_class, methods, sizeof(methods) / sizeof(JNINativeMethod));
+ if (register_natives_success != JNI_OK) {
+ return register_natives_success;
+ }
+
+ return JNI_VERSION_1_6;
+}
+
} // extern "C"
diff --git a/icing/jni/jni-cache.cc b/icing/jni/jni-cache.cc
index 58eb8bf..1804b9a 100644
--- a/icing/jni/jni-cache.cc
+++ b/icing/jni/jni-cache.cc
@@ -14,6 +14,8 @@
#include "icing/jni/jni-cache.h"
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+
#include "icing/text_classifier/lib3/utils/java/jni-base.h"
#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
#include "icing/absl_ports/canonical_errors.h"
@@ -157,8 +159,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<JniCache>> JniCache::Create(
// BreakIteratorBatcher
ICING_GET_CLASS_OR_RETURN_NULL(
- breakiterator,
- "com/google/android/icing/BreakIteratorBatcher");
+ breakiterator, "com/google/android/icing/BreakIteratorBatcher");
ICING_GET_METHOD(breakiterator, constructor, "<init>",
"(Ljava/util/Locale;)V");
ICING_GET_METHOD(breakiterator, settext, "setText", "(Ljava/lang/String;)V");
@@ -214,3 +215,5 @@ JniCache::ConvertToJavaString(const char* utf8_text,
} // namespace lib
} // namespace icing
+
+#endif // ICING_REVERSE_JNI_SEGMENTATION
diff --git a/icing/jni/jni-cache.h b/icing/jni/jni-cache.h
index a5f16c7..3faaed6 100644
--- a/icing/jni/jni-cache.h
+++ b/icing/jni/jni-cache.h
@@ -15,6 +15,16 @@
#ifndef ICING_JNI_JNI_CACHE_H_
#define ICING_JNI_JNI_CACHE_H_
+#ifndef ICING_REVERSE_JNI_SEGMENTATION
+namespace icing {
+namespace lib {
+
+class JniCache {}; // Declare an empty class definition for non-Android builds.
+
+} // namespace lib
+} // namespace icing
+#else // ICING_REVERSE_JNI_SEGMENTATION
+
#include <jni.h>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
@@ -75,4 +85,6 @@ struct JniCache {
} // namespace lib
} // namespace icing
+#endif // !ICING_REVERSE_JNI_SEGMENTATION
+
#endif // ICING_JNI_JNI_CACHE_H_
diff --git a/icing/jni/scoped-primitive-array-critical.h b/icing/jni/scoped-primitive-array-critical.h
new file mode 100644
index 0000000..062c145
--- /dev/null
+++ b/icing/jni/scoped-primitive-array-critical.h
@@ -0,0 +1,86 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JNI_SCOPED_PRIMITIVE_ARRAY_CRITICAL_H_
+#define ICING_JNI_SCOPED_PRIMITIVE_ARRAY_CRITICAL_H_
+
+#include <jni.h>
+
+#include <utility>
+
+namespace icing {
+namespace lib {
+
+template <typename T>
+class ScopedPrimitiveArrayCritical {
+ public:
+ ScopedPrimitiveArrayCritical(JNIEnv* env, jarray array)
+ : env_(env), array_(array) {
+ if (array_ == nullptr) {
+ array_critical_ = nullptr;
+ array_critical_size_ = 0;
+ } else {
+ array_critical_size_ = env->GetArrayLength(array);
+ array_critical_ = static_cast<T*>(
+ env->GetPrimitiveArrayCritical(array, /*isCopy=*/nullptr));
+ }
+ }
+
+ ScopedPrimitiveArrayCritical(ScopedPrimitiveArrayCritical&& rhs)
+ : env_(nullptr),
+ array_(nullptr),
+ array_critical_(nullptr),
+ array_critical_size_(0) {
+ Swap(rhs);
+ }
+
+ ScopedPrimitiveArrayCritical(const ScopedPrimitiveArrayCritical&) = delete;
+
+ ScopedPrimitiveArrayCritical& operator=(ScopedPrimitiveArrayCritical&& rhs) {
+ Swap(rhs);
+ return *this;
+ }
+
+ ScopedPrimitiveArrayCritical& operator=(const ScopedPrimitiveArrayCritical&) =
+ delete;
+
+ ~ScopedPrimitiveArrayCritical() {
+ if (array_critical_ != nullptr && array_ != nullptr) {
+ env_->ReleasePrimitiveArrayCritical(array_, array_critical_, /*mode=*/0);
+ }
+ }
+
+ T* data() { return array_critical_; }
+ const T* data() const { return array_critical_; }
+
+ size_t size() const { return array_critical_size_; }
+
+ private:
+ void Swap(ScopedPrimitiveArrayCritical& other) {
+ std::swap(env_, other.env_);
+ std::swap(array_, other.array_);
+ std::swap(array_critical_, other.array_critical_);
+ std::swap(array_critical_size_, other.array_critical_size_);
+ }
+
+ JNIEnv* env_;
+ jarray array_;
+ T* array_critical_;
+ size_t array_critical_size_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JNI_SCOPED_PRIMITIVE_ARRAY_CRITICAL_H_
diff --git a/icing/jni/scoped-primitive-array-critical_test.cc b/icing/jni/scoped-primitive-array-critical_test.cc
new file mode 100644
index 0000000..3655378
--- /dev/null
+++ b/icing/jni/scoped-primitive-array-critical_test.cc
@@ -0,0 +1,140 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/scoped-primitive-array-critical.h"
+
+#include <jni.h>
+
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "util/java/mock_jni_env.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsNull;
+using ::testing::Return;
+using util::java::test::MockJNIEnv;
+
+TEST(ScopedJniClassesTest, ScopedPrimitiveArrayNull) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array(
+ env_mock.get(), /*array=*/nullptr);
+ EXPECT_THAT(scoped_primitive_array.data(), IsNull());
+ EXPECT_THAT(scoped_primitive_array.size(), Eq(0));
+
+ // Move construct a scoped utf chars
+ ScopedPrimitiveArrayCritical<uint8_t> moved_scoped_primitive_array(
+ std::move(scoped_primitive_array));
+ EXPECT_THAT(moved_scoped_primitive_array.data(), IsNull());
+ EXPECT_THAT(moved_scoped_primitive_array.size(), Eq(0));
+
+ // Move assign a scoped utf chars
+ ScopedPrimitiveArrayCritical<uint8_t> move_assigned_scoped_primitive_array =
+ std::move(moved_scoped_primitive_array);
+ EXPECT_THAT(move_assigned_scoped_primitive_array.data(), IsNull());
+ EXPECT_THAT(move_assigned_scoped_primitive_array.size(), Eq(0));
+}
+
+TEST(ScopedJniClassesTest, ScopedPrimitiveArrayConstruction) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ jarray fake_jarray = reinterpret_cast<jarray>(-303);
+ uint8_t fake_array[] = {1, 8, 63, 90};
+ ON_CALL(*env_mock, GetPrimitiveArrayCritical(Eq(fake_jarray), IsNull()))
+ .WillByDefault(Return(fake_array));
+ ON_CALL(*env_mock, GetArrayLength(Eq(fake_jarray))).WillByDefault(Return(4));
+
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array(
+ env_mock.get(),
+ /*array=*/fake_jarray);
+ EXPECT_THAT(scoped_primitive_array.data(), Eq(fake_array));
+ EXPECT_THAT(scoped_primitive_array.size(), Eq(4));
+
+ EXPECT_CALL(*env_mock, ReleasePrimitiveArrayCritical(Eq(fake_jarray),
+ Eq(fake_array), Eq(0)))
+ .Times(1);
+}
+
+TEST(ScopedJniClassesTest, ScopedPrimitiveArrayMoveConstruction) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ jarray fake_jarray = reinterpret_cast<jarray>(-303);
+ uint8_t fake_array[] = {1, 8, 63, 90};
+ ON_CALL(*env_mock, GetPrimitiveArrayCritical(Eq(fake_jarray), IsNull()))
+ .WillByDefault(Return(fake_array));
+ ON_CALL(*env_mock, GetArrayLength(Eq(fake_jarray))).WillByDefault(Return(4));
+
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array(
+ env_mock.get(),
+ /*array=*/fake_jarray);
+
+ // Move construct a scoped utf chars
+ ScopedPrimitiveArrayCritical<uint8_t> moved_scoped_primitive_array(
+ std::move(scoped_primitive_array));
+ EXPECT_THAT(moved_scoped_primitive_array.data(), Eq(fake_array));
+ EXPECT_THAT(moved_scoped_primitive_array.size(), Eq(4));
+
+ EXPECT_CALL(*env_mock, ReleasePrimitiveArrayCritical(Eq(fake_jarray),
+ Eq(fake_array), Eq(0)))
+ .Times(1);
+}
+
+TEST(ScopedJniClassesTest, ScopedPrimitiveArrayMoveAssignment) {
+ // Setup the mock to return:
+ // {1, 8, 63, 90} for jstring (-303)
+ // {5, 9, 82} for jstring (-505)
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ jarray fake_jarray1 = reinterpret_cast<jarray>(-303);
+ uint8_t fake_array1[] = {1, 8, 63, 90};
+ ON_CALL(*env_mock, GetPrimitiveArrayCritical(Eq(fake_jarray1), IsNull()))
+ .WillByDefault(Return(fake_array1));
+ ON_CALL(*env_mock, GetArrayLength(Eq(fake_jarray1))).WillByDefault(Return(4));
+
+ jarray fake_jarray2 = reinterpret_cast<jarray>(-505);
+ uint8_t fake_array2[] = {5, 9, 82};
+ ON_CALL(*env_mock, GetPrimitiveArrayCritical(Eq(fake_jarray2), IsNull()))
+ .WillByDefault(Return(fake_array2));
+ ON_CALL(*env_mock, GetArrayLength(Eq(fake_jarray2))).WillByDefault(Return(3));
+
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array1(
+ env_mock.get(),
+ /*array=*/fake_jarray1);
+ ScopedPrimitiveArrayCritical<uint8_t> scoped_primitive_array2(
+ env_mock.get(),
+ /*array=*/fake_jarray2);
+
+ // Move assign a scoped utf chars
+ scoped_primitive_array2 = std::move(scoped_primitive_array1);
+ EXPECT_THAT(scoped_primitive_array2.data(), Eq(fake_array1));
+ EXPECT_THAT(scoped_primitive_array2.size(), Eq(4));
+
+ EXPECT_CALL(*env_mock, ReleasePrimitiveArrayCritical(Eq(fake_jarray1),
+ Eq(fake_array1), Eq(0)))
+ .Times(1);
+ EXPECT_CALL(*env_mock, ReleasePrimitiveArrayCritical(Eq(fake_jarray2),
+ Eq(fake_array2), Eq(0)))
+ .Times(1);
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/jni/scoped-utf-chars.h b/icing/jni/scoped-utf-chars.h
new file mode 100644
index 0000000..5a3ac6a
--- /dev/null
+++ b/icing/jni/scoped-utf-chars.h
@@ -0,0 +1,81 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JNI_SCOPED_UTF_CHARS_H_
+#define ICING_JNI_SCOPED_UTF_CHARS_H_
+
+#include <jni.h>
+
+#include <cstddef>
+#include <cstring>
+#include <utility>
+
+namespace icing {
+namespace lib {
+
+// An RAII class to manage access and allocation of a Java string's UTF chars.
+class ScopedUtfChars {
+ public:
+ ScopedUtfChars(JNIEnv* env, jstring s) : env_(env), string_(s) {
+ if (s == nullptr) {
+ utf_chars_ = nullptr;
+ size_ = 0;
+ } else {
+ utf_chars_ = env->GetStringUTFChars(s, /*isCopy=*/nullptr);
+ size_ = strlen(utf_chars_);
+ }
+ }
+
+ ScopedUtfChars(ScopedUtfChars&& rhs)
+ : env_(nullptr), string_(nullptr), utf_chars_(nullptr) {
+ Swap(rhs);
+ }
+
+ ScopedUtfChars(const ScopedUtfChars&) = delete;
+
+ ScopedUtfChars& operator=(ScopedUtfChars&& rhs) {
+ Swap(rhs);
+ return *this;
+ }
+
+ ScopedUtfChars& operator=(const ScopedUtfChars&) = delete;
+
+ ~ScopedUtfChars() {
+ if (utf_chars_ != nullptr) {
+ env_->ReleaseStringUTFChars(string_, utf_chars_);
+ }
+ }
+
+ const char* c_str() const { return utf_chars_; }
+
+ size_t size() const { return size_; }
+
+ private:
+ void Swap(ScopedUtfChars& other) {
+ std::swap(env_, other.env_);
+ std::swap(string_, other.string_);
+ std::swap(utf_chars_, other.utf_chars_);
+ std::swap(size_, other.size_);
+ }
+
+ JNIEnv* env_;
+ jstring string_;
+ const char* utf_chars_;
+ size_t size_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JNI_SCOPED_UTF_CHARS_H_
diff --git a/icing/jni/scoped-utf-chars_test.cc b/icing/jni/scoped-utf-chars_test.cc
new file mode 100644
index 0000000..d249f69
--- /dev/null
+++ b/icing/jni/scoped-utf-chars_test.cc
@@ -0,0 +1,126 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/jni/scoped-utf-chars.h"
+
+#include <jni.h>
+
+#include <string>
+#include <utility>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "util/java/mock_jni_env.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsNull;
+using ::testing::Return;
+using util::java::test::MockJNIEnv;
+
+TEST(ScopedJniClassesTest, ScopedUtfCharsNull) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ ScopedUtfChars scoped_utf_chars(env_mock.get(), /*s=*/nullptr);
+ EXPECT_THAT(scoped_utf_chars.c_str(), IsNull());
+ EXPECT_THAT(scoped_utf_chars.size(), Eq(0));
+
+ // Move construct a scoped utf chars
+ ScopedUtfChars moved_scoped_utf_chars(std::move(scoped_utf_chars));
+ EXPECT_THAT(moved_scoped_utf_chars.c_str(), IsNull());
+ EXPECT_THAT(moved_scoped_utf_chars.size(), Eq(0));
+
+ // Move assign a scoped utf chars
+ ScopedUtfChars move_assigned_scoped_utf_chars =
+ std::move(moved_scoped_utf_chars);
+ EXPECT_THAT(move_assigned_scoped_utf_chars.c_str(), IsNull());
+ EXPECT_THAT(move_assigned_scoped_utf_chars.size(), Eq(0));
+}
+
+TEST(ScopedJniClassesTest, ScopedUtfCharsConstruction) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ jstring fake_jstring = reinterpret_cast<jstring>(-303);
+ std::string fake_string = "foo";
+ ON_CALL(*env_mock, GetStringUTFChars(Eq(fake_jstring), IsNull()))
+ .WillByDefault(Return(fake_string.c_str()));
+
+ ScopedUtfChars scoped_utf_chars(env_mock.get(), /*s=*/fake_jstring);
+ EXPECT_THAT(scoped_utf_chars.c_str(), Eq(fake_string.c_str()));
+ EXPECT_THAT(scoped_utf_chars.size(), Eq(3));
+
+ EXPECT_CALL(*env_mock,
+ ReleaseStringUTFChars(Eq(fake_jstring), Eq(fake_string.c_str())))
+ .Times(1);
+}
+
+TEST(ScopedJniClassesTest, ScopedUtfCharsMoveConstruction) {
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ // Construct a scoped utf chars normally.
+ jstring fake_jstring = reinterpret_cast<jstring>(-303);
+ std::string fake_string = "foo";
+ ON_CALL(*env_mock, GetStringUTFChars(Eq(fake_jstring), IsNull()))
+ .WillByDefault(Return(fake_string.c_str()));
+
+ ScopedUtfChars scoped_utf_chars(env_mock.get(), /*s=*/fake_jstring);
+
+ // Move construct a scoped utf chars
+ ScopedUtfChars moved_scoped_utf_chars(std::move(scoped_utf_chars));
+ EXPECT_THAT(moved_scoped_utf_chars.c_str(), Eq(fake_string.c_str()));
+ EXPECT_THAT(moved_scoped_utf_chars.size(), Eq(3));
+
+ EXPECT_CALL(*env_mock,
+ ReleaseStringUTFChars(Eq(fake_jstring), Eq(fake_string.c_str())))
+ .Times(1);
+}
+
+TEST(ScopedJniClassesTest, ScopedUtfCharsMoveAssignment) {
+ // Setup the mock to return:
+ // "foo" for jstring (-303)
+ // "bar baz" for jstring (-505)
+ auto env_mock = std::make_unique<MockJNIEnv>();
+ jstring fake_jstring1 = reinterpret_cast<jstring>(-303);
+ std::string fake_string1 = "foo";
+ ON_CALL(*env_mock, GetStringUTFChars(Eq(fake_jstring1), IsNull()))
+ .WillByDefault(Return(fake_string1.c_str()));
+
+ jstring fake_jstring2 = reinterpret_cast<jstring>(-505);
+ std::string fake_string2 = "bar baz";
+ ON_CALL(*env_mock, GetStringUTFChars(Eq(fake_jstring2), IsNull()))
+ .WillByDefault(Return(fake_string2.c_str()));
+
+ ScopedUtfChars scoped_utf_chars1(env_mock.get(), /*s=*/fake_jstring1);
+ ScopedUtfChars scoped_utf_chars2(env_mock.get(), /*s=*/fake_jstring2);
+
+ // Move assign a scoped utf chars
+ scoped_utf_chars2 = std::move(scoped_utf_chars1);
+ EXPECT_THAT(scoped_utf_chars2.c_str(), Eq(fake_string1.c_str()));
+ EXPECT_THAT(scoped_utf_chars2.size(), Eq(3));
+
+ EXPECT_CALL(*env_mock, ReleaseStringUTFChars(Eq(fake_jstring1),
+ Eq(fake_string1.c_str())))
+ .Times(1);
+ EXPECT_CALL(*env_mock, ReleaseStringUTFChars(Eq(fake_jstring2),
+ Eq(fake_string2.c_str())))
+ .Times(1);
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/aggregation-scorer.cc b/icing/join/aggregation-scorer.cc
new file mode 100644
index 0000000..3dee3dd
--- /dev/null
+++ b/icing/join/aggregation-scorer.cc
@@ -0,0 +1,139 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/aggregation-scorer.h"
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "icing/proto/search.pb.h"
+#include "icing/scoring/scored-document-hit.h"
+
+namespace icing {
+namespace lib {
+
+class CountAggregationScorer : public AggregationScorer {
+ public:
+ double GetScore(const ScoredDocumentHit& parent,
+ const std::vector<ScoredDocumentHit>& children) override {
+ return children.size();
+ }
+};
+
+class MinAggregationScorer : public AggregationScorer {
+ public:
+ double GetScore(const ScoredDocumentHit& parent,
+ const std::vector<ScoredDocumentHit>& children) override {
+ if (children.empty()) {
+ // Return 0 if there is no child document.
+ // For non-empty children with negative scores, they are considered "worse
+ // than" 0, so it is correct to return 0 for empty children to assign it a
+ // rank higher than non-empty children with negative scores.
+ return 0.0;
+ }
+ return std::min_element(children.begin(), children.end(),
+ [](const ScoredDocumentHit& lhs,
+ const ScoredDocumentHit& rhs) -> bool {
+ return lhs.score() < rhs.score();
+ })
+ ->score();
+ }
+};
+
+class AverageAggregationScorer : public AggregationScorer {
+ public:
+ double GetScore(const ScoredDocumentHit& parent,
+ const std::vector<ScoredDocumentHit>& children) override {
+ if (children.empty()) {
+ // Return 0 if there is no child document.
+ // For non-empty children with negative scores, they are considered "worse
+ // than" 0, so it is correct to return 0 for empty children to assign it a
+ // rank higher than non-empty children with negative scores.
+ return 0.0;
+ }
+ return std::reduce(
+ children.begin(), children.end(), 0.0,
+ [](double prev, const ScoredDocumentHit& item) -> double {
+ return prev + item.score();
+ }) /
+ children.size();
+ }
+};
+
+class MaxAggregationScorer : public AggregationScorer {
+ public:
+ double GetScore(const ScoredDocumentHit& parent,
+ const std::vector<ScoredDocumentHit>& children) override {
+ if (children.empty()) {
+ // Return 0 if there is no child document.
+ // For non-empty children with negative scores, they are considered "worse
+ // than" 0, so it is correct to return 0 for empty children to assign it a
+ // rank higher than non-empty children with negative scores.
+ return 0.0;
+ }
+ return std::max_element(children.begin(), children.end(),
+ [](const ScoredDocumentHit& lhs,
+ const ScoredDocumentHit& rhs) -> bool {
+ return lhs.score() < rhs.score();
+ })
+ ->score();
+ }
+};
+
+class SumAggregationScorer : public AggregationScorer {
+ public:
+ double GetScore(const ScoredDocumentHit& parent,
+ const std::vector<ScoredDocumentHit>& children) override {
+ return std::reduce(
+ children.begin(), children.end(), 0.0,
+ [](double prev, const ScoredDocumentHit& item) -> double {
+ return prev + item.score();
+ });
+ }
+};
+
+class DefaultAggregationScorer : public AggregationScorer {
+ public:
+ double GetScore(const ScoredDocumentHit& parent,
+ const std::vector<ScoredDocumentHit>& children) override {
+ return parent.score();
+ }
+};
+
+std::unique_ptr<AggregationScorer> AggregationScorer::Create(
+ const JoinSpecProto& join_spec) {
+ switch (join_spec.aggregation_scoring_strategy()) {
+ case JoinSpecProto::AggregationScoringStrategy::COUNT:
+ return std::make_unique<CountAggregationScorer>();
+ case JoinSpecProto::AggregationScoringStrategy::MIN:
+ return std::make_unique<MinAggregationScorer>();
+ case JoinSpecProto::AggregationScoringStrategy::AVG:
+ return std::make_unique<AverageAggregationScorer>();
+ case JoinSpecProto::AggregationScoringStrategy::MAX:
+ return std::make_unique<MaxAggregationScorer>();
+ case JoinSpecProto::AggregationScoringStrategy::SUM:
+ return std::make_unique<SumAggregationScorer>();
+ case JoinSpecProto::AggregationScoringStrategy::NONE:
+ // No aggregation strategy means using parent document score, so fall
+ // through to return DefaultAggregationScorer.
+ [[fallthrough]];
+ default:
+ return std::make_unique<DefaultAggregationScorer>();
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/aggregation-scorer.h b/icing/join/aggregation-scorer.h
new file mode 100644
index 0000000..3d38cf0
--- /dev/null
+++ b/icing/join/aggregation-scorer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_AGGREGATION_SCORER_H_
+#define ICING_JOIN_AGGREGATION_SCORER_H_
+
+#include <memory>
+#include <vector>
+
+#include "icing/proto/search.pb.h"
+#include "icing/scoring/scored-document-hit.h"
+
+namespace icing {
+namespace lib {
+
+class AggregationScorer {
+ public:
+ static std::unique_ptr<AggregationScorer> Create(
+ const JoinSpecProto& join_spec);
+
+ virtual ~AggregationScorer() = default;
+
+ virtual double GetScore(const ScoredDocumentHit& parent,
+ const std::vector<ScoredDocumentHit>& children) = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_AGGREGATION_SCORER_H_
diff --git a/icing/join/aggregation-scorer_test.cc b/icing/join/aggregation-scorer_test.cc
new file mode 100644
index 0000000..19a7239
--- /dev/null
+++ b/icing/join/aggregation-scorer_test.cc
@@ -0,0 +1,215 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/aggregation-scorer.h"
+
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/scored-document-hit.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::DoubleEq;
+
+struct AggregationScorerTestParam {
+ double ans;
+ JoinSpecProto::AggregationScoringStrategy::Code scoring_strategy;
+ double parent_score;
+ std::vector<double> child_scores;
+
+ explicit AggregationScorerTestParam(
+ double ans_in,
+ JoinSpecProto::AggregationScoringStrategy::Code scoring_strategy_in,
+ double parent_score_in, std::vector<double> child_scores_in)
+ : ans(ans_in),
+ scoring_strategy(scoring_strategy_in),
+ parent_score(std::move(parent_score_in)),
+ child_scores(std::move(child_scores_in)) {}
+};
+
+class AggregationScorerTest
+ : public ::testing::TestWithParam<AggregationScorerTestParam> {};
+
+TEST_P(AggregationScorerTest, GetScore) {
+ static constexpr DocumentId kDefaultDocumentId = 0;
+
+ const AggregationScorerTestParam& param = GetParam();
+ // Test AggregationScorer by creating some ScoredDocumentHits for parent and
+ // child documents. DocumentId and SectionIdMask won't affect the aggregation
+ // score calculation, so just simply set default values.
+ // Parent document
+ ScoredDocumentHit parent_scored_document_hit(
+ kDefaultDocumentId, kSectionIdMaskNone, param.parent_score);
+ // Child documents
+ std::vector<ScoredDocumentHit> child_scored_document_hits;
+ child_scored_document_hits.reserve(param.child_scores.size());
+ std::transform(param.child_scores.cbegin(), param.child_scores.cend(),
+ std::back_inserter(child_scored_document_hits),
+ [](double score) -> ScoredDocumentHit {
+ return ScoredDocumentHit(kDefaultDocumentId,
+ kSectionIdMaskNone, score);
+ });
+
+ JoinSpecProto join_spec;
+ join_spec.set_aggregation_scoring_strategy(param.scoring_strategy);
+ std::unique_ptr<AggregationScorer> scorer =
+ AggregationScorer::Create(join_spec);
+ EXPECT_THAT(
+ scorer->GetScore(parent_scored_document_hit, child_scored_document_hits),
+ DoubleEq(param.ans));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ CountAggregationScorerTest, AggregationScorerTest,
+ testing::Values(
+ // General case.
+ AggregationScorerTestParam(
+ /*ans_in=*/5, JoinSpecProto::AggregationScoringStrategy::COUNT,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{8, 3, 1, 4, 7}),
+ // Only one child.
+ AggregationScorerTestParam(
+ /*ans_in=*/1, JoinSpecProto::AggregationScoringStrategy::COUNT,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{123}),
+ // No child.
+ AggregationScorerTestParam(
+ /*ans_in=*/0, JoinSpecProto::AggregationScoringStrategy::COUNT,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{})));
+
+INSTANTIATE_TEST_SUITE_P(
+ MinAggregationScorerTest, AggregationScorerTest,
+ testing::Values(
+ // General case.
+ AggregationScorerTestParam(
+ /*ans_in=*/1, JoinSpecProto::AggregationScoringStrategy::MIN,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{8, 3, 1, 4, 7}),
+ // Only one child, greater than parent.
+ AggregationScorerTestParam(
+ /*ans_in=*/123, JoinSpecProto::AggregationScoringStrategy::MIN,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{123}),
+ // Only one child, smaller than parent.
+ AggregationScorerTestParam(
+ /*ans_in=*/50, JoinSpecProto::AggregationScoringStrategy::MIN,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{50}),
+ // No child.
+ AggregationScorerTestParam(
+ /*ans_in=*/0, JoinSpecProto::AggregationScoringStrategy::MIN,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{})));
+
+INSTANTIATE_TEST_SUITE_P(
+ AverageAggregationScorerTest, AggregationScorerTest,
+ testing::Values(
+ // General case.
+ AggregationScorerTestParam(
+ /*ans_in=*/4.6, JoinSpecProto::AggregationScoringStrategy::AVG,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{8, 3, 1, 4, 7}),
+ // Only one child.
+ AggregationScorerTestParam(
+ /*ans_in=*/123, JoinSpecProto::AggregationScoringStrategy::AVG,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{123}),
+ // No child.
+ AggregationScorerTestParam(
+ /*ans_in=*/0, JoinSpecProto::AggregationScoringStrategy::AVG,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{})));
+
+INSTANTIATE_TEST_SUITE_P(
+ MaxAggregationScorerTest, AggregationScorerTest,
+ testing::Values(
+ // General case.
+ AggregationScorerTestParam(
+ /*ans_in=*/8, JoinSpecProto::AggregationScoringStrategy::MAX,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{8, 3, 1, 4, 7}),
+ // Only one child, greater than parent.
+ AggregationScorerTestParam(
+ /*ans_in=*/123, JoinSpecProto::AggregationScoringStrategy::MAX,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{123}),
+ // Only one child, smaller than parent.
+ AggregationScorerTestParam(
+ /*ans_in=*/50, JoinSpecProto::AggregationScoringStrategy::MAX,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{50}),
+ // No child.
+ AggregationScorerTestParam(
+ /*ans_in=*/0, JoinSpecProto::AggregationScoringStrategy::MAX,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{})));
+
+INSTANTIATE_TEST_SUITE_P(
+ SumAggregationScorerTest, AggregationScorerTest,
+ testing::Values(
+ // General case.
+ AggregationScorerTestParam(
+ /*ans_in=*/23, JoinSpecProto::AggregationScoringStrategy::SUM,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{8, 3, 1, 4, 7}),
+ // Only one child.
+ AggregationScorerTestParam(
+ /*ans_in=*/123, JoinSpecProto::AggregationScoringStrategy::SUM,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{123}),
+ // No child.
+ AggregationScorerTestParam(
+ /*ans_in=*/0, JoinSpecProto::AggregationScoringStrategy::SUM,
+ /*parent_score_in=*/0,
+ /*child_scores_in=*/{})));
+
+INSTANTIATE_TEST_SUITE_P(
+ DefaultAggregationScorerTest, AggregationScorerTest,
+ testing::Values(
+ // General case.
+ AggregationScorerTestParam(
+ /*ans_in=*/98, JoinSpecProto::AggregationScoringStrategy::NONE,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{8, 3, 1, 4, 7}),
+ // Only one child, greater than parent.
+ AggregationScorerTestParam(
+ /*ans_in=*/98, JoinSpecProto::AggregationScoringStrategy::NONE,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{123}),
+ // Only one child, smaller than parent.
+ AggregationScorerTestParam(
+ /*ans_in=*/98, JoinSpecProto::AggregationScoringStrategy::NONE,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{50}),
+ // No child.
+ AggregationScorerTestParam(
+ /*ans_in=*/98, JoinSpecProto::AggregationScoringStrategy::NONE,
+ /*parent_score_in=*/98,
+ /*child_scores_in=*/{})));
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/doc-join-info.cc b/icing/join/doc-join-info.cc
new file mode 100644
index 0000000..3b06f01
--- /dev/null
+++ b/icing/join/doc-join-info.cc
@@ -0,0 +1,49 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/doc-join-info.h"
+
+#include <cstdint>
+
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-id.h"
+#include "icing/util/bit-util.h"
+
+namespace icing {
+namespace lib {
+
+DocJoinInfo::DocJoinInfo(DocumentId document_id,
+ JoinablePropertyId joinable_property_id) {
+ Value temp_value = 0;
+ bit_util::BitfieldSet(/*new_value=*/document_id,
+ /*lsb_offset=*/kJoinablePropertyIdBits,
+ /*len=*/kDocumentIdBits, &temp_value);
+ bit_util::BitfieldSet(/*new_value=*/joinable_property_id,
+ /*lsb_offset=*/0,
+ /*len=*/kJoinablePropertyIdBits, &temp_value);
+ value_ = temp_value;
+}
+
+DocumentId DocJoinInfo::document_id() const {
+ return bit_util::BitfieldGet(value_, /*lsb_offset=*/kJoinablePropertyIdBits,
+ /*len=*/kDocumentIdBits);
+}
+
+JoinablePropertyId DocJoinInfo::joinable_property_id() const {
+ return bit_util::BitfieldGet(value_, /*lsb_offset=*/0,
+ /*len=*/kJoinablePropertyIdBits);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/doc-join-info.h b/icing/join/doc-join-info.h
new file mode 100644
index 0000000..7696b92
--- /dev/null
+++ b/icing/join/doc-join-info.h
@@ -0,0 +1,66 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_DOC_JOIN_INFO
+#define ICING_JOIN_DOC_JOIN_INFO
+
+#include <cstdint>
+#include <limits>
+
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// DocJoinInfo is composed of document_id and joinable_property_id.
+class DocJoinInfo {
+ public:
+ // The datatype used to encode DocJoinInfo information: the document_id and
+ // joinable_property_id.
+ using Value = uint32_t;
+
+ static_assert(kDocumentIdBits + kJoinablePropertyIdBits <= sizeof(Value) * 8,
+ "Cannot encode document id and joinable property id in "
+ "DocJoinInfo::Value");
+
+ // All bits of kInvalidValue are 1, and it contains:
+ // - 0b1 for 4 unused bits.
+ // - kInvalidDocumentId (2^22-1).
+ // - JoinablePropertyId 2^6-1 (valid), which is ok because kInvalidDocumentId
+ // has already invalidated the value. In fact, we currently use all 2^6
+ // joinable property ids and there is no "invalid joinable property id", so
+ // it doesn't matter what JoinablePropertyId we set for kInvalidValue.
+ static constexpr Value kInvalidValue = std::numeric_limits<Value>::max();
+
+ explicit DocJoinInfo(DocumentId document_id,
+ JoinablePropertyId joinable_property_id);
+
+ explicit DocJoinInfo(Value value = kInvalidValue) : value_(value) {}
+
+ bool is_valid() const { return value_ != kInvalidValue; }
+ Value value() const { return value_; }
+ DocumentId document_id() const;
+ JoinablePropertyId joinable_property_id() const;
+
+ private:
+ // Value bits layout: 4 unused + 22 document_id + 6 joinable_property_id.
+ Value value_;
+} __attribute__((packed));
+static_assert(sizeof(DocJoinInfo) == 4, "");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_DOC_JOIN_INFO
diff --git a/icing/join/doc-join-info_test.cc b/icing/join/doc-join-info_test.cc
new file mode 100644
index 0000000..7025473
--- /dev/null
+++ b/icing/join/doc-join-info_test.cc
@@ -0,0 +1,96 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/doc-join-info.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+
+static constexpr DocumentId kSomeDocumentId = 24;
+static constexpr JoinablePropertyId kSomeJoinablePropertyId = 5;
+
+TEST(DocJoinInfoTest, Accessors) {
+ DocJoinInfo doc_join_info(kSomeDocumentId, kSomeJoinablePropertyId);
+ EXPECT_THAT(doc_join_info.document_id(), Eq(kSomeDocumentId));
+ EXPECT_THAT(doc_join_info.joinable_property_id(),
+ Eq(kSomeJoinablePropertyId));
+}
+
+TEST(DocJoinInfoTest, Invalid) {
+ DocJoinInfo default_invalid;
+ EXPECT_THAT(default_invalid.is_valid(), IsFalse());
+
+ // Also make sure the invalid DocJoinInfo contains an invalid document id.
+ EXPECT_THAT(default_invalid.document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(default_invalid.joinable_property_id(),
+ Eq(kMaxJoinablePropertyId));
+}
+
+TEST(DocJoinInfoTest, Valid) {
+ DocJoinInfo maximum_document_id_info(kMaxDocumentId, kSomeJoinablePropertyId);
+ EXPECT_THAT(maximum_document_id_info.is_valid(), IsTrue());
+ EXPECT_THAT(maximum_document_id_info.document_id(), Eq(kMaxDocumentId));
+ EXPECT_THAT(maximum_document_id_info.joinable_property_id(),
+ Eq(kSomeJoinablePropertyId));
+
+ DocJoinInfo maximum_joinable_property_id_info(kSomeDocumentId,
+ kMaxJoinablePropertyId);
+ EXPECT_THAT(maximum_joinable_property_id_info.is_valid(), IsTrue());
+ EXPECT_THAT(maximum_joinable_property_id_info.document_id(),
+ Eq(kSomeDocumentId));
+ EXPECT_THAT(maximum_joinable_property_id_info.joinable_property_id(),
+ Eq(kMaxJoinablePropertyId));
+
+ DocJoinInfo minimum_document_id_info(kMinDocumentId, kSomeJoinablePropertyId);
+ EXPECT_THAT(minimum_document_id_info.is_valid(), IsTrue());
+ EXPECT_THAT(minimum_document_id_info.document_id(), Eq(kMinDocumentId));
+ EXPECT_THAT(minimum_document_id_info.joinable_property_id(),
+ Eq(kSomeJoinablePropertyId));
+
+ DocJoinInfo minimum_joinable_property_id_info(kSomeDocumentId,
+ kMinJoinablePropertyId);
+ EXPECT_THAT(minimum_joinable_property_id_info.is_valid(), IsTrue());
+ EXPECT_THAT(minimum_joinable_property_id_info.document_id(),
+ Eq(kSomeDocumentId));
+ EXPECT_THAT(minimum_joinable_property_id_info.joinable_property_id(),
+ Eq(kMinJoinablePropertyId));
+
+ DocJoinInfo all_maximum_info(kMaxDocumentId, kMaxJoinablePropertyId);
+ EXPECT_THAT(all_maximum_info.is_valid(), IsTrue());
+ EXPECT_THAT(all_maximum_info.document_id(), Eq(kMaxDocumentId));
+ EXPECT_THAT(all_maximum_info.joinable_property_id(),
+ Eq(kMaxJoinablePropertyId));
+
+ DocJoinInfo all_minimum_info(kMinDocumentId, kMinJoinablePropertyId);
+ EXPECT_THAT(all_minimum_info.is_valid(), IsTrue());
+ EXPECT_THAT(all_minimum_info.document_id(), Eq(kMinDocumentId));
+ EXPECT_THAT(all_minimum_info.joinable_property_id(),
+ Eq(kMinJoinablePropertyId));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/document-id-to-join-info.h b/icing/join/document-id-to-join-info.h
new file mode 100644
index 0000000..dee4885
--- /dev/null
+++ b/icing/join/document-id-to-join-info.h
@@ -0,0 +1,67 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_DOCUMENT_ID_TO_JOIN_INFO_H_
+#define ICING_JOIN_DOCUMENT_ID_TO_JOIN_INFO_H_
+
+#include <utility>
+
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// DocumentIdToJoinInfo is composed of document_id and its join info.
+// - QualifiedId join: join info is the referenced document's namespace_id +
+// fingerprint(uri).
+// - String join: join info is the term id.
+// - Integer join: join info is the integer.
+//
+// DocumentIdToJoinInfo will be stored in posting list.
+template <typename JoinInfoType>
+class DocumentIdToJoinInfo {
+ public:
+ static DocumentIdToJoinInfo<JoinInfoType> GetInvalid() {
+ return DocumentIdToJoinInfo<JoinInfoType>(kInvalidDocumentId,
+ JoinInfoType());
+ }
+
+ explicit DocumentIdToJoinInfo(DocumentId document_id, JoinInfoType join_info)
+ : document_id_(document_id), join_info_(std::move(join_info)) {}
+
+ DocumentId document_id() const { return document_id_; }
+ const JoinInfoType& join_info() const { return join_info_; }
+
+ bool is_valid() const { return IsDocumentIdValid(document_id_); }
+
+ bool operator<(const DocumentIdToJoinInfo<JoinInfoType>& other) const {
+ if (document_id_ != other.document_id_) {
+ return document_id_ < other.document_id_;
+ }
+ return join_info_ < other.join_info_;
+ }
+
+ bool operator==(const DocumentIdToJoinInfo<JoinInfoType>& other) const {
+ return document_id_ == other.document_id_ && join_info_ == other.join_info_;
+ }
+
+ private:
+ DocumentId document_id_;
+ JoinInfoType join_info_;
+} __attribute__((packed));
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_DOCUMENT_ID_TO_JOIN_INFO_H_
diff --git a/icing/join/join-children-fetcher.cc b/icing/join/join-children-fetcher.cc
new file mode 100644
index 0000000..c6d1b97
--- /dev/null
+++ b/icing/join/join-children-fetcher.cc
@@ -0,0 +1,39 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/join-children-fetcher.h"
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<std::vector<ScoredDocumentHit>>
+JoinChildrenFetcher::GetChildren(DocumentId parent_doc_id) const {
+ if (join_spec_.parent_property_expression() == kQualifiedIdExpr) {
+ if (auto iter = map_joinable_qualified_id_.find(parent_doc_id);
+ iter != map_joinable_qualified_id_.end()) {
+ return iter->second;
+ }
+ return std::vector<ScoredDocumentHit>();
+ }
+ // TODO(b/256022027): So far we only support kQualifiedIdExpr for
+ // parent_property_expression, we could support more.
+ return absl_ports::UnimplementedError(absl_ports::StrCat(
+ "Parent property expression must be ", kQualifiedIdExpr));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/join-children-fetcher.h b/icing/join/join-children-fetcher.h
new file mode 100644
index 0000000..1b875bc
--- /dev/null
+++ b/icing/join/join-children-fetcher.h
@@ -0,0 +1,73 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_JOIN_CHILDREN_FETCHER_H_
+#define ICING_JOIN_JOIN_CHILDREN_FETCHER_H_
+
+#include <unordered_map>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/proto/search.pb.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// A class that provides the GetChildren method for joins to fetch all children
+// documents given a parent document id.
+//
+// Internally, the class maintains a map for each joinable value type that
+// groups children according to the joinable values. Currently we only support
+// QUALIFIED_ID joining, in which the joinable value type is document id.
+class JoinChildrenFetcher {
+ public:
+ explicit JoinChildrenFetcher(
+ const JoinSpecProto& join_spec,
+ std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>&&
+ map_joinable_qualified_id)
+ : join_spec_(join_spec),
+ map_joinable_qualified_id_(std::move(map_joinable_qualified_id)) {}
+
+ // Get a vector of children ScoredDocumentHit by parent document id.
+ //
+ // TODO(b/256022027): Implement property value joins with types of string and
+ // int. In these cases, GetChildren should look up join index to fetch
+ // joinable property value of the given parent_doc_id according to
+ // join_spec_.parent_property_expression, and then fetch children by the
+ // corresponding map in this class using the joinable property value.
+ //
+ // Returns:
+ // The vector of results on success.
+ // UNIMPLEMENTED_ERROR if the join type specified by join_spec is not
+ // supported.
+ libtextclassifier3::StatusOr<std::vector<ScoredDocumentHit>> GetChildren(
+ DocumentId parent_doc_id) const;
+
+ private:
+ static constexpr std::string_view kQualifiedIdExpr = "this.qualifiedId()";
+
+ const JoinSpecProto& join_spec_; // Does not own!
+
+ // The map that groups children by qualified id used to support QualifiedId
+ // joining. The joining type is document id.
+ std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
+ map_joinable_qualified_id_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_JOIN_CHILDREN_FETCHER_H_
diff --git a/icing/join/join-children-fetcher_test.cc b/icing/join/join-children-fetcher_test.cc
new file mode 100644
index 0000000..92a7a81
--- /dev/null
+++ b/icing/join/join-children-fetcher_test.cc
@@ -0,0 +1,83 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/join-children-fetcher.h"
+
+#include <unordered_map>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/join/join-processor.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/section.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+
+TEST(JoinChildrenFetcherTest, FetchQualifiedIdJoinChildren) {
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+
+ std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
+ map_joinable_qualified_id;
+ DocumentId parent_doc_id = 0;
+ ScoredDocumentHit child1(/*document_id=*/1, kSectionIdMaskNone,
+ /*score=*/1.0);
+ ScoredDocumentHit child2(/*document_id=*/2, kSectionIdMaskNone,
+ /*score=*/2.0);
+ map_joinable_qualified_id[parent_doc_id].push_back(child1);
+ map_joinable_qualified_id[parent_doc_id].push_back(child2);
+
+ JoinChildrenFetcher fetcher(join_spec, std::move(map_joinable_qualified_id));
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<ScoredDocumentHit> children,
+ fetcher.GetChildren(parent_doc_id));
+ EXPECT_THAT(children, ElementsAre(EqualsScoredDocumentHit(child1),
+ EqualsScoredDocumentHit(child2)));
+}
+
+TEST(JoinChildrenFetcherTest, FetchJoinEmptyChildren) {
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+
+ DocumentId parent_doc_id = 0;
+
+ JoinChildrenFetcher fetcher(join_spec, /*map_joinable_qualified_id=*/{});
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<ScoredDocumentHit> children,
+ fetcher.GetChildren(parent_doc_id));
+ EXPECT_THAT(children, IsEmpty());
+}
+
+TEST(JoinChildrenFetcherTest, UnsupportedJoin) {
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression("name");
+ join_spec.set_child_property_expression("sender");
+ JoinChildrenFetcher fetcher(join_spec, /*map_joinable_qualified_id=*/{});
+ EXPECT_THAT(fetcher.GetChildren(0),
+ StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/join-processor.cc b/icing/join/join-processor.cc
new file mode 100644
index 0000000..1b7ca0d
--- /dev/null
+++ b/icing/join/join-processor.cc
@@ -0,0 +1,270 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/join-processor.h"
+
+#include <algorithm>
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/join/aggregation-scorer.h"
+#include "icing/join/doc-join-info.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/join/qualified-id.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<JoinChildrenFetcher>
+JoinProcessor::GetChildrenFetcher(
+ const JoinSpecProto& join_spec,
+ std::vector<ScoredDocumentHit>&& child_scored_document_hits) {
+ if (join_spec.parent_property_expression() != kQualifiedIdExpr) {
+ // TODO(b/256022027): So far we only support kQualifiedIdExpr for
+ // parent_property_expression, we could support more.
+ return absl_ports::UnimplementedError(absl_ports::StrCat(
+ "Parent property expression must be ", kQualifiedIdExpr));
+ }
+
+ ScoredDocumentHitComparator score_comparator(
+ /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() ==
+ ScoringSpecProto::Order::DESC);
+
+ if (qualified_id_join_index_->is_v2()) {
+ // v2
+ // Step 1a: sort child ScoredDocumentHits in document id descending order.
+ std::sort(child_scored_document_hits.begin(),
+ child_scored_document_hits.end(),
+ [](const ScoredDocumentHit& lhs, const ScoredDocumentHit& rhs) {
+ return lhs.document_id() > rhs.document_id();
+ });
+
+ // Step 1b: group all child ScoredDocumentHits by the document's
+ // schema_type_id.
+ std::unordered_map<SchemaTypeId, std::vector<ScoredDocumentHit>>
+ schema_to_child_scored_doc_hits_map;
+ for (const ScoredDocumentHit& child_scored_document_hit :
+ child_scored_document_hits) {
+ std::optional<DocumentFilterData> child_doc_filter_data =
+ doc_store_->GetAliveDocumentFilterData(
+ child_scored_document_hit.document_id(), current_time_ms_);
+ if (!child_doc_filter_data) {
+ continue;
+ }
+
+ schema_to_child_scored_doc_hits_map[child_doc_filter_data
+ ->schema_type_id()]
+ .push_back(child_scored_document_hit);
+ }
+
+ // Step 1c: for each schema_type_id, lookup QualifiedIdJoinIndexImplV2 to
+ // fetch all child join data from posting list(s). Convert all
+ // child join data to referenced parent document ids and bucketize
+ // child ScoredDocumentHits by it.
+ std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
+ parent_to_child_docs_map;
+ for (auto& [schema_type_id, grouped_child_scored_doc_hits] :
+ schema_to_child_scored_doc_hits_map) {
+ // Get joinable_property_id of this schema.
+ ICING_ASSIGN_OR_RETURN(
+ const JoinablePropertyMetadata* metadata,
+ schema_store_->GetJoinablePropertyMetadata(
+ schema_type_id, join_spec.child_property_expression()));
+ if (metadata == nullptr ||
+ metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) {
+ // Currently we only support qualified id, so skip other types.
+ continue;
+ }
+
+ // Lookup QualifiedIdJoinIndexImplV2.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase>
+ join_index_iter,
+ qualified_id_join_index_->GetIterator(
+ schema_type_id, /*joinable_property_id=*/metadata->id));
+
+ // - Join index contains all join data of schema_type_id and
+ // join_index_iter will return all of them in (child) document id
+ // descending order.
+ // - But we only need join data of child document ids which appear in
+ // grouped_child_scored_doc_hits. Also grouped_child_scored_doc_hits
+ // contain ScoredDocumentHits in (child) document id descending order.
+ // - Therefore, we advance 2 iterators to intersect them and get desired
+ // join data.
+ auto child_scored_doc_hits_iter = grouped_child_scored_doc_hits.cbegin();
+ while (join_index_iter->Advance().ok() &&
+ child_scored_doc_hits_iter !=
+ grouped_child_scored_doc_hits.cend()) {
+ // Advance child_scored_doc_hits_iter until it points to a
+ // ScoredDocumentHit with document id <= the one pointed by
+ // join_index_iter.
+ while (child_scored_doc_hits_iter !=
+ grouped_child_scored_doc_hits.cend() &&
+ child_scored_doc_hits_iter->document_id() >
+ join_index_iter->GetCurrent().document_id()) {
+ ++child_scored_doc_hits_iter;
+ }
+
+ if (child_scored_doc_hits_iter !=
+ grouped_child_scored_doc_hits.cend() &&
+ child_scored_doc_hits_iter->document_id() ==
+ join_index_iter->GetCurrent().document_id()) {
+ // We get a join data whose child document id exists in both join
+ // index and grouped_child_scored_doc_hits. Convert its join info to
+ // referenced parent document ids and bucketize ScoredDocumentHits by
+ // it (putting into parent_to_child_docs_map).
+ const NamespaceFingerprintIdentifier& ref_ns_id =
+ join_index_iter->GetCurrent().join_info();
+ libtextclassifier3::StatusOr<DocumentId> ref_parent_doc_id_or =
+ doc_store_->GetDocumentId(ref_ns_id);
+ if (ref_parent_doc_id_or.ok()) {
+ parent_to_child_docs_map[std::move(ref_parent_doc_id_or)
+ .ValueOrDie()]
+ .push_back(*child_scored_doc_hits_iter);
+ }
+ }
+ }
+ }
+
+ // Step 1d: finally, sort each parent's joined child ScoredDocumentHits by
+ // score.
+ for (auto& [parent_doc_id, bucketized_child_scored_hits] :
+ parent_to_child_docs_map) {
+ std::sort(bucketized_child_scored_hits.begin(),
+ bucketized_child_scored_hits.end(), score_comparator);
+ }
+
+ return JoinChildrenFetcher(join_spec, std::move(parent_to_child_docs_map));
+ }
+
+ // v1
+ // TODO(b/275121148): deprecate this part after rollout v2.
+ std::sort(child_scored_document_hits.begin(),
+ child_scored_document_hits.end(), score_comparator);
+
+ // Step 1: group child documents by parent documentId. Currently we only
+ // support QualifiedId joining, so fetch the qualified id content of
+ // child_property_expression, break it down into namespace + uri, and
+ // lookup the DocumentId.
+ // The keys of this map are the DocumentIds of the parent docs the child
+ // ScoredDocumentHits refer to. The values in this map are vectors of child
+ // ScoredDocumentHits that refer to a parent DocumentId.
+ std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
+ map_joinable_qualified_id;
+ for (const ScoredDocumentHit& child : child_scored_document_hits) {
+ ICING_ASSIGN_OR_RETURN(
+ DocumentId ref_doc_id,
+ FetchReferencedQualifiedId(child.document_id(),
+ join_spec.child_property_expression()));
+ if (ref_doc_id == kInvalidDocumentId) {
+ continue;
+ }
+
+ map_joinable_qualified_id[ref_doc_id].push_back(child);
+ }
+ return JoinChildrenFetcher(join_spec, std::move(map_joinable_qualified_id));
+}
+
+libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
+JoinProcessor::Join(
+ const JoinSpecProto& join_spec,
+ std::vector<ScoredDocumentHit>&& parent_scored_document_hits,
+ const JoinChildrenFetcher& join_children_fetcher) {
+ std::unique_ptr<AggregationScorer> aggregation_scorer =
+ AggregationScorer::Create(join_spec);
+
+ std::vector<JoinedScoredDocumentHit> joined_scored_document_hits;
+ joined_scored_document_hits.reserve(parent_scored_document_hits.size());
+
+ // Step 2: iterate through all parent documentIds and construct
+ // JoinedScoredDocumentHit for each by looking up
+ // join_children_fetcher.
+ for (ScoredDocumentHit& parent : parent_scored_document_hits) {
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<ScoredDocumentHit> children,
+ join_children_fetcher.GetChildren(parent.document_id()));
+
+ double final_score = aggregation_scorer->GetScore(parent, children);
+ joined_scored_document_hits.emplace_back(final_score, std::move(parent),
+ std::move(children));
+ }
+
+ return joined_scored_document_hits;
+}
+
+libtextclassifier3::StatusOr<DocumentId>
+JoinProcessor::FetchReferencedQualifiedId(
+ const DocumentId& document_id, const std::string& property_path) const {
+ std::optional<DocumentFilterData> filter_data =
+ doc_store_->GetAliveDocumentFilterData(document_id, current_time_ms_);
+ if (!filter_data) {
+ return kInvalidDocumentId;
+ }
+
+ ICING_ASSIGN_OR_RETURN(const JoinablePropertyMetadata* metadata,
+ schema_store_->GetJoinablePropertyMetadata(
+ filter_data->schema_type_id(), property_path));
+ if (metadata == nullptr ||
+ metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) {
+ // Currently we only support qualified id.
+ return kInvalidDocumentId;
+ }
+
+ DocJoinInfo info(document_id, metadata->id);
+ libtextclassifier3::StatusOr<std::string_view> ref_qualified_id_str_or =
+ qualified_id_join_index_->Get(info);
+ if (!ref_qualified_id_str_or.ok()) {
+ if (absl_ports::IsNotFound(ref_qualified_id_str_or.status())) {
+ return kInvalidDocumentId;
+ }
+ return std::move(ref_qualified_id_str_or).status();
+ }
+
+ libtextclassifier3::StatusOr<QualifiedId> ref_qualified_id_or =
+ QualifiedId::Parse(std::move(ref_qualified_id_str_or).ValueOrDie());
+ if (!ref_qualified_id_or.ok()) {
+ // This shouldn't happen because we've validated it during indexing and only
+ // put valid qualified id strings into qualified id join index.
+ return kInvalidDocumentId;
+ }
+ QualifiedId qualified_id = std::move(ref_qualified_id_or).ValueOrDie();
+
+ libtextclassifier3::StatusOr<DocumentId> ref_document_id_or =
+ doc_store_->GetDocumentId(qualified_id.name_space(), qualified_id.uri());
+ if (!ref_document_id_or.ok()) {
+ return kInvalidDocumentId;
+ }
+ return std::move(ref_document_id_or).ValueOrDie();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/join-processor.h b/icing/join/join-processor.h
new file mode 100644
index 0000000..517e9db
--- /dev/null
+++ b/icing/join/join-processor.h
@@ -0,0 +1,88 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_JOIN_PROCESSOR_H_
+#define ICING_JOIN_JOIN_PROCESSOR_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+class JoinProcessor {
+ public:
+ static constexpr std::string_view kQualifiedIdExpr = "this.qualifiedId()";
+
+ explicit JoinProcessor(const DocumentStore* doc_store,
+ const SchemaStore* schema_store,
+ const QualifiedIdJoinIndex* qualified_id_join_index,
+ int64_t current_time_ms)
+ : doc_store_(doc_store),
+ schema_store_(schema_store),
+ qualified_id_join_index_(qualified_id_join_index),
+ current_time_ms_(current_time_ms) {}
+
+ // Get a JoinChildrenFetcher used to fetch all children documents by a parent
+ // document id.
+ //
+ // Returns:
+ // A JoinChildrenFetcher instance on success.
+ // UNIMPLEMENTED_ERROR if the join type specified by join_spec is not
+ // supported.
+ libtextclassifier3::StatusOr<JoinChildrenFetcher> GetChildrenFetcher(
+ const JoinSpecProto& join_spec,
+ std::vector<ScoredDocumentHit>&& child_scored_document_hits);
+
+ libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>> Join(
+ const JoinSpecProto& join_spec,
+ std::vector<ScoredDocumentHit>&& parent_scored_document_hits,
+ const JoinChildrenFetcher& join_children_fetcher);
+
+ private:
+ // Fetches referenced document id of the given document under the given
+ // property path.
+ //
+ // TODO(b/256022027): validate joinable property (and its upper-level) should
+ // not have REPEATED cardinality.
+ //
+ // Returns:
+ // - A valid referenced document id on success
+ // - kInvalidDocumentId if the given document is not found, doesn't have
+ // qualified id joinable type for the given property_path, or doesn't have
+ // joinable value (an optional property)
+ // - Any other QualifiedIdJoinIndex errors
+ libtextclassifier3::StatusOr<DocumentId> FetchReferencedQualifiedId(
+ const DocumentId& document_id, const std::string& property_path) const;
+
+ const DocumentStore* doc_store_; // Does not own.
+ const SchemaStore* schema_store_; // Does not own.
+ const QualifiedIdJoinIndex* qualified_id_join_index_; // Does not own.
+ int64_t current_time_ms_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_JOIN_PROCESSOR_H_
diff --git a/icing/join/join-processor_test.cc b/icing/join/join-processor_test.cc
new file mode 100644
index 0000000..a40d934
--- /dev/null
+++ b/icing/join/join-processor_test.cc
@@ -0,0 +1,930 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/join-processor.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/join/qualified-id-join-index-impl-v1.h"
+#include "icing/join/qualified-id-join-index-impl-v2.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/join/qualified-id-join-indexing-handler.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsTrue;
+
+// TODO(b/275121148): remove template after deprecating
+// QualifiedIdJoinIndexImplV1.
+template <typename T>
+class JoinProcessorTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/icing_join_processor_test";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()),
+ IsTrue());
+
+ schema_store_dir_ = test_dir_ + "/schema_store";
+ doc_store_dir_ = test_dir_ + "/doc_store";
+ qualified_id_join_index_dir_ = test_dir_ + "/qualified_id_join_index";
+
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ASSERT_THAT(
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("receiver")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/true, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ doc_store_ = std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_,
+ CreateQualifiedIdJoinIndex<T>());
+ }
+
+ void TearDown() override {
+ qualified_id_join_index_.reset();
+ doc_store_.reset();
+ schema_store_.reset();
+ lang_segmenter_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ template <typename UnknownJoinIndexType>
+ libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>>
+ CreateQualifiedIdJoinIndex() {
+ return absl_ports::InvalidArgumentError("Unknown type");
+ }
+
+ template <>
+ libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>>
+ CreateQualifiedIdJoinIndex<QualifiedIdJoinIndexImplV1>() {
+ return QualifiedIdJoinIndexImplV1::Create(
+ filesystem_, qualified_id_join_index_dir_, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false);
+ }
+
+ template <>
+ libtextclassifier3::StatusOr<std::unique_ptr<QualifiedIdJoinIndex>>
+ CreateQualifiedIdJoinIndex<QualifiedIdJoinIndexImplV2>() {
+ return QualifiedIdJoinIndexImplV2::Create(filesystem_,
+ qualified_id_join_index_dir_,
+ /*pre_mapping_fbv=*/false);
+ }
+
+ libtextclassifier3::StatusOr<DocumentId> PutAndIndexDocument(
+ const DocumentProto& document) {
+ ICING_ASSIGN_OR_RETURN(DocumentId document_id, doc_store_->Put(document));
+ ICING_ASSIGN_OR_RETURN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ ICING_RETURN_IF_ERROR(handler->Handle(tokenized_document, document_id,
+ /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr));
+ return document_id;
+ }
+
+ libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>> Join(
+ const JoinSpecProto& join_spec,
+ std::vector<ScoredDocumentHit> parent_scored_document_hits,
+ std::vector<ScoredDocumentHit> child_scored_document_hits) {
+ JoinProcessor join_processor(
+ doc_store_.get(), schema_store_.get(), qualified_id_join_index_.get(),
+ /*current_time_ms=*/fake_clock_.GetSystemTimeMilliseconds());
+ ICING_ASSIGN_OR_RETURN(
+ JoinChildrenFetcher join_children_fetcher,
+ join_processor.GetChildrenFetcher(
+ join_spec, std::move(child_scored_document_hits)));
+ return join_processor.Join(join_spec,
+ std::move(parent_scored_document_hits),
+ join_children_fetcher);
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+ std::string schema_store_dir_;
+ std::string doc_store_dir_;
+ std::string qualified_id_join_index_dir_;
+
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> doc_store_;
+ std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_;
+
+ FakeClock fake_clock_;
+};
+
+using TestTypes =
+ ::testing::Types<QualifiedIdJoinIndexImplV1, QualifiedIdJoinIndexImplV2>;
+TYPED_TEST_SUITE(JoinProcessorTest, TestTypes);
+
+TYPED_TEST(JoinProcessorTest, JoinByQualifiedId_allDocuments) {
+ DocumentProto person1 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Alice")
+ .Build();
+ DocumentProto person2 = DocumentBuilder()
+ .SetKey(R"(pkg$db/name#space\\)", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Bob")
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("sender",
+ R"(pkg$db/name\#space\\\\#person2)") // escaped
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ this->PutAndIndexDocument(person1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ this->PutAndIndexDocument(person2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ this->PutAndIndexDocument(email1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ this->PutAndIndexDocument(email2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ this->PutAndIndexDocument(email3));
+
+ ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit2(document_id2, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/3.0);
+ ScoredDocumentHit scored_doc_hit4(document_id4, kSectionIdMaskNone,
+ /*score=*/4.0);
+ ScoredDocumentHit scored_doc_hit5(document_id5, kSectionIdMaskNone,
+ /*score=*/5.0);
+
+ // Parent ScoredDocumentHits: all Person documents
+ std::vector<ScoredDocumentHit> parent_scored_document_hits = {
+ scored_doc_hit2, scored_doc_hit1};
+
+ // Child ScoredDocumentHits: all Email documents
+ std::vector<ScoredDocumentHit> child_scored_document_hits = {
+ scored_doc_hit5, scored_doc_hit4, scored_doc_hit3};
+
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+ join_spec.set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by(
+ ScoringSpecProto::Order::DESC);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits,
+ this->Join(join_spec, std::move(parent_scored_document_hits),
+ std::move(child_scored_document_hits)));
+ EXPECT_THAT(
+ joined_result_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/1.0,
+ /*parent_scored_document_hit=*/scored_doc_hit2,
+ /*child_scored_document_hits=*/{scored_doc_hit4})),
+ EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/2.0,
+ /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/
+ {scored_doc_hit5, scored_doc_hit3}))));
+}
+
+TYPED_TEST(JoinProcessorTest, JoinByQualifiedId_partialDocuments) {
+ DocumentProto person1 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Alice")
+ .Build();
+ DocumentProto person2 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Bob")
+ .Build();
+ DocumentProto person3 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person3")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Eve")
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("sender", "pkg$db/namespace#person2")
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("sender", "pkg$db/namespace#person3")
+ .Build();
+ DocumentProto email4 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email4")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 4")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ this->PutAndIndexDocument(person1));
+ ICING_ASSERT_OK(/*document_id2 unused*/
+ this->PutAndIndexDocument(person2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ this->PutAndIndexDocument(person3));
+ ICING_ASSERT_OK(/*document_id4 unused*/
+ this->PutAndIndexDocument(email1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ this->PutAndIndexDocument(email2));
+ ICING_ASSERT_OK(/*document_id6 unused*/
+ this->PutAndIndexDocument(email3));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id7,
+ this->PutAndIndexDocument(email4));
+
+ ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit5(document_id5, kSectionIdMaskNone,
+ /*score=*/4.0);
+ ScoredDocumentHit scored_doc_hit7(document_id7, kSectionIdMaskNone,
+ /*score=*/5.0);
+
+ // Only join person1, person3, email2 and email4.
+ // Parent ScoredDocumentHits: person1, person3
+ std::vector<ScoredDocumentHit> parent_scored_document_hits = {
+ scored_doc_hit3, scored_doc_hit1};
+
+ // Child ScoredDocumentHits: email2, email4
+ std::vector<ScoredDocumentHit> child_scored_document_hits = {scored_doc_hit7,
+ scored_doc_hit5};
+
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+ join_spec.set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by(
+ ScoringSpecProto::Order::DESC);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits,
+ this->Join(join_spec, std::move(parent_scored_document_hits),
+ std::move(child_scored_document_hits)));
+ EXPECT_THAT(
+ joined_result_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/0.0,
+ /*parent_scored_document_hit=*/scored_doc_hit3,
+ /*child_scored_document_hits=*/{})),
+ EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/1.0,
+ /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/{scored_doc_hit7}))));
+}
+
+TYPED_TEST(JoinProcessorTest,
+ ShouldIgnoreChildDocumentsWithoutJoiningProperty) {
+ DocumentProto person1 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Alice")
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+ DocumentProto email2 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ this->PutAndIndexDocument(person1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ this->PutAndIndexDocument(email1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ this->PutAndIndexDocument(email2));
+
+ ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit2(document_id2, kSectionIdMaskNone,
+ /*score=*/5.0);
+ ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/6.0);
+
+ // Parent ScoredDocumentHits: all Person documents
+ std::vector<ScoredDocumentHit> parent_scored_document_hits = {
+ scored_doc_hit1};
+
+ // Child ScoredDocumentHits: all Email documents
+ std::vector<ScoredDocumentHit> child_scored_document_hits = {scored_doc_hit2,
+ scored_doc_hit3};
+
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+ join_spec.set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by(
+ ScoringSpecProto::Order::DESC);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits,
+ this->Join(join_spec, std::move(parent_scored_document_hits),
+ std::move(child_scored_document_hits)));
+ // Since Email2 doesn't have "sender" property, it should be ignored.
+ EXPECT_THAT(
+ joined_result_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/1.0, /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/{scored_doc_hit2}))));
+}
+
+TYPED_TEST(JoinProcessorTest,
+ ShouldIgnoreChildDocumentsWithInvalidQualifiedId) {
+ DocumentProto person1 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Alice")
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty(
+ "sender",
+ "pkg$db/namespace#person2") // qualified id is invalid since
+ // person2 doesn't exist.
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("sender",
+ R"(pkg$db/namespace\#person1)") // invalid format
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ this->PutAndIndexDocument(person1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ this->PutAndIndexDocument(email1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ this->PutAndIndexDocument(email2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ this->PutAndIndexDocument(email3));
+
+ ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit2(document_id2, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit4(document_id4, kSectionIdMaskNone,
+ /*score=*/0.0);
+
+ // Parent ScoredDocumentHits: all Person documents
+ std::vector<ScoredDocumentHit> parent_scored_document_hits = {
+ scored_doc_hit1};
+
+ // Child ScoredDocumentHits: all Email documents
+ std::vector<ScoredDocumentHit> child_scored_document_hits = {
+ scored_doc_hit2, scored_doc_hit3, scored_doc_hit4};
+
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+ join_spec.set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by(
+ ScoringSpecProto::Order::DESC);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits,
+ this->Join(join_spec, std::move(parent_scored_document_hits),
+ std::move(child_scored_document_hits)));
+ // Email 2 and email 3 (document id 3 and 4) contain invalid qualified ids.
+ // Join processor should ignore them.
+ EXPECT_THAT(joined_result_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/1.0,
+ /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/{scored_doc_hit2}))));
+}
+
+TYPED_TEST(JoinProcessorTest, LeftJoinShouldReturnParentWithoutChildren) {
+ DocumentProto person1 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Alice")
+ .Build();
+ DocumentProto person2 = DocumentBuilder()
+ .SetKey(R"(pkg$db/name#space\\)", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Bob")
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("sender",
+ R"(pkg$db/name\#space\\\\#person2)") // escaped
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ this->PutAndIndexDocument(person1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ this->PutAndIndexDocument(person2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ this->PutAndIndexDocument(email1));
+
+ ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit2(document_id2, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/3.0);
+
+ // Parent ScoredDocumentHits: all Person documents
+ std::vector<ScoredDocumentHit> parent_scored_document_hits = {
+ scored_doc_hit2, scored_doc_hit1};
+
+ // Child ScoredDocumentHits: all Email documents
+ std::vector<ScoredDocumentHit> child_scored_document_hits = {scored_doc_hit3};
+
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+ join_spec.set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by(
+ ScoringSpecProto::Order::DESC);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits,
+ this->Join(join_spec, std::move(parent_scored_document_hits),
+ std::move(child_scored_document_hits)));
+ // Person1 has no child documents, but left join should also include it.
+ EXPECT_THAT(
+ joined_result_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/1.0,
+ /*parent_scored_document_hit=*/scored_doc_hit2,
+ /*child_scored_document_hits=*/{scored_doc_hit3})),
+ EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/0.0,
+ /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/{}))));
+}
+
+TYPED_TEST(JoinProcessorTest, ShouldSortChildDocumentsByRankingStrategy) {
+ DocumentProto person1 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Alice")
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ this->PutAndIndexDocument(person1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ this->PutAndIndexDocument(email1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ this->PutAndIndexDocument(email2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ this->PutAndIndexDocument(email3));
+
+ ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit2(document_id2, kSectionIdMaskNone,
+ /*score=*/2.0);
+ ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/5.0);
+ ScoredDocumentHit scored_doc_hit4(document_id4, kSectionIdMaskNone,
+ /*score=*/3.0);
+
+ // Parent ScoredDocumentHits: all Person documents
+ std::vector<ScoredDocumentHit> parent_scored_document_hits = {
+ scored_doc_hit1};
+
+ // Child ScoredDocumentHits: all Email documents
+ std::vector<ScoredDocumentHit> child_scored_document_hits = {
+ scored_doc_hit2, scored_doc_hit3, scored_doc_hit4};
+
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+ join_spec.set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by(
+ ScoringSpecProto::Order::DESC);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits,
+ this->Join(join_spec, std::move(parent_scored_document_hits),
+ std::move(child_scored_document_hits)));
+ // Child documents should be sorted according to the (nested) ranking
+ // strategy.
+ EXPECT_THAT(
+ joined_result_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/3.0, /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/
+ {scored_doc_hit3, scored_doc_hit4, scored_doc_hit2}))));
+}
+
+TYPED_TEST(JoinProcessorTest, ShouldAllowSelfJoining) {
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("sender", "pkg$db/namespace#email1")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ this->PutAndIndexDocument(email1));
+
+ ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/0.0);
+
+ // Parent ScoredDocumentHits: all Person documents
+ std::vector<ScoredDocumentHit> parent_scored_document_hits = {
+ scored_doc_hit1};
+
+ // Child ScoredDocumentHits: all Email documents
+ std::vector<ScoredDocumentHit> child_scored_document_hits = {scored_doc_hit1};
+
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+ join_spec.set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by(
+ ScoringSpecProto::Order::DESC);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits,
+ this->Join(join_spec, std::move(parent_scored_document_hits),
+ std::move(child_scored_document_hits)));
+ EXPECT_THAT(joined_result_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/1.0,
+ /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/{scored_doc_hit1}))));
+}
+
+TYPED_TEST(JoinProcessorTest, MultipleChildSchemasJoining) {
+ DocumentProto person1 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person1")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Alice")
+ .Build();
+ DocumentProto person2 = DocumentBuilder()
+ .SetKey("pkg$db/namespace", "person2")
+ .SetSchema("Person")
+ .AddStringProperty("Name", "Bob")
+ .Build();
+
+ DocumentProto email1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email1")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 1")
+ .AddStringProperty("sender", "pkg$db/namespace#person2")
+ .Build();
+ DocumentProto email2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email2")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 2")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+ DocumentProto email3 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "email3")
+ .SetSchema("Email")
+ .AddStringProperty("subject", "test subject 3")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .Build();
+ DocumentProto message1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "message1")
+ .SetSchema("Message")
+ .AddStringProperty("content", "test content 1")
+ .AddStringProperty("sender", "pkg$db/namespace#person1")
+ .AddStringProperty("receiver", "pkg$db/namespace#person2")
+ .Build();
+ DocumentProto message2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/namespace", "message2")
+ .SetSchema("Message")
+ .AddStringProperty("content", "test content 2")
+ .AddStringProperty("sender", "pkg$db/namespace#person2")
+ .AddStringProperty("receiver", "pkg$db/namespace#person1")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ this->PutAndIndexDocument(person1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ this->PutAndIndexDocument(person2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ this->PutAndIndexDocument(email1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ this->PutAndIndexDocument(email2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ this->PutAndIndexDocument(email3));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6,
+ this->PutAndIndexDocument(message1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id7,
+ this->PutAndIndexDocument(message2));
+
+ ScoredDocumentHit scored_doc_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit2(document_id2, kSectionIdMaskNone,
+ /*score=*/0.0);
+ ScoredDocumentHit scored_doc_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/5.0);
+ ScoredDocumentHit scored_doc_hit4(document_id4, kSectionIdMaskNone,
+ /*score=*/3.0);
+ ScoredDocumentHit scored_doc_hit5(document_id5, kSectionIdMaskNone,
+ /*score=*/2.0);
+ ScoredDocumentHit scored_doc_hit6(document_id6, kSectionIdMaskNone,
+ /*score=*/4.0);
+ ScoredDocumentHit scored_doc_hit7(document_id7, kSectionIdMaskNone,
+ /*score=*/1.0);
+
+ // Parent ScoredDocumentHits: all Person documents
+ std::vector<ScoredDocumentHit> parent_scored_document_hits = {
+ scored_doc_hit1, scored_doc_hit2};
+
+ // Child ScoredDocumentHits: all Email and Message documents
+ std::vector<ScoredDocumentHit> child_scored_document_hits = {
+ scored_doc_hit3, scored_doc_hit4, scored_doc_hit5, scored_doc_hit6,
+ scored_doc_hit7};
+
+ // Join by "sender".
+ // - Person1: [
+ // email2 (scored_doc_hit4),
+ // email3 (scored_doc_hit5),
+ // message1 (scored_doc_hit6),
+ // ]
+ // - Person2: [
+ // email1 (scored_doc_hit3),
+ // message2 (scored_doc_hit7),
+ // ]
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression(
+ std::string(JoinProcessor::kQualifiedIdExpr));
+ join_spec.set_child_property_expression("sender");
+ join_spec.set_aggregation_scoring_strategy(
+ JoinSpecProto::AggregationScoringStrategy::COUNT);
+ join_spec.mutable_nested_spec()->mutable_scoring_spec()->set_order_by(
+ ScoringSpecProto::Order::DESC);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits1,
+ this->Join(join_spec, parent_scored_document_hits,
+ child_scored_document_hits));
+ EXPECT_THAT(
+ joined_result_document_hits1,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/3.0,
+ /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/
+ {scored_doc_hit6, scored_doc_hit4, scored_doc_hit5})),
+ EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/2.0,
+ /*parent_scored_document_hit=*/scored_doc_hit2,
+ /*child_scored_document_hits=*/
+ {scored_doc_hit3, scored_doc_hit7}))));
+
+ // Join by "receiver".
+ // - Person1: [
+ // message2 (scored_doc_hit7),
+ // ]
+ // - Person2: [
+ // message1 (scored_doc_hit6),
+ // ]
+ join_spec.set_child_property_expression("receiver");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<JoinedScoredDocumentHit> joined_result_document_hits2,
+ this->Join(join_spec, parent_scored_document_hits,
+ child_scored_document_hits));
+ EXPECT_THAT(
+ joined_result_document_hits2,
+ ElementsAre(EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/1.0,
+ /*parent_scored_document_hit=*/scored_doc_hit1,
+ /*child_scored_document_hits=*/{scored_doc_hit7})),
+ EqualsJoinedScoredDocumentHit(JoinedScoredDocumentHit(
+ /*final_score=*/1.0,
+ /*parent_scored_document_hit=*/scored_doc_hit2,
+ /*child_scored_document_hits=*/{scored_doc_hit6}))));
+}
+
+// TODO(b/256022027): add unit tests for non-joinable property. If joinable
+// value type is unset, then qualifed id join should not
+// include the child document even if it contains a valid
+// qualified id string.
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/posting-list-join-data-accessor.h b/icing/join/posting-list-join-data-accessor.h
new file mode 100644
index 0000000..6669f9f
--- /dev/null
+++ b/icing/join/posting-list-join-data-accessor.h
@@ -0,0 +1,211 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_
+#define ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/index-block.h"
+#include "icing/file/posting_list/posting-list-accessor.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/join/posting-list-join-data-serializer.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// This class is used to provide a simple abstraction for adding join data to
+// posting lists. PostingListJoinDataAccessor handles:
+// 1) selection of properly-sized posting lists for the accumulated join index
+// data during Finalize()
+// 2) chaining of max-sized posting lists.
+template <typename JoinDataType>
+class PostingListJoinDataAccessor : public PostingListAccessor {
+ public:
+ // Creates an empty PostingListJoinDataAccessor.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListJoinDataAccessor
+ // - INVALID_ARGUMENT error if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>>
+ Create(FlashIndexStorage* storage,
+ PostingListJoinDataSerializer<JoinDataType>* serializer);
+
+ // Creates a PostingListJoinDataAccessor with an existing posting list
+ // identified by existing_posting_list_id.
+ //
+ // RETURNS:
+ // - On success, a valid instance of PostingListJoinDataAccessor
+ // - INVALID_ARGUMENT if storage has an invalid block_size.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>>
+ CreateFromExisting(FlashIndexStorage* storage,
+ PostingListJoinDataSerializer<JoinDataType>* serializer,
+ PostingListIdentifier existing_posting_list_id);
+
+ PostingListSerializer* GetSerializer() override { return serializer_; }
+
+ // Retrieves the next batch of data in the posting list chain.
+ //
+ // RETURNS:
+ // - On success, a vector of join data in the posting list chain
+ // - FAILED_PRECONDITION_ERROR if called on an instance that was created via
+ // Create.
+ // - INTERNAL_ERROR if unable to read the next posting list in the chain or
+ // if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<JoinDataType>> GetNextDataBatch();
+
+ // Prepends one data. This may result in flushing the posting list to disk (if
+ // the PostingListJoinDataAccessor holds a max-sized posting list that is
+ // full) or freeing a pre-existing posting list if it is too small to fit all
+ // data necessary.
+ //
+ // RETURNS:
+ // - OK, on success
+ // - INVALID_ARGUMENT if !data.is_valid() or if data is greater than the
+ // previously added data.
+ // - RESOURCE_EXHAUSTED error if unable to grow the index to allocate a new
+ // posting list.
+ libtextclassifier3::Status PrependData(const JoinDataType& data);
+
+ private:
+ explicit PostingListJoinDataAccessor(
+ FlashIndexStorage* storage, PostingListUsed in_memory_posting_list,
+ PostingListJoinDataSerializer<JoinDataType>* serializer)
+ : PostingListAccessor(storage, std::move(in_memory_posting_list)),
+ serializer_(serializer) {}
+
+ PostingListJoinDataSerializer<JoinDataType>* serializer_; // Does not own.
+};
+
+template <typename JoinDataType>
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>>
+PostingListJoinDataAccessor<JoinDataType>::Create(
+ FlashIndexStorage* storage,
+ PostingListJoinDataSerializer<JoinDataType>* serializer) {
+ uint32_t max_posting_list_bytes = IndexBlock::CalculateMaxPostingListBytes(
+ storage->block_size(), serializer->GetDataTypeBytes());
+ ICING_ASSIGN_OR_RETURN(PostingListUsed in_memory_posting_list,
+ PostingListUsed::CreateFromUnitializedRegion(
+ serializer, max_posting_list_bytes));
+ return std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>(
+ new PostingListJoinDataAccessor<JoinDataType>(
+ storage, std::move(in_memory_posting_list), serializer));
+}
+
+template <typename JoinDataType>
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>>
+PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
+ FlashIndexStorage* storage,
+ PostingListJoinDataSerializer<JoinDataType>* serializer,
+ PostingListIdentifier existing_posting_list_id) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ Create(storage, serializer));
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage->GetPostingList(existing_posting_list_id));
+ pl_accessor->preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ return pl_accessor;
+}
+
+// Returns the next batch of join data for the provided posting list.
+template <typename JoinDataType>
+libtextclassifier3::StatusOr<std::vector<JoinDataType>>
+PostingListJoinDataAccessor<JoinDataType>::GetNextDataBatch() {
+ if (preexisting_posting_list_ == nullptr) {
+ if (has_reached_posting_list_chain_end_) {
+ return std::vector<JoinDataType>();
+ }
+ return absl_ports::FailedPreconditionError(
+ "Cannot retrieve data from a PostingListJoinDataAccessor that was not "
+ "created from a preexisting posting list.");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<JoinDataType> batch,
+ serializer_->GetData(&preexisting_posting_list_->posting_list));
+ uint32_t next_block_index = kInvalidBlockIndex;
+ // Posting lists will only be chained when they are max-sized, in which case
+ // next_block_index will point to the next block for the next posting list.
+ // Otherwise, next_block_index can be kInvalidBlockIndex or be used to point
+ // to the next free list block, which is not relevant here.
+ if (preexisting_posting_list_->posting_list.size_in_bytes() ==
+ storage_->max_posting_list_bytes()) {
+ next_block_index = preexisting_posting_list_->next_block_index;
+ }
+
+ if (next_block_index != kInvalidBlockIndex) {
+ // Since we only have to deal with next block for max-sized posting list
+ // block, max_num_posting_lists is 1 and posting_list_index_bits is
+ // BitsToStore(1).
+ PostingListIdentifier next_posting_list_id(
+ next_block_index, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/BitsToStore(1));
+ ICING_ASSIGN_OR_RETURN(PostingListHolder holder,
+ storage_->GetPostingList(next_posting_list_id));
+ preexisting_posting_list_ =
+ std::make_unique<PostingListHolder>(std::move(holder));
+ } else {
+ has_reached_posting_list_chain_end_ = true;
+ preexisting_posting_list_.reset();
+ }
+ return batch;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataAccessor<JoinDataType>::PrependData(
+ const JoinDataType& data) {
+ PostingListUsed& active_pl = (preexisting_posting_list_ != nullptr)
+ ? preexisting_posting_list_->posting_list
+ : in_memory_posting_list_;
+ libtextclassifier3::Status status =
+ serializer_->PrependData(&active_pl, data);
+ if (!absl_ports::IsResourceExhausted(status)) {
+ return status;
+ }
+ // There is no more room to add data to this current posting list! Therefore,
+ // we need to either move those data to a larger posting list or flush this
+ // posting list and create another max-sized posting list in the chain.
+ if (preexisting_posting_list_ != nullptr) {
+ ICING_RETURN_IF_ERROR(FlushPreexistingPostingList());
+ } else {
+ ICING_RETURN_IF_ERROR(FlushInMemoryPostingList());
+ }
+
+ // Re-add data. Should always fit since we just cleared
+ // in_memory_posting_list_. It's fine to explicitly reference
+ // in_memory_posting_list_ here because there's no way of reaching this line
+ // while preexisting_posting_list_ is still in use.
+ return serializer_->PrependData(&in_memory_posting_list_, data);
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_POSTING_LIST_JOIN_DATA_ACCESSOR_H_
diff --git a/icing/join/posting-list-join-data-accessor_test.cc b/icing/join/posting-list-join-data-accessor_test.cc
new file mode 100644
index 0000000..ddc2d32
--- /dev/null
+++ b/icing/join/posting-list-join-data-accessor_test.cc
@@ -0,0 +1,435 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/posting-list-join-data-accessor.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-accessor.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/join/document-id-to-join-info.h"
+#include "icing/join/posting-list-join-data-serializer.h"
+#include "icing/store/document-id.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+using ::testing::Lt;
+using ::testing::Ne;
+using ::testing::SizeIs;
+
+using JoinDataType = DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>;
+
+static constexpr NamespaceId kDefaultNamespaceId = 1;
+
+class PostingListJoinDataAccessorTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/test_dir";
+ file_name_ = test_dir_ + "/test_file.idx.index";
+
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(test_dir_.c_str()));
+
+ serializer_ =
+ std::make_unique<PostingListJoinDataSerializer<JoinDataType>>();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(file_name_, &filesystem_, serializer_.get()));
+ flash_index_storage_ =
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
+ }
+
+ void TearDown() override {
+ flash_index_storage_.reset();
+ serializer_.reset();
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+ std::string file_name_;
+ std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>> serializer_;
+ std::unique_ptr<FlashIndexStorage> flash_index_storage_;
+};
+
+std::vector<JoinDataType> CreateData(int num_data, DocumentId start_document_id,
+ NamespaceId ref_namespace_id,
+ uint64_t start_ref_hash_uri) {
+ std::vector<JoinDataType> data;
+ data.reserve(num_data);
+ for (int i = 0; i < num_data; ++i) {
+ data.push_back(JoinDataType(
+ start_document_id,
+ NamespaceFingerprintIdentifier(ref_namespace_id,
+ /*fingerprint=*/start_ref_hash_uri)));
+
+ ++start_document_id;
+ ++start_ref_hash_uri;
+ }
+ return data;
+}
+
+TEST_F(PostingListJoinDataAccessorTest, DataAddAndRetrieveProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ // Add some join data
+ std::vector<JoinDataType> data_vec =
+ CreateData(/*num_data=*/5, /*start_document_id=*/0,
+ /*ref_namespace_id=*/kDefaultNamespaceId,
+ /*start_ref_hash_uri=*/819);
+ for (const JoinDataType& data : data_vec) {
+ EXPECT_THAT(pl_accessor->PrependData(data), IsOk());
+ }
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ EXPECT_THAT(result.status, IsOk());
+ EXPECT_THAT(result.id.block_index(), Eq(1));
+ EXPECT_THAT(result.id.posting_list_index(), Eq(0));
+
+ // Retrieve some data.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result.id));
+ EXPECT_THAT(
+ serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(data_vec.rbegin(), data_vec.rend())));
+ EXPECT_THAT(pl_holder.next_block_index, Eq(kInvalidBlockIndex));
+}
+
+TEST_F(PostingListJoinDataAccessorTest, PreexistingPLKeepOnSameBlock) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ // Add a single data. This will fit in a min-sized posting list.
+ JoinDataType data1(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/123));
+ ICING_ASSERT_OK(pl_accessor->PrependData(data1));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+ // Should be allocated to the first block.
+ ASSERT_THAT(result1.id.block_index(), Eq(1));
+ ASSERT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Add one more data. The minimum size for a posting list must be able to fit
+ // two data, so this should NOT cause the previous pl to be reallocated.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ JoinDataType data2(
+ /*document_id=*/2,
+ NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/456));
+ ICING_ASSERT_OK(pl_accessor->PrependData(data2));
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result2.status);
+ // Should be in the same posting list.
+ EXPECT_THAT(result2.id, Eq(result1.id));
+
+ // The posting list at result2.id should hold all of the data that have been
+ // added.
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result2.id));
+ EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAre(data2, data1)));
+}
+
+TEST_F(PostingListJoinDataAccessorTest, PreexistingPLReallocateToLargerPL) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ // Adding 3 data should cause Finalize allocating a 56-byte posting list,
+ // which can store at most 4 data.
+ std::vector<JoinDataType> data_vec1 =
+ CreateData(/*num_data=*/3, /*start_document_id=*/0,
+ /*ref_namespace_id=*/kDefaultNamespaceId,
+ /*start_ref_hash_uri=*/819);
+ for (const JoinDataType& data : data_vec1) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+ // Should be allocated to the first block.
+ ASSERT_THAT(result1.id.block_index(), Eq(1));
+ ASSERT_THAT(result1.id.posting_list_index(), Eq(0));
+
+ // Now add more data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ // The current posting list can fit 1 more data. Adding 12 more data should
+ // result in these data being moved to a larger posting list. Also the total
+ // size of these data won't exceed max size posting list, so there will be
+ // only one single posting list and no chain.
+ std::vector<JoinDataType> data_vec2 = CreateData(
+ /*num_data=*/12, /*start_document_id=*/data_vec1.back().document_id() + 1,
+ /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
+
+ for (const JoinDataType& data : data_vec2) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result2.status);
+ // Should be allocated to the second (new) block because the posting list
+ // should grow beyond the size that the first block maintains.
+ EXPECT_THAT(result2.id.block_index(), Eq(2));
+ EXPECT_THAT(result2.id.posting_list_index(), Eq(0));
+
+ // The posting list at result2.id should hold all of the data that have been
+ // added.
+ std::vector<JoinDataType> all_data_vec;
+ all_data_vec.reserve(data_vec1.size() + data_vec2.size());
+ all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end());
+ all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end());
+ ICING_ASSERT_OK_AND_ASSIGN(PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(result2.id));
+ EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(all_data_vec.rbegin(),
+ all_data_vec.rend())));
+}
+
+TEST_F(PostingListJoinDataAccessorTest, MultiBlockChainsBlocksProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(JoinDataType)
+ // is 14, so the max size posting list can store (4096 - 12) / 14 = 291 data.
+ // Adding 292 data should cause:
+ // - 2 max size posting lists being allocated to block 1 and block 2.
+ // - Chaining: block 2 -> block 1
+ std::vector<JoinDataType> data_vec = CreateData(
+ /*num_data=*/292, /*start_document_id=*/0,
+ /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
+ for (const JoinDataType& data : data_vec) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+ PostingListIdentifier second_block_id = result1.id;
+ // Should be allocated to the second block.
+ EXPECT_THAT(second_block_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // We should be able to retrieve all data.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(second_block_id));
+ // This pl_holder will only hold a posting list with the data that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<JoinDataType> second_block_data,
+ serializer_->GetData(&pl_holder.posting_list));
+ ASSERT_THAT(second_block_data, SizeIs(Lt(data_vec.size())));
+ auto first_block_data_start = data_vec.rbegin() + second_block_data.size();
+ EXPECT_THAT(second_block_data,
+ ElementsAreArray(data_vec.rbegin(), first_block_data_start));
+
+ // Now retrieve all of the data that were on the first block.
+ uint32_t first_block_id = pl_holder.next_block_index;
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage_->GetPostingList(pl_id));
+ EXPECT_THAT(
+ serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(first_block_data_start, data_vec.rend())));
+}
+
+TEST_F(PostingListJoinDataAccessorTest,
+ PreexistingMultiBlockReusesBlocksProperly) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ // Block size is 4096, sizeof(BlockHeader) is 12 and sizeof(JoinDataType)
+ // is 14, so the max size posting list can store (4096 - 12) / 14 = 291 data.
+ // Adding 292 data will cause:
+ // - 2 max size posting lists being allocated to block 1 and block 2.
+ // - Chaining: block 2 -> block 1
+ std::vector<JoinDataType> data_vec1 = CreateData(
+ /*num_data=*/292, /*start_document_id=*/0,
+ /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
+ for (const JoinDataType& data : data_vec1) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result1.status);
+ PostingListIdentifier first_add_id = result1.id;
+ EXPECT_THAT(first_add_id, Eq(PostingListIdentifier(
+ /*block_index=*/2, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0)));
+
+ // Now add more data. These should fit on the existing second block and not
+ // fill it up.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), first_add_id));
+ std::vector<JoinDataType> data_vec2 = CreateData(
+ /*num_data=*/10, /*start_document_id=*/data_vec1.back().document_id() + 1,
+ /*ref_namespace_id=*/kDefaultNamespaceId, /*start_ref_hash_uri=*/819);
+ for (const JoinDataType& data : data_vec2) {
+ ICING_ASSERT_OK(pl_accessor->PrependData(data));
+ }
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor).Finalize();
+ ICING_ASSERT_OK(result2.status);
+ PostingListIdentifier second_add_id = result2.id;
+ EXPECT_THAT(second_add_id, Eq(first_add_id));
+
+ // We should be able to retrieve all data.
+ std::vector<JoinDataType> all_data_vec;
+ all_data_vec.reserve(data_vec1.size() + data_vec2.size());
+ all_data_vec.insert(all_data_vec.end(), data_vec1.begin(), data_vec1.end());
+ all_data_vec.insert(all_data_vec.end(), data_vec2.begin(), data_vec2.end());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListHolder pl_holder,
+ flash_index_storage_->GetPostingList(second_add_id));
+ // This pl_holder will only hold a posting list with the data that didn't fit
+ // on the first block.
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<JoinDataType> second_block_data,
+ serializer_->GetData(&pl_holder.posting_list));
+ ASSERT_THAT(second_block_data, SizeIs(Lt(all_data_vec.size())));
+ auto first_block_data_start =
+ all_data_vec.rbegin() + second_block_data.size();
+ EXPECT_THAT(second_block_data,
+ ElementsAreArray(all_data_vec.rbegin(), first_block_data_start));
+
+ // Now retrieve all of the data that were on the first block.
+ uint32_t first_block_id = pl_holder.next_block_index;
+ EXPECT_THAT(first_block_id, Eq(1));
+
+ PostingListIdentifier pl_id(first_block_id, /*posting_list_index=*/0,
+ /*posting_list_index_bits=*/0);
+ ICING_ASSERT_OK_AND_ASSIGN(pl_holder,
+ flash_index_storage_->GetPostingList(pl_id));
+ EXPECT_THAT(serializer_->GetData(&pl_holder.posting_list),
+ IsOkAndHolds(ElementsAreArray(first_block_data_start,
+ all_data_vec.rend())));
+}
+
+TEST_F(PostingListJoinDataAccessorTest,
+ InvalidDataShouldReturnInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ JoinDataType invalid_data = JoinDataType::GetInvalid();
+ EXPECT_THAT(pl_accessor->PrependData(invalid_data),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListJoinDataAccessorTest,
+ JoinDataNonIncreasingShouldReturnInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ JoinDataType data1(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/819));
+ ICING_ASSERT_OK(pl_accessor->PrependData(data1));
+
+ JoinDataType data2(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/818));
+ EXPECT_THAT(pl_accessor->PrependData(data2),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ JoinDataType data3(/*document_id=*/1,
+ NamespaceFingerprintIdentifier(kDefaultNamespaceId - 1,
+ /*fingerprint=*/820));
+ EXPECT_THAT(pl_accessor->PrependData(data3),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ JoinDataType data4(/*document_id=*/0,
+ NamespaceFingerprintIdentifier(kDefaultNamespaceId + 1,
+ /*fingerprint=*/820));
+ EXPECT_THAT(pl_accessor->PrependData(data4),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListJoinDataAccessorTest,
+ NewPostingListNoDataAddedShouldReturnInvalidArgument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ EXPECT_THAT(result.status,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(PostingListJoinDataAccessorTest,
+ PreexistingPostingListNoDataAddedShouldSucceed) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor1,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), serializer_.get()));
+ JoinDataType data1(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(kDefaultNamespaceId, /*fingerprint=*/819));
+ ICING_ASSERT_OK(pl_accessor1->PrependData(data1));
+ PostingListAccessor::FinalizeResult result1 =
+ std::move(*pl_accessor1).Finalize();
+ ICING_ASSERT_OK(result1.status);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor2,
+ PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
+ flash_index_storage_.get(), serializer_.get(), result1.id));
+ PostingListAccessor::FinalizeResult result2 =
+ std::move(*pl_accessor2).Finalize();
+ EXPECT_THAT(result2.status, IsOk());
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/posting-list-join-data-serializer.h b/icing/join/posting-list-join-data-serializer.h
new file mode 100644
index 0000000..9f39dca
--- /dev/null
+++ b/icing/join/posting-list-join-data-serializer.h
@@ -0,0 +1,803 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
+#define ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
+
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/posting_list/posting-list-common.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// A serializer class to serialize JoinDataType to PostingListUsed. Usually
+// JoinDataType is DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>,
+// DocumentIdToJoinInfo<TermId>, or DocumentIdToJoinInfo<int64_t>.
+//
+// REQUIRES:
+// - JoinDataType is comparable by operator <.
+// - JoinDataType implements is_valid() method.
+// - JoinDataType has static method GetInvalid() that returns a JoinDataType
+// instance containing invalid data.
+template <typename JoinDataType>
+class PostingListJoinDataSerializer : public PostingListSerializer {
+ public:
+ using SpecialDataType = SpecialData<JoinDataType>;
+ static_assert(sizeof(SpecialDataType) == sizeof(JoinDataType), "");
+
+ static constexpr uint32_t kSpecialDataSize =
+ kNumSpecialData * sizeof(SpecialDataType);
+
+ uint32_t GetDataTypeBytes() const override { return sizeof(JoinDataType); }
+
+ uint32_t GetMinPostingListSize() const override {
+ static constexpr uint32_t kMinPostingListSize = kSpecialDataSize;
+ static_assert(sizeof(PostingListIndex) <= kMinPostingListSize,
+ "PostingListIndex must be small enough to fit in a "
+ "minimum-sized Posting List.");
+
+ return kMinPostingListSize;
+ }
+
+ uint32_t GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const override;
+
+ uint32_t GetBytesUsed(
+ const PostingListUsed* posting_list_used) const override;
+
+ void Clear(PostingListUsed* posting_list_used) const override;
+
+ libtextclassifier3::Status MoveFrom(PostingListUsed* dst,
+ PostingListUsed* src) const override;
+
+ // Prepend a JoinData to the posting list.
+ //
+ // RETURNS:
+ // - INVALID_ARGUMENT if !data.is_valid() or if data is not greater than the
+ // previously added data.
+ // - RESOURCE_EXHAUSTED if there is no more room to add data to the posting
+ // list.
+ libtextclassifier3::Status PrependData(PostingListUsed* posting_list_used,
+ const JoinDataType& data) const;
+
+ // Prepend multiple JoinData to the posting list.
+ // Data should be sorted in ascending order (as defined by the less than
+ // operator for JoinData)
+ // If keep_prepended is true, whatever could be prepended is kept, otherwise
+ // the posting list is reverted and left in its original state.
+ //
+ // RETURNS:
+ // The number of data that have been prepended to the posting list. If
+ // keep_prepended is false and reverted, then it returns 0.
+ libtextclassifier3::StatusOr<uint32_t> PrependDataArray(
+ PostingListUsed* posting_list_used, const JoinDataType* array,
+ uint32_t num_data, bool keep_prepended) const;
+
+ // Retrieves all data stored in the posting list.
+ //
+ // RETURNS:
+ // - On success, a vector of JoinDataType sorted by the reverse order of
+ // prepending.
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::StatusOr<std::vector<JoinDataType>> GetData(
+ const PostingListUsed* posting_list_used) const;
+
+ // Same as GetData but appends data to data_arr_out.
+ //
+ // RETURNS:
+ // - OK on success, and data_arr_out will be appended JoinDataType sorted by
+ // the reverse order of prepending.
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status GetData(
+ const PostingListUsed* posting_list_used,
+ std::vector<JoinDataType>* data_arr_out) const;
+
+ // Undo the last num_data data prepended. If num_data > number of data, then
+ // we clear all data.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status PopFrontData(PostingListUsed* posting_list_used,
+ uint32_t num_data) const;
+
+ // Helper function to determine if posting list is full.
+ bool IsFull(const PostingListUsed* posting_list_used) const {
+ return GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
+ GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+ }
+
+ private:
+ // In PostingListJoinDataSerializer, there is no compression, but we still use
+ // the traditional posting list implementation.
+ //
+ // Posting list layout formats:
+ //
+ // NOT_FULL
+ // +-special-data-0--+-special-data-1--+------------+-----------------------+
+ // | | | | |
+ // |data-start-offset| Data::Invalid | 0x00000000 | (compressed) data |
+ // | | | | |
+ // +-----------------+-----------------+------------+-----------------------+
+ //
+ // ALMOST_FULL
+ // +-special-data-0--+-special-data-1--+-----+------------------------------+
+ // | | | | |
+ // | Data::Invalid | 1st data |(pad)| (compressed) data |
+ // | | | | |
+ // +-----------------+-----------------+-----+------------------------------+
+ //
+ // FULL
+ // +-special-data-0--+-special-data-1--+-----+------------------------------+
+ // | | | | |
+ // | 1st data | 2nd data |(pad)| (compressed) data |
+ // | | | | |
+ // +-----------------+-----------------+-----+------------------------------+
+ //
+ // The first two uncompressed (special) data also implicitly encode
+ // information about the size of the compressed data region.
+ //
+ // 1. If the posting list is NOT_FULL, then special_data_0 contains the byte
+ // offset of the start of the compressed data. Thus, the size of the
+ // compressed data is
+ // posting_list_used->size_in_bytes() - special_data_0.data_start_offset().
+ //
+ // 2. If posting list is ALMOST_FULL or FULL, then the compressed data region
+ // starts somewhere between
+ // [kSpecialDataSize, kSpecialDataSize + sizeof(JoinDataType) - 1] and ends
+ // at posting_list_used->size_in_bytes() - 1.
+ //
+ // EXAMPLE
+ // JoinDataType = DocumentIdToJoinInfo<int64_t>. Posting list size: 48 bytes
+ //
+ // EMPTY!
+ // +-- byte 0-11 --+---- 12-23 ----+------------ 24-47 -------------+
+ // | | | |
+ // | 48 | Data::Invalid | 0x00000000 |
+ // | | | |
+ // +---------------+---------------+--------------------------------+
+ //
+ // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 12, JoinInteger = 5)
+ // NOT FULL!
+ // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
+ // | | | | 12 |
+ // | 36 | Data::Invalid | 0x00000000 | 5 |
+ // | | | | |
+ // +---------------+---------------+---------------+---------------+
+ //
+ // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 18, JoinInteger = -2)
+ // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
+ // | | | 18 | 12 |
+ // | 24 | Data::Invalid | -2 | 5 |
+ // | | | | |
+ // +---------------+---------------+---------------+---------------+
+ //
+ // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 22, JoinInteger = 3)
+ // ALMOST_FULL!
+ // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
+ // | | 22 | 18 | 12 |
+ // | Data::Invalid | 3 | -2 | 5 |
+ // | | | | |
+ // +---------------+---------------+---------------+---------------+
+ //
+ // Add DocumentIdToJoinInfo<int64_t>(DocumentId = 27, JoinInteger = 0)
+ // FULL!
+ // +-- byte 0-11 --+---- 12-23 ----+---- 24-35 ----+---- 36-47 ----+
+ // | 27 | 22 | 18 | 12 |
+ // | 0 | 3 | -2 | 5 |
+ // | | | | |
+ // +---------------+---------------+---------------+---------------+
+
+ // Helpers to determine what state the posting list is in.
+ bool IsAlmostFull(const PostingListUsed* posting_list_used) const {
+ return !GetSpecialData(posting_list_used, /*index=*/0).data().is_valid() &&
+ GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+ }
+
+ bool IsEmpty(const PostingListUsed* posting_list_used) const {
+ return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset() ==
+ posting_list_used->size_in_bytes() &&
+ !GetSpecialData(posting_list_used, /*index=*/1).data().is_valid();
+ }
+
+ // Returns false if both special data are invalid or if data start offset
+ // stored in the special data is less than kSpecialDataSize or greater than
+ // posting_list_used->size_in_bytes(). Returns true, otherwise.
+ bool IsPostingListValid(const PostingListUsed* posting_list_used) const;
+
+ // Prepend data to a posting list that is in the ALMOST_FULL state.
+ //
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if data is not less than the previously added data.
+ libtextclassifier3::Status PrependDataToAlmostFull(
+ PostingListUsed* posting_list_used, const JoinDataType& data) const;
+
+ // Prepend data to a posting list that is in the EMPTY state. This will always
+ // succeed because there are no pre-existing data and no validly constructed
+ // posting list could fail to fit one data.
+ void PrependDataToEmpty(PostingListUsed* posting_list_used,
+ const JoinDataType& data) const;
+
+ // Prepend data to a posting list that is in the NOT_FULL state.
+ //
+ // RETURNS:
+ // - OK, if successful
+ // - INVALID_ARGUMENT if data is not less than the previously added data.
+ libtextclassifier3::Status PrependDataToNotFull(
+ PostingListUsed* posting_list_used, const JoinDataType& data,
+ uint32_t offset) const;
+
+ // Returns either 0 (FULL state), sizeof(JoinDataType) (ALMOST_FULL state) or
+ // a byte offset between kSpecialDataSize and
+ // posting_list_used->size_in_bytes() (inclusive) (NOT_FULL state).
+ uint32_t GetStartByteOffset(const PostingListUsed* posting_list_used) const;
+
+ // Sets special data 0 to properly reflect what start byte offset is (see
+ // layout comment for further details).
+ //
+ // Returns false if offset > posting_list_used->size_in_bytes() or offset is
+ // in range (kSpecialDataSize, sizeof(JoinDataType)) or
+ // (sizeof(JoinDataType), 0). True, otherwise.
+ bool SetStartByteOffset(PostingListUsed* posting_list_used,
+ uint32_t offset) const;
+
+ // Helper for MoveFrom/GetData/PopFrontData. Adds limit number of data to out
+ // or all data in the posting list if the posting list contains less than
+ // limit number of data. out can be NULL.
+ //
+ // NOTE: If called with limit=1, pop=true on a posting list that transitioned
+ // from NOT_FULL directly to FULL, GetDataInternal will not return the posting
+ // list to NOT_FULL. Instead it will leave it in a valid state, but it will be
+ // ALMOST_FULL.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL_ERROR if the posting list has been corrupted somehow.
+ libtextclassifier3::Status GetDataInternal(
+ const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+ std::vector<JoinDataType>* out) const;
+
+ // Retrieves the value stored in the index-th special data.
+ //
+ // REQUIRES:
+ // 0 <= index < kNumSpecialData.
+ //
+ // RETURNS:
+ // - A valid SpecialData<JoinDataType>.
+ SpecialDataType GetSpecialData(const PostingListUsed* posting_list_used,
+ uint32_t index) const;
+
+ // Sets the value stored in the index-th special data to special_data.
+ //
+ // REQUIRES:
+ // 0 <= index < kNumSpecialData.
+ void SetSpecialData(PostingListUsed* posting_list_used, uint32_t index,
+ const SpecialDataType& special_data) const;
+
+ // Prepends data to the memory region
+ // [offset - sizeof(JoinDataType), offset - 1] and
+ // returns the new beginning of the region.
+ //
+ // RETURNS:
+ // - The new beginning of the padded region, if successful.
+ // - INVALID_ARGUMENT if data will not fit (uncompressed) between
+ // [kSpecialDataSize, offset - 1]
+ libtextclassifier3::StatusOr<uint32_t> PrependDataUncompressed(
+ PostingListUsed* posting_list_used, const JoinDataType& data,
+ uint32_t offset) const;
+};
+
+template <typename JoinDataType>
+uint32_t PostingListJoinDataSerializer<JoinDataType>::GetBytesUsed(
+ const PostingListUsed* posting_list_used) const {
+ // The special data will be included if they represent actual data. If they
+ // represent the data start offset or the invalid data sentinel, they are not
+ // included.
+ return posting_list_used->size_in_bytes() -
+ GetStartByteOffset(posting_list_used);
+}
+
+template <typename JoinDataType>
+uint32_t
+PostingListJoinDataSerializer<JoinDataType>::GetMinPostingListSizeToFit(
+ const PostingListUsed* posting_list_used) const {
+ if (IsFull(posting_list_used) || IsAlmostFull(posting_list_used)) {
+ // If in either the FULL state or ALMOST_FULL state, this posting list *is*
+ // the minimum size posting list that can fit these data. So just return the
+ // size of the posting list.
+ return posting_list_used->size_in_bytes();
+ }
+
+ // In NOT_FULL state, BytesUsed contains no special data. The minimum sized
+ // posting list that would be guaranteed to fit these data would be
+ // ALMOST_FULL, with kInvalidData in special data 0, the uncompressed data in
+ // special data 1 and the n compressed data in the compressed region.
+ // BytesUsed contains one uncompressed data and n compressed data. Therefore,
+ // fitting these data into a posting list would require BytesUsed plus one
+ // extra data.
+ return GetBytesUsed(posting_list_used) + GetDataTypeBytes();
+}
+
+template <typename JoinDataType>
+void PostingListJoinDataSerializer<JoinDataType>::Clear(
+ PostingListUsed* posting_list_used) const {
+ // Safe to ignore return value because posting_list_used->size_in_bytes() is
+ // a valid argument.
+ SetStartByteOffset(posting_list_used,
+ /*offset=*/posting_list_used->size_in_bytes());
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::MoveFrom(
+ PostingListUsed* dst, PostingListUsed* src) const {
+ ICING_RETURN_ERROR_IF_NULL(dst);
+ ICING_RETURN_ERROR_IF_NULL(src);
+ if (GetMinPostingListSizeToFit(src) > dst->size_in_bytes()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "src MinPostingListSizeToFit %d must be larger than size %d.",
+ GetMinPostingListSizeToFit(src), dst->size_in_bytes()));
+ }
+
+ if (!IsPostingListValid(dst)) {
+ return absl_ports::FailedPreconditionError(
+ "Dst posting list is in an invalid state and can't be used!");
+ }
+ if (!IsPostingListValid(src)) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot MoveFrom an invalid src posting list!");
+ }
+
+ // Pop just enough data that all of src's compressed data fit in
+ // dst posting_list's compressed area. Then we can memcpy that area.
+ std::vector<JoinDataType> data_arr;
+ while (IsFull(src) || IsAlmostFull(src) ||
+ (dst->size_in_bytes() - kSpecialDataSize < GetBytesUsed(src))) {
+ if (!GetDataInternal(src, /*limit=*/1, /*pop=*/true, &data_arr).ok()) {
+ return absl_ports::AbortedError(
+ "Unable to retrieve data from src posting list.");
+ }
+ }
+
+ // memcpy the area and set up start byte offset.
+ Clear(dst);
+ memcpy(dst->posting_list_buffer() + dst->size_in_bytes() - GetBytesUsed(src),
+ src->posting_list_buffer() + GetStartByteOffset(src),
+ GetBytesUsed(src));
+ // Because we popped all data from src outside of the compressed area and we
+ // guaranteed that GetBytesUsed(src) is less than dst->size_in_bytes() -
+ // kSpecialDataSize. This is guaranteed to be a valid byte offset for the
+ // NOT_FULL state, so ignoring the value is safe.
+ SetStartByteOffset(dst, dst->size_in_bytes() - GetBytesUsed(src));
+
+ // Put back remaining data.
+ for (auto riter = data_arr.rbegin(); riter != data_arr.rend(); ++riter) {
+ // PrependData may return:
+ // - INVALID_ARGUMENT: if data is invalid or not less than the previous data
+ // - RESOURCE_EXHAUSTED
+ // RESOURCE_EXHAUSTED should be impossible because we've already assured
+ // that there is enough room above.
+ ICING_RETURN_IF_ERROR(PrependData(dst, *riter));
+ }
+
+ Clear(src);
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::PrependDataToAlmostFull(
+ PostingListUsed* posting_list_used, const JoinDataType& data) const {
+ SpecialDataType special_data = GetSpecialData(posting_list_used, /*index=*/1);
+ if (data < special_data.data()) {
+ return absl_ports::InvalidArgumentError(
+ "JoinData being prepended must not be smaller than the most recent "
+ "JoinData");
+ }
+
+ // Without compression, prepend a new data into ALMOST_FULL posting list will
+ // change the posting list to FULL state. Therefore, set special data 0
+ // directly.
+ SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(data));
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+void PostingListJoinDataSerializer<JoinDataType>::PrependDataToEmpty(
+ PostingListUsed* posting_list_used, const JoinDataType& data) const {
+ // First data to be added. Just add verbatim, no compression.
+ if (posting_list_used->size_in_bytes() == kSpecialDataSize) {
+ // First data will be stored at special data 1.
+ // Safe to ignore the return value because 1 < kNumSpecialData
+ SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
+ // Safe to ignore the return value because sizeof(JoinDataType) is a valid
+ // argument.
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType));
+ } else {
+ // Since this is the first data, size != kSpecialDataSize and
+ // size % sizeof(JoinDataType) == 0, we know that there is room to fit
+ // 'data' into the compressed region, so ValueOrDie is safe.
+ uint32_t offset =
+ PrependDataUncompressed(posting_list_used, data,
+ /*offset=*/posting_list_used->size_in_bytes())
+ .ValueOrDie();
+ // Safe to ignore the return value because PrependDataUncompressed is
+ // guaranteed to return a valid offset.
+ SetStartByteOffset(posting_list_used, offset);
+ }
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::PrependDataToNotFull(
+ PostingListUsed* posting_list_used, const JoinDataType& data,
+ uint32_t offset) const {
+ JoinDataType curr = JoinDataType::GetInvalid();
+ memcpy(&curr, posting_list_used->posting_list_buffer() + offset,
+ sizeof(JoinDataType));
+ if (data < curr) {
+ return absl_ports::InvalidArgumentError(
+ "JoinData being prepended must not be smaller than the most recent "
+ "JoinData");
+ }
+
+ if (offset >= kSpecialDataSize + sizeof(JoinDataType)) {
+ offset =
+ PrependDataUncompressed(posting_list_used, data, offset).ValueOrDie();
+ SetStartByteOffset(posting_list_used, offset);
+ } else {
+ // The new data must be put in special data 1.
+ SetSpecialData(posting_list_used, /*index=*/1, SpecialDataType(data));
+ // State ALMOST_FULL. Safe to ignore the return value because
+ // sizeof(JoinDataType) is a valid argument.
+ SetStartByteOffset(posting_list_used, /*offset=*/sizeof(JoinDataType));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::PrependData(
+ PostingListUsed* posting_list_used, const JoinDataType& data) const {
+ if (!data.is_valid()) {
+ return absl_ports::InvalidArgumentError("Cannot prepend an invalid data!");
+ }
+ if (!IsPostingListValid(posting_list_used)) {
+ return absl_ports::FailedPreconditionError(
+ "This PostingListUsed is in an invalid state and can't add any data!");
+ }
+
+ if (IsFull(posting_list_used)) {
+ // State FULL: no space left.
+ return absl_ports::ResourceExhaustedError("No more room for data");
+ } else if (IsAlmostFull(posting_list_used)) {
+ return PrependDataToAlmostFull(posting_list_used, data);
+ } else if (IsEmpty(posting_list_used)) {
+ PrependDataToEmpty(posting_list_used, data);
+ return libtextclassifier3::Status::OK;
+ } else {
+ uint32_t offset = GetStartByteOffset(posting_list_used);
+ return PrependDataToNotFull(posting_list_used, data, offset);
+ }
+}
+
+template <typename JoinDataType>
+libtextclassifier3::StatusOr<uint32_t>
+PostingListJoinDataSerializer<JoinDataType>::PrependDataArray(
+ PostingListUsed* posting_list_used, const JoinDataType* array,
+ uint32_t num_data, bool keep_prepended) const {
+ if (!IsPostingListValid(posting_list_used)) {
+ return 0;
+ }
+
+ uint32_t i;
+ for (i = 0; i < num_data; ++i) {
+ if (!PrependData(posting_list_used, array[i]).ok()) {
+ break;
+ }
+ }
+ if (i != num_data && !keep_prepended) {
+ // Didn't fit. Undo everything and check that we have the same offset as
+ // before. PopFrontData guarantees that it will remove all 'i' data so long
+ // as there are at least 'i' data in the posting list, which we know there
+ // are.
+ ICING_RETURN_IF_ERROR(PopFrontData(posting_list_used, /*num_data=*/i));
+ return 0;
+ }
+ return i;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::StatusOr<std::vector<JoinDataType>>
+PostingListJoinDataSerializer<JoinDataType>::GetData(
+ const PostingListUsed* posting_list_used) const {
+ std::vector<JoinDataType> data_arr_out;
+ ICING_RETURN_IF_ERROR(GetData(posting_list_used, &data_arr_out));
+ return data_arr_out;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status PostingListJoinDataSerializer<JoinDataType>::GetData(
+ const PostingListUsed* posting_list_used,
+ std::vector<JoinDataType>* data_arr_out) const {
+ return GetDataInternal(posting_list_used,
+ /*limit=*/std::numeric_limits<uint32_t>::max(),
+ /*pop=*/false, data_arr_out);
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::PopFrontData(
+ PostingListUsed* posting_list_used, uint32_t num_data) const {
+ if (num_data == 1 && IsFull(posting_list_used)) {
+ // The PL is in FULL state which means that we save 2 uncompressed data in
+ // the 2 special postions. But FULL state may be reached by 2 different
+ // states.
+ // (1) In ALMOST_FULL state
+ // +------------------+-----------------+-----+---------------------------+
+ // |Data::Invalid |1st data |(pad)|(compressed) data |
+ // | | | | |
+ // +------------------+-----------------+-----+---------------------------+
+ // When we prepend another data, we can only put it at special data 0, and
+ // thus get a FULL PL
+ // +------------------+-----------------+-----+---------------------------+
+ // |new 1st data |original 1st data|(pad)|(compressed) data |
+ // | | | | |
+ // +------------------+-----------------+-----+---------------------------+
+ //
+ // (2) In NOT_FULL state
+ // +------------------+-----------------+-------+---------+---------------+
+ // |data-start-offset |Data::Invalid |(pad) |1st data |(compressed) |
+ // | | | | |data |
+ // +------------------+-----------------+-------+---------+---------------+
+ // When we prepend another data, we can reach any of the 3 following
+ // scenarios:
+ // (2.1) NOT_FULL
+ // if the space of pad and original 1st data can accommodate the new 1st
+ // data and the encoded delta value.
+ // +------------------+-----------------+-----+--------+------------------+
+ // |data-start-offset |Data::Invalid |(pad)|new |(compressed) data |
+ // | | | |1st data| |
+ // +------------------+-----------------+-----+--------+------------------+
+ // (2.2) ALMOST_FULL
+ // If the space of pad and original 1st data cannot accommodate the new 1st
+ // data and the encoded delta value but can accommodate the encoded delta
+ // value only. We can put the new 1st data at special position 1.
+ // +------------------+-----------------+---------+-----------------------+
+ // |Data::Invalid |new 1st data |(pad) |(compressed) data |
+ // | | | | |
+ // +------------------+-----------------+---------+-----------------------+
+ // (2.3) FULL
+ // In very rare case, it cannot even accommodate only the encoded delta
+ // value. we can move the original 1st data into special position 1 and the
+ // new 1st data into special position 0. This may happen because we use
+ // VarInt encoding method which may make the encoded value longer (about
+ // 4/3 times of original)
+ // +------------------+-----------------+--------------+------------------+
+ // |new 1st data |original 1st data|(pad) |(compressed) data |
+ // | | | | |
+ // +------------------+-----------------+--------------+------------------+
+ //
+ // Suppose now the PL is in FULL state. But we don't know whether it arrived
+ // this state from NOT_FULL (like (2.3)) or from ALMOST_FULL (like (1)).
+ // We'll return to ALMOST_FULL state like (1) if we simply pop the new 1st
+ // data, but we want to make the prepending operation "reversible". So
+ // there should be some way to return to NOT_FULL if possible. A simple way
+ // to do is:
+ // - Pop 2 data out of the PL to state ALMOST_FULL or NOT_FULL.
+ // - Add the second data ("original 1st data") back.
+ //
+ // Then we can return to the correct original states of (2.1) or (1). This
+ // makes our prepending operation reversible.
+ std::vector<JoinDataType> out;
+
+ // Popping 2 data should never fail because we've just ensured that the
+ // posting list is in the FULL state.
+ ICING_RETURN_IF_ERROR(
+ GetDataInternal(posting_list_used, /*limit=*/2, /*pop=*/true, &out));
+
+ // PrependData should never fail because:
+ // - out[1] is a valid data less than all previous data in the posting list.
+ // - There's no way that the posting list could run out of room because it
+ // previously stored these 2 data.
+ ICING_RETURN_IF_ERROR(PrependData(posting_list_used, out[1]));
+ } else if (num_data > 0) {
+ return GetDataInternal(posting_list_used, /*limit=*/num_data, /*pop=*/true,
+ /*out=*/nullptr);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::Status
+PostingListJoinDataSerializer<JoinDataType>::GetDataInternal(
+ const PostingListUsed* posting_list_used, uint32_t limit, bool pop,
+ std::vector<JoinDataType>* out) const {
+ uint32_t offset = GetStartByteOffset(posting_list_used);
+ uint32_t count = 0;
+
+ // First traverse the first two special positions.
+ while (count < limit && offset < kSpecialDataSize) {
+ // offset / sizeof(JoinDataType) < kNumSpecialData
+ // because of the check above.
+ SpecialDataType special_data = GetSpecialData(
+ posting_list_used, /*index=*/offset / sizeof(JoinDataType));
+ if (out != nullptr) {
+ out->push_back(special_data.data());
+ }
+ offset += sizeof(JoinDataType);
+ ++count;
+ }
+
+ // - We don't compress the data.
+ // - The posting list size is a multiple of data type bytes.
+ // So offset of the first non-special data is guaranteed to be at
+ // kSpecialDataSize if in ALMOST_FULL or FULL state. In fact, we must not
+ // apply padding skipping logic here when still storing uncompressed data,
+ // because in this case 0 bytes are meanful (e.g. inverted doc id byte = 0).
+ while (count < limit && offset < posting_list_used->size_in_bytes()) {
+ JoinDataType data = JoinDataType::GetInvalid();
+ memcpy(&data, posting_list_used->posting_list_buffer() + offset,
+ sizeof(JoinDataType));
+ offset += sizeof(JoinDataType);
+ if (out != nullptr) {
+ out->push_back(data);
+ }
+ ++count;
+ }
+
+ if (pop) {
+ PostingListUsed* mutable_posting_list_used =
+ const_cast<PostingListUsed*>(posting_list_used);
+ // Modify the posting list so that we pop all data actually traversed.
+ if (offset >= kSpecialDataSize &&
+ offset < posting_list_used->size_in_bytes()) {
+ memset(
+ mutable_posting_list_used->posting_list_buffer() + kSpecialDataSize,
+ 0, offset - kSpecialDataSize);
+ }
+ SetStartByteOffset(mutable_posting_list_used, offset);
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename JoinDataType>
+typename PostingListJoinDataSerializer<JoinDataType>::SpecialDataType
+PostingListJoinDataSerializer<JoinDataType>::GetSpecialData(
+ const PostingListUsed* posting_list_used, uint32_t index) const {
+ // It is ok to temporarily construct a SpecialData with offset = 0 since we're
+ // going to overwrite it by memcpy.
+ SpecialDataType special_data(0);
+ memcpy(&special_data,
+ posting_list_used->posting_list_buffer() +
+ index * sizeof(SpecialDataType),
+ sizeof(SpecialDataType));
+ return special_data;
+}
+
+template <typename JoinDataType>
+void PostingListJoinDataSerializer<JoinDataType>::SetSpecialData(
+ PostingListUsed* posting_list_used, uint32_t index,
+ const SpecialDataType& special_data) const {
+ memcpy(posting_list_used->posting_list_buffer() +
+ index * sizeof(SpecialDataType),
+ &special_data, sizeof(SpecialDataType));
+}
+
+template <typename JoinDataType>
+bool PostingListJoinDataSerializer<JoinDataType>::IsPostingListValid(
+ const PostingListUsed* posting_list_used) const {
+ if (IsAlmostFull(posting_list_used)) {
+ // Special data 1 should hold a valid data.
+ if (!GetSpecialData(posting_list_used, /*index=*/1).data().is_valid()) {
+ ICING_LOG(ERROR)
+ << "Both special data cannot be invalid at the same time.";
+ return false;
+ }
+ } else if (!IsFull(posting_list_used)) {
+ // NOT_FULL. Special data 0 should hold a valid offset.
+ SpecialDataType special_data =
+ GetSpecialData(posting_list_used, /*index=*/0);
+ if (special_data.data_start_offset() > posting_list_used->size_in_bytes() ||
+ special_data.data_start_offset() < kSpecialDataSize) {
+ ICING_LOG(ERROR) << "Offset: " << special_data.data_start_offset()
+ << " size: " << posting_list_used->size_in_bytes()
+ << " sp size: " << kSpecialDataSize;
+ return false;
+ }
+ }
+ return true;
+}
+
+template <typename JoinDataType>
+uint32_t PostingListJoinDataSerializer<JoinDataType>::GetStartByteOffset(
+ const PostingListUsed* posting_list_used) const {
+ if (IsFull(posting_list_used)) {
+ return 0;
+ } else if (IsAlmostFull(posting_list_used)) {
+ return sizeof(JoinDataType);
+ } else {
+ return GetSpecialData(posting_list_used, /*index=*/0).data_start_offset();
+ }
+}
+
+template <typename JoinDataType>
+bool PostingListJoinDataSerializer<JoinDataType>::SetStartByteOffset(
+ PostingListUsed* posting_list_used, uint32_t offset) const {
+ if (offset > posting_list_used->size_in_bytes()) {
+ ICING_LOG(ERROR) << "offset cannot be a value greater than size "
+ << posting_list_used->size_in_bytes() << ". offset is "
+ << offset << ".";
+ return false;
+ }
+ if (offset < kSpecialDataSize && offset > sizeof(JoinDataType)) {
+ ICING_LOG(ERROR) << "offset cannot be a value between ("
+ << sizeof(JoinDataType) << ", " << kSpecialDataSize
+ << "). offset is " << offset << ".";
+ return false;
+ }
+ if (offset < sizeof(JoinDataType) && offset != 0) {
+ ICING_LOG(ERROR) << "offset cannot be a value between (0, "
+ << sizeof(JoinDataType) << "). offset is " << offset
+ << ".";
+ return false;
+ }
+
+ if (offset >= kSpecialDataSize) {
+ // NOT_FULL state.
+ SetSpecialData(posting_list_used, /*index=*/0, SpecialDataType(offset));
+ SetSpecialData(posting_list_used, /*index=*/1,
+ SpecialDataType(JoinDataType::GetInvalid()));
+ } else if (offset == sizeof(JoinDataType)) {
+ // ALMOST_FULL state.
+ SetSpecialData(posting_list_used, /*index=*/0,
+ SpecialDataType(JoinDataType::GetInvalid()));
+ }
+ // Nothing to do for the FULL state - the offset isn't actually stored
+ // anywhere and both 2 special data hold valid data.
+ return true;
+}
+
+template <typename JoinDataType>
+libtextclassifier3::StatusOr<uint32_t>
+PostingListJoinDataSerializer<JoinDataType>::PrependDataUncompressed(
+ PostingListUsed* posting_list_used, const JoinDataType& data,
+ uint32_t offset) const {
+ if (offset < kSpecialDataSize + sizeof(JoinDataType)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Not enough room to prepend JoinData at offset %d.", offset));
+ }
+ offset -= sizeof(JoinDataType);
+ memcpy(posting_list_used->posting_list_buffer() + offset, &data,
+ sizeof(JoinDataType));
+ return offset;
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_POSTING_LIST_JOIN_DATA_SERIALIZER_H_
diff --git a/icing/join/posting-list-join-data-serializer_test.cc b/icing/join/posting-list-join-data-serializer_test.cc
new file mode 100644
index 0000000..20137b6
--- /dev/null
+++ b/icing/join/posting-list-join-data-serializer_test.cc
@@ -0,0 +1,653 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/posting-list-join-data-serializer.h"
+
+#include <algorithm>
+#include <iterator>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/posting_list/posting-list-used.h"
+#include "icing/join/document-id-to-join-info.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/testing/common-matchers.h"
+
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::IsEmpty;
+using testing::SizeIs;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+TEST(PostingListJoinDataSerializerTest, GetMinPostingListSizeToFitNotNull) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size =
+ 2551 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ ASSERT_THAT(
+ serializer.PrependData(
+ &pl_used,
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/2))),
+ IsOk());
+ EXPECT_THAT(
+ serializer.GetMinPostingListSizeToFit(&pl_used),
+ Eq(2 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>)));
+
+ ASSERT_THAT(
+ serializer.PrependData(
+ &pl_used,
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/5))),
+ IsOk());
+ EXPECT_THAT(
+ serializer.GetMinPostingListSizeToFit(&pl_used),
+ Eq(3 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>)));
+}
+
+TEST(PostingListJoinDataSerializerTest, GetMinPostingListSizeToFitAlmostFull) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 3 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ ASSERT_THAT(
+ serializer.PrependData(
+ &pl_used,
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/2))),
+ IsOk());
+ ASSERT_THAT(
+ serializer.PrependData(
+ &pl_used,
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/5))),
+ IsOk());
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), Eq(size));
+}
+
+TEST(PostingListJoinDataSerializerTest, GetMinPostingListSizeToFitFull) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 3 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ ASSERT_THAT(
+ serializer.PrependData(
+ &pl_used,
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/2))),
+ IsOk());
+ ASSERT_THAT(
+ serializer.PrependData(
+ &pl_used,
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/5))),
+ IsOk());
+ ASSERT_THAT(
+ serializer.PrependData(
+ &pl_used,
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/10))),
+ IsOk());
+ EXPECT_THAT(serializer.GetMinPostingListSizeToFit(&pl_used), Eq(size));
+}
+
+TEST(PostingListJoinDataSerializerTest, PrependDataNotFull) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size =
+ 2551 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ // Make used.
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data0(
+ /*document_id=*/0,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk());
+ // Size = sizeof(uncompressed data0)
+ int expected_size =
+ sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(ElementsAre(data0)));
+
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data1(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk());
+ // Size = sizeof(uncompressed data1)
+ // + sizeof(uncompressed data0)
+ expected_size += sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data1, data0)));
+
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data2(
+ /*document_id=*/2, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/10));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data2), IsOk());
+ // Size = sizeof(uncompressed data2)
+ // + sizeof(uncompressed data1)
+ // + sizeof(uncompressed data0)
+ expected_size += sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data2, data1, data0)));
+}
+
+TEST(PostingListJoinDataSerializerTest, PrependDataAlmostFull) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 4 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ // Fill up the compressed region.
+ // Transitions:
+ // Adding data0: EMPTY -> NOT_FULL
+ // Adding data1: NOT_FULL -> NOT_FULL
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data0(
+ /*document_id=*/0,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2));
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data1(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk());
+ EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk());
+ int expected_size =
+ 2 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data1, data0)));
+
+ // Add one more data to transition NOT_FULL -> ALMOST_FULL
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data2(
+ /*document_id=*/2, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/10));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data2), IsOk());
+ expected_size =
+ 3 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data2, data1, data0)));
+
+ // Add one more data to transition ALMOST_FULL -> FULL
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data3(
+ /*document_id=*/3, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/0));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data3), IsOk());
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data3, data2, data1, data0)));
+
+ // The posting list is FULL. Adding another data should fail.
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data4(
+ /*document_id=*/4, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/0, /*fingerprint=*/1234));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data4),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListJoinDataSerializerTest, PrependSmallerDataShouldFail) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 4 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data(
+ /*document_id=*/100,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2));
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> smaller_data(
+ /*document_id=*/99,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2));
+
+ // NOT_FULL -> NOT_FULL
+ ASSERT_THAT(serializer.PrependData(&pl_used, data), IsOk());
+ EXPECT_THAT(serializer.PrependData(&pl_used, smaller_data),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // NOT_FULL -> ALMOST_FULL
+ ASSERT_THAT(serializer.PrependData(&pl_used, data), IsOk());
+ EXPECT_THAT(serializer.PrependData(&pl_used, smaller_data),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // ALMOST_FULL -> FULL
+ ASSERT_THAT(serializer.PrependData(&pl_used, data), IsOk());
+ EXPECT_THAT(serializer.PrependData(&pl_used, smaller_data),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(PostingListJoinDataSerializerTest, PrependDataPostingListUsedMinSize) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ // PL State: EMPTY
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(0));
+ EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(IsEmpty()));
+
+ // Add a data. PL should shift to ALMOST_FULL state
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data0(
+ /*document_id=*/0,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data0), IsOk());
+ // Size = sizeof(uncompressed data0)
+ int expected_size =
+ sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used), IsOkAndHolds(ElementsAre(data0)));
+
+ // Add another data. PL should shift to FULL state.
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data1(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data1), IsOk());
+ // Size = sizeof(uncompressed data1) + sizeof(uncompressed data0)
+ expected_size += sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used), Eq(expected_size));
+ EXPECT_THAT(serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAre(data1, data0)));
+
+ // The posting list is FULL. Adding another data should fail.
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier> data2(
+ /*document_id=*/2, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/10));
+ EXPECT_THAT(serializer.PrependData(&pl_used, data2),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST(PostingListJoinDataSerializerTest, PrependDataArrayDoNotKeepPrepended) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 6 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_in;
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_pushed;
+
+ // Add 3 data. The PL is in the empty state and should be able to fit all 3
+ // data without issue, transitioning the PL from EMPTY -> NOT_FULL.
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/10)));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_in.size()));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() *
+ sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+
+ // Add 2 data. The PL should transition from NOT_FULL to ALMOST_FULL.
+ data_in.clear();
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/3,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/0)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/4, NamespaceFingerprintIdentifier(/*namespace_id=*/0,
+ /*fingerprint=*/1234)));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_in.size()));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() *
+ sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+
+ // Add 2 data. The PL should remain ALMOST_FULL since the remaining space can
+ // only fit 1 data.
+ data_in.clear();
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, NamespaceFingerprintIdentifier(/*namespace_id=*/2,
+ /*fingerprint=*/99)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/6, NamespaceFingerprintIdentifier(/*namespace_id=*/1,
+ /*fingerprint=*/63)));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(0));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() *
+ sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+
+ // Add 1 data. The PL should transition from ALMOST_FULL to FULL.
+ data_in.pop_back();
+ ASSERT_THAT(data_in, SizeIs(1));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_in.size()));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() *
+ sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+}
+
+TEST(PostingListJoinDataSerializerTest, PrependDataArrayKeepPrepended) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 6 * sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_in;
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_pushed;
+
+ // Add 3 data. The PL is in the empty state and should be able to fit all 3
+ // data without issue, transitioning the PL from EMPTY -> NOT_FULL.
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/2)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/5)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/10)));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/true),
+ IsOkAndHolds(data_in.size()));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() *
+ sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+
+ // Add 4 data. The PL should prepend 3 data and transition from NOT_FULL to
+ // FULL.
+ data_in.clear();
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/3,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/1, /*fingerprint=*/0)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/4, NamespaceFingerprintIdentifier(/*namespace_id=*/0,
+ /*fingerprint=*/1234)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, NamespaceFingerprintIdentifier(/*namespace_id=*/2,
+ /*fingerprint=*/99)));
+ data_in.push_back(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/6, NamespaceFingerprintIdentifier(/*namespace_id=*/1,
+ /*fingerprint=*/63)));
+ EXPECT_THAT(
+ serializer.PrependDataArray(&pl_used, data_in.data(), data_in.size(),
+ /*keep_prepended=*/true),
+ IsOkAndHolds(3));
+ data_in.pop_back();
+ ASSERT_THAT(data_in, SizeIs(3));
+ std::move(data_in.begin(), data_in.end(), std::back_inserter(data_pushed));
+ EXPECT_THAT(serializer.GetBytesUsed(&pl_used),
+ Eq(data_pushed.size() *
+ sizeof(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>)));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_pushed.rbegin(), data_pushed.rend())));
+}
+
+TEST(PostingListJoinDataSerializerTest, MoveFrom) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr1 =
+ {DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/2)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/5))};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used1, data_arr1.data(), data_arr1.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr1.size()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr2 =
+ {DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/10)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/3, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/0)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/4,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/0,
+ /*fingerprint=*/1234)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/2,
+ /*fingerprint=*/99))};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used2, data_arr2.data(), data_arr2.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr2.size()));
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
+ IsOk());
+ EXPECT_THAT(
+ serializer.GetData(&pl_used2),
+ IsOkAndHolds(ElementsAreArray(data_arr1.rbegin(), data_arr1.rend())));
+ EXPECT_THAT(serializer.GetData(&pl_used1), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PostingListJoinDataSerializerTest, MoveToNullReturnsFailedPrecondition) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr = {
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/2)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/5))};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used, data_arr.data(), data_arr.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr.size()));
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used, /*src=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend())));
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/nullptr, /*src=*/&pl_used),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend())));
+}
+
+TEST(PostingListJoinDataSerializerTest, MoveToPostingListTooSmall) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size1 = 3 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used1,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size1));
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr1 =
+ {DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/2)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/5)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/10)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/3, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/0)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/4,
+ NamespaceFingerprintIdentifier(/*namespace_id=*/0,
+ /*fingerprint=*/1234))};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used1, data_arr1.data(), data_arr1.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr1.size()));
+
+ int size2 = serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used2,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size2));
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr2 =
+ {DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/2, /*fingerprint=*/99))};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used2, data_arr2.data(), data_arr2.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr2.size()));
+
+ EXPECT_THAT(serializer.MoveFrom(/*dst=*/&pl_used2, /*src=*/&pl_used1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used1),
+ IsOkAndHolds(ElementsAreArray(data_arr1.rbegin(), data_arr1.rend())));
+ EXPECT_THAT(
+ serializer.GetData(&pl_used2),
+ IsOkAndHolds(ElementsAreArray(data_arr2.rbegin(), data_arr2.rend())));
+}
+
+TEST(PostingListJoinDataSerializerTest, PopFrontData) {
+ PostingListJoinDataSerializer<
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>>
+ serializer;
+
+ int size = 2 * serializer.GetMinPostingListSize();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PostingListUsed pl_used,
+ PostingListUsed::CreateFromUnitializedRegion(&serializer, size));
+
+ std::vector<DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>> data_arr = {
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/2)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/5)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2, NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/10))};
+ ASSERT_THAT(
+ serializer.PrependDataArray(&pl_used, data_arr.data(), data_arr.size(),
+ /*keep_prepended=*/false),
+ IsOkAndHolds(data_arr.size()));
+ ASSERT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend())));
+
+ // Now, pop the last data. The posting list should contain the first three
+ // data.
+ EXPECT_THAT(serializer.PopFrontData(&pl_used, /*num_data=*/1), IsOk());
+ data_arr.pop_back();
+ EXPECT_THAT(
+ serializer.GetData(&pl_used),
+ IsOkAndHolds(ElementsAreArray(data_arr.rbegin(), data_arr.rend())));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id-join-index-impl-v1.cc b/icing/join/qualified-id-join-index-impl-v1.cc
new file mode 100644
index 0000000..cdcb5a9
--- /dev/null
+++ b/icing/join/qualified-id-join-index-impl-v1.cc
@@ -0,0 +1,476 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id-join-index-impl-v1.h"
+
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/memory-mapped-file.h"
+#include "icing/join/doc-join-info.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/store/document-id.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/key-mapper.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/persistent-hash-map-key-mapper.h"
+#include "icing/util/crc32.h"
+#include "icing/util/encode-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Set 1M for max # of qualified id entries and 10 bytes for key-value bytes.
+// This will take at most 23 MiB disk space and mmap for persistent hash map.
+static constexpr int32_t kDocJoinInfoMapperMaxNumEntries = 1 << 20;
+static constexpr int32_t kDocJoinInfoMapperAverageKVByteSize = 10;
+
+static constexpr int32_t kDocJoinInfoMapperDynamicTrieMaxSize =
+ 128 * 1024 * 1024; // 128 MiB
+
+DocumentId GetNewDocumentId(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId old_document_id) {
+ if (old_document_id >= document_id_old_to_new.size()) {
+ return kInvalidDocumentId;
+ }
+ return document_id_old_to_new[old_document_id];
+}
+
+std::string GetMetadataFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/metadata");
+}
+
+std::string GetDocJoinInfoMapperPath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/doc_join_info_mapper");
+}
+
+std::string GetQualifiedIdStoragePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/qualified_id_storage");
+}
+
+} // namespace
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV1>>
+QualifiedIdJoinIndexImplV1::Create(const Filesystem& filesystem,
+ std::string working_path,
+ bool pre_mapping_fbv,
+ bool use_persistent_hash_map) {
+ if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
+ !filesystem.DirectoryExists(
+ GetDocJoinInfoMapperPath(working_path).c_str()) ||
+ !filesystem.FileExists(GetQualifiedIdStoragePath(working_path).c_str())) {
+ // Discard working_path if any file/directory is missing, and reinitialize.
+ if (filesystem.DirectoryExists(working_path.c_str())) {
+ ICING_RETURN_IF_ERROR(
+ QualifiedIdJoinIndex::Discard(filesystem, working_path));
+ }
+ return InitializeNewFiles(filesystem, std::move(working_path),
+ pre_mapping_fbv, use_persistent_hash_map);
+ }
+ return InitializeExistingFiles(filesystem, std::move(working_path),
+ pre_mapping_fbv, use_persistent_hash_map);
+}
+
+QualifiedIdJoinIndexImplV1::~QualifiedIdJoinIndexImplV1() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING) << "Failed to persist qualified id type joinable index "
+ "to disk while destructing "
+ << working_path_;
+ }
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV1::Put(
+ const DocJoinInfo& doc_join_info, std::string_view ref_qualified_id_str) {
+ SetDirty();
+
+ if (!doc_join_info.is_valid()) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot put data for an invalid DocJoinInfo");
+ }
+
+ int32_t qualified_id_index = qualified_id_storage_->num_elements();
+ ICING_ASSIGN_OR_RETURN(
+ FileBackedVector<char>::MutableArrayView mutable_arr,
+ qualified_id_storage_->Allocate(ref_qualified_id_str.size() + 1));
+ mutable_arr.SetArray(/*idx=*/0, ref_qualified_id_str.data(),
+ ref_qualified_id_str.size());
+ mutable_arr.SetArray(/*idx=*/ref_qualified_id_str.size(), /*arr=*/"\0",
+ /*arr_len=*/1);
+
+ ICING_RETURN_IF_ERROR(doc_join_info_mapper_->Put(
+ encode_util::EncodeIntToCString(doc_join_info.value()),
+ qualified_id_index));
+
+ // TODO(b/268521214): add data into delete propagation storage
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<std::string_view> QualifiedIdJoinIndexImplV1::Get(
+ const DocJoinInfo& doc_join_info) const {
+ if (!doc_join_info.is_valid()) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot get data for an invalid DocJoinInfo");
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ int32_t qualified_id_index,
+ doc_join_info_mapper_->Get(
+ encode_util::EncodeIntToCString(doc_join_info.value())));
+
+ const char* data = qualified_id_storage_->array() + qualified_id_index;
+ return std::string_view(data, strlen(data));
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV1::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const std::vector<NamespaceId>& namespace_id_old_to_new,
+ DocumentId new_last_added_document_id) {
+ std::string temp_working_path = working_path_ + "_temp";
+ ICING_RETURN_IF_ERROR(
+ QualifiedIdJoinIndex::Discard(filesystem_, temp_working_path));
+
+ DestructibleDirectory temp_working_path_ddir(&filesystem_,
+ std::move(temp_working_path));
+ if (!temp_working_path_ddir.is_valid()) {
+ return absl_ports::InternalError(
+ "Unable to create temp directory to build new qualified id type "
+ "joinable index");
+ }
+
+ {
+ // Transfer all data from the current to new qualified id type joinable
+ // index. Also PersistToDisk and destruct the instance after finishing, so
+ // we can safely swap directories later.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> new_index,
+ Create(filesystem_, temp_working_path_ddir.dir(), pre_mapping_fbv_,
+ use_persistent_hash_map_));
+ ICING_RETURN_IF_ERROR(
+ TransferIndex(document_id_old_to_new, new_index.get()));
+ new_index->set_last_added_document_id(new_last_added_document_id);
+ ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
+ }
+
+ // Destruct current index's storage instances to safely swap directories.
+ // TODO(b/268521214): handle delete propagation storage
+ doc_join_info_mapper_.reset();
+ qualified_id_storage_.reset();
+
+ if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(),
+ working_path_.c_str())) {
+ return absl_ports::InternalError(
+ "Unable to apply new qualified id type joinable index due to failed "
+ "swap");
+ }
+
+ // Reinitialize qualified id type joinable index.
+ if (!filesystem_.PRead(GetMetadataFilePath(working_path_).c_str(),
+ metadata_buffer_.get(), kMetadataFileSize,
+ /*offset=*/0)) {
+ return absl_ports::InternalError("Fail to read metadata file");
+ }
+ if (use_persistent_hash_map_) {
+ ICING_ASSIGN_OR_RETURN(
+ doc_join_info_mapper_,
+ PersistentHashMapKeyMapper<int32_t>::Create(
+ filesystem_, GetDocJoinInfoMapperPath(working_path_),
+ pre_mapping_fbv_,
+ /*max_num_entries=*/kDocJoinInfoMapperMaxNumEntries,
+ /*average_kv_byte_size=*/kDocJoinInfoMapperAverageKVByteSize));
+ } else {
+ ICING_ASSIGN_OR_RETURN(
+ doc_join_info_mapper_,
+ DynamicTrieKeyMapper<int32_t>::Create(
+ filesystem_, GetDocJoinInfoMapperPath(working_path_),
+ kDocJoinInfoMapperDynamicTrieMaxSize));
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ qualified_id_storage_,
+ FileBackedVector<char>::Create(
+ filesystem_, GetQualifiedIdStoragePath(working_path_),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<char>::kMaxFileSize,
+ /*pre_mapping_mmap_size=*/pre_mapping_fbv_ ? 1024 * 1024 : 0));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV1::Clear() {
+ SetDirty();
+
+ doc_join_info_mapper_.reset();
+ // Discard and reinitialize doc join info mapper.
+ std::string doc_join_info_mapper_path =
+ GetDocJoinInfoMapperPath(working_path_);
+ if (use_persistent_hash_map_) {
+ ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<int32_t>::Delete(
+ filesystem_, doc_join_info_mapper_path));
+ ICING_ASSIGN_OR_RETURN(
+ doc_join_info_mapper_,
+ PersistentHashMapKeyMapper<int32_t>::Create(
+ filesystem_, std::move(doc_join_info_mapper_path), pre_mapping_fbv_,
+ /*max_num_entries=*/kDocJoinInfoMapperMaxNumEntries,
+ /*average_kv_byte_size=*/kDocJoinInfoMapperAverageKVByteSize));
+ } else {
+ ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<int32_t>::Delete(
+ filesystem_, doc_join_info_mapper_path));
+ ICING_ASSIGN_OR_RETURN(doc_join_info_mapper_,
+ DynamicTrieKeyMapper<int32_t>::Create(
+ filesystem_, doc_join_info_mapper_path,
+ kDocJoinInfoMapperDynamicTrieMaxSize));
+ }
+
+ // Clear qualified_id_storage_.
+ if (qualified_id_storage_->num_elements() > 0) {
+ ICING_RETURN_IF_ERROR(qualified_id_storage_->TruncateTo(0));
+ }
+
+ // TODO(b/268521214): clear delete propagation storage
+
+ info().last_added_document_id = kInvalidDocumentId;
+ return libtextclassifier3::Status::OK;
+}
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV1>>
+QualifiedIdJoinIndexImplV1::InitializeNewFiles(const Filesystem& filesystem,
+ std::string&& working_path,
+ bool pre_mapping_fbv,
+ bool use_persistent_hash_map) {
+ // Create working directory.
+ if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create directory: ", working_path));
+ }
+
+ // Initialize doc_join_info_mapper
+ std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper;
+ if (use_persistent_hash_map) {
+ // TODO(b/263890397): decide PersistentHashMapKeyMapper size
+ ICING_ASSIGN_OR_RETURN(
+ doc_join_info_mapper,
+ PersistentHashMapKeyMapper<int32_t>::Create(
+ filesystem, GetDocJoinInfoMapperPath(working_path), pre_mapping_fbv,
+ /*max_num_entries=*/kDocJoinInfoMapperMaxNumEntries,
+ /*average_kv_byte_size=*/kDocJoinInfoMapperAverageKVByteSize));
+ } else {
+ ICING_ASSIGN_OR_RETURN(
+ doc_join_info_mapper,
+ DynamicTrieKeyMapper<int32_t>::Create(
+ filesystem, GetDocJoinInfoMapperPath(working_path),
+ kDocJoinInfoMapperDynamicTrieMaxSize));
+ }
+
+ // Initialize qualified_id_storage
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<char>> qualified_id_storage,
+ FileBackedVector<char>::Create(
+ filesystem, GetQualifiedIdStoragePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<char>::kMaxFileSize,
+ /*pre_mapping_mmap_size=*/pre_mapping_fbv ? 1024 * 1024 : 0));
+
+ // Create instance.
+ auto new_index = std::unique_ptr<QualifiedIdJoinIndexImplV1>(
+ new QualifiedIdJoinIndexImplV1(
+ filesystem, std::move(working_path),
+ /*metadata_buffer=*/std::make_unique<uint8_t[]>(kMetadataFileSize),
+ std::move(doc_join_info_mapper), std::move(qualified_id_storage),
+ pre_mapping_fbv, use_persistent_hash_map));
+ // Initialize info content.
+ new_index->info().magic = Info::kMagic;
+ new_index->info().last_added_document_id = kInvalidDocumentId;
+ // Initialize new PersistentStorage. The initial checksums will be computed
+ // and set via InitializeNewStorage.
+ ICING_RETURN_IF_ERROR(new_index->InitializeNewStorage());
+
+ return new_index;
+}
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV1>>
+QualifiedIdJoinIndexImplV1::InitializeExistingFiles(
+ const Filesystem& filesystem, std::string&& working_path,
+ bool pre_mapping_fbv, bool use_persistent_hash_map) {
+ // PRead metadata file.
+ auto metadata_buffer = std::make_unique<uint8_t[]>(kMetadataFileSize);
+ if (!filesystem.PRead(GetMetadataFilePath(working_path).c_str(),
+ metadata_buffer.get(), kMetadataFileSize,
+ /*offset=*/0)) {
+ return absl_ports::InternalError("Fail to read metadata file");
+ }
+
+ // Initialize doc_join_info_mapper
+ bool dynamic_trie_key_mapper_dir_exists = filesystem.DirectoryExists(
+ absl_ports::StrCat(GetDocJoinInfoMapperPath(working_path),
+ "/key_mapper_dir")
+ .c_str());
+ if ((use_persistent_hash_map && dynamic_trie_key_mapper_dir_exists) ||
+ (!use_persistent_hash_map && !dynamic_trie_key_mapper_dir_exists)) {
+ // Return a failure here so that the caller can properly delete and rebuild
+ // this component.
+ return absl_ports::FailedPreconditionError("Key mapper type mismatch");
+ }
+
+ std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper;
+ if (use_persistent_hash_map) {
+ ICING_ASSIGN_OR_RETURN(
+ doc_join_info_mapper,
+ PersistentHashMapKeyMapper<int32_t>::Create(
+ filesystem, GetDocJoinInfoMapperPath(working_path), pre_mapping_fbv,
+ /*max_num_entries=*/kDocJoinInfoMapperMaxNumEntries,
+ /*average_kv_byte_size=*/kDocJoinInfoMapperAverageKVByteSize));
+ } else {
+ ICING_ASSIGN_OR_RETURN(
+ doc_join_info_mapper,
+ DynamicTrieKeyMapper<int32_t>::Create(
+ filesystem, GetDocJoinInfoMapperPath(working_path),
+ kDocJoinInfoMapperDynamicTrieMaxSize));
+ }
+
+ // Initialize qualified_id_storage
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<FileBackedVector<char>> qualified_id_storage,
+ FileBackedVector<char>::Create(
+ filesystem, GetQualifiedIdStoragePath(working_path),
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC,
+ FileBackedVector<char>::kMaxFileSize,
+ /*pre_mapping_mmap_size=*/pre_mapping_fbv ? 1024 * 1024 : 0));
+
+ // Create instance.
+ auto type_joinable_index = std::unique_ptr<QualifiedIdJoinIndexImplV1>(
+ new QualifiedIdJoinIndexImplV1(
+ filesystem, std::move(working_path), std::move(metadata_buffer),
+ std::move(doc_join_info_mapper), std::move(qualified_id_storage),
+ pre_mapping_fbv, use_persistent_hash_map));
+ // Initialize existing PersistentStorage. Checksums will be validated.
+ ICING_RETURN_IF_ERROR(type_joinable_index->InitializeExistingStorage());
+
+ // Validate magic.
+ if (type_joinable_index->info().magic != Info::kMagic) {
+ return absl_ports::FailedPreconditionError("Incorrect magic value");
+ }
+
+ return type_joinable_index;
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV1::TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ QualifiedIdJoinIndexImplV1* new_index) const {
+ std::unique_ptr<KeyMapper<int32_t>::Iterator> iter =
+ doc_join_info_mapper_->GetIterator();
+ while (iter->Advance()) {
+ DocJoinInfo old_doc_join_info(
+ encode_util::DecodeIntFromCString(iter->GetKey()));
+ int32_t qualified_id_index = iter->GetValue();
+
+ const char* data = qualified_id_storage_->array() + qualified_id_index;
+ std::string_view ref_qualified_id_str(data, strlen(data));
+
+ // Translate to new doc id.
+ DocumentId new_document_id = GetNewDocumentId(
+ document_id_old_to_new, old_doc_join_info.document_id());
+
+ if (new_document_id != kInvalidDocumentId) {
+ ICING_RETURN_IF_ERROR(
+ new_index->Put(DocJoinInfo(new_document_id,
+ old_doc_join_info.joinable_property_id()),
+ ref_qualified_id_str));
+ }
+ }
+
+ // TODO(b/268521214): transfer delete propagation storage
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV1::PersistMetadataToDisk(
+ bool force) {
+ if (!force && !is_info_dirty() && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ std::string metadata_file_path = GetMetadataFilePath(working_path_);
+
+ ScopedFd sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ if (!sfd.is_valid()) {
+ return absl_ports::InternalError("Fail to open metadata file for write");
+ }
+
+ if (!filesystem_.PWrite(sfd.get(), /*offset=*/0, metadata_buffer_.get(),
+ kMetadataFileSize)) {
+ return absl_ports::InternalError("Fail to write metadata file");
+ }
+
+ if (!filesystem_.DataSync(sfd.get())) {
+ return absl_ports::InternalError("Fail to sync metadata to disk");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV1::PersistStoragesToDisk(
+ bool force) {
+ if (!force && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ ICING_RETURN_IF_ERROR(doc_join_info_mapper_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(qualified_id_storage_->PersistToDisk());
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<Crc32>
+QualifiedIdJoinIndexImplV1::ComputeInfoChecksum(bool force) {
+ if (!force && !is_info_dirty()) {
+ return Crc32(crcs().component_crcs.info_crc);
+ }
+
+ return info().ComputeChecksum();
+}
+
+libtextclassifier3::StatusOr<Crc32>
+QualifiedIdJoinIndexImplV1::ComputeStoragesChecksum(bool force) {
+ if (!force && !is_storage_dirty()) {
+ return Crc32(crcs().component_crcs.storages_crc);
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 doc_join_info_mapper_crc,
+ doc_join_info_mapper_->ComputeChecksum());
+ ICING_ASSIGN_OR_RETURN(Crc32 qualified_id_storage_crc,
+ qualified_id_storage_->ComputeChecksum());
+
+ return Crc32(doc_join_info_mapper_crc.Get() ^ qualified_id_storage_crc.Get());
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id-join-index-impl-v1.h b/icing/join/qualified-id-join-index-impl-v1.h
new file mode 100644
index 0000000..9314602
--- /dev/null
+++ b/icing/join/qualified-id-join-index-impl-v1.h
@@ -0,0 +1,327 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_
+#define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/join/doc-join-info.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/key-mapper.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/store/namespace-id.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// QualifiedIdJoinIndexImplV1: a class to maintain data mapping DocJoinInfo to
+// joinable qualified ids and delete propagation info.
+class QualifiedIdJoinIndexImplV1 : public QualifiedIdJoinIndex {
+ public:
+ struct Info {
+ static constexpr int32_t kMagic = 0x48cabdc6;
+
+ int32_t magic;
+ DocumentId last_added_document_id;
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(
+ std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
+ }
+ } __attribute__((packed));
+ static_assert(sizeof(Info) == 8, "");
+
+ // Metadata file layout: <Crcs><Info>
+ static constexpr int32_t kCrcsMetadataBufferOffset = 0;
+ static constexpr int32_t kInfoMetadataBufferOffset =
+ static_cast<int32_t>(sizeof(Crcs));
+ static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
+ static_assert(kMetadataFileSize == 20, "");
+
+ // Creates a QualifiedIdJoinIndexImplV1 instance to store qualified ids for
+ // future joining search. If any of the underlying file is missing, then
+ // delete the whole working_path and (re)initialize with new ones. Otherwise
+ // initialize and create the instance by existing files.
+ //
+ // filesystem: Object to make system level calls
+ // working_path: Specifies the working path for PersistentStorage.
+ // QualifiedIdJoinIndexImplV1 uses working path as working
+ // directory and all related files will be stored under this
+ // directory. It takes full ownership and of working_path_,
+ // including creation/deletion. It is the caller's
+ // responsibility to specify correct working path and avoid
+ // mixing different persistent storages together under the same
+ // path. Also the caller has the ownership for the parent
+ // directory of working_path_, and it is responsible for parent
+ // directory creation/deletion. See PersistentStorage for more
+ // details about the concept of working_path.
+ // pre_mapping_fbv: flag indicating whether memory map max possible file size
+ // for underlying FileBackedVector before growing the actual
+ // file size.
+ // use_persistent_hash_map: flag indicating whether use persistent hash map as
+ // the key mapper (if false, then fall back to
+ // dynamic trie key mapper).
+ //
+ // Returns:
+ // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
+ // checksum
+ // - INTERNAL_ERROR on I/O errors
+ // - Any KeyMapper errors
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV1>>
+ Create(const Filesystem& filesystem, std::string working_path,
+ bool pre_mapping_fbv, bool use_persistent_hash_map);
+
+ // Delete copy and move constructor/assignment operator.
+ QualifiedIdJoinIndexImplV1(const QualifiedIdJoinIndexImplV1&) = delete;
+ QualifiedIdJoinIndexImplV1& operator=(const QualifiedIdJoinIndexImplV1&) =
+ delete;
+
+ QualifiedIdJoinIndexImplV1(QualifiedIdJoinIndexImplV1&&) = delete;
+ QualifiedIdJoinIndexImplV1& operator=(QualifiedIdJoinIndexImplV1&&) = delete;
+
+ ~QualifiedIdJoinIndexImplV1() override;
+
+ // v2 only API. Returns UNIMPLEMENTED_ERROR.
+ libtextclassifier3::Status Put(SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id,
+ DocumentId document_id,
+ std::vector<NamespaceFingerprintIdentifier>&&
+ ref_namespace_fingerprint_ids) override {
+ return absl_ports::UnimplementedError("This API is not supported in V2");
+ }
+
+ // v2 only API. Returns UNIMPLEMENTED_ERROR.
+ libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>>
+ GetIterator(SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id) const override {
+ return absl_ports::UnimplementedError("This API is not supported in V2");
+ }
+
+ // Puts a new data into index: DocJoinInfo (DocumentId, JoinablePropertyId)
+ // references to ref_qualified_id_str (the identifier of another document).
+ //
+ // REQUIRES: ref_qualified_id_str contains no '\0'.
+ //
+ // Returns:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid
+ // - Any KeyMapper errors
+ libtextclassifier3::Status Put(
+ const DocJoinInfo& doc_join_info,
+ std::string_view ref_qualified_id_str) override;
+
+ // Gets the referenced document's qualified id string by DocJoinInfo.
+ //
+ // Returns:
+ // - A qualified id string referenced by the given DocJoinInfo (DocumentId,
+ // JoinablePropertyId) on success
+ // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid
+ // - NOT_FOUND_ERROR if doc_join_info doesn't exist
+ // - Any KeyMapper errors
+ libtextclassifier3::StatusOr<std::string_view> Get(
+ const DocJoinInfo& doc_join_info) const override;
+
+ // Reduces internal file sizes by reclaiming space and ids of deleted
+ // documents. Qualified id type joinable index will convert all entries to the
+ // new document ids.
+ //
+ // - document_id_old_to_new: a map for converting old document id to new
+ // document id.
+ // - namespace_id_old_to_new: a map for converting old namespace id to new
+ // namespace id. It is unused in this implementation since we store raw
+ // qualified id string (which contains raw namespace string).
+ // - new_last_added_document_id: will be used to update the last added
+ // document id in the qualified id type joinable
+ // index.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error. This could potentially leave the index in
+ // an invalid state and the caller should handle it properly (e.g. discard
+ // and rebuild)
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const std::vector<NamespaceId>& namespace_id_old_to_new,
+ DocumentId new_last_added_document_id) override;
+
+ // Clears all data and set last_added_document_id to kInvalidDocumentId.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status Clear() override;
+
+ bool is_v2() const override { return false; }
+
+ int32_t size() const override { return doc_join_info_mapper_->num_keys(); }
+
+ bool empty() const override { return size() == 0; }
+
+ DocumentId last_added_document_id() const override {
+ return info().last_added_document_id;
+ }
+
+ void set_last_added_document_id(DocumentId document_id) override {
+ SetInfoDirty();
+
+ Info& info_ref = info();
+ if (info_ref.last_added_document_id == kInvalidDocumentId ||
+ document_id > info_ref.last_added_document_id) {
+ info_ref.last_added_document_id = document_id;
+ }
+ }
+
+ private:
+ explicit QualifiedIdJoinIndexImplV1(
+ const Filesystem& filesystem, std::string&& working_path,
+ std::unique_ptr<uint8_t[]> metadata_buffer,
+ std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper,
+ std::unique_ptr<FileBackedVector<char>> qualified_id_storage,
+ bool pre_mapping_fbv, bool use_persistent_hash_map)
+ : QualifiedIdJoinIndex(filesystem, std::move(working_path)),
+ metadata_buffer_(std::move(metadata_buffer)),
+ doc_join_info_mapper_(std::move(doc_join_info_mapper)),
+ qualified_id_storage_(std::move(qualified_id_storage)),
+ pre_mapping_fbv_(pre_mapping_fbv),
+ use_persistent_hash_map_(use_persistent_hash_map),
+ is_info_dirty_(false),
+ is_storage_dirty_(false) {}
+
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV1>>
+ InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
+ bool pre_mapping_fbv, bool use_persistent_hash_map);
+
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV1>>
+ InitializeExistingFiles(const Filesystem& filesystem,
+ std::string&& working_path, bool pre_mapping_fbv,
+ bool use_persistent_hash_map);
+
+ // Transfers qualified id join index data from the current to new_index and
+ // convert to new document id according to document_id_old_to_new. It is a
+ // helper function for Optimize.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ QualifiedIdJoinIndexImplV1* new_index) const;
+
+ // Flushes contents of metadata file.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistMetadataToDisk(bool force) override;
+
+ // Flushes contents of all storages to underlying files.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistStoragesToDisk(bool force) override;
+
+ // Computes and returns Info checksum.
+ //
+ // Returns:
+ // - Crc of the Info on success
+ libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override;
+
+ // Computes and returns all storages checksum.
+ //
+ // Returns:
+ // - Crc of all storages on success
+ // - INTERNAL_ERROR if any data inconsistency
+ libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) override;
+
+ Crcs& crcs() override {
+ return *reinterpret_cast<Crcs*>(metadata_buffer_.get() +
+ kCrcsMetadataBufferOffset);
+ }
+
+ const Crcs& crcs() const override {
+ return *reinterpret_cast<const Crcs*>(metadata_buffer_.get() +
+ kCrcsMetadataBufferOffset);
+ }
+
+ Info& info() {
+ return *reinterpret_cast<Info*>(metadata_buffer_.get() +
+ kInfoMetadataBufferOffset);
+ }
+
+ const Info& info() const {
+ return *reinterpret_cast<const Info*>(metadata_buffer_.get() +
+ kInfoMetadataBufferOffset);
+ }
+
+ void SetInfoDirty() { is_info_dirty_ = true; }
+ // When storage is dirty, we have to set info dirty as well. So just expose
+ // SetDirty to set both.
+ void SetDirty() {
+ is_info_dirty_ = true;
+ is_storage_dirty_ = true;
+ }
+
+ bool is_info_dirty() const { return is_info_dirty_; }
+ bool is_storage_dirty() const { return is_storage_dirty_; }
+
+ // Metadata buffer
+ std::unique_ptr<uint8_t[]> metadata_buffer_;
+
+ // Persistent KeyMapper for mapping (encoded) DocJoinInfo (DocumentId,
+ // JoinablePropertyId) to another referenced document's qualified id string
+ // index in qualified_id_storage_.
+ std::unique_ptr<KeyMapper<int32_t>> doc_join_info_mapper_;
+
+ // Storage for qualified id strings.
+ std::unique_ptr<FileBackedVector<char>> qualified_id_storage_;
+
+ // TODO(b/268521214): add delete propagation storage
+
+ // Flag indicating whether memory map max possible file size for underlying
+ // FileBackedVector before growing the actual file size.
+ bool pre_mapping_fbv_;
+
+ // Flag indicating whether use persistent hash map as the key mapper (if
+ // false, then fall back to dynamic trie key mapper).
+ bool use_persistent_hash_map_;
+
+ bool is_info_dirty_;
+ bool is_storage_dirty_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V1_H_
diff --git a/icing/join/qualified-id-join-index-impl-v1_test.cc b/icing/join/qualified-id-join-index-impl-v1_test.cc
new file mode 100644
index 0000000..a6e19bb
--- /dev/null
+++ b/icing/join/qualified-id-join-index-impl-v1_test.cc
@@ -0,0 +1,931 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id-join-index-impl-v1.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/file-backed-vector.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/join/doc-join-info.h"
+#include "icing/store/document-id.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/key-mapper.h"
+#include "icing/store/persistent-hash-map-key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::IsTrue;
+using ::testing::Lt;
+using ::testing::Ne;
+using ::testing::Not;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+
+using Crcs = PersistentStorage::Crcs;
+using Info = QualifiedIdJoinIndexImplV1::Info;
+
+static constexpr int32_t kCorruptedValueOffset = 3;
+
+struct QualifiedIdJoinIndexImplV1TestParam {
+ bool pre_mapping_fbv;
+ bool use_persistent_hash_map;
+
+ explicit QualifiedIdJoinIndexImplV1TestParam(bool pre_mapping_fbv_in,
+ bool use_persistent_hash_map_in)
+ : pre_mapping_fbv(pre_mapping_fbv_in),
+ use_persistent_hash_map(use_persistent_hash_map_in) {}
+};
+
+class QualifiedIdJoinIndexImplV1Test
+ : public ::testing::TestWithParam<QualifiedIdJoinIndexImplV1TestParam> {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/icing";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ working_path_ = base_dir_ + "/qualified_id_join_index_test";
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+ std::string working_path_;
+};
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, InvalidWorkingPath) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(
+ filesystem_, "/dev/null/qualified_id_join_index_test",
+ param.pre_mapping_fbv, param.use_persistent_hash_map),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, InitializeNewFiles) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ {
+ // Create new qualified id join index
+ ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Metadata file should be initialized correctly for both info and crcs
+ // sections.
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/metadata");
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize);
+ ASSERT_THAT(
+ filesystem_.PRead(metadata_file_path.c_str(), metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Check info section
+ const Info* info = reinterpret_cast<const Info*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV1::kInfoMetadataBufferOffset);
+ EXPECT_THAT(info->magic, Eq(Info::kMagic));
+ EXPECT_THAT(info->last_added_document_id, Eq(kInvalidDocumentId));
+
+ // Check crcs section
+ const Crcs* crcs = reinterpret_cast<const Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV1::kCrcsMetadataBufferOffset);
+ // There are some initial info in KeyMapper, so storages_crc should be
+ // non-zero.
+ EXPECT_THAT(crcs->component_crcs.storages_crc, Ne(0));
+ EXPECT_THAT(crcs->component_crcs.info_crc,
+ Eq(Crc32(std::string_view(reinterpret_cast<const char*>(info),
+ sizeof(Info)))
+ .Get()));
+ EXPECT_THAT(crcs->all_crc,
+ Eq(Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs->component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get()));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test,
+ InitializationShouldFailWithoutPersistToDiskOrDestruction) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ // Insert some data.
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+ ICING_ASSERT_OK(index->PersistToDisk());
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriB"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriC"));
+
+ // Without calling PersistToDisk, checksums will not be recomputed or synced
+ // to disk, so initializing another instance on the same files should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map),
+ StatusIs(param.use_persistent_hash_map
+ ? libtextclassifier3::StatusCode::FAILED_PRECONDITION
+ : libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test,
+ InitializationShouldSucceedWithPersistToDisk) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index1,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ // Insert some data.
+ ICING_ASSERT_OK(
+ index1->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+ ICING_ASSERT_OK(
+ index1->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriB"));
+ ICING_ASSERT_OK(
+ index1->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriC"));
+ ASSERT_THAT(index1, Pointee(SizeIs(3)));
+
+ // After calling PersistToDisk, all checksums should be recomputed and synced
+ // correctly to disk, so initializing another instance on the same files
+ // should succeed, and we should be able to get the same contents.
+ ICING_EXPECT_OK(index1->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index2,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ EXPECT_THAT(index2, Pointee(SizeIs(3)));
+ EXPECT_THAT(
+ index2->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20)),
+ IsOkAndHolds(/*ref_qualified_id_str=*/"namespace#uriA"));
+ EXPECT_THAT(
+ index2->Get(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20)),
+ IsOkAndHolds(/*ref_qualified_id_str=*/"namespace#uriB"));
+ EXPECT_THAT(
+ index2->Get(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20)),
+ IsOkAndHolds(/*ref_qualified_id_str=*/"namespace#uriC"));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test,
+ InitializationShouldSucceedAfterDestruction) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ // Insert some data.
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriB"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriC"));
+ ASSERT_THAT(index, Pointee(SizeIs(3)));
+ }
+
+ {
+ // The previous instance went out of scope and was destructed. Although we
+ // didn't call PersistToDisk explicitly, the destructor should invoke it and
+ // thus initializing another instance on the same files should succeed, and
+ // we should be able to get the same contents.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+ EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/1,
+ /*joinable_property_id=*/20)),
+ IsOkAndHolds("namespace#uriA"));
+ EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/3,
+ /*joinable_property_id=*/20)),
+ IsOkAndHolds("namespace#uriB"));
+ EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/5,
+ /*joinable_property_id=*/20)),
+ IsOkAndHolds("namespace#uriC"));
+ }
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test,
+ InitializeExistingFilesWithDifferentMagicShouldFail) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ // Manually change magic and update checksum
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/metadata");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize);
+ ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Manually change magic and update checksums.
+ Crcs* crcs = reinterpret_cast<Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV1::kCrcsMetadataBufferOffset);
+ Info* info = reinterpret_cast<Info*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV1::kInfoMetadataBufferOffset);
+ info->magic += kCorruptedValueOffset;
+ crcs->component_crcs.info_crc = info->ComputeChecksum().Get();
+ crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get();
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id join index with different magic. This
+ // should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Incorrect magic value")));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test,
+ InitializeExistingFilesWithWrongAllCrcShouldFail) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/metadata");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize);
+ ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Manually corrupt all_crc
+ Crcs* crcs = reinterpret_cast<Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV1::kCrcsMetadataBufferOffset);
+ crcs->all_crc += kCorruptedValueOffset;
+
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id join index with metadata containing
+ // corrupted all_crc. This should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid all crc")));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test,
+ InitializeExistingFilesWithCorruptedInfoShouldFail) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/metadata");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize);
+ ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Modify info, but don't update the checksum. This would be similar to
+ // corruption of info.
+ Info* info = reinterpret_cast<Info*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV1::kInfoMetadataBufferOffset);
+ info->last_added_document_id += kCorruptedValueOffset;
+
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV1::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id join index with info that doesn't match
+ // its checksum. This should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid info crc")));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test,
+ InitializeExistingFilesWithCorruptedDocJoinInfoMapperShouldFail) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Corrupt doc_join_info_mapper manually.
+ {
+ std::string mapper_working_path =
+ absl_ports::StrCat(working_path_, "/doc_join_info_mapper");
+ std::unique_ptr<KeyMapper<int32_t>> mapper;
+ if (param.use_persistent_hash_map) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ mapper, PersistentHashMapKeyMapper<int32_t>::Create(
+ filesystem_, std::move(mapper_working_path),
+ param.pre_mapping_fbv));
+ } else {
+ ICING_ASSERT_OK_AND_ASSIGN(mapper,
+ DynamicTrieKeyMapper<int32_t>::Create(
+ filesystem_, mapper_working_path,
+ /*maximum_size_bytes=*/128 * 1024 * 1024));
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, mapper->ComputeChecksum());
+ ICING_ASSERT_OK(mapper->Put("foo", 12345));
+ ICING_ASSERT_OK(mapper->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, mapper->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ // Attempt to create the qualified id join index with corrupted
+ // doc_join_info_mapper. This should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid storages crc")));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test,
+ InitializeExistingFilesWithCorruptedQualifiedIdStorageShouldFail) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ // Corrupt qualified_id_storage manually.
+ std::string qualified_id_storage_path =
+ absl_ports::StrCat(working_path_, "/qualified_id_storage");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<FileBackedVector<char>> qualified_id_storage,
+ FileBackedVector<char>::Create(
+ filesystem_, qualified_id_storage_path,
+ MemoryMappedFile::Strategy::READ_WRITE_AUTO_SYNC));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc,
+ qualified_id_storage->ComputeChecksum());
+ ICING_ASSERT_OK(qualified_id_storage->Append('a'));
+ ICING_ASSERT_OK(qualified_id_storage->Append('b'));
+ ICING_ASSERT_OK(qualified_id_storage->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc,
+ qualified_id_storage->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ // Attempt to create the qualified id join index with corrupted
+ // qualified_id_storage. This should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid storages crc")));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, InvalidPut) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ DocJoinInfo default_invalid;
+ EXPECT_THAT(
+ index->Put(default_invalid, /*ref_qualified_id_str=*/"namespace#uriA"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, InvalidGet) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ DocJoinInfo default_invalid;
+ EXPECT_THAT(index->Get(default_invalid),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, PutAndGet) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20);
+ std::string_view ref_qualified_id_str_a = "namespace#uriA";
+
+ DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/13);
+ std::string_view ref_qualified_id_str_b = "namespace#uriB";
+
+ DocJoinInfo target_info3(/*document_id=*/4, /*joinable_property_id=*/4);
+ std::string_view ref_qualified_id_str_c = "namespace#uriC";
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ EXPECT_THAT(index->Put(target_info1, ref_qualified_id_str_a), IsOk());
+ EXPECT_THAT(index->Put(target_info2, ref_qualified_id_str_b), IsOk());
+ EXPECT_THAT(index->Put(target_info3, ref_qualified_id_str_c), IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+
+ EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_qualified_id_str_a));
+ EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_qualified_id_str_b));
+ EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_qualified_id_str_c));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Verify we can get all of them after destructing and re-initializing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+ EXPECT_THAT(index->Get(target_info1), IsOkAndHolds(ref_qualified_id_str_a));
+ EXPECT_THAT(index->Get(target_info2), IsOkAndHolds(ref_qualified_id_str_b));
+ EXPECT_THAT(index->Get(target_info3), IsOkAndHolds(ref_qualified_id_str_c));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, GetShouldReturnNotFoundErrorIfNotExist) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ DocJoinInfo target_info(/*document_id=*/1, /*joinable_property_id=*/20);
+ std::string_view ref_qualified_id_str = "namespace#uriA";
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ // Verify entry is not found in the beginning.
+ EXPECT_THAT(index->Get(target_info),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK(index->Put(target_info, ref_qualified_id_str));
+ ASSERT_THAT(index->Get(target_info), IsOkAndHolds(ref_qualified_id_str));
+
+ // Get another non-existing entry. This should get NOT_FOUND_ERROR.
+ DocJoinInfo another_target_info(/*document_id=*/2,
+ /*joinable_property_id=*/20);
+ EXPECT_THAT(index->Get(another_target_info),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, SetLastAddedDocumentId) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ constexpr DocumentId kDocumentId = 100;
+ index->set_last_added_document_id(kDocumentId);
+ EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+
+ constexpr DocumentId kNextDocumentId = 123;
+ index->set_last_added_document_id(kNextDocumentId);
+ EXPECT_THAT(index->last_added_document_id(), Eq(kNextDocumentId));
+}
+
+TEST_P(
+ QualifiedIdJoinIndexImplV1Test,
+ SetLastAddedDocumentIdShouldIgnoreNewDocumentIdNotGreaterThanTheCurrent) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ constexpr DocumentId kDocumentId = 123;
+ index->set_last_added_document_id(kDocumentId);
+ ASSERT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+
+ constexpr DocumentId kNextDocumentId = 100;
+ ASSERT_THAT(kNextDocumentId, Lt(kDocumentId));
+ index->set_last_added_document_id(kNextDocumentId);
+ // last_added_document_id() should remain unchanged.
+ EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, Optimize) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/3),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/8, /*joinable_property_id=*/9),
+ /*ref_qualified_id_str=*/"namespace#uriB"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/13, /*joinable_property_id=*/4),
+ /*ref_qualified_id_str=*/"namespace#uriC"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/21, /*joinable_property_id=*/12),
+ /*ref_qualified_id_str=*/"namespace#uriC"));
+ index->set_last_added_document_id(21);
+
+ ASSERT_THAT(index, Pointee(SizeIs(5)));
+
+ // Delete doc id = 5, 8, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId);
+ document_id_old_to_new[3] = 0;
+ document_id_old_to_new[13] = 1;
+ document_id_old_to_new[21] = 2;
+
+ DocumentId new_last_added_document_id = 2;
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new, /*namespace_id_old_to_new=*/{},
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id));
+
+ // Verify Put and Get API still work normally after Optimize().
+ // (old_doc_id=3, joinable_property_id=10), which is now (doc_id=0,
+ // joinable_property_id=10), has referenced qualified id str =
+ // "namespace#uriA".
+ EXPECT_THAT(
+ index->Get(DocJoinInfo(/*document_id=*/0, /*joinable_property_id=*/10)),
+ IsOkAndHolds("namespace#uriA"));
+
+ // (old_doc_id=5, joinable_property_id=3) and (old_doc_id=8,
+ // joinable_property_id=9) are now not found since we've deleted old_doc_id =
+ // 5, 8. It is not testable via Get() because there is no valid doc_id mapping
+ // for old_doc_id = 5, 8 and we cannot generate a valid DocJoinInfo for it.
+
+ // (old_doc_id=13, joinable_property_id=4), which is now (doc_id=1,
+ // joinable_property_id=4), has referenced qualified id str =
+ // "namespace#uriC".
+ EXPECT_THAT(
+ index->Get(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/4)),
+ IsOkAndHolds("namespace#uriC"));
+
+ // (old_doc_id=21, joinable_property_id=12), which is now (doc_id=2,
+ // joinable_property_id=12), has referenced qualified id str =
+ // "namespace#uriC".
+ EXPECT_THAT(
+ index->Get(DocJoinInfo(/*document_id=*/2, /*joinable_property_id=*/12)),
+ IsOkAndHolds("namespace#uriC"));
+
+ // Joinable index should be able to work normally after Optimize().
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/2),
+ /*ref_qualified_id_str=*/"namespace#uriD"));
+ index->set_last_added_document_id(99);
+
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(99));
+ EXPECT_THAT(index->Get(DocJoinInfo(/*document_id=*/99,
+ /*joinable_property_id=*/2)),
+ IsOkAndHolds("namespace#uriD"));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, OptimizeOutOfRangeDocumentId) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/99, /*joinable_property_id=*/10),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+ index->set_last_added_document_id(99);
+
+ // Create document_id_old_to_new with size = 1. Optimize should handle out of
+ // range DocumentId properly.
+ std::vector<DocumentId> document_id_old_to_new = {kInvalidDocumentId};
+
+ // There shouldn't be any error due to vector index.
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new, /*namespace_id_old_to_new=*/{},
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, OptimizeDeleteAll) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/3, /*joinable_property_id=*/10),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/5, /*joinable_property_id=*/3),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/8, /*joinable_property_id=*/9),
+ /*ref_qualified_id_str=*/"namespace#uriB"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/13, /*joinable_property_id=*/4),
+ /*ref_qualified_id_str=*/"namespace#uriC"));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/21, /*joinable_property_id=*/12),
+ /*ref_qualified_id_str=*/"namespace#uriC"));
+ index->set_last_added_document_id(21);
+
+ // Delete all documents.
+ std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId);
+
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new, /*namespace_id_old_to_new=*/{},
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, Clear) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ DocJoinInfo target_info1(/*document_id=*/1, /*joinable_property_id=*/20);
+ DocJoinInfo target_info2(/*document_id=*/3, /*joinable_property_id=*/5);
+ DocJoinInfo target_info3(/*document_id=*/6, /*joinable_property_id=*/13);
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ ICING_ASSERT_OK(
+ index->Put(target_info1, /*ref_qualified_id_str=*/"namespace#uriA"));
+ ICING_ASSERT_OK(
+ index->Put(target_info2, /*ref_qualified_id_str=*/"namespace#uriB"));
+ ICING_ASSERT_OK(
+ index->Put(target_info3, /*ref_qualified_id_str=*/"namespace#uriC"));
+ ASSERT_THAT(index, Pointee(SizeIs(3)));
+ index->set_last_added_document_id(6);
+ ASSERT_THAT(index->last_added_document_id(), Eq(6));
+
+ // After resetting, last_added_document_id should be set to
+ // kInvalidDocumentId, and the previous added data should be deleted.
+ EXPECT_THAT(index->Clear(), IsOk());
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(index->Get(target_info1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info3),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Join index should be able to work normally after Clear().
+ DocJoinInfo target_info4(/*document_id=*/2, /*joinable_property_id=*/19);
+ ICING_ASSERT_OK(
+ index->Put(target_info4, /*ref_qualified_id_str=*/"namespace#uriD"));
+ index->set_last_added_document_id(2);
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(2));
+ EXPECT_THAT(index->Get(target_info4), IsOkAndHolds("namespace#uriD"));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ index.reset();
+
+ // Verify index after reconstructing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index, QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ EXPECT_THAT(index->last_added_document_id(), Eq(2));
+ EXPECT_THAT(index->Get(target_info1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info2),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info3),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(index->Get(target_info4), IsOkAndHolds("namespace#uriD"));
+}
+
+TEST_P(QualifiedIdJoinIndexImplV1Test, SwitchKeyMapperTypeShouldReturnError) {
+ const QualifiedIdJoinIndexImplV1TestParam& param = GetParam();
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV1> index,
+ QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ param.use_persistent_hash_map));
+ ICING_ASSERT_OK(
+ index->Put(DocJoinInfo(/*document_id=*/1, /*joinable_property_id=*/20),
+ /*ref_qualified_id_str=*/"namespace#uriA"));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ bool switch_key_mapper_flag = !param.use_persistent_hash_map;
+ EXPECT_THAT(QualifiedIdJoinIndexImplV1::Create(filesystem_, working_path_,
+ param.pre_mapping_fbv,
+ switch_key_mapper_flag),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ QualifiedIdJoinIndexImplV1Test, QualifiedIdJoinIndexImplV1Test,
+ testing::Values(QualifiedIdJoinIndexImplV1TestParam(
+ /*pre_mapping_fbv_in=*/true,
+ /*use_persistent_hash_map_in=*/true),
+ QualifiedIdJoinIndexImplV1TestParam(
+ /*pre_mapping_fbv_in=*/true,
+ /*use_persistent_hash_map_in=*/false),
+ QualifiedIdJoinIndexImplV1TestParam(
+ /*pre_mapping_fbv_in=*/false,
+ /*use_persistent_hash_map_in=*/true),
+ QualifiedIdJoinIndexImplV1TestParam(
+ /*pre_mapping_fbv_in=*/false,
+ /*use_persistent_hash_map_in=*/false)));
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id-join-index-impl-v2.cc b/icing/join/qualified-id-join-index-impl-v2.cc
new file mode 100644
index 0000000..70fd13c
--- /dev/null
+++ b/icing/join/qualified-id-join-index-impl-v2.cc
@@ -0,0 +1,681 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id-join-index-impl-v2.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-accessor.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/join/document-id-to-join-info.h"
+#include "icing/join/posting-list-join-data-accessor.h"
+#include "icing/join/posting-list-join-data-serializer.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/key-mapper.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/persistent-hash-map-key-mapper.h"
+#include "icing/util/crc32.h"
+#include "icing/util/encode-util.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Set 1M for max # of qualified id entries and 10 bytes for key-value bytes.
+// This will take at most 23 MiB disk space and mmap for persistent hash map.
+static constexpr int32_t kSchemaJoinableIdToPostingListMapperMaxNumEntries =
+ 1 << 20;
+static constexpr int32_t kSchemaJoinableIdToPostingListMapperAverageKVByteSize =
+ 10;
+
+inline DocumentId GetNewDocumentId(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ DocumentId old_document_id) {
+ if (old_document_id >= document_id_old_to_new.size()) {
+ return kInvalidDocumentId;
+ }
+ return document_id_old_to_new[old_document_id];
+}
+
+inline NamespaceId GetNewNamespaceId(
+ const std::vector<NamespaceId>& namespace_id_old_to_new,
+ NamespaceId namespace_id) {
+ if (namespace_id >= namespace_id_old_to_new.size()) {
+ return kInvalidNamespaceId;
+ }
+ return namespace_id_old_to_new[namespace_id];
+}
+
+libtextclassifier3::StatusOr<PostingListIdentifier> GetPostingListIdentifier(
+ const KeyMapper<PostingListIdentifier>&
+ schema_joinable_id_to_posting_list_mapper,
+ const std::string& encoded_schema_type_joinable_property_id_str) {
+ auto posting_list_identifier_or =
+ schema_joinable_id_to_posting_list_mapper.Get(
+ encoded_schema_type_joinable_property_id_str);
+ if (!posting_list_identifier_or.ok()) {
+ if (absl_ports::IsNotFound(posting_list_identifier_or.status())) {
+ // Not found. Return invalid posting list id.
+ return PostingListIdentifier::kInvalid;
+ }
+ // Real error.
+ return posting_list_identifier_or;
+ }
+ return std::move(posting_list_identifier_or).ValueOrDie();
+}
+
+libtextclassifier3::StatusOr<std::string> EncodeSchemaTypeJoinablePropertyId(
+ SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id) {
+ if (schema_type_id < 0) {
+ return absl_ports::InvalidArgumentError("Invalid schema type id");
+ }
+
+ if (!IsJoinablePropertyIdValid(joinable_property_id)) {
+ return absl_ports::InvalidArgumentError("Invalid joinable property id");
+ }
+
+ static constexpr int kEncodedSchemaTypeIdLength = 3;
+
+ // encoded_schema_type_id_str should be 1 to 3 bytes based on the value of
+ // schema_type_id.
+ std::string encoded_schema_type_id_str =
+ encode_util::EncodeIntToCString(schema_type_id);
+ // Make encoded_schema_type_id_str to fixed kEncodedSchemaTypeIdLength bytes.
+ while (encoded_schema_type_id_str.size() < kEncodedSchemaTypeIdLength) {
+ // C string cannot contain 0 bytes, so we append it using 1, just like what
+ // we do in encode_util::EncodeIntToCString.
+ //
+ // The reason that this works is because DecodeIntToString decodes a byte
+ // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded
+ // schema type id that is less than 3 bytes, it means that the id contains
+ // unencoded leading 0x00. So here we're explicitly encoding those bytes as
+ // 0x01.
+ encoded_schema_type_id_str.push_back(1);
+ }
+
+ return absl_ports::StrCat(
+ encoded_schema_type_id_str,
+ encode_util::EncodeIntToCString(joinable_property_id));
+}
+
+std::string GetMetadataFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/metadata");
+}
+
+std::string GetSchemaJoinableIdToPostingListMapperPath(
+ std::string_view working_path) {
+ return absl_ports::StrCat(working_path,
+ "/schema_joinable_id_to_posting_list_mapper");
+}
+
+std::string GetFlashIndexStorageFilePath(std::string_view working_path) {
+ return absl_ports::StrCat(working_path, "/flash_index_storage");
+}
+
+} // namespace
+
+libtextclassifier3::Status
+QualifiedIdJoinIndexImplV2::JoinDataIterator::Advance() {
+ if (pl_accessor_ == nullptr) {
+ return absl_ports::ResourceExhaustedError("End of iterator");
+ }
+
+ if (!should_retrieve_next_batch_) {
+ // In this case, cached_batch_join_data_ is not empty (contains some data
+ // fetched in the previous round), so move curr_ to the next position and
+ // check if we have to fetch the next batch.
+ //
+ // Note: in the 1st round, should_retrieve_next_batch_ is true, so this part
+ // will never be executed.
+ ++curr_;
+ should_retrieve_next_batch_ = curr_ >= cached_batch_join_data_.cend();
+ }
+
+ if (should_retrieve_next_batch_) {
+ // Fetch next batch if needed.
+ ICING_RETURN_IF_ERROR(GetNextDataBatch());
+ should_retrieve_next_batch_ = false;
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status
+QualifiedIdJoinIndexImplV2::JoinDataIterator::GetNextDataBatch() {
+ auto cached_batch_join_data_or = pl_accessor_->GetNextDataBatch();
+ if (!cached_batch_join_data_or.ok()) {
+ ICING_LOG(WARNING)
+ << "Fail to get next batch data from posting list due to: "
+ << cached_batch_join_data_or.status().error_message();
+ return std::move(cached_batch_join_data_or).status();
+ }
+
+ cached_batch_join_data_ = std::move(cached_batch_join_data_or).ValueOrDie();
+ curr_ = cached_batch_join_data_.cbegin();
+
+ if (cached_batch_join_data_.empty()) {
+ return absl_ports::ResourceExhaustedError("End of iterator");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV2>>
+QualifiedIdJoinIndexImplV2::Create(const Filesystem& filesystem,
+ std::string working_path,
+ bool pre_mapping_fbv) {
+ if (!filesystem.FileExists(GetMetadataFilePath(working_path).c_str()) ||
+ !filesystem.DirectoryExists(
+ GetSchemaJoinableIdToPostingListMapperPath(working_path).c_str()) ||
+ !filesystem.FileExists(
+ GetFlashIndexStorageFilePath(working_path).c_str())) {
+ // Discard working_path if any file/directory is missing, and reinitialize.
+ if (filesystem.DirectoryExists(working_path.c_str())) {
+ ICING_RETURN_IF_ERROR(
+ QualifiedIdJoinIndex::Discard(filesystem, working_path));
+ }
+ return InitializeNewFiles(filesystem, std::move(working_path),
+ pre_mapping_fbv);
+ }
+ return InitializeExistingFiles(filesystem, std::move(working_path),
+ pre_mapping_fbv);
+}
+
+QualifiedIdJoinIndexImplV2::~QualifiedIdJoinIndexImplV2() {
+ if (!PersistToDisk().ok()) {
+ ICING_LOG(WARNING) << "Failed to persist qualified id join index (v2) to "
+ "disk while destructing "
+ << working_path_;
+ }
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Put(
+ SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id,
+ DocumentId document_id,
+ std::vector<NamespaceFingerprintIdentifier>&&
+ ref_namespace_fingerprint_ids) {
+ std::sort(ref_namespace_fingerprint_ids.begin(),
+ ref_namespace_fingerprint_ids.end());
+
+ // Dedupe.
+ auto last = std::unique(ref_namespace_fingerprint_ids.begin(),
+ ref_namespace_fingerprint_ids.end());
+ ref_namespace_fingerprint_ids.erase(last,
+ ref_namespace_fingerprint_ids.end());
+ if (ref_namespace_fingerprint_ids.empty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ SetDirty();
+ ICING_ASSIGN_OR_RETURN(
+ std::string encoded_schema_type_joinable_property_id_str,
+ EncodeSchemaTypeJoinablePropertyId(schema_type_id, joinable_property_id));
+
+ ICING_ASSIGN_OR_RETURN(
+ PostingListIdentifier posting_list_identifier,
+ GetPostingListIdentifier(*schema_joinable_id_to_posting_list_mapper_,
+ encoded_schema_type_joinable_property_id_str));
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor;
+ if (posting_list_identifier.is_valid()) {
+ ICING_ASSIGN_OR_RETURN(
+ pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_.get(),
+ posting_list_identifier));
+ } else {
+ ICING_ASSIGN_OR_RETURN(
+ pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ flash_index_storage_.get(), posting_list_serializer_.get()));
+ }
+
+ // Prepend join data into posting list.
+ for (const NamespaceFingerprintIdentifier& ref_namespace_fingerprint_id :
+ ref_namespace_fingerprint_ids) {
+ ICING_RETURN_IF_ERROR(pl_accessor->PrependData(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ document_id, ref_namespace_fingerprint_id)));
+ }
+
+ // Finalize the posting list and update mapper.
+ PostingListAccessor::FinalizeResult result =
+ std::move(*pl_accessor).Finalize();
+ if (!result.status.ok()) {
+ return result.status;
+ }
+ if (!result.id.is_valid()) {
+ return absl_ports::InternalError("Fail to flush data into posting list(s)");
+ }
+ ICING_RETURN_IF_ERROR(schema_joinable_id_to_posting_list_mapper_->Put(
+ encoded_schema_type_joinable_property_id_str, result.id));
+
+ // Update info.
+ info().num_data += ref_namespace_fingerprint_ids.size();
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase>>
+QualifiedIdJoinIndexImplV2::GetIterator(
+ SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id) const {
+ ICING_ASSIGN_OR_RETURN(
+ std::string encoded_schema_type_joinable_property_id_str,
+ EncodeSchemaTypeJoinablePropertyId(schema_type_id, joinable_property_id));
+
+ ICING_ASSIGN_OR_RETURN(
+ PostingListIdentifier posting_list_identifier,
+ GetPostingListIdentifier(*schema_joinable_id_to_posting_list_mapper_,
+ encoded_schema_type_joinable_property_id_str));
+
+ if (!posting_list_identifier.is_valid()) {
+ return std::make_unique<JoinDataIterator>(nullptr);
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_.get(),
+ posting_list_identifier));
+
+ return std::make_unique<JoinDataIterator>(std::move(pl_accessor));
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const std::vector<NamespaceId>& namespace_id_old_to_new,
+ DocumentId new_last_added_document_id) {
+ std::string temp_working_path = working_path_ + "_temp";
+ ICING_RETURN_IF_ERROR(
+ QualifiedIdJoinIndex::Discard(filesystem_, temp_working_path));
+
+ DestructibleDirectory temp_working_path_ddir(&filesystem_,
+ std::move(temp_working_path));
+ if (!temp_working_path_ddir.is_valid()) {
+ return absl_ports::InternalError(
+ "Unable to create temp directory to build new qualified id join index "
+ "(v2)");
+ }
+
+ {
+ // Transfer all data from the current to new qualified id join index. Also
+ // PersistToDisk and destruct the instance after finishing, so we can safely
+ // swap directories later.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> new_index,
+ Create(filesystem_, temp_working_path_ddir.dir(), pre_mapping_fbv_));
+ ICING_RETURN_IF_ERROR(TransferIndex(
+ document_id_old_to_new, namespace_id_old_to_new, new_index.get()));
+ new_index->set_last_added_document_id(new_last_added_document_id);
+ ICING_RETURN_IF_ERROR(new_index->PersistToDisk());
+ }
+
+ // Destruct current index's storage instances to safely swap directories.
+ // TODO(b/268521214): handle delete propagation storage
+ schema_joinable_id_to_posting_list_mapper_.reset();
+ flash_index_storage_.reset();
+
+ if (!filesystem_.SwapFiles(temp_working_path_ddir.dir().c_str(),
+ working_path_.c_str())) {
+ return absl_ports::InternalError(
+ "Unable to apply new qualified id join index (v2) due to failed swap");
+ }
+
+ // Reinitialize qualified id join index.
+ if (!filesystem_.PRead(GetMetadataFilePath(working_path_).c_str(),
+ metadata_buffer_.get(), kMetadataFileSize,
+ /*offset=*/0)) {
+ return absl_ports::InternalError("Fail to read metadata file");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ schema_joinable_id_to_posting_list_mapper_,
+ PersistentHashMapKeyMapper<PostingListIdentifier>::Create(
+ filesystem_,
+ GetSchemaJoinableIdToPostingListMapperPath(working_path_),
+ pre_mapping_fbv_,
+ /*max_num_entries=*/
+ kSchemaJoinableIdToPostingListMapperMaxNumEntries,
+ /*average_kv_byte_size=*/
+ kSchemaJoinableIdToPostingListMapperAverageKVByteSize));
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path_),
+ &filesystem_, posting_list_serializer_.get()));
+ flash_index_storage_ =
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV2::Clear() {
+ SetDirty();
+
+ schema_joinable_id_to_posting_list_mapper_.reset();
+ // Discard and reinitialize schema_joinable_id_to_posting_list_mapper.
+ std::string schema_joinable_id_to_posting_list_mapper_path =
+ GetSchemaJoinableIdToPostingListMapperPath(working_path_);
+ ICING_RETURN_IF_ERROR(
+ PersistentHashMapKeyMapper<PostingListIdentifier>::Delete(
+ filesystem_, schema_joinable_id_to_posting_list_mapper_path));
+ ICING_ASSIGN_OR_RETURN(
+ schema_joinable_id_to_posting_list_mapper_,
+ PersistentHashMapKeyMapper<PostingListIdentifier>::Create(
+ filesystem_,
+ std::move(schema_joinable_id_to_posting_list_mapper_path),
+ pre_mapping_fbv_,
+ /*max_num_entries=*/
+ kSchemaJoinableIdToPostingListMapperMaxNumEntries,
+ /*average_kv_byte_size=*/
+ kSchemaJoinableIdToPostingListMapperAverageKVByteSize));
+
+ // Discard and reinitialize flash_index_storage.
+ flash_index_storage_.reset();
+ if (!filesystem_.DeleteFile(
+ GetFlashIndexStorageFilePath(working_path_).c_str())) {
+ return absl_ports::InternalError("Fail to delete flash index storage file");
+ }
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path_),
+ &filesystem_, posting_list_serializer_.get()));
+ flash_index_storage_ =
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage));
+
+ // TODO(b/268521214): clear delete propagation storage
+
+ info().num_data = 0;
+ info().last_added_document_id = kInvalidDocumentId;
+ return libtextclassifier3::Status::OK;
+}
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV2>>
+QualifiedIdJoinIndexImplV2::InitializeNewFiles(const Filesystem& filesystem,
+ std::string&& working_path,
+ bool pre_mapping_fbv) {
+ // Create working directory.
+ if (!filesystem.CreateDirectoryRecursively(working_path.c_str())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to create directory: ", working_path));
+ }
+
+ // Initialize schema_joinable_id_to_posting_list_mapper
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<KeyMapper<PostingListIdentifier>>
+ schema_joinable_id_to_posting_list_mapper,
+ PersistentHashMapKeyMapper<PostingListIdentifier>::Create(
+ filesystem, GetSchemaJoinableIdToPostingListMapperPath(working_path),
+ pre_mapping_fbv,
+ /*max_num_entries=*/
+ kSchemaJoinableIdToPostingListMapperMaxNumEntries,
+ /*average_kv_byte_size=*/
+ kSchemaJoinableIdToPostingListMapperAverageKVByteSize));
+
+ // Initialize flash_index_storage
+ auto posting_list_serializer =
+ std::make_unique<PostingListJoinDataSerializer<JoinDataType>>();
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path),
+ &filesystem, posting_list_serializer.get()));
+
+ // Create instance.
+ auto new_join_index = std::unique_ptr<QualifiedIdJoinIndexImplV2>(
+ new QualifiedIdJoinIndexImplV2(
+ filesystem, std::move(working_path),
+ /*metadata_buffer=*/std::make_unique<uint8_t[]>(kMetadataFileSize),
+ std::move(schema_joinable_id_to_posting_list_mapper),
+ std::move(posting_list_serializer),
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)),
+ pre_mapping_fbv));
+ // Initialize info content.
+ new_join_index->info().magic = Info::kMagic;
+ new_join_index->info().num_data = 0;
+ new_join_index->info().last_added_document_id = kInvalidDocumentId;
+ // Initialize new PersistentStorage. The initial checksums will be computed
+ // and set via InitializeNewStorage.
+ ICING_RETURN_IF_ERROR(new_join_index->InitializeNewStorage());
+
+ return new_join_index;
+}
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV2>>
+QualifiedIdJoinIndexImplV2::InitializeExistingFiles(
+ const Filesystem& filesystem, std::string&& working_path,
+ bool pre_mapping_fbv) {
+ // PRead metadata file.
+ auto metadata_buffer = std::make_unique<uint8_t[]>(kMetadataFileSize);
+ if (!filesystem.PRead(GetMetadataFilePath(working_path).c_str(),
+ metadata_buffer.get(), kMetadataFileSize,
+ /*offset=*/0)) {
+ return absl_ports::InternalError("Fail to read metadata file");
+ }
+
+ // Initialize schema_joinable_id_to_posting_list_mapper
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<KeyMapper<PostingListIdentifier>>
+ schema_joinable_id_to_posting_list_mapper,
+ PersistentHashMapKeyMapper<PostingListIdentifier>::Create(
+ filesystem, GetSchemaJoinableIdToPostingListMapperPath(working_path),
+ pre_mapping_fbv,
+ /*max_num_entries=*/
+ kSchemaJoinableIdToPostingListMapperMaxNumEntries,
+ /*average_kv_byte_size=*/
+ kSchemaJoinableIdToPostingListMapperAverageKVByteSize));
+
+ // Initialize flash_index_storage
+ auto posting_list_serializer =
+ std::make_unique<PostingListJoinDataSerializer<JoinDataType>>();
+ ICING_ASSIGN_OR_RETURN(
+ FlashIndexStorage flash_index_storage,
+ FlashIndexStorage::Create(GetFlashIndexStorageFilePath(working_path),
+ &filesystem, posting_list_serializer.get()));
+
+ // Create instance.
+ auto join_index = std::unique_ptr<QualifiedIdJoinIndexImplV2>(
+ new QualifiedIdJoinIndexImplV2(
+ filesystem, std::move(working_path), std::move(metadata_buffer),
+ std::move(schema_joinable_id_to_posting_list_mapper),
+ std::move(posting_list_serializer),
+ std::make_unique<FlashIndexStorage>(std::move(flash_index_storage)),
+ pre_mapping_fbv));
+ // Initialize existing PersistentStorage. Checksums will be validated.
+ ICING_RETURN_IF_ERROR(join_index->InitializeExistingStorage());
+
+ // Validate magic.
+ if (join_index->info().magic != Info::kMagic) {
+ return absl_ports::FailedPreconditionError("Incorrect magic value");
+ }
+
+ return join_index;
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV2::TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const std::vector<NamespaceId>& namespace_id_old_to_new,
+ QualifiedIdJoinIndexImplV2* new_index) const {
+ std::unique_ptr<KeyMapper<PostingListIdentifier>::Iterator> iter =
+ schema_joinable_id_to_posting_list_mapper_->GetIterator();
+
+ // Iterate through all (schema_type_id, joinable_property_id).
+ while (iter->Advance()) {
+ PostingListIdentifier old_pl_id = iter->GetValue();
+ if (!old_pl_id.is_valid()) {
+ // Skip invalid posting list id.
+ continue;
+ }
+
+ // Read all join data from old posting lists and convert to new join data
+ // with new document id, namespace id.
+ std::vector<JoinDataType> new_join_data_vec;
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>
+ old_pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::CreateFromExisting(
+ flash_index_storage_.get(), posting_list_serializer_.get(),
+ old_pl_id));
+ ICING_ASSIGN_OR_RETURN(std::vector<JoinDataType> batch_old_join_data,
+ old_pl_accessor->GetNextDataBatch());
+ while (!batch_old_join_data.empty()) {
+ for (const JoinDataType& old_join_data : batch_old_join_data) {
+ DocumentId new_document_id = GetNewDocumentId(
+ document_id_old_to_new, old_join_data.document_id());
+ NamespaceId new_ref_namespace_id = GetNewNamespaceId(
+ namespace_id_old_to_new, old_join_data.join_info().namespace_id());
+
+ // Transfer if the document and namespace are not deleted or outdated.
+ if (new_document_id != kInvalidDocumentId &&
+ new_ref_namespace_id != kInvalidNamespaceId) {
+ // We can reuse the fingerprint from old_join_data, since document uri
+ // (and its fingerprint) will never change.
+ new_join_data_vec.push_back(JoinDataType(
+ new_document_id, NamespaceFingerprintIdentifier(
+ new_ref_namespace_id,
+ old_join_data.join_info().fingerprint())));
+ }
+ }
+ ICING_ASSIGN_OR_RETURN(batch_old_join_data,
+ old_pl_accessor->GetNextDataBatch());
+ }
+
+ if (new_join_data_vec.empty()) {
+ continue;
+ }
+
+ // NamespaceId order may change, so we have to sort the vector.
+ std::sort(new_join_data_vec.begin(), new_join_data_vec.end());
+
+ // Create new posting list in new_index and prepend all new join data into
+ // it.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>>
+ new_pl_accessor,
+ PostingListJoinDataAccessor<JoinDataType>::Create(
+ new_index->flash_index_storage_.get(),
+ new_index->posting_list_serializer_.get()));
+ for (const JoinDataType& new_join_data : new_join_data_vec) {
+ ICING_RETURN_IF_ERROR(new_pl_accessor->PrependData(new_join_data));
+ }
+
+ // Finalize the posting list and update mapper of new_index.
+ PostingListAccessor::FinalizeResult result =
+ std::move(*new_pl_accessor).Finalize();
+ if (!result.status.ok()) {
+ return result.status;
+ }
+ if (!result.id.is_valid()) {
+ return absl_ports::InternalError(
+ "Fail to flush data into posting list(s)");
+ }
+ ICING_RETURN_IF_ERROR(
+ new_index->schema_joinable_id_to_posting_list_mapper_->Put(
+ iter->GetKey(), result.id));
+
+ // Update info.
+ new_index->info().num_data += new_join_data_vec.size();
+ }
+
+ // TODO(b/268521214): transfer delete propagation storage
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV2::PersistMetadataToDisk(
+ bool force) {
+ if (!force && !is_info_dirty() && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ std::string metadata_file_path = GetMetadataFilePath(working_path_);
+
+ ScopedFd sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ if (!sfd.is_valid()) {
+ return absl_ports::InternalError("Fail to open metadata file for write");
+ }
+
+ if (!filesystem_.PWrite(sfd.get(), /*offset=*/0, metadata_buffer_.get(),
+ kMetadataFileSize)) {
+ return absl_ports::InternalError("Fail to write metadata file");
+ }
+
+ if (!filesystem_.DataSync(sfd.get())) {
+ return absl_ports::InternalError("Fail to sync metadata to disk");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexImplV2::PersistStoragesToDisk(
+ bool force) {
+ if (!force && !is_storage_dirty()) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ ICING_RETURN_IF_ERROR(
+ schema_joinable_id_to_posting_list_mapper_->PersistToDisk());
+ if (!flash_index_storage_->PersistToDisk()) {
+ return absl_ports::InternalError(
+ "Fail to persist FlashIndexStorage to disk");
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<Crc32>
+QualifiedIdJoinIndexImplV2::ComputeInfoChecksum(bool force) {
+ if (!force && !is_info_dirty()) {
+ return Crc32(crcs().component_crcs.info_crc);
+ }
+
+ return info().ComputeChecksum();
+}
+
+libtextclassifier3::StatusOr<Crc32>
+QualifiedIdJoinIndexImplV2::ComputeStoragesChecksum(bool force) {
+ if (!force && !is_storage_dirty()) {
+ return Crc32(crcs().component_crcs.storages_crc);
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ Crc32 schema_joinable_id_to_posting_list_mapper_crc,
+ schema_joinable_id_to_posting_list_mapper_->ComputeChecksum());
+
+ return Crc32(schema_joinable_id_to_posting_list_mapper_crc.Get());
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id-join-index-impl-v2.h b/icing/join/qualified-id-join-index-impl-v2.h
new file mode 100644
index 0000000..2b0bf3f
--- /dev/null
+++ b/icing/join/qualified-id-join-index-impl-v2.h
@@ -0,0 +1,369 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
+#define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/file/posting_list/flash-index-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/join/doc-join-info.h"
+#include "icing/join/document-id-to-join-info.h"
+#include "icing/join/posting-list-join-data-accessor.h"
+#include "icing/join/posting-list-join-data-serializer.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/key-mapper.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/store/namespace-id.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// QualifiedIdJoinIndexImplV2: a class to maintain join data (DocumentId to
+// referenced NamespaceFingerprintIdentifier). It stores join data in posting
+// lists and bucketizes them by (schema_type_id, joinable_property_id).
+class QualifiedIdJoinIndexImplV2 : public QualifiedIdJoinIndex {
+ public:
+ using JoinDataType = DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>;
+
+ class JoinDataIterator : public JoinDataIteratorBase {
+ public:
+ explicit JoinDataIterator(
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor)
+ : pl_accessor_(std::move(pl_accessor)),
+ should_retrieve_next_batch_(true) {}
+
+ ~JoinDataIterator() override = default;
+
+ // Advances to the next data.
+ //
+ // Returns:
+ // - OK on success
+ // - RESOURCE_EXHAUSTED_ERROR if reaching the end (i.e. no more relevant
+ // data)
+ // - Any other PostingListJoinDataAccessor errors
+ libtextclassifier3::Status Advance() override;
+
+ const JoinDataType& GetCurrent() const override { return *curr_; }
+
+ private:
+ // Gets next batch of data from the posting list chain, caches in
+ // cached_batch_integer_index_data_, and sets curr_ to the begin of the
+ // cache.
+ libtextclassifier3::Status GetNextDataBatch();
+
+ std::unique_ptr<PostingListJoinDataAccessor<JoinDataType>> pl_accessor_;
+ std::vector<JoinDataType> cached_batch_join_data_;
+ std::vector<JoinDataType>::const_iterator curr_;
+ bool should_retrieve_next_batch_;
+ };
+
+ struct Info {
+ static constexpr int32_t kMagic = 0x12d1c074;
+
+ int32_t magic;
+ int32_t num_data;
+ DocumentId last_added_document_id;
+
+ Crc32 ComputeChecksum() const {
+ return Crc32(
+ std::string_view(reinterpret_cast<const char*>(this), sizeof(Info)));
+ }
+ } __attribute__((packed));
+ static_assert(sizeof(Info) == 12, "");
+
+ // Metadata file layout: <Crcs><Info>
+ static constexpr int32_t kCrcsMetadataBufferOffset = 0;
+ static constexpr int32_t kInfoMetadataBufferOffset =
+ static_cast<int32_t>(sizeof(Crcs));
+ static constexpr int32_t kMetadataFileSize = sizeof(Crcs) + sizeof(Info);
+ static_assert(kMetadataFileSize == 24, "");
+
+ static constexpr WorkingPathType kWorkingPathType =
+ WorkingPathType::kDirectory;
+
+ // Creates a QualifiedIdJoinIndexImplV2 instance to store join data
+ // (DocumentId to referenced NamespaceFingerPrintIdentifier) for future
+ // joining search. If any of the underlying file is missing, then delete the
+ // whole working_path and (re)initialize with new ones. Otherwise initialize
+ // and create the instance by existing files.
+ //
+ // filesystem: Object to make system level calls
+ // working_path: Specifies the working path for PersistentStorage.
+ // QualifiedIdJoinIndexImplV2 uses working path as working
+ // directory and all related files will be stored under this
+ // directory. It takes full ownership and of working_path_,
+ // including creation/deletion. It is the caller's
+ // responsibility to specify correct working path and avoid
+ // mixing different persistent storages together under the same
+ // path. Also the caller has the ownership for the parent
+ // directory of working_path_, and it is responsible for parent
+ // directory creation/deletion. See PersistentStorage for more
+ // details about the concept of working_path.
+ // pre_mapping_fbv: flag indicating whether memory map max possible file size
+ // for underlying FileBackedVector before growing the actual
+ // file size.
+ //
+ // Returns:
+ // - FAILED_PRECONDITION_ERROR if the file checksum doesn't match the stored
+ // checksum
+ // - INTERNAL_ERROR on I/O errors
+ // - Any KeyMapper errors
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV2>>
+ Create(const Filesystem& filesystem, std::string working_path,
+ bool pre_mapping_fbv);
+
+ // Delete copy and move constructor/assignment operator.
+ QualifiedIdJoinIndexImplV2(const QualifiedIdJoinIndexImplV2&) = delete;
+ QualifiedIdJoinIndexImplV2& operator=(const QualifiedIdJoinIndexImplV2&) =
+ delete;
+
+ QualifiedIdJoinIndexImplV2(QualifiedIdJoinIndexImplV2&&) = delete;
+ QualifiedIdJoinIndexImplV2& operator=(QualifiedIdJoinIndexImplV2&&) = delete;
+
+ ~QualifiedIdJoinIndexImplV2() override;
+
+ // v1 only API. Returns UNIMPLEMENTED_ERROR.
+ libtextclassifier3::Status Put(
+ const DocJoinInfo& doc_join_info,
+ std::string_view ref_qualified_id_str) override {
+ return absl_ports::UnimplementedError("This API is not supported in V2");
+ }
+
+ // v1 only API. Returns UNIMPLEMENTED_ERROR.
+ libtextclassifier3::StatusOr<std::string_view> Get(
+ const DocJoinInfo& doc_join_info) const override {
+ return absl_ports::UnimplementedError("This API is not supported in V2");
+ }
+
+ // Puts a list of referenced (parent) NamespaceFingerprintIdentifiers into
+ // the join index, given the (child) DocumentId, SchemaTypeId and
+ // JoinablePropertyId.
+ //
+ // Returns:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR if schema_type_id, joinable_property_id, or
+ // document_id is invalid
+ // - Any KeyMapper/FlashIndexStorage errors
+ libtextclassifier3::Status Put(SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id,
+ DocumentId document_id,
+ std::vector<NamespaceFingerprintIdentifier>&&
+ ref_namespace_fingerprint_ids) override;
+
+ // Returns a JoinDataIterator for iterating through all join data of the
+ // specified (schema_type_id, joinable_property_id).
+ //
+ // Returns:
+ // - On success: a JoinDataIterator
+ // - INVALID_ARGUMENT_ERROR if schema_type_id or joinable_property_id is
+ // invalid
+ // - Any KeyMapper/FlashIndexStorage errors
+ libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>>
+ GetIterator(SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id) const override;
+
+ // Reduces internal file sizes by reclaiming space and ids of deleted
+ // documents. Qualified id join index will convert all entries to the new
+ // document ids and namespace ids.
+ //
+ // - document_id_old_to_new: a map for converting old document id to new
+ // document id.
+ // - namespace_id_old_to_new: a map for converting old namespace id to new
+ // namespace id.
+ // - new_last_added_document_id: will be used to update the last added
+ // document id in the qualified id join index.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error. This could potentially leave the index in
+ // an invalid state and the caller should handle it properly (e.g. discard
+ // and rebuild)
+ libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const std::vector<NamespaceId>& namespace_id_old_to_new,
+ DocumentId new_last_added_document_id) override;
+
+ // Clears all data and set last_added_document_id to kInvalidDocumentId.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status Clear() override;
+
+ bool is_v2() const override { return true; }
+
+ int32_t size() const override { return info().num_data; }
+
+ bool empty() const override { return size() == 0; }
+
+ DocumentId last_added_document_id() const override {
+ return info().last_added_document_id;
+ }
+
+ void set_last_added_document_id(DocumentId document_id) override {
+ SetInfoDirty();
+
+ Info& info_ref = info();
+ if (info_ref.last_added_document_id == kInvalidDocumentId ||
+ document_id > info_ref.last_added_document_id) {
+ info_ref.last_added_document_id = document_id;
+ }
+ }
+
+ private:
+ explicit QualifiedIdJoinIndexImplV2(
+ const Filesystem& filesystem, std::string&& working_path,
+ std::unique_ptr<uint8_t[]> metadata_buffer,
+ std::unique_ptr<KeyMapper<PostingListIdentifier>>
+ schema_joinable_id_to_posting_list_mapper,
+ std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>>
+ posting_list_serializer,
+ std::unique_ptr<FlashIndexStorage> flash_index_storage,
+ bool pre_mapping_fbv)
+ : QualifiedIdJoinIndex(filesystem, std::move(working_path)),
+ metadata_buffer_(std::move(metadata_buffer)),
+ schema_joinable_id_to_posting_list_mapper_(
+ std::move(schema_joinable_id_to_posting_list_mapper)),
+ posting_list_serializer_(std::move(posting_list_serializer)),
+ flash_index_storage_(std::move(flash_index_storage)),
+ pre_mapping_fbv_(pre_mapping_fbv),
+ is_info_dirty_(false),
+ is_storage_dirty_(false) {}
+
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV2>>
+ InitializeNewFiles(const Filesystem& filesystem, std::string&& working_path,
+ bool pre_mapping_fbv);
+
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexImplV2>>
+ InitializeExistingFiles(const Filesystem& filesystem,
+ std::string&& working_path, bool pre_mapping_fbv);
+
+ // Transfers qualified id join index data from the current to new_index and
+ // convert to new document id according to document_id_old_to_new and
+ // namespace_id_old_to_new. It is a helper function for Optimize.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status TransferIndex(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const std::vector<NamespaceId>& namespace_id_old_to_new,
+ QualifiedIdJoinIndexImplV2* new_index) const;
+
+ // Flushes contents of metadata file.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistMetadataToDisk(bool force) override;
+
+ // Flushes contents of all storages to underlying files.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status PersistStoragesToDisk(bool force) override;
+
+ // Computes and returns Info checksum.
+ //
+ // Returns:
+ // - Crc of the Info on success
+ libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(bool force) override;
+
+ // Computes and returns all storages checksum.
+ //
+ // Returns:
+ // - Crc of all storages on success
+ // - INTERNAL_ERROR if any data inconsistency
+ libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) override;
+
+ Crcs& crcs() override {
+ return *reinterpret_cast<Crcs*>(metadata_buffer_.get() +
+ kCrcsMetadataBufferOffset);
+ }
+
+ const Crcs& crcs() const override {
+ return *reinterpret_cast<const Crcs*>(metadata_buffer_.get() +
+ kCrcsMetadataBufferOffset);
+ }
+
+ Info& info() {
+ return *reinterpret_cast<Info*>(metadata_buffer_.get() +
+ kInfoMetadataBufferOffset);
+ }
+
+ const Info& info() const {
+ return *reinterpret_cast<const Info*>(metadata_buffer_.get() +
+ kInfoMetadataBufferOffset);
+ }
+
+ void SetInfoDirty() { is_info_dirty_ = true; }
+ // When storage is dirty, we have to set info dirty as well. So just expose
+ // SetDirty to set both.
+ void SetDirty() {
+ is_info_dirty_ = true;
+ is_storage_dirty_ = true;
+ }
+
+ bool is_info_dirty() const { return is_info_dirty_; }
+ bool is_storage_dirty() const { return is_storage_dirty_; }
+
+ // Metadata buffer
+ std::unique_ptr<uint8_t[]> metadata_buffer_;
+
+ // Persistent KeyMapper for mapping (schema_type_id, joinable_property_id) to
+ // PostingListIdentifier.
+ std::unique_ptr<KeyMapper<PostingListIdentifier>>
+ schema_joinable_id_to_posting_list_mapper_;
+
+ // Posting list related members. Use posting list to store join data
+ // (document id to referenced NamespaceFingerprintIdentifier).
+ std::unique_ptr<PostingListJoinDataSerializer<JoinDataType>>
+ posting_list_serializer_;
+ std::unique_ptr<FlashIndexStorage> flash_index_storage_;
+
+ // TODO(b/268521214): add delete propagation storage
+
+ // Flag indicating whether memory map max possible file size for underlying
+ // FileBackedVector before growing the actual file size.
+ bool pre_mapping_fbv_;
+
+ bool is_info_dirty_;
+ bool is_storage_dirty_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_IMPL_V2_H_
diff --git a/icing/join/qualified-id-join-index-impl-v2_test.cc b/icing/join/qualified-id-join-index-impl-v2_test.cc
new file mode 100644
index 0000000..d73d6c2
--- /dev/null
+++ b/icing/join/qualified-id-join-index-impl-v2_test.cc
@@ -0,0 +1,1414 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id-join-index-impl-v2.h"
+
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/file/posting_list/posting-list-identifier.h"
+#include "icing/join/document-id-to-join-info.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/key-mapper.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/persistent-hash-map-key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::IsTrue;
+using ::testing::Lt;
+using ::testing::Ne;
+using ::testing::Not;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+
+using Crcs = PersistentStorage::Crcs;
+using Info = QualifiedIdJoinIndexImplV2::Info;
+
+static constexpr int32_t kCorruptedValueOffset = 3;
+
+class QualifiedIdJoinIndexImplV2Test : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/icing";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ working_path_ = base_dir_ + "/qualified_id_join_index_impl_v2_test";
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::string base_dir_;
+ std::string working_path_;
+};
+
+libtextclassifier3::StatusOr<
+ std::vector<QualifiedIdJoinIndexImplV2::JoinDataType>>
+GetJoinData(const QualifiedIdJoinIndexImplV2& index,
+ SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase> iter,
+ index.GetIterator(schema_type_id, joinable_property_id));
+
+ std::vector<QualifiedIdJoinIndexImplV2::JoinDataType> result;
+ while (iter->Advance().ok()) {
+ result.push_back(iter->GetCurrent());
+ }
+
+ return result;
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, InvalidWorkingPath) {
+ EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(
+ filesystem_, "/dev/null/qualified_id_join_index_impl_v2_test",
+ /*pre_mapping_fbv=*/false),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, InitializeNewFiles) {
+ {
+ // Create new qualified id join index
+ ASSERT_FALSE(filesystem_.DirectoryExists(working_path_.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Metadata file should be initialized correctly for both info and crcs
+ // sections.
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/metadata");
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize);
+ ASSERT_THAT(
+ filesystem_.PRead(metadata_file_path.c_str(), metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Check info section
+ const Info* info = reinterpret_cast<const Info*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV2::kInfoMetadataBufferOffset);
+ EXPECT_THAT(info->magic, Eq(Info::kMagic));
+ EXPECT_THAT(info->num_data, Eq(0));
+ EXPECT_THAT(info->last_added_document_id, Eq(kInvalidDocumentId));
+
+ // Check crcs section
+ const Crcs* crcs = reinterpret_cast<const Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV2::kCrcsMetadataBufferOffset);
+ // There are some initial info in KeyMapper, so storages_crc should be
+ // non-zero.
+ EXPECT_THAT(crcs->component_crcs.storages_crc, Ne(0));
+ EXPECT_THAT(crcs->component_crcs.info_crc,
+ Eq(Crc32(std::string_view(reinterpret_cast<const char*>(info),
+ sizeof(Info)))
+ .Get()));
+ EXPECT_THAT(crcs->all_crc,
+ Eq(Crc32(std::string_view(
+ reinterpret_cast<const char*>(&crcs->component_crcs),
+ sizeof(Crcs::ComponentCrcs)))
+ .Get()));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ InitializationShouldFailWithoutPersistToDiskOrDestruction) {
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/12);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/56);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/78);
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ // Insert some data.
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id2, id1}));
+ ICING_ASSERT_OK(index->PersistToDisk());
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/3, /*joinable_property_id=*/10, /*document_id=*/6,
+ /*ref_namespace_fingerprint_ids=*/{id3}));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/12,
+ /*ref_namespace_fingerprint_ids=*/{id4}));
+
+ // Without calling PersistToDisk, checksums will not be recomputed or synced
+ // to disk, so initializing another instance on the same files should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ InitializationShouldSucceedWithPersistToDisk) {
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/12);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/56);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/78);
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index1,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ // Insert some data.
+ ICING_ASSERT_OK(index1->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id2, id1}));
+ ICING_ASSERT_OK(index1->Put(
+ /*schema_type_id=*/3, /*joinable_property_id=*/10, /*document_id=*/6,
+ /*ref_namespace_fingerprint_ids=*/{id3}));
+ ICING_ASSERT_OK(index1->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/12,
+ /*ref_namespace_fingerprint_ids=*/{id4}));
+ ASSERT_THAT(index1, Pointee(SizeIs(4)));
+
+ // After calling PersistToDisk, all checksums should be recomputed and synced
+ // correctly to disk, so initializing another instance on the same files
+ // should succeed, and we should be able to get the same contents.
+ ICING_EXPECT_OK(index1->PersistToDisk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index2,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ EXPECT_THAT(index2, Pointee(SizeIs(4)));
+ EXPECT_THAT(
+ GetJoinData(*index2, /*schema_type_id=*/2, /*joinable_property_id=*/1),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/12, /*join_info=*/id4),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id2),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id1))));
+ EXPECT_THAT(
+ GetJoinData(*index2, /*schema_type_id=*/3, /*joinable_property_id=*/10),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/6, /*join_info=*/id3))));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ InitializationShouldSucceedAfterDestruction) {
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/12);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/56);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/78);
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ // Insert some data.
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id2, id1}));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/3, /*joinable_property_id=*/10, /*document_id=*/6,
+ /*ref_namespace_fingerprint_ids=*/{id3}));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/12,
+ /*ref_namespace_fingerprint_ids=*/{id4}));
+ ASSERT_THAT(index, Pointee(SizeIs(4)));
+ }
+
+ {
+ // The previous instance went out of scope and was destructed. Although we
+ // didn't call PersistToDisk explicitly, the destructor should invoke it and
+ // thus initializing another instance on the same files should succeed, and
+ // we should be able to get the same contents.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(
+ GetJoinData(*index, /*schema_type_id=*/2, /*joinable_property_id=*/1),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/12, /*join_info=*/id4),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id2),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id1))));
+ EXPECT_THAT(
+ GetJoinData(*index, /*schema_type_id=*/3, /*joinable_property_id=*/10),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/6, /*join_info=*/id3))));
+ }
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ InitializeExistingFilesWithDifferentMagicShouldFail) {
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/
+ {NamespaceFingerprintIdentifier(/*namespace_id=*/1,
+ /*fingerprint=*/12)}));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/metadata");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize);
+ ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Manually change magic and update checksum
+ Crcs* crcs = reinterpret_cast<Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV2::kCrcsMetadataBufferOffset);
+ Info* info = reinterpret_cast<Info*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV2::kInfoMetadataBufferOffset);
+ info->magic += kCorruptedValueOffset;
+ crcs->component_crcs.info_crc = info->ComputeChecksum().Get();
+ crcs->all_crc = crcs->component_crcs.ComputeChecksum().Get();
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id join index with different magic. This
+ // should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Incorrect magic value")));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ InitializeExistingFilesWithWrongAllCrcShouldFail) {
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/
+ {NamespaceFingerprintIdentifier(/*namespace_id=*/1,
+ /*fingerprint=*/12)}));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/metadata");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize);
+ ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Manually corrupt all_crc
+ Crcs* crcs = reinterpret_cast<Crcs*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV2::kCrcsMetadataBufferOffset);
+ crcs->all_crc += kCorruptedValueOffset;
+
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id join index with metadata containing
+ // corrupted all_crc. This should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid all crc")));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ InitializeExistingFilesWithCorruptedInfoShouldFail) {
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/
+ {NamespaceFingerprintIdentifier(/*namespace_id=*/1,
+ /*fingerprint=*/12)}));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ {
+ const std::string metadata_file_path =
+ absl_ports::StrCat(working_path_, "/metadata");
+ ScopedFd metadata_sfd(filesystem_.OpenForWrite(metadata_file_path.c_str()));
+ ASSERT_THAT(metadata_sfd.is_valid(), IsTrue());
+
+ auto metadata_buffer = std::make_unique<uint8_t[]>(
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize);
+ ASSERT_THAT(filesystem_.PRead(metadata_sfd.get(), metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize,
+ /*offset=*/0),
+ IsTrue());
+
+ // Modify info, but don't update the checksum. This would be similar to
+ // corruption of info.
+ Info* info = reinterpret_cast<Info*>(
+ metadata_buffer.get() +
+ QualifiedIdJoinIndexImplV2::kInfoMetadataBufferOffset);
+ info->last_added_document_id += kCorruptedValueOffset;
+
+ ASSERT_THAT(filesystem_.PWrite(
+ metadata_sfd.get(), /*offset=*/0, metadata_buffer.get(),
+ QualifiedIdJoinIndexImplV2::kMetadataFileSize),
+ IsTrue());
+ }
+
+ // Attempt to create the qualified id join index with info that doesn't match
+ // its checksum. This should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid info crc")));
+}
+
+TEST_F(
+ QualifiedIdJoinIndexImplV2Test,
+ InitializeExistingFilesWithCorruptedSchemaJoinableIdToPostingListMapperShouldFail) {
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/
+ {NamespaceFingerprintIdentifier(/*namespace_id=*/1,
+ /*fingerprint=*/12)}));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Corrupt schema_joinable_id_to_posting_list_mapper manually.
+ {
+ std::string mapper_working_path = absl_ports::StrCat(
+ working_path_, "/schema_joinable_id_to_posting_list_mapper");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<PostingListIdentifier>> mapper,
+ PersistentHashMapKeyMapper<PostingListIdentifier>::Create(
+ filesystem_, std::move(mapper_working_path),
+ /*pre_mapping_fbv=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 old_crc, mapper->ComputeChecksum());
+ ICING_ASSERT_OK(mapper->Put("foo", PostingListIdentifier::kInvalid));
+ ICING_ASSERT_OK(mapper->PersistToDisk());
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 new_crc, mapper->ComputeChecksum());
+ ASSERT_THAT(old_crc, Not(Eq(new_crc)));
+ }
+
+ // Attempt to create the qualified id join index with corrupted
+ // doc_join_info_mapper. This should fail.
+ EXPECT_THAT(QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION,
+ HasSubstr("Invalid storages crc")));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, InvalidPut) {
+ NamespaceFingerprintIdentifier id(/*namespace_id=*/1, /*fingerprint=*/12);
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ EXPECT_THAT(
+ index->Put(/*schema_type_id=*/-1, /*joinable_property_id=*/1,
+ /*document_id=*/5, /*ref_namespace_fingerprint_ids=*/{id}),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ index->Put(/*schema_type_id=*/2, /*joinable_property_id=*/-1,
+ /*document_id=*/5, /*ref_namespace_fingerprint_ids=*/{id}),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(index->Put(/*schema_type_id=*/2, /*joinable_property_id=*/1,
+ /*document_id=*/kInvalidDocumentId,
+ /*ref_namespace_fingerprint_ids=*/{id}),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, InvalidGetIterator) {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ EXPECT_THAT(
+ index->GetIterator(/*schema_type_id=*/-1, /*joinable_property_id=*/1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ index->GetIterator(/*schema_type_id=*/2, /*joinable_property_id=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ PutEmptyRefNamespaceFingerprintIdsShouldReturnOk) {
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{}),
+ IsOk());
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id + 1, joinable_property_id),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id + 1),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ PutAndGetSingleSchemaTypeAndJoinableProperty) {
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/3, /*fingerprint=*/12);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/2, /*fingerprint=*/56);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/0, /*fingerprint=*/78);
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id2, id1}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/6,
+ /*ref_namespace_fingerprint_ids=*/{id3}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/12,
+ /*ref_namespace_fingerprint_ids=*/{id4}),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/12, /*join_info=*/id4),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/6, /*join_info=*/id3),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id1),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id2))));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id + 1, joinable_property_id),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id + 1),
+ IsOkAndHolds(IsEmpty()));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Verify we can get all of them after destructing and re-initializing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/12, /*join_info=*/id4),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/6, /*join_info=*/id3),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id1),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id2))));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id + 1, joinable_property_id),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id + 1),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test,
+ PutAndGetMultipleSchemaTypesAndJoinableProperties) {
+ SchemaTypeId schema_type_id1 = 2;
+ SchemaTypeId schema_type_id2 = 4;
+
+ JoinablePropertyId joinable_property_id1 = 1;
+ JoinablePropertyId joinable_property_id2 = 10;
+
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/3, /*fingerprint=*/12);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/2, /*fingerprint=*/56);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/0, /*fingerprint=*/78);
+
+ {
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ EXPECT_THAT(
+ index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id1}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id1, joinable_property_id2, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id2}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id2, joinable_property_id1, /*document_id=*/12,
+ /*ref_namespace_fingerprint_ids=*/{id3}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id2, joinable_property_id2, /*document_id=*/12,
+ /*ref_namespace_fingerprint_ids=*/{id4}),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+
+ EXPECT_THAT(GetJoinData(*index, schema_type_id1, joinable_property_id1),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id1))));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id1, joinable_property_id2),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id2))));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id2, joinable_property_id1),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/12, /*join_info=*/id3))));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id2, joinable_property_id2),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/12, /*join_info=*/id4))));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ }
+
+ // Verify we can get all of them after destructing and re-initializing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id1, joinable_property_id1),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id1))));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id1, joinable_property_id2),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/5, /*join_info=*/id2))));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id2, joinable_property_id1),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/12, /*join_info=*/id3))));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id2, joinable_property_id2),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/12, /*join_info=*/id4))));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, SetLastAddedDocumentId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ constexpr DocumentId kDocumentId = 100;
+ index->set_last_added_document_id(kDocumentId);
+ EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+
+ constexpr DocumentId kNextDocumentId = 123;
+ index->set_last_added_document_id(kNextDocumentId);
+ EXPECT_THAT(index->last_added_document_id(), Eq(kNextDocumentId));
+}
+
+TEST_F(
+ QualifiedIdJoinIndexImplV2Test,
+ SetLastAddedDocumentIdShouldIgnoreNewDocumentIdNotGreaterThanTheCurrent) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ constexpr DocumentId kDocumentId = 123;
+ index->set_last_added_document_id(kDocumentId);
+ ASSERT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+
+ constexpr DocumentId kNextDocumentId = 100;
+ ASSERT_THAT(kNextDocumentId, Lt(kDocumentId));
+ index->set_last_added_document_id(kNextDocumentId);
+ // last_added_document_id() should remain unchanged.
+ EXPECT_THAT(index->last_added_document_id(), Eq(kDocumentId));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, Optimize) {
+ // General test for Optimize().
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ SchemaTypeId schema_type_id1 = 2;
+ SchemaTypeId schema_type_id2 = 5;
+
+ JoinablePropertyId joinable_property_id1 = 11;
+ JoinablePropertyId joinable_property_id2 = 15;
+
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/2, /*fingerprint=*/101);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/3, /*fingerprint=*/102);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/4, /*fingerprint=*/103);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/0, /*fingerprint=*/104);
+ NamespaceFingerprintIdentifier id5(/*namespace_id=*/0, /*fingerprint=*/105);
+ NamespaceFingerprintIdentifier id6(/*namespace_id=*/1, /*fingerprint=*/106);
+ NamespaceFingerprintIdentifier id7(/*namespace_id=*/3, /*fingerprint=*/107);
+ NamespaceFingerprintIdentifier id8(/*namespace_id=*/2, /*fingerprint=*/108);
+
+ EXPECT_THAT(
+ index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/3,
+ /*ref_namespace_fingerprint_ids=*/{id1, id2, id3}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id2, joinable_property_id2, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id4}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id2, joinable_property_id2, /*document_id=*/8,
+ /*ref_namespace_fingerprint_ids=*/{id5, id6}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/13,
+ /*ref_namespace_fingerprint_ids=*/{id7}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/21,
+ /*ref_namespace_fingerprint_ids=*/{id8}),
+ IsOk());
+ index->set_last_added_document_id(21);
+
+ ASSERT_THAT(index, Pointee(SizeIs(8)));
+
+ // Delete doc id = 5, 13, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId);
+ document_id_old_to_new[3] = 0;
+ document_id_old_to_new[8] = 1;
+ document_id_old_to_new[21] = 2;
+
+ // Delete namespace id 1, 2 (and invalidate id1, id6, id8). Reorder namespace
+ // ids [0, 3, 4] to [1, 2, 0].
+ std::vector<NamespaceId> namespace_id_old_to_new(5, kInvalidNamespaceId);
+ namespace_id_old_to_new[0] = 1;
+ namespace_id_old_to_new[3] = 2;
+ namespace_id_old_to_new[4] = 0;
+
+ DocumentId new_last_added_document_id = 2;
+ EXPECT_THAT(index->Optimize(document_id_old_to_new, namespace_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(3)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id));
+
+ // Verify GetIterator API should work normally after Optimize().
+ // 1) schema_type_id1, joinable_property_id1:
+ // - old_doc_id=21, old_ref_namespace_id=2: NOT FOUND
+ // - old_doc_id=13, old_ref_namespace_id=3: NOT FOUND
+ // - old_doc_id=3, old_ref_namespace_id=4:
+ // become new_doc_id=0, new_ref_namespace_id=0
+ // - old_doc_id=3, old_ref_namespace_id=3:
+ // become new_doc_id=0, new_ref_namespace_id=2
+ // - old_doc_id=3, old_ref_namespace_id=2: NOT FOUND
+ //
+ // For new_doc_id=0, it should reorder due to posting list restriction.
+ EXPECT_THAT(
+ GetJoinData(*index, schema_type_id1, joinable_property_id1),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/2, /*fingerprint=*/102)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/0, /*fingerprint=*/103)))));
+
+ // 2) schema_type_id2, joinable_property_id2:
+ // - old_doc_id=8, old_ref_namespace_id=1: NOT FOUND
+ // - old_doc_id=8, old_ref_namespace_id=0:
+ // become new_doc_id=1, new_ref_namespace_id=1
+ // - old_doc_id=5, old_ref_namespace_id=0: NOT FOUND
+ EXPECT_THAT(
+ GetJoinData(*index, schema_type_id2, joinable_property_id2),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/105)))));
+
+ // Verify Put API should work normally after Optimize().
+ NamespaceFingerprintIdentifier id9(/*namespace_id=*/1, /*fingerprint=*/109);
+ EXPECT_THAT(
+ index->Put(schema_type_id1, joinable_property_id1, /*document_id=*/99,
+ /*ref_namespace_fingerprint_ids=*/{id9}),
+ IsOk());
+ index->set_last_added_document_id(99);
+
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(99));
+ EXPECT_THAT(
+ GetJoinData(*index, schema_type_id1, joinable_property_id1),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/99, /*join_info=*/id9),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/2, /*fingerprint=*/102)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/0, /*fingerprint=*/103)))));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeDocumentIdChange) {
+ // Specific test for Optimize(): document id compaction.
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/101);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/102);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/103);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/104);
+ NamespaceFingerprintIdentifier id5(/*namespace_id=*/1, /*fingerprint=*/105);
+ NamespaceFingerprintIdentifier id6(/*namespace_id=*/1, /*fingerprint=*/106);
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/3,
+ /*ref_namespace_fingerprint_ids=*/{id1, id2}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id3}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/8,
+ /*ref_namespace_fingerprint_ids=*/{id4}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/13,
+ /*ref_namespace_fingerprint_ids=*/{id5}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/21,
+ /*ref_namespace_fingerprint_ids=*/{id6}),
+ IsOk());
+ index->set_last_added_document_id(21);
+
+ ASSERT_THAT(index, Pointee(SizeIs(6)));
+
+ // Delete doc id = 5, 8, compress and keep the rest.
+ std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId);
+ document_id_old_to_new[3] = 0;
+ document_id_old_to_new[13] = 1;
+ document_id_old_to_new[21] = 2;
+
+ // No change for namespace id.
+ std::vector<NamespaceId> namespace_id_old_to_new = {0, 1};
+
+ DocumentId new_last_added_document_id = 2;
+ EXPECT_THAT(index->Optimize(document_id_old_to_new, namespace_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id));
+
+ // Verify GetIterator API should work normally after Optimize().
+ // - old_doc_id=21, join_info=id6: become doc_id=2, join_info=id6
+ // - old_doc_id=13, join_info=id5: become doc_id=1, join_info=id5
+ // - old_doc_id=8, join_info=id4: NOT FOUND
+ // - old_doc_id=5, join_info=id3: NOT FOUND
+ // - old_doc_id=3, join_info=id2: become doc_id=0, join_info=id2
+ // - old_doc_id=3, join_info=id1: become doc_id=0, join_info=id1
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2, /*join_info=*/id6),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, /*join_info=*/id5),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/id2),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/id1))));
+
+ // Verify Put API should work normally after Optimize().
+ NamespaceFingerprintIdentifier id7(/*namespace_id=*/1, /*fingerprint=*/107);
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/99,
+ /*ref_namespace_fingerprint_ids=*/{id7}),
+ IsOk());
+ index->set_last_added_document_id(99);
+
+ EXPECT_THAT(index, Pointee(SizeIs(5)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(99));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/99, /*join_info=*/id7),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2, /*join_info=*/id6),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, /*join_info=*/id5),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/id2),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/id1))));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeOutOfRangeDocumentId) {
+ // Specific test for Optimize() for out of range document id.
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+ NamespaceFingerprintIdentifier id(/*namespace_id=*/1, /*fingerprint=*/101);
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/99,
+ /*ref_namespace_fingerprint_ids=*/{id}),
+ IsOk());
+ index->set_last_added_document_id(99);
+
+ // Create document_id_old_to_new with size = 1. Optimize should handle out of
+ // range DocumentId properly.
+ std::vector<DocumentId> document_id_old_to_new = {kInvalidDocumentId};
+ std::vector<NamespaceId> namespace_id_old_to_new = {0, 1};
+
+ // There shouldn't be any error due to vector index.
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new, namespace_id_old_to_new,
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeDeleteAllDocuments) {
+ // Specific test for Optimize(): delete all document ids.
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/101);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/102);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/103);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/104);
+ NamespaceFingerprintIdentifier id5(/*namespace_id=*/1, /*fingerprint=*/105);
+ NamespaceFingerprintIdentifier id6(/*namespace_id=*/1, /*fingerprint=*/106);
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/3,
+ /*ref_namespace_fingerprint_ids=*/{id1, id2}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id3}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/8,
+ /*ref_namespace_fingerprint_ids=*/{id4}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/13,
+ /*ref_namespace_fingerprint_ids=*/{id5}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/21,
+ /*ref_namespace_fingerprint_ids=*/{id6}),
+ IsOk());
+ index->set_last_added_document_id(21);
+
+ ASSERT_THAT(index, Pointee(SizeIs(6)));
+
+ // Delete all documents.
+ std::vector<DocumentId> document_id_old_to_new(22, kInvalidDocumentId);
+
+ // No change for namespace id.
+ std::vector<NamespaceId> namespace_id_old_to_new = {0, 1};
+
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new, namespace_id_old_to_new,
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeNamespaceIdChange) {
+ // Specific test for Optimize(): referenced namespace id compaction.
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/3, /*fingerprint=*/101);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/5, /*fingerprint=*/102);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/4, /*fingerprint=*/103);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/0, /*fingerprint=*/104);
+ NamespaceFingerprintIdentifier id5(/*namespace_id=*/2, /*fingerprint=*/105);
+ NamespaceFingerprintIdentifier id6(/*namespace_id=*/1, /*fingerprint=*/106);
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/2,
+ /*ref_namespace_fingerprint_ids=*/{id1}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/3,
+ /*ref_namespace_fingerprint_ids=*/{id2}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id3}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/8,
+ /*ref_namespace_fingerprint_ids=*/{id4}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/13,
+ /*ref_namespace_fingerprint_ids=*/{id5}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/21,
+ /*ref_namespace_fingerprint_ids=*/{id6}),
+ IsOk());
+ index->set_last_added_document_id(21);
+
+ ASSERT_THAT(index, Pointee(SizeIs(6)));
+
+ // No change for document id.
+ std::vector<DocumentId> document_id_old_to_new(22);
+ std::iota(document_id_old_to_new.begin(), document_id_old_to_new.end(), 0);
+
+ // Delete namespace id 2, 4. Reorder namespace id [0, 1, 3, 5] to [2, 3, 1,
+ // 0].
+ std::vector<NamespaceId> namespace_id_old_to_new(6, kInvalidNamespaceId);
+ namespace_id_old_to_new[0] = 2;
+ namespace_id_old_to_new[1] = 3;
+ namespace_id_old_to_new[3] = 1;
+ namespace_id_old_to_new[5] = 0;
+
+ DocumentId new_last_added_document_id = 21;
+ EXPECT_THAT(index->Optimize(document_id_old_to_new, namespace_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id));
+
+ // Verify GetIterator API should work normally after Optimize().
+ // - id6 (old_namespace_id=1): new_namespace_id=3 (document_id = 21)
+ // - id5 (old_namespace_id=2): NOT FOUND
+ // - id4 (old_namespace_id=0): new_namespace_id=2 (document_id = 8)
+ // - id3 (old_namespace_id=4): NOT FOUND
+ // - id2 (old_namespace_id=5): new_namespace_id=0 (document_id = 3)
+ // - id1 (old_namespace_id=3): new_namespace_id=1 (document_id = 2)
+ EXPECT_THAT(
+ GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/21, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/3, /*fingerprint=*/106)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/8, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/2, /*fingerprint=*/104)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/3, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/0, /*fingerprint=*/102)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/101)))));
+
+ // Verify Put API should work normally after Optimize().
+ NamespaceFingerprintIdentifier id7(/*namespace_id=*/1, /*fingerprint=*/107);
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/99,
+ /*ref_namespace_fingerprint_ids=*/{id7}),
+ IsOk());
+ index->set_last_added_document_id(99);
+
+ EXPECT_THAT(index, Pointee(SizeIs(5)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(99));
+ EXPECT_THAT(
+ GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/99, /*join_info=*/id7),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/21, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/3, /*fingerprint=*/106)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/8, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/2, /*fingerprint=*/104)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/3, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/0, /*fingerprint=*/102)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/2, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/101)))));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeNamespaceIdChangeShouldReorder) {
+ // Specific test for Optimize(): referenced namespace id reorder.
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/0, /*fingerprint=*/101);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/102);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/2, /*fingerprint=*/103);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/104);
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/0,
+ /*ref_namespace_fingerprint_ids=*/{id1, id2, id3}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/1,
+ /*ref_namespace_fingerprint_ids=*/{id4}),
+ IsOk());
+ index->set_last_added_document_id(1);
+
+ ASSERT_THAT(index, Pointee(SizeIs(4)));
+
+ // No change for document id.
+ std::vector<DocumentId> document_id_old_to_new = {0, 1};
+
+ // Reorder namespace id [0, 1, 2] to [2, 0, 1].
+ std::vector<NamespaceId> namespace_id_old_to_new = {2, 0, 1};
+
+ DocumentId new_last_added_document_id = 1;
+ EXPECT_THAT(index->Optimize(document_id_old_to_new, namespace_id_old_to_new,
+ new_last_added_document_id),
+ IsOk());
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(new_last_added_document_id));
+
+ // Verify GetIterator API should work normally after Optimize().
+ // - id4 (old_namespace_id=1): new_namespace_id=0 (document_id = 1)
+ // - id3 (old_namespace_id=2): new_namespace_id=1 (document_id = 0)
+ // - id2 (old_namespace_id=1): new_namespace_id=0 (document_id = 0)
+ // - id1 (old_namespace_id=0): new_namespace_id=2 (document_id = 0)
+ //
+ // Should reorder to [id4, id1, id3, id2] due to posting list restriction.
+ EXPECT_THAT(
+ GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(ElementsAre(
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/1, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/0, /*fingerprint=*/104)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/2, /*fingerprint=*/101)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/1, /*fingerprint=*/103)),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/0, /*join_info=*/NamespaceFingerprintIdentifier(
+ /*namespace_id=*/0, /*fingerprint=*/102)))));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeOutOfRangeNamespaceId) {
+ // Specific test for Optimize(): out of range referenced namespace id.
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+ NamespaceFingerprintIdentifier id(/*namespace_id=*/99, /*fingerprint=*/101);
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/0,
+ /*ref_namespace_fingerprint_ids=*/{id}),
+ IsOk());
+ index->set_last_added_document_id(0);
+
+ // Create namespace_id_old_to_new with size = 1. Optimize should handle out of
+ // range NamespaceId properly.
+ std::vector<DocumentId> document_id_old_to_new = {0};
+ std::vector<NamespaceId> namespace_id_old_to_new = {kInvalidNamespaceId};
+
+ // There shouldn't be any error due to vector index.
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new, namespace_id_old_to_new,
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, OptimizeDeleteAllNamespaces) {
+ // Specific test for Optimize(): delete all referenced namespace ids.
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+
+ SchemaTypeId schema_type_id = 2;
+ JoinablePropertyId joinable_property_id = 1;
+
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/0, /*fingerprint=*/101);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/102);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/2, /*fingerprint=*/103);
+
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/0,
+ /*ref_namespace_fingerprint_ids=*/{id1}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/1,
+ /*ref_namespace_fingerprint_ids=*/{id2}),
+ IsOk());
+ EXPECT_THAT(
+ index->Put(schema_type_id, joinable_property_id, /*document_id=*/2,
+ /*ref_namespace_fingerprint_ids=*/{id3}),
+ IsOk());
+ index->set_last_added_document_id(3);
+
+ ASSERT_THAT(index, Pointee(SizeIs(3)));
+
+ // No change for document id.
+ std::vector<DocumentId> document_id_old_to_new = {0, 1, 2};
+
+ // Delete all namespaces.
+ std::vector<NamespaceId> namespace_id_old_to_new(3, kInvalidNamespaceId);
+
+ EXPECT_THAT(
+ index->Optimize(document_id_old_to_new, namespace_id_old_to_new,
+ /*new_last_added_document_id=*/kInvalidDocumentId),
+ IsOk());
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+
+ // Verify all data are discarded after Optimize().
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+ EXPECT_THAT(GetJoinData(*index, schema_type_id, joinable_property_id),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexImplV2Test, Clear) {
+ NamespaceFingerprintIdentifier id1(/*namespace_id=*/1, /*fingerprint=*/12);
+ NamespaceFingerprintIdentifier id2(/*namespace_id=*/1, /*fingerprint=*/34);
+ NamespaceFingerprintIdentifier id3(/*namespace_id=*/1, /*fingerprint=*/56);
+ NamespaceFingerprintIdentifier id4(/*namespace_id=*/1, /*fingerprint=*/78);
+
+ // Create new qualified id join index
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> index,
+ QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ // Insert some data.
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/5,
+ /*ref_namespace_fingerprint_ids=*/{id2, id1}));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/3, /*joinable_property_id=*/10, /*document_id=*/6,
+ /*ref_namespace_fingerprint_ids=*/{id3}));
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/12,
+ /*ref_namespace_fingerprint_ids=*/{id4}));
+ ASSERT_THAT(index, Pointee(SizeIs(4)));
+ index->set_last_added_document_id(12);
+ ASSERT_THAT(index->last_added_document_id(), Eq(12));
+
+ // After Clear(), last_added_document_id should be set to kInvalidDocumentId,
+ // and the previous added data should be deleted.
+ EXPECT_THAT(index->Clear(), IsOk());
+ EXPECT_THAT(index, Pointee(IsEmpty()));
+ EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
+ EXPECT_THAT(
+ GetJoinData(*index, /*schema_type_id=*/2, /*joinable_property_id=*/1),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ GetJoinData(*index, /*schema_type_id=*/3, /*joinable_property_id=*/10),
+ IsOkAndHolds(IsEmpty()));
+
+ // Join index should be able to work normally after Clear().
+ ICING_ASSERT_OK(index->Put(
+ /*schema_type_id=*/2, /*joinable_property_id=*/1, /*document_id=*/20,
+ /*ref_namespace_fingerprint_ids=*/{id4, id2, id1, id3}));
+ index->set_last_added_document_id(20);
+
+ EXPECT_THAT(index, Pointee(SizeIs(4)));
+ EXPECT_THAT(index->last_added_document_id(), Eq(20));
+ EXPECT_THAT(
+ GetJoinData(*index, /*schema_type_id=*/2, /*joinable_property_id=*/1),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/20, /*join_info=*/id4),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/20, /*join_info=*/id3),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/20, /*join_info=*/id2),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/20, /*join_info=*/id1))));
+
+ ICING_ASSERT_OK(index->PersistToDisk());
+ index.reset();
+
+ // Verify index after reconstructing.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index, QualifiedIdJoinIndexImplV2::Create(filesystem_, working_path_,
+ /*pre_mapping_fbv=*/false));
+ EXPECT_THAT(index->last_added_document_id(), Eq(20));
+ EXPECT_THAT(
+ GetJoinData(*index, /*schema_type_id=*/2, /*joinable_property_id=*/1),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/20, /*join_info=*/id4),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/20, /*join_info=*/id3),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/20, /*join_info=*/id2),
+ DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/20, /*join_info=*/id1))));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id-join-index.h b/icing/join/qualified-id-join-index.h
new file mode 100644
index 0000000..4e487f9
--- /dev/null
+++ b/icing/join/qualified-id-join-index.h
@@ -0,0 +1,187 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_H_
+#define ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-storage.h"
+#include "icing/join/doc-join-info.h"
+#include "icing/join/document-id-to-join-info.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/store/namespace-id.h"
+#include "icing/util/crc32.h"
+
+namespace icing {
+namespace lib {
+
+// QualifiedIdJoinIndex: an abstract class to maintain data for qualified id
+// joining.
+class QualifiedIdJoinIndex : public PersistentStorage {
+ public:
+ class JoinDataIteratorBase {
+ public:
+ virtual ~JoinDataIteratorBase() = default;
+
+ virtual libtextclassifier3::Status Advance() = 0;
+
+ virtual const DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>&
+ GetCurrent() const = 0;
+ };
+
+ static constexpr WorkingPathType kWorkingPathType =
+ WorkingPathType::kDirectory;
+
+ // Deletes QualifiedIdJoinIndex under working_path.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ static libtextclassifier3::Status Discard(const Filesystem& filesystem,
+ const std::string& working_path) {
+ return PersistentStorage::Discard(filesystem, working_path,
+ kWorkingPathType);
+ }
+
+ virtual ~QualifiedIdJoinIndex() override = default;
+
+ // (v1 only) Puts a new data into index: DocJoinInfo (DocumentId,
+ // JoinablePropertyId) references to ref_qualified_id_str (the identifier of
+ // another document).
+ //
+ // REQUIRES: ref_qualified_id_str contains no '\0'.
+ //
+ // Returns:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid
+ // - Any KeyMapper errors
+ virtual libtextclassifier3::Status Put(
+ const DocJoinInfo& doc_join_info,
+ std::string_view ref_qualified_id_str) = 0;
+
+ // (v2 only) Puts a list of referenced NamespaceFingerprintIdentifier into
+ // index, given the DocumentId, SchemaTypeId and JoinablePropertyId.
+ //
+ // Returns:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR if schema_type_id, joinable_property_id, or
+ // document_id is invalid
+ // - Any KeyMapper/FlashIndexStorage errors
+ virtual libtextclassifier3::Status Put(
+ SchemaTypeId schema_type_id, JoinablePropertyId joinable_property_id,
+ DocumentId document_id,
+ std::vector<NamespaceFingerprintIdentifier>&&
+ ref_namespace_fingerprint_ids) = 0;
+
+ // (v1 only) Gets the referenced document's qualified id string by
+ // DocJoinInfo.
+ //
+ // Returns:
+ // - A qualified id string referenced by the given DocJoinInfo (DocumentId,
+ // JoinablePropertyId) on success
+ // - INVALID_ARGUMENT_ERROR if doc_join_info is invalid
+ // - NOT_FOUND_ERROR if doc_join_info doesn't exist
+ // - Any KeyMapper errors
+ virtual libtextclassifier3::StatusOr<std::string_view> Get(
+ const DocJoinInfo& doc_join_info) const = 0;
+
+ // (v2 only) Returns a JoinDataIterator for iterating through all join data of
+ // the specified (schema_type_id, joinable_property_id).
+ //
+ // Returns:
+ // - On success: a JoinDataIterator
+ // - INVALID_ARGUMENT_ERROR if schema_type_id or joinable_property_id is
+ // invalid
+ // - Any KeyMapper/FlashIndexStorage errors
+ virtual libtextclassifier3::StatusOr<std::unique_ptr<JoinDataIteratorBase>>
+ GetIterator(SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id) const = 0;
+
+ // Reduces internal file sizes by reclaiming space and ids of deleted
+ // documents. Qualified id type joinable index will convert all entries to the
+ // new document ids.
+ //
+ // - document_id_old_to_new: a map for converting old document id to new
+ // document id.
+ // - namespace_id_old_to_new: a map for converting old namespace id to new
+ // namespace id.
+ // - new_last_added_document_id: will be used to update the last added
+ // document id in the qualified id type joinable
+ // index.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error. This could potentially leave the index in
+ // an invalid state and the caller should handle it properly (e.g. discard
+ // and rebuild)
+ virtual libtextclassifier3::Status Optimize(
+ const std::vector<DocumentId>& document_id_old_to_new,
+ const std::vector<NamespaceId>& namespace_id_old_to_new,
+ DocumentId new_last_added_document_id) = 0;
+
+ // Clears all data and set last_added_document_id to kInvalidDocumentId.
+ //
+ // Returns:
+ // - OK on success
+ // - INTERNAL_ERROR on I/O error
+ virtual libtextclassifier3::Status Clear() = 0;
+
+ virtual bool is_v2() const = 0;
+
+ virtual int32_t size() const = 0;
+
+ virtual bool empty() const = 0;
+
+ virtual DocumentId last_added_document_id() const = 0;
+
+ virtual void set_last_added_document_id(DocumentId document_id) = 0;
+
+ protected:
+ explicit QualifiedIdJoinIndex(const Filesystem& filesystem,
+ std::string&& working_path)
+ : PersistentStorage(filesystem, std::move(working_path),
+ kWorkingPathType) {}
+
+ virtual libtextclassifier3::Status PersistStoragesToDisk(
+ bool force) override = 0;
+
+ virtual libtextclassifier3::Status PersistMetadataToDisk(
+ bool force) override = 0;
+
+ virtual libtextclassifier3::StatusOr<Crc32> ComputeInfoChecksum(
+ bool force) override = 0;
+
+ virtual libtextclassifier3::StatusOr<Crc32> ComputeStoragesChecksum(
+ bool force) override = 0;
+
+ virtual Crcs& crcs() override = 0;
+ virtual const Crcs& crcs() const override = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEX_H_
diff --git a/icing/join/qualified-id-join-indexing-handler-v1_test.cc b/icing/join/qualified-id-join-indexing-handler-v1_test.cc
new file mode 100644
index 0000000..9700132
--- /dev/null
+++ b/icing/join/qualified-id-join-indexing-handler-v1_test.cc
@@ -0,0 +1,558 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/join/qualified-id-join-index-impl-v1.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/join/qualified-id-join-indexing-handler.h"
+#include "icing/join/qualified-id.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsTrue;
+
+// Schema type for referenced documents: ReferencedType
+static constexpr std::string_view kReferencedType = "ReferencedType";
+static constexpr std::string_view kPropertyName = "name";
+
+// Joinable properties and joinable property id. Joinable property id is
+// determined by the lexicographical order of joinable property path.
+// Schema type with joinable property: FakeType
+static constexpr std::string_view kFakeType = "FakeType";
+static constexpr std::string_view kPropertyQualifiedId = "qualifiedId";
+
+static constexpr JoinablePropertyId kQualifiedIdJoinablePropertyId = 0;
+
+// Schema type with nested joinable properties: NestedType
+static constexpr std::string_view kNestedType = "NestedType";
+static constexpr std::string_view kPropertyNestedDoc = "nested";
+static constexpr std::string_view kPropertyQualifiedId2 = "qualifiedId2";
+
+static constexpr JoinablePropertyId kNestedQualifiedIdJoinablePropertyId = 0;
+static constexpr JoinablePropertyId kQualifiedId2JoinablePropertyId = 1;
+
+static constexpr DocumentId kDefaultDocumentId = 3;
+
+// TODO(b/275121148): remove this test after deprecating
+// QualifiedIdJoinIndexImplV1.
+class QualifiedIdJoinIndexingHandlerV1Test : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ base_dir_ = GetTestTempDir() + "/icing_test";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index";
+ schema_store_dir_ = base_dir_ + "/schema_store";
+ doc_store_dir_ = base_dir_ + "/doc_store";
+
+ ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_,
+ QualifiedIdJoinIndexImplV1::Create(
+ filesystem_, qualified_id_join_index_dir_,
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false));
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
+
+ ASSERT_THAT(
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kReferencedType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType(kFakeType).AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyQualifiedId)
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kNestedType)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyNestedDoc)
+ .SetDataTypeDocument(
+ kFakeType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyQualifiedId2)
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false,
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ doc_store_ = std::move(create_result.document_store);
+ }
+
+ void TearDown() override {
+ doc_store_.reset();
+ schema_store_.reset();
+ lang_segmenter_.reset();
+ qualified_id_join_index_.reset();
+
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ FakeClock fake_clock_;
+ std::string base_dir_;
+ std::string qualified_id_join_index_dir_;
+ std::string schema_store_dir_;
+ std::string doc_store_dir_;
+
+ std::unique_ptr<QualifiedIdJoinIndex> qualified_id_join_index_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> doc_store_;
+};
+
+TEST_F(QualifiedIdJoinIndexingHandlerV1Test,
+ CreationWithNullPointerShouldFail) {
+ EXPECT_THAT(
+ QualifiedIdJoinIndexingHandler::Create(
+ /*clock=*/nullptr, doc_store_.get(), qualified_id_join_index_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ EXPECT_THAT(
+ QualifiedIdJoinIndexingHandler::Create(
+ &fake_clock_, /*doc_store=*/nullptr, qualified_id_join_index_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ EXPECT_THAT(
+ QualifiedIdJoinIndexingHandler::Create(
+ &fake_clock_, doc_store_.get(), /*qualified_id_join_index=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerV1Test, HandleJoinableProperty) {
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ // Handle document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kDefaultDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ IsOk());
+
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kQualifiedIdJoinablePropertyId)),
+ IsOkAndHolds("pkg$db/ns#ref_type/1"));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerV1Test, HandleNestedJoinableProperty) {
+ DocumentProto referenced_document1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+ DocumentProto referenced_document2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/2")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "two")
+ .Build();
+
+ DocumentProto nested_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "nested_type/1")
+ .SetSchema(std::string(kNestedType))
+ .AddDocumentProperty(
+ std::string(kPropertyNestedDoc),
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "nested_fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/2")
+ .Build())
+ .AddStringProperty(std::string(kPropertyQualifiedId2),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ nested_document));
+
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ // Handle nested_document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(handler->Handle(tokenized_document, kDefaultDocumentId,
+ /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kNestedQualifiedIdJoinablePropertyId)),
+ IsOkAndHolds("pkg$db/ns#ref_type/2"));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kQualifiedId2JoinablePropertyId)),
+ IsOkAndHolds("pkg$db/ns#ref_type/1"));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerV1Test,
+ HandleShouldSkipInvalidFormatQualifiedId) {
+ static constexpr std::string_view kInvalidFormatQualifiedId =
+ "invalid_format_qualified_id";
+ ASSERT_THAT(QualifiedId::Parse(kInvalidFormatQualifiedId),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ std::string(kInvalidFormatQualifiedId))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ // Handle document. Should ignore invalid format qualified id.
+ // Index data should remain unchanged since there is no valid qualified id,
+ // but last_added_document_id should be updated.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kDefaultDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kQualifiedIdJoinablePropertyId)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerV1Test, HandleShouldSkipEmptyQualifiedId) {
+ // Create a document without any qualified id.
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ ASSERT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty());
+
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ // Handle document. Index data should remain unchanged since there is no
+ // qualified id, but last_added_document_id should be updated.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kDefaultDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kQualifiedIdJoinablePropertyId)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerV1Test,
+ HandleInvalidDocumentIdShouldReturnInvalidArgumentError) {
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+
+ // Handling document with kInvalidDocumentId should cause a failure, and both
+ // index data and last_added_document_id should remain unchanged.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kInvalidDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kInvalidDocumentId, kQualifiedIdJoinablePropertyId)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Recovery mode should get the same result.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kInvalidDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kInvalidDocumentId, kQualifiedIdJoinablePropertyId)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerV1Test,
+ HandleOutOfOrderDocumentIdShouldReturnInvalidArgumentError) {
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+
+ // Handling document with document_id < last_added_document_id should cause a
+ // failure, and both index data and last_added_document_id should remain
+ // unchanged.
+ ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId - 1), IsTrue());
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kDefaultDocumentId - 1,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kQualifiedIdJoinablePropertyId)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Handling document with document_id == last_added_document_id should cause a
+ // failure, and both index data and last_added_document_id should remain
+ // unchanged.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kDefaultDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kQualifiedIdJoinablePropertyId)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerV1Test,
+ HandleRecoveryModeShouldIgnoreDocsLELastAddedDocId) {
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ qualified_id_join_index_->set_last_added_document_id(kDefaultDocumentId);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+
+ // Handle document with document_id < last_added_document_id in recovery mode.
+ // We should not get any error, but the handler should ignore the document, so
+ // both index data and last_added_document_id should remain unchanged.
+ ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId - 1), IsTrue());
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kDefaultDocumentId - 1,
+ /*recovery_mode=*/true, /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kQualifiedIdJoinablePropertyId)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Handle document with document_id == last_added_document_id in recovery
+ // mode. We should not get any error, but the handler should ignore the
+ // document, so both index data and last_added_document_id should remain
+ // unchanged.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kDefaultDocumentId,
+ /*recovery_mode=*/true, /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId, kQualifiedIdJoinablePropertyId)),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Handle document with document_id > last_added_document_id in recovery mode.
+ // The handler should index this document and update last_added_document_id.
+ ASSERT_THAT(IsDocumentIdValid(kDefaultDocumentId + 1), IsTrue());
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kDefaultDocumentId + 1,
+ /*recovery_mode=*/true, /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kDefaultDocumentId + 1));
+ EXPECT_THAT(qualified_id_join_index_->Get(DocJoinInfo(
+ kDefaultDocumentId + 1, kQualifiedIdJoinablePropertyId)),
+ IsOkAndHolds("pkg$db/ns#ref_type/1"));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id-join-indexing-handler.cc b/icing/join/qualified-id-join-indexing-handler.cc
new file mode 100644
index 0000000..df86cba
--- /dev/null
+++ b/icing/join/qualified-id-join-indexing-handler.cc
@@ -0,0 +1,179 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id-join-indexing-handler.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/join/doc-join-info.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/join/qualified-id.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/store/namespace-id.h"
+#include "icing/util/clock.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexingHandler>>
+QualifiedIdJoinIndexingHandler::Create(
+ const Clock* clock, const DocumentStore* doc_store,
+ QualifiedIdJoinIndex* qualified_id_join_index) {
+ ICING_RETURN_ERROR_IF_NULL(clock);
+ ICING_RETURN_ERROR_IF_NULL(doc_store);
+ ICING_RETURN_ERROR_IF_NULL(qualified_id_join_index);
+
+ return std::unique_ptr<QualifiedIdJoinIndexingHandler>(
+ new QualifiedIdJoinIndexingHandler(clock, doc_store,
+ qualified_id_join_index));
+}
+
+libtextclassifier3::Status QualifiedIdJoinIndexingHandler::Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ bool recovery_mode, PutDocumentStatsProto* put_document_stats) {
+ std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
+
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(
+ IcingStringUtil::StringPrintf("Invalid DocumentId %d", document_id));
+ }
+
+ if (qualified_id_join_index_.last_added_document_id() != kInvalidDocumentId &&
+ document_id <= qualified_id_join_index_.last_added_document_id()) {
+ if (recovery_mode) {
+ // Skip the document if document_id <= last_added_document_id in recovery
+ // mode without returning an error.
+ return libtextclassifier3::Status::OK;
+ }
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "DocumentId %d must be greater than last added document_id %d",
+ document_id, qualified_id_join_index_.last_added_document_id()));
+ }
+ qualified_id_join_index_.set_last_added_document_id(document_id);
+
+ if (qualified_id_join_index_.is_v2()) {
+ // v2
+ std::optional<DocumentFilterData> filter_data =
+ doc_store_.GetAliveDocumentFilterData(
+ document_id,
+ /*current_time_ms=*/std::numeric_limits<int64_t>::min());
+ if (!filter_data) {
+ // This should not happen.
+ return absl_ports::InternalError(
+ "Failed to get alive document filter data when indexing");
+ }
+
+ for (const JoinableProperty<std::string_view>& qualified_id_property :
+ tokenized_document.qualified_id_join_properties()) {
+ // Parse all qualified id strings and convert them to
+ // NamespaceFingerprintIdentifier.
+ std::vector<NamespaceFingerprintIdentifier> ref_doc_ns_fingerprint_ids;
+ for (std::string_view ref_qualified_id_str :
+ qualified_id_property.values) {
+ // Attempt to parse qualified id string to make sure the format is
+ // correct.
+ auto ref_qualified_id_or = QualifiedId::Parse(ref_qualified_id_str);
+ if (!ref_qualified_id_or.ok()) {
+ // Skip incorrect format of qualified id string.
+ continue;
+ }
+
+ QualifiedId ref_qualified_id =
+ std::move(ref_qualified_id_or).ValueOrDie();
+ auto ref_namespace_id_or =
+ doc_store_.GetNamespaceId(ref_qualified_id.name_space());
+ if (!ref_namespace_id_or.ok()) {
+ // Skip invalid namespace id.
+ continue;
+ }
+ NamespaceId ref_namespace_id =
+ std::move(ref_namespace_id_or).ValueOrDie();
+
+ ref_doc_ns_fingerprint_ids.push_back(NamespaceFingerprintIdentifier(
+ ref_namespace_id, ref_qualified_id.uri()));
+ }
+
+ // Batch add all join data of this (schema_type_id, joinable_property_id)
+ // into to the index.
+ libtextclassifier3::Status status = qualified_id_join_index_.Put(
+ filter_data->schema_type_id(), qualified_id_property.metadata.id,
+ document_id, std::move(ref_doc_ns_fingerprint_ids));
+ if (!status.ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to add data into qualified id join index v2 due to: "
+ << status.error_message();
+ return status;
+ }
+ }
+ } else {
+ // v1
+ // TODO(b/275121148): deprecate this part after rollout v2.
+ for (const JoinableProperty<std::string_view>& qualified_id_property :
+ tokenized_document.qualified_id_join_properties()) {
+ if (qualified_id_property.values.empty()) {
+ continue;
+ }
+
+ DocJoinInfo info(document_id, qualified_id_property.metadata.id);
+ // Currently we only support single (non-repeated) joinable value under a
+ // property.
+ std::string_view ref_qualified_id_str = qualified_id_property.values[0];
+
+ // Attempt to parse qualified id string to make sure the format is
+ // correct.
+ if (!QualifiedId::Parse(ref_qualified_id_str).ok()) {
+ // Skip incorrect format of qualified id string to save disk space.
+ continue;
+ }
+
+ libtextclassifier3::Status status =
+ qualified_id_join_index_.Put(info, ref_qualified_id_str);
+ if (!status.ok()) {
+ ICING_LOG(WARNING)
+ << "Failed to add data into qualified id join index due to: "
+ << status.error_message();
+ return status;
+ }
+ }
+ }
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_qualified_id_join_index_latency_ms(
+ index_timer->GetElapsedMilliseconds());
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id-join-indexing-handler.h b/icing/join/qualified-id-join-indexing-handler.h
new file mode 100644
index 0000000..8a11bf9
--- /dev/null
+++ b/icing/join/qualified-id-join-indexing-handler.h
@@ -0,0 +1,78 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_QUALIFIED_ID_JOIN_INDEXING_HANDLER_H_
+#define ICING_JOIN_QUALIFIED_ID_JOIN_INDEXING_HANDLER_H_
+
+#include <memory>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/data-indexing-handler.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/util/clock.h"
+#include "icing/util/tokenized-document.h"
+
+namespace icing {
+namespace lib {
+
+class QualifiedIdJoinIndexingHandler : public DataIndexingHandler {
+ public:
+ // Creates a QualifiedIdJoinIndexingHandler instance which does not take
+ // ownership of any input components. All pointers must refer to valid objects
+ // that outlive the created QualifiedIdJoinIndexingHandler instance.
+ //
+ // Returns:
+ // - A QualifiedIdJoinIndexingHandler instance on success
+ // - FAILED_PRECONDITION_ERROR if any of the input pointer is null
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<QualifiedIdJoinIndexingHandler>>
+ Create(const Clock* clock, const DocumentStore* doc_store,
+ QualifiedIdJoinIndex* qualified_id_join_index);
+
+ ~QualifiedIdJoinIndexingHandler() override = default;
+
+ // Handles the joinable qualified id data indexing process: add data into the
+ // qualified id join index.
+ //
+ /// Returns:
+ // - OK on success.
+ // - INVALID_ARGUMENT_ERROR if document_id is invalid OR document_id is less
+ // than or equal to the document_id of a previously indexed document in
+ // non recovery mode.
+ // - INTERNAL_ERROR if any other errors occur.
+ // - Any QualifiedIdJoinIndex errors.
+ libtextclassifier3::Status Handle(
+ const TokenizedDocument& tokenized_document, DocumentId document_id,
+ bool recovery_mode, PutDocumentStatsProto* put_document_stats) override;
+
+ private:
+ explicit QualifiedIdJoinIndexingHandler(
+ const Clock* clock, const DocumentStore* doc_store,
+ QualifiedIdJoinIndex* qualified_id_join_index)
+ : DataIndexingHandler(clock),
+ doc_store_(*doc_store),
+ qualified_id_join_index_(*qualified_id_join_index) {}
+
+ const DocumentStore& doc_store_; // Does not own.
+ QualifiedIdJoinIndex& qualified_id_join_index_; // Does not own.
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_QUALIFIED_ID_JOIN_INDEXING_HANDLER_H_
diff --git a/icing/join/qualified-id-join-indexing-handler_test.cc b/icing/join/qualified-id-join-indexing-handler_test.cc
new file mode 100644
index 0000000..53d35c7
--- /dev/null
+++ b/icing/join/qualified-id-join-indexing-handler_test.cc
@@ -0,0 +1,829 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id-join-indexing-handler.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/join/document-id-to-join-info.h"
+#include "icing/join/qualified-id-join-index-impl-v2.h"
+#include "icing/join/qualified-id-join-index.h"
+#include "icing/join/qualified-id.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsTrue;
+using ::testing::NotNull;
+
+// Schema type for referenced documents: ReferencedType
+static constexpr std::string_view kReferencedType = "ReferencedType";
+static constexpr std::string_view kPropertyName = "name";
+
+// Joinable properties and joinable property id. Joinable property id is
+// determined by the lexicographical order of joinable property path.
+// Schema type with joinable property: FakeType
+static constexpr std::string_view kFakeType = "FakeType";
+static constexpr std::string_view kPropertyQualifiedId = "qualifiedId";
+
+// Schema type with nested joinable properties: NestedType
+static constexpr std::string_view kNestedType = "NestedType";
+static constexpr std::string_view kPropertyNestedDoc = "nested";
+static constexpr std::string_view kPropertyQualifiedId2 = "qualifiedId2";
+
+class QualifiedIdJoinIndexingHandlerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ base_dir_ = GetTestTempDir() + "/icing_test";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ qualified_id_join_index_dir_ = base_dir_ + "/qualified_id_join_index";
+ schema_store_dir_ = base_dir_ + "/schema_store";
+ doc_store_dir_ = base_dir_ + "/doc_store";
+
+ ICING_ASSERT_OK_AND_ASSIGN(qualified_id_join_index_,
+ QualifiedIdJoinIndexImplV2::Create(
+ filesystem_, qualified_id_join_index_dir_,
+ /*pre_mapping_fbv=*/false));
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
+
+ ASSERT_THAT(
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kReferencedType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType(kFakeType).AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyQualifiedId)
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kNestedType)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyNestedDoc)
+ .SetDataTypeDocument(
+ kFakeType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyQualifiedId2)
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str()),
+ IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/true,
+ /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ doc_store_ = std::move(create_result.document_store);
+
+ // Get FakeType related ids.
+ ICING_ASSERT_OK_AND_ASSIGN(fake_type_id_,
+ schema_store_->GetSchemaTypeId(kFakeType));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ const JoinablePropertyMetadata* metadata1,
+ schema_store_->GetJoinablePropertyMetadata(
+ fake_type_id_, std::string(kPropertyQualifiedId)));
+ ASSERT_THAT(metadata1, NotNull());
+ fake_type_joinable_property_id_ = metadata1->id;
+
+ // Get NestedType related ids.
+ ICING_ASSERT_OK_AND_ASSIGN(nested_type_id_,
+ schema_store_->GetSchemaTypeId(kNestedType));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ const JoinablePropertyMetadata* metadata2,
+ schema_store_->GetJoinablePropertyMetadata(
+ nested_type_id_,
+ absl_ports::StrCat(kPropertyNestedDoc, ".", kPropertyQualifiedId)));
+ ASSERT_THAT(metadata2, NotNull());
+ nested_type_nested_joinable_property_id_ = metadata2->id;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ const JoinablePropertyMetadata* metadata3,
+ schema_store_->GetJoinablePropertyMetadata(
+ nested_type_id_, std::string(kPropertyQualifiedId2)));
+ ASSERT_THAT(metadata3, NotNull());
+ nested_type_joinable_property_id_ = metadata3->id;
+ }
+
+ void TearDown() override {
+ doc_store_.reset();
+ schema_store_.reset();
+ lang_segmenter_.reset();
+ qualified_id_join_index_.reset();
+
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ FakeClock fake_clock_;
+ std::string base_dir_;
+ std::string qualified_id_join_index_dir_;
+ std::string schema_store_dir_;
+ std::string doc_store_dir_;
+
+ std::unique_ptr<QualifiedIdJoinIndexImplV2> qualified_id_join_index_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> doc_store_;
+
+ // FakeType related ids.
+ SchemaTypeId fake_type_id_;
+ JoinablePropertyId fake_type_joinable_property_id_;
+
+ // NestedType related ids.
+ SchemaTypeId nested_type_id_;
+ JoinablePropertyId nested_type_nested_joinable_property_id_;
+ JoinablePropertyId nested_type_joinable_property_id_;
+};
+
+libtextclassifier3::StatusOr<
+ std::vector<QualifiedIdJoinIndexImplV2::JoinDataType>>
+GetJoinData(const QualifiedIdJoinIndexImplV2& index,
+ SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase> iter,
+ index.GetIterator(schema_type_id, joinable_property_id));
+
+ std::vector<QualifiedIdJoinIndexImplV2::JoinDataType> result;
+ while (iter->Advance().ok()) {
+ result.push_back(iter->GetCurrent());
+ }
+
+ return result;
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest, CreationWithNullPointerShouldFail) {
+ EXPECT_THAT(
+ QualifiedIdJoinIndexingHandler::Create(
+ /*clock=*/nullptr, doc_store_.get(), qualified_id_join_index_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ EXPECT_THAT(
+ QualifiedIdJoinIndexingHandler::Create(
+ &fake_clock_, /*doc_store=*/nullptr, qualified_id_join_index_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ EXPECT_THAT(
+ QualifiedIdJoinIndexingHandler::Create(
+ &fake_clock_, doc_store_.get(), /*qualified_id_join_index=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleJoinableProperty) {
+ // Create and put referenced (parent) document. Get its document id and
+ // namespace id.
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id,
+ doc_store_->Put(referenced_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ NamespaceId ref_doc_ns_id,
+ doc_store_->GetNamespaceId(referenced_document.namespace_()));
+ NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id(
+ /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri());
+ ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id),
+ IsOkAndHolds(ref_doc_id));
+
+ // Create and put (child) document. Also tokenize it.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+
+ // Handle document.
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ // Verify the state of qualified_id_join_index_ after Handle().
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ // (kFakeType, kPropertyQualifiedId) should contain
+ // [(doc_id, ref_doc_ns_fingerprint_id)].
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/doc_id,
+ /*join_info=*/ref_doc_ns_fingerprint_id))));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleNestedJoinableProperty) {
+ // Create and put referenced (parent) document1. Get its document id and
+ // namespace id.
+ DocumentProto referenced_document1 =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id1,
+ doc_store_->Put(referenced_document1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ NamespaceId ref_doc_ns_id1,
+ doc_store_->GetNamespaceId(referenced_document1.namespace_()));
+ NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id1(
+ /*namespace_id=*/ref_doc_ns_id1,
+ /*target_str=*/referenced_document1.uri());
+ ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id1),
+ IsOkAndHolds(ref_doc_id1));
+
+ // Create and put referenced (parent) document2. Get its document id and
+ // namespace id.
+ DocumentProto referenced_document2 =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/2")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "two")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id2,
+ doc_store_->Put(referenced_document2));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ NamespaceId ref_doc_ns_id2,
+ doc_store_->GetNamespaceId(referenced_document2.namespace_()));
+ NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id2(
+ /*namespace_id=*/ref_doc_ns_id2,
+ /*target_str=*/referenced_document2.uri());
+ ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id2),
+ IsOkAndHolds(ref_doc_id2));
+
+ // Create and put (child) document:
+ // - kPropertyNestedDoc.kPropertyQualifiedId refers to referenced_document2.
+ // - kPropertyQualifiedId2 refers to referenced_document1.
+ //
+ // Also tokenize it.
+ DocumentProto nested_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "nested_type/1")
+ .SetSchema(std::string(kNestedType))
+ .AddDocumentProperty(
+ std::string(kPropertyNestedDoc),
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "nested_fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/2")
+ .Build())
+ .AddStringProperty(std::string(kPropertyQualifiedId2),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id,
+ doc_store_->Put(nested_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ nested_document));
+
+ // Handle nested_document.
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ // Verify the state of qualified_id_join_index_ after Handle().
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+ // (kNestedType, kPropertyNestedDoc.kPropertyQualifiedId) should contain
+ // [(doc_id, ref_doc_ns_fingerprint_id2)].
+ EXPECT_THAT(
+ GetJoinData(
+ *qualified_id_join_index_, /*schema_type_id=*/nested_type_id_,
+ /*joinable_property_id=*/nested_type_nested_joinable_property_id_),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/doc_id,
+ /*join_info=*/ref_doc_ns_fingerprint_id2))));
+ // (kNestedType, kPropertyQualifiedId2) should contain
+ // [(doc_id, ref_doc_ns_fingerprint_id1)].
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/nested_type_id_,
+ /*joinable_property_id=*/nested_type_joinable_property_id_),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/doc_id,
+ /*join_info=*/ref_doc_ns_fingerprint_id1))));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest,
+ HandleShouldSkipInvalidFormatQualifiedId) {
+ static constexpr std::string_view kInvalidFormatQualifiedId =
+ "invalid_format_qualified_id";
+ ASSERT_THAT(QualifiedId::Parse(kInvalidFormatQualifiedId),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Create and put (child) document with an invalid format referenced qualified
+ // id. Also tokenize it.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ std::string(kInvalidFormatQualifiedId))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ // Handle document. Should ignore invalid format qualified id.
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ // Verify the state of qualified_id_join_index_ after Handle(). Index data
+ // should remain unchanged since there is no valid qualified id, but
+ // last_added_document_id should be updated.
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest,
+ HandleShouldSkipNonExistingNamespace) {
+ static constexpr std::string_view kUnknownNamespace = "UnknownNamespace";
+ // Create and put (child) document which references to a parent qualified id
+ // with an unknown namespace. Also tokenize it.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(
+ std::string(kPropertyQualifiedId),
+ absl_ports::StrCat(kUnknownNamespace, "#", "ref_type/1"))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+
+ // Handle document.
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ // Verify the state of qualified_id_join_index_ after Handle().
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ // (kFakeType, kPropertyQualifiedId) should be empty since
+ // "UnknownNamespace#ref_type/1" should be skipped.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest, HandleShouldSkipEmptyQualifiedId) {
+ // Create and put (child) document without any qualified id. Also tokenize it.
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+ ASSERT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty());
+
+ // Handle document.
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(kInvalidDocumentId));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+
+ // Verify the state of qualified_id_join_index_ after Handle(). Index data
+ // should remain unchanged since there is no qualified id, but
+ // last_added_document_id should be updated.
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest,
+ HandleInvalidDocumentIdShouldReturnInvalidArgumentError) {
+ // Create and put referenced (parent) document. Get its document id and
+ // namespace id.
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id,
+ doc_store_->Put(referenced_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ NamespaceId ref_doc_ns_id,
+ doc_store_->GetNamespaceId(referenced_document.namespace_()));
+ NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id(
+ /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri());
+ ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id),
+ IsOkAndHolds(ref_doc_id));
+
+ // Create and put (child) document. Also tokenize it.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK(doc_store_->Put(document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+
+ qualified_id_join_index_->set_last_added_document_id(ref_doc_id);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(ref_doc_id));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+
+ // Handling document with kInvalidDocumentId should cause a failure.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kInvalidDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ // Verify the state of qualified_id_join_index_ after Handle(). Both index
+ // data and last_added_document_id should remain unchanged.
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(ref_doc_id));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+
+ // Recovery mode should get the same result.
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, kInvalidDocumentId,
+ /*recovery_mode=*/false, /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(ref_doc_id));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest,
+ HandleOutOfOrderDocumentIdShouldReturnInvalidArgumentError) {
+ // Create and put referenced (parent) document. Get its document id and
+ // namespace id.
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id,
+ doc_store_->Put(referenced_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ NamespaceId ref_doc_ns_id,
+ doc_store_->GetNamespaceId(referenced_document.namespace_()));
+ NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id(
+ /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri());
+ ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id),
+ IsOkAndHolds(ref_doc_id));
+
+ // Create and put (child) document. Also tokenize it.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+
+ // Handling document with document_id == last_added_document_id should cause a
+ // failure.
+ qualified_id_join_index_->set_last_added_document_id(doc_id);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ // Verify the state of qualified_id_join_index_ after Handle(). Both index
+ // data and last_added_document_id should remain unchanged.
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+
+ // Handling document with document_id < last_added_document_id should cause a
+ // failure.
+ qualified_id_join_index_->set_last_added_document_id(doc_id + 1);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(doc_id + 1));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/false,
+ /*put_document_stats=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ // Verify the state of qualified_id_join_index_ after Handle(). Both index
+ // data and last_added_document_id should remain unchanged.
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(doc_id + 1));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest,
+ HandleRecoveryModeShouldIndexDocsGtLastAddedDocId) {
+ // Create and put referenced (parent) document. Get its document id and
+ // namespace id.
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id,
+ doc_store_->Put(referenced_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ NamespaceId ref_doc_ns_id,
+ doc_store_->GetNamespaceId(referenced_document.namespace_()));
+ NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id(
+ /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri());
+ ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id),
+ IsOkAndHolds(ref_doc_id));
+
+ // Create and put (child) document. Also tokenize it.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+
+ // Handle document with document_id > last_added_document_id in recovery mode.
+ // The handler should index this document and update last_added_document_id.
+ qualified_id_join_index_->set_last_added_document_id(doc_id - 1);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(doc_id - 1));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/true,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(
+ ElementsAre(DocumentIdToJoinInfo<NamespaceFingerprintIdentifier>(
+ /*document_id=*/doc_id,
+ /*join_info=*/ref_doc_ns_fingerprint_id))));
+}
+
+TEST_F(QualifiedIdJoinIndexingHandlerTest,
+ HandleRecoveryModeShouldIgnoreDocsLeLastAddedDocId) {
+ // Create and put referenced (parent) document. Get its document id and
+ // namespace id.
+ DocumentProto referenced_document =
+ DocumentBuilder()
+ .SetKey("pkg$db/ns", "ref_type/1")
+ .SetSchema(std::string(kReferencedType))
+ .AddStringProperty(std::string(kPropertyName), "one")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId ref_doc_id,
+ doc_store_->Put(referenced_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ NamespaceId ref_doc_ns_id,
+ doc_store_->GetNamespaceId(referenced_document.namespace_()));
+ NamespaceFingerprintIdentifier ref_doc_ns_fingerprint_id(
+ /*namespace_id=*/ref_doc_ns_id, /*target_str=*/referenced_document.uri());
+ ASSERT_THAT(doc_store_->GetDocumentId(ref_doc_ns_fingerprint_id),
+ IsOkAndHolds(ref_doc_id));
+
+ // Create and put (child) document. Also tokenize it.
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kPropertyQualifiedId),
+ "pkg$db/ns#ref_type/1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId doc_id, doc_store_->Put(document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ std::move(document)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<QualifiedIdJoinIndexingHandler> handler,
+ QualifiedIdJoinIndexingHandler::Create(&fake_clock_, doc_store_.get(),
+ qualified_id_join_index_.get()));
+
+ // Handle document with document_id == last_added_document_id in recovery
+ // mode. We should not get any error, but the handler should ignore the
+ // document, so both index data and last_added_document_id should remain
+ // unchanged.
+ qualified_id_join_index_->set_last_added_document_id(doc_id);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/true,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(), Eq(doc_id));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+
+ // Handle document with document_id < last_added_document_id in recovery mode.
+ // We should not get any error, but the handler should ignore the document, so
+ // both index data and last_added_document_id should remain unchanged.
+ qualified_id_join_index_->set_last_added_document_id(doc_id + 1);
+ ASSERT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(doc_id + 1));
+ EXPECT_THAT(
+ handler->Handle(tokenized_document, doc_id, /*recovery_mode=*/true,
+ /*put_document_stats=*/nullptr),
+ IsOk());
+ EXPECT_THAT(qualified_id_join_index_->last_added_document_id(),
+ Eq(doc_id + 1));
+ // (kFakeType, kPropertyQualifiedId) should contain nothing.
+ EXPECT_THAT(
+ GetJoinData(*qualified_id_join_index_, /*schema_type_id=*/fake_type_id_,
+ /*joinable_property_id=*/fake_type_joinable_property_id_),
+ IsOkAndHolds(IsEmpty()));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id.cc b/icing/join/qualified-id.cc
new file mode 100644
index 0000000..42e080c
--- /dev/null
+++ b/icing/join/qualified-id.cc
@@ -0,0 +1,110 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id.h"
+
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Since we use '#' as the separator and '\' to escape '\' and '#', only these 2
+// characters are considered special characters to parse qualified id.
+bool IsSpecialCharacter(char c) {
+ return c == QualifiedId::kEscapeChar ||
+ c == QualifiedId::kNamespaceUriSeparator;
+}
+
+// Helper function to verify the format (check the escape format and make sure
+// number of separator '#' is 1) and find the position of the unique separator.
+//
+// Returns:
+// A valid index of the separator on success.
+// std::string::npos if the escape format of content is incorrect.
+// std::string::npos if the content contains 0 or more than 1 separators.
+// std::string::npos if the content contains '\0'.
+size_t VerifyFormatAndGetSeparatorPosition(std::string_view content) {
+ size_t separator_pos = std::string::npos;
+ for (size_t i = 0; i < content.length(); ++i) {
+ if (content[i] == '\0') {
+ return std::string::npos;
+ }
+
+ if (content[i] == QualifiedId::kEscapeChar) {
+ // Advance to the next character.
+ ++i;
+ if (i >= content.length() || !IsSpecialCharacter(content[i])) {
+ // Invalid escape format.
+ return std::string::npos;
+ }
+ } else if (content[i] == QualifiedId::kNamespaceUriSeparator) {
+ if (separator_pos != std::string::npos) {
+ // Found another separator, so return std::string::npos since only one
+ // separator is allowed.
+ return std::string::npos;
+ }
+ separator_pos = i;
+ }
+ }
+ return separator_pos;
+}
+
+// Helper function to unescape the content.
+libtextclassifier3::StatusOr<std::string> Unescape(std::string_view content) {
+ std::string unescaped_content;
+ for (size_t i = 0; i < content.length(); ++i) {
+ if (content[i] == QualifiedId::kEscapeChar) {
+ // Advance to the next character.
+ ++i;
+ if (i >= content.length() || !IsSpecialCharacter(content[i])) {
+ // Invalid escape format.
+ return absl_ports::InvalidArgumentError("Invalid escape format");
+ }
+ }
+ unescaped_content += content[i];
+ }
+ return unescaped_content;
+}
+
+} // namespace
+
+/* static */ libtextclassifier3::StatusOr<QualifiedId> QualifiedId::Parse(
+ std::string_view qualified_id_str) {
+ size_t separator_pos = VerifyFormatAndGetSeparatorPosition(qualified_id_str);
+ if (separator_pos == std::string::npos) {
+ return absl_ports::InvalidArgumentError(
+ "Failed to find the position of separator");
+ }
+
+ if (separator_pos == 0 || separator_pos + 1 >= qualified_id_str.length()) {
+ return absl_ports::InvalidArgumentError(
+ "Namespace or uri cannot be empty after parsing");
+ }
+
+ ICING_ASSIGN_OR_RETURN(std::string name_space,
+ Unescape(qualified_id_str.substr(0, separator_pos)));
+ ICING_ASSIGN_OR_RETURN(std::string uri,
+ Unescape(qualified_id_str.substr(separator_pos + 1)));
+ return QualifiedId(std::move(name_space), std::move(uri));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/join/qualified-id.h b/icing/join/qualified-id.h
new file mode 100644
index 0000000..eb6606a
--- /dev/null
+++ b/icing/join/qualified-id.h
@@ -0,0 +1,65 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_JOIN_QUALIFIED_ID_H_
+#define ICING_JOIN_QUALIFIED_ID_H_
+
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+
+// QualifiedId definition: namespace and uri.
+// This is a wrapper class for parsing qualified id string.
+//
+// Qualified id string format: escape(namespace) + '#' + escape(uri).
+// - Use '#' as the separator to concat namespace and uri
+// - Use '\' to escape '\' and '#' in namespace and uri.
+// - There should be 1 separator '#' in a qualified string, and the rest part
+// should have correct escape format.
+// - Raw namespace and uri cannot be empty.
+class QualifiedId {
+ public:
+ static constexpr char kEscapeChar = '\\';
+ static constexpr char kNamespaceUriSeparator = '#';
+
+ // Parses a qualified id string "<escaped(namespace)>#<escaped(uri)>" and
+ // creates an instance of QualifiedId.
+ //
+ // qualified_id_str: a qualified id string having the format mentioned above.
+ //
+ // Returns:
+ // - A QualifiedId instance with raw namespace and uri, on success.
+ // - INVALID_ARGUMENT_ERROR if the format of qualified_id_str is incorrect.
+ static libtextclassifier3::StatusOr<QualifiedId> Parse(
+ std::string_view qualified_id_str);
+
+ explicit QualifiedId(std::string name_space, std::string uri)
+ : name_space_(std::move(name_space)), uri_(std::move(uri)) {}
+
+ const std::string& name_space() const { return name_space_; }
+ const std::string& uri() const { return uri_; }
+
+ private:
+ std::string name_space_;
+ std::string uri_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_JOIN_QUALIFIED_ID_H_
diff --git a/icing/join/qualified-id_test.cc b/icing/join/qualified-id_test.cc
new file mode 100644
index 0000000..92bf63e
--- /dev/null
+++ b/icing/join/qualified-id_test.cc
@@ -0,0 +1,159 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/join/qualified-id.h"
+
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+
+TEST(QualifiedIdTest, ValidQualifiedIdWithoutSpecialCharacters) {
+ // "namespace#uri" -> "namespace" + "uri"
+ ICING_ASSERT_OK_AND_ASSIGN(QualifiedId id,
+ QualifiedId::Parse(R"(namespace#uri)"));
+ EXPECT_THAT(id.name_space(), Eq(R"(namespace)"));
+ EXPECT_THAT(id.uri(), R"(uri)");
+}
+
+TEST(QualifiedIdTest, ValidQualifiedIdWithEscapedSpecialCharacters) {
+ // "namespace\\#uri" -> "namespace\" + "uri"
+ ICING_ASSERT_OK_AND_ASSIGN(QualifiedId id1,
+ QualifiedId::Parse(R"(namespace\\#uri)"));
+ EXPECT_THAT(id1.name_space(), Eq(R"(namespace\)"));
+ EXPECT_THAT(id1.uri(), R"(uri)");
+
+ // "namespace\\\##uri" -> "namespace\#" + "uri"
+ ICING_ASSERT_OK_AND_ASSIGN(QualifiedId id2,
+ QualifiedId::Parse(R"(namespace\\\##uri)"));
+ EXPECT_THAT(id2.name_space(), Eq(R"(namespace\#)"));
+ EXPECT_THAT(id2.uri(), R"(uri)");
+
+ // "namespace#\#\\uri" -> "namespace" + "#\uri"
+ ICING_ASSERT_OK_AND_ASSIGN(QualifiedId id3,
+ QualifiedId::Parse(R"(namespace#\#\\uri)"));
+ EXPECT_THAT(id3.name_space(), Eq(R"(namespace)"));
+ EXPECT_THAT(id3.uri(), R"(#\uri)");
+
+ // "namespace\\\##\#\\uri" -> "namespace\#" + "#\uri"
+ ICING_ASSERT_OK_AND_ASSIGN(QualifiedId id4,
+ QualifiedId::Parse(R"(namespace\\\##\#\\uri)"));
+ EXPECT_THAT(id4.name_space(), Eq(R"(namespace\#)"));
+ EXPECT_THAT(id4.uri(), R"(#\uri)");
+}
+
+TEST(QualifiedIdTest, InvalidQualifiedIdWithEmptyNamespaceOrUri) {
+ // "#uri"
+ EXPECT_THAT(QualifiedId::Parse(R"(#uri)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "namespace#"
+ EXPECT_THAT(QualifiedId::Parse(R"(namespace#)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "#"
+ EXPECT_THAT(QualifiedId::Parse(R"(#)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(QualifiedIdTest, InvalidQualifiedIdWithInvalidEscape) {
+ // "namespace\"
+ // Add an additional '#' and use string_view trick to cover the index safe
+ // check when skipping the last '\'.
+ std::string str1 = R"(namespace\)"
+ R"(#)";
+ EXPECT_THAT(
+ QualifiedId::Parse(std::string_view(str1.data(), str1.length() - 1)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "names\pace#uri"
+ EXPECT_THAT(QualifiedId::Parse(R"(names\pace#uri)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "names\\\pace#uri"
+ EXPECT_THAT(QualifiedId::Parse(R"(names\\\pace#uri)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "namespace#uri\"
+ // Add an additional '#' and use string_view trick to cover the index safe
+ // check when skipping the last '\'.
+ std::string str2 = R"(namespace#uri\)"
+ R"(#)";
+ EXPECT_THAT(
+ QualifiedId::Parse(std::string_view(str2.data(), str2.length() - 1)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(QualifiedIdTest, InvalidQualifiedIdWithWrongNumberOfSeparators) {
+ // ""
+ EXPECT_THAT(QualifiedId::Parse(R"()"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "namespaceuri"
+ EXPECT_THAT(QualifiedId::Parse(R"(namespaceuri)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "namespace##uri"
+ EXPECT_THAT(QualifiedId::Parse(R"(namespace##uri)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "namespace#uri#others"
+ EXPECT_THAT(QualifiedId::Parse(R"(namespace#uri#others)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "namespace\#uri"
+ EXPECT_THAT(QualifiedId::Parse(R"(namespace\#uri)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "namespace\\##uri"
+ EXPECT_THAT(QualifiedId::Parse(R"(namespace\\##uri)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // "namespace#uri\\#others"
+ EXPECT_THAT(QualifiedId::Parse(R"(namespace#uri\\#)"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(QualifiedIdTest, InvalidQualifiedIdWithStringTerminator) {
+ const char invalid_qualified_id1[] = "names\0pace#uri";
+ EXPECT_THAT(QualifiedId::Parse(std::string_view(invalid_qualified_id1, 14)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ const char invalid_qualified_id2[] = "namespace#ur\0i";
+ EXPECT_THAT(QualifiedId::Parse(std::string_view(invalid_qualified_id2, 14)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ const char invalid_qualified_id3[] = "\0namespace#uri";
+ EXPECT_THAT(QualifiedId::Parse(std::string_view(invalid_qualified_id3, 14)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ const char invalid_qualified_id4[] = "namespace#uri\0";
+ EXPECT_THAT(QualifiedId::Parse(std::string_view(invalid_qualified_id4, 14)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/core/icing-core-types.h b/icing/legacy/core/icing-core-types.h
index cc12663..7db8408 100644
--- a/icing/legacy/core/icing-core-types.h
+++ b/icing/legacy/core/icing-core-types.h
@@ -21,9 +21,8 @@
#ifndef ICING_LEGACY_CORE_ICING_CORE_TYPES_H_
#define ICING_LEGACY_CORE_ICING_CORE_TYPES_H_
-#include <stdint.h>
-
#include <cstddef> // size_t not defined implicitly for all platforms.
+#include <cstdint>
#include <vector>
#include "icing/legacy/core/icing-compat.h"
diff --git a/icing/legacy/core/icing-string-util.cc b/icing/legacy/core/icing-string-util.cc
index 1954cd3..ed06e03 100644
--- a/icing/legacy/core/icing-string-util.cc
+++ b/icing/legacy/core/icing-string-util.cc
@@ -11,21 +11,13 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
-
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-// sbanacho@google.com (Scott Banachowski)
-//
-// This is a list of IsGoogleLetter letters. It is copied from
-// google3/util/utf8/proptables/letters.txt CL 19164202.
#include "icing/legacy/core/icing-string-util.h"
-#include <stdarg.h>
-#include <stddef.h>
-#include <stdint.h>
-#include <stdio.h>
-
#include <algorithm>
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
#include <string>
#include "icing/legacy/portable/icing-zlib.h"
@@ -34,7 +26,6 @@
namespace icing {
namespace lib {
-namespace {} // namespace
uint32_t IcingStringUtil::UpdateCrc32(uint32_t crc, const char *str, int len) {
if (len > 0) {
crc = ~crc32(~crc, reinterpret_cast<const Bytef *>(str), len);
diff --git a/icing/legacy/core/icing-string-util.h b/icing/legacy/core/icing-string-util.h
index 4ea93ec..e5e4941 100644
--- a/icing/legacy/core/icing-string-util.h
+++ b/icing/legacy/core/icing-string-util.h
@@ -12,16 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// Copyright 2011 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-// sbanacho@google.com (Scott Banachowski)
-
#ifndef ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
#define ICING_LEGACY_CORE_ICING_STRING_UTIL_H_
-#include <stdarg.h>
-#include <stdint.h>
-
+#include <cstdarg>
+#include <cstdint>
#include <string>
#include "icing/legacy/core/icing-compat.h"
diff --git a/icing/legacy/core/icing-timer.h b/icing/legacy/core/icing-timer.h
index 49ba9ad..af38912 100644
--- a/icing/legacy/core/icing-timer.h
+++ b/icing/legacy/core/icing-timer.h
@@ -16,7 +16,8 @@
#define ICING_LEGACY_CORE_ICING_TIMER_H_
#include <sys/time.h>
-#include <time.h>
+
+#include <ctime>
namespace icing {
namespace lib {
diff --git a/icing/legacy/index/icing-array-storage.cc b/icing/legacy/index/icing-array-storage.cc
index b462135..de5178a 100644
--- a/icing/legacy/index/icing-array-storage.cc
+++ b/icing/legacy/index/icing-array-storage.cc
@@ -14,10 +14,10 @@
#include "icing/legacy/index/icing-array-storage.h"
-#include <inttypes.h>
#include <sys/mman.h>
#include <algorithm>
+#include <cinttypes>
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/core/icing-timer.h"
@@ -65,17 +65,13 @@ bool IcingArrayStorage::Init(int fd, size_t fd_offset, bool map_shared,
return false;
}
if (file_size < fd_offset) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Array storage file size %" PRIu64 " less than offset %zu", file_size,
- fd_offset);
+ ICING_LOG(ERROR) << "Array storage file size " << file_size << " less than offset " << fd_offset;
return false;
}
uint32_t capacity_num_elts = (file_size - fd_offset) / elt_size;
if (capacity_num_elts < num_elts) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Array storage num elts %u > capacity num elts %u", num_elts,
- capacity_num_elts);
+ ICING_LOG(ERROR) << "Array storage num elts " << num_elts << " > capacity num elts " << capacity_num_elts;
return false;
}
@@ -108,8 +104,7 @@ bool IcingArrayStorage::Init(int fd, size_t fd_offset, bool map_shared,
if (init_crc) {
*crc_ptr_ = crc;
} else if (crc != *crc_ptr_) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Array storage bad crc %u vs %u", crc, *crc_ptr_);
+ ICING_LOG(ERROR) << "Array storage bad crc " << crc << " vs " << *crc_ptr_;
goto failed;
}
}
@@ -276,9 +271,9 @@ void IcingArrayStorage::UpdateCrc() {
cur_offset += change.elt_len * elt_size_;
}
if (!changes_.empty()) {
- ICING_VLOG(2) << IcingStringUtil::StringPrintf(
- "Array update partial crcs %d truncated %d overlapped %d duplicate %d",
- num_partial_crcs, num_truncated, num_overlapped, num_duplicate);
+ ICING_VLOG(2) << "Array update partial crcs " << num_partial_crcs
+ << " truncated " << num_truncated << " overlapped " << num_overlapped
+ << " duplicate " << num_duplicate;
}
// Now update with grown area.
@@ -286,8 +281,7 @@ void IcingArrayStorage::UpdateCrc() {
cur_crc = IcingStringUtil::UpdateCrc32(
cur_crc, array_cast<char>() + changes_end_ * elt_size_,
(cur_num_ - changes_end_) * elt_size_);
- ICING_VLOG(2) << IcingStringUtil::StringPrintf(
- "Array update tail crc offset %u -> %u", changes_end_, cur_num_);
+ ICING_VLOG(2) << "Array update tail crc offset " << changes_end_ << " -> " << cur_num_;
}
// Clear, now that we've applied changes.
@@ -341,8 +335,7 @@ uint32_t IcingArrayStorage::Sync() {
if (pwrite(fd_, array() + dirty_start, dirty_end - dirty_start,
fd_offset_ + dirty_start) !=
static_cast<ssize_t>(dirty_end - dirty_start)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flushing pages failed (%u, %u)", dirty_start, dirty_end);
+ ICING_LOG(ERROR) << "Flushing pages failed (" << dirty_start << ", " << dirty_end << ")";
}
in_dirty = false;
} else if (!in_dirty && is_dirty) {
@@ -361,8 +354,7 @@ uint32_t IcingArrayStorage::Sync() {
if (pwrite(fd_, array() + dirty_start, dirty_end - dirty_start,
fd_offset_ + dirty_start) !=
static_cast<ssize_t>(dirty_end - dirty_start)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flushing pages failed (%u, %u)", dirty_start, dirty_end);
+ ICING_LOG(ERROR) << "Flushing pages failed (" << dirty_start << ", " << dirty_end << ")";
}
}
@@ -377,9 +369,7 @@ uint32_t IcingArrayStorage::Sync() {
}
if (num_flushed > 0) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Flushing %u/%u %u contiguous pages in %.3fms", num_flushed,
- dirty_pages_size, num_contiguous, timer.Elapsed() * 1000.);
+ ICING_VLOG(1) << "Flushing " << num_flushed << "/" << dirty_pages_size << " " << num_contiguous << " contiguous pages in " << timer.Elapsed() * 1000 << "ms.";
}
return num_flushed;
diff --git a/icing/legacy/index/icing-array-storage.h b/icing/legacy/index/icing-array-storage.h
index fad0565..0d93172 100644
--- a/icing/legacy/index/icing-array-storage.h
+++ b/icing/legacy/index/icing-array-storage.h
@@ -20,8 +20,7 @@
#ifndef ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_
#define ICING_LEGACY_INDEX_ICING_ARRAY_STORAGE_H_
-#include <stdint.h>
-
+#include <cstdint>
#include <string>
#include <vector>
diff --git a/icing/legacy/index/icing-bit-util.h b/icing/legacy/index/icing-bit-util.h
index 3273a68..d0c3f50 100644
--- a/icing/legacy/index/icing-bit-util.h
+++ b/icing/legacy/index/icing-bit-util.h
@@ -20,9 +20,8 @@
#ifndef ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_
#define ICING_LEGACY_INDEX_ICING_BIT_UTIL_H_
-#include <stdint.h>
-#include <stdio.h>
-
+#include <cstdint>
+#include <cstdio>
#include <limits>
#include <vector>
diff --git a/icing/legacy/index/icing-common-types.h b/icing/legacy/index/icing-common-types.h
deleted file mode 100644
index 592b549..0000000
--- a/icing/legacy/index/icing-common-types.h
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright 2014 Google Inc. All Rights Reserved.
-// Author: sbanacho@google.com (Scott Banachowski)
-// Author: csyoung@google.com (C. Sean Young)
-
-#ifndef ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
-#define ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
-
-#include "icing/legacy/core/icing-core-types.h"
-
-// Protocol buffers are shared across several components.
-namespace com {
-namespace google {
-namespace android {
-namespace gms {
-namespace icing {
-namespace lib {
-
-class ClientFileGroup;
-class Document;
-class Document_Section;
-class DocumentStoreStatusProto;
-class IMEUpdate;
-class IMEUpdateResponse;
-class IndexCorpusScoringConfig;
-class IndexCorpusScoringConfig_Section;
-class IndexScoringConfig;
-class InitStatus;
-class InitStatus_CorpusInitInfo;
-class PendingDeleteUsageReport;
-class PhraseAffinityRequest;
-class QueryResponse;
-class QueryResponse_Corpus;
-class QueryResponse_Corpus_Section;
-class QueryResponse_Corpus_Tag;
-class QueryRequestSpec;
-class QueryRequestSpec_CorpusSpec;
-class QueryRequestSpec_SectionSpec;
-class ResponseDebugInfo;
-class ResultDebugInfo;
-class SectionConfig;
-class SuggestionResponse;
-class SuggestionResponse_Suggestion;
-class UsageReportsResponse;
-class UsageStats;
-class UsageStats_Corpus;
-
-} // namespace lib
-} // namespace icing
-} // namespace gms
-} // namespace android
-} // namespace google
-} // namespace com
-
-namespace icing {
-namespace lib {
-
-// Typedefs.
-using IcingDocId = uint32_t;
-
-using IcingSectionId = uint32_t;
-
-using IcingCorpusId = uint16_t;
-using IcingSectionIdMask = uint16_t;
-
-using IcingTagsCount = uint16_t;
-
-using IcingSequenceNumber = int64_t;
-
-using IcingScore = uint64_t;
-
-constexpr size_t kIcingMaxTokenLen = 30; // default shared between query
- // processor and indexer
-constexpr int kIcingQueryTermLimit = 50; // Maximum number of terms in a query
-constexpr int kIcingMaxVariantsPerToken = 10; // Maximum number of variants
-
-// LINT.IfChange
-constexpr int kIcingDocIdBits = 20; // 1M docs
-constexpr IcingDocId kIcingInvalidDocId = (1u << kIcingDocIdBits) - 1;
-constexpr IcingDocId kIcingMaxDocId = kIcingInvalidDocId - 1;
-// LINT.ThenChange(//depot/google3/wireless/android/icing/plx/google_sql_common_macros.sql)
-
-constexpr int kIcingDocScoreBits = 32;
-
-constexpr int kIcingSectionIdBits = 4; // 4 bits for 16 values
-constexpr IcingSectionId kIcingMaxSectionId = (1u << kIcingSectionIdBits) - 1;
-constexpr IcingSectionId kIcingInvalidSectionId = kIcingMaxSectionId + 1;
-constexpr IcingSectionIdMask kIcingSectionIdMaskAll = ~IcingSectionIdMask{0};
-constexpr IcingSectionIdMask kIcingSectionIdMaskNone = IcingSectionIdMask{0};
-
-constexpr int kIcingCorpusIdBits = 15; // 32K
-constexpr IcingCorpusId kIcingInvalidCorpusId = (1u << kIcingCorpusIdBits) - 1;
-constexpr IcingCorpusId kIcingMaxCorpusId = kIcingInvalidCorpusId - 1;
-
-constexpr size_t kIcingMaxSearchableDocumentSize = (1u << 16) - 1; // 64K
-// Max num tokens per document. 64KB is our original maximum (searchable)
-// document size. We clip if document exceeds this.
-constexpr uint32_t kIcingMaxNumTokensPerDoc =
- kIcingMaxSearchableDocumentSize / 5;
-constexpr uint32_t kIcingMaxNumHitsPerDocument =
- kIcingMaxNumTokensPerDoc * kIcingMaxVariantsPerToken;
-
-constexpr IcingTagsCount kIcingInvalidTagCount = ~IcingTagsCount{0};
-constexpr IcingTagsCount kIcingMaxTagCount = kIcingInvalidTagCount - 1;
-
-// Location refers to document storage.
-constexpr uint64_t kIcingInvalidLocation = ~uint64_t{0};
-constexpr uint64_t kIcingMaxDocStoreWriteLocation = uint64_t{1}
- << 32; // 4bytes.
-
-// Dump symbols in the proto namespace.
-using namespace ::com::google::android::gms::icing; // NOLINT(build/namespaces)
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_LEGACY_INDEX_ICING_COMMON_TYPES_H_
diff --git a/icing/legacy/index/icing-dynamic-trie.cc b/icing/legacy/index/icing-dynamic-trie.cc
index ee3d3a2..378b666 100644
--- a/icing/legacy/index/icing-dynamic-trie.cc
+++ b/icing/legacy/index/icing-dynamic-trie.cc
@@ -62,18 +62,20 @@
#include "icing/legacy/index/icing-dynamic-trie.h"
-#include <errno.h>
#include <fcntl.h>
-#include <inttypes.h>
-#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <algorithm>
+#include <cerrno>
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
#include <memory>
#include <utility>
+#include "icing/absl_ports/canonical_errors.h"
#include "icing/legacy/core/icing-packed-pod.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/core/icing-timer.h"
@@ -81,9 +83,11 @@
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/legacy/index/icing-flash-bitmap.h"
#include "icing/legacy/index/icing-mmapper.h"
+#include "icing/legacy/index/proto/icing-dynamic-trie-header.pb.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/logging.h"
#include "icing/util/math-util.h"
+#include "icing/util/status-macros.h"
using std::inplace_merge;
using std::lower_bound;
@@ -96,14 +100,22 @@ using std::vector;
namespace icing {
namespace lib {
+namespace {
+constexpr uint32_t kInvalidNodeIndex = (1U << 24) - 1;
+constexpr uint32_t kInvalidNextIndex = ~0U;
+
+void ResetMutableNext(IcingDynamicTrie::Next &mutable_next) {
+ mutable_next.set_val(0xff);
+ mutable_next.set_node_index(kInvalidNodeIndex);
+}
+} // namespace
+
// Based on the bit field widths.
const uint32_t IcingDynamicTrie::Options::kMaxNodes = (1U << 24) - 1;
const uint32_t IcingDynamicTrie::Options::kMaxNexts = (1U << 27) - 1;
const uint32_t IcingDynamicTrie::Options::kMaxSuffixesSize = 1U << 27;
const uint32_t IcingDynamicTrie::Options::kMaxValueSize = 1U << 16;
-const uint32_t IcingDynamicTrie::kInvalidNodeIndex = (1U << 24) - 1;
-const uint32_t IcingDynamicTrie::kInvalidNextIndex = ~0U;
const uint32_t IcingDynamicTrie::kInvalidSuffixIndex = ~0U;
const int IcingDynamicTrie::kMaxNextArraySize;
@@ -298,7 +310,7 @@ class IcingDynamicTrie::IcingDynamicTrieStorage {
// REQUIRES: nodes_left() > 0.
Node *AllocNode();
// REQUIRES: nexts_left() >= kMaxNextArraySize.
- Next *AllocNextArray(int size);
+ libtextclassifier3::StatusOr<Next *> AllocNextArray(int size);
void FreeNextArray(Next *next, int log2_size);
// REQUIRES: suffixes_left() >= strlen(suffix) + 1 + value_size()
uint32_t MakeSuffix(const char *suffix, const void *value,
@@ -383,6 +395,8 @@ class IcingDynamicTrie::IcingDynamicTrieStorage {
// storage.
IcingScopedFd array_fds_[NUM_ARRAY_TYPES];
std::vector<IcingArrayStorage> array_storage_;
+
+ // Legacy file system. Switch to use the new Filesystem class instead.
const IcingFilesystem *filesystem_;
};
@@ -449,8 +463,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Init() {
if (i == 0) {
// Header.
if (file_size != IcingMMapper::system_page_size()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Trie hdr wrong size: %" PRIu64, file_size);
+ ICING_LOG(ERROR) << "Trie hdr wrong size: " << file_size;
goto failed;
}
@@ -511,8 +524,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Init() {
sizeof(char), hdr_.hdr.suffixes_size(),
hdr_.hdr.max_suffixes_size(),
&crcs_->array_crcs[SUFFIX], init_crcs)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Trie mmap suffix failed");
+ ICING_LOG(ERROR) << "Trie mmap suffix failed";
goto failed;
}
@@ -660,8 +672,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Sync() {
}
if (!WriteHeader()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flushing trie header failed: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Flushing trie header failed: " << strerror(errno);
success = false;
}
@@ -675,8 +686,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Sync() {
}
if (total_flushed > 0) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Flushing %u pages of trie",
- total_flushed);
+ ICING_VLOG(1) << "Flushing " << total_flushed << " pages of trie";
}
return success;
@@ -719,10 +729,11 @@ IcingDynamicTrie::Node *IcingDynamicTrie::IcingDynamicTrieStorage::AllocNode() {
return GetMutableNode(hdr_.hdr.num_nodes() - 1);
}
-IcingDynamicTrie::Next *
+libtextclassifier3::StatusOr<IcingDynamicTrie::Next *>
IcingDynamicTrie::IcingDynamicTrieStorage::AllocNextArray(int size) {
if (size > kMaxNextArraySize) {
- ICING_LOG(FATAL) << "Array size exceeds the max 'next' array size";
+ return absl_ports::InternalError(
+ "Array size exceeds the max 'next' array size");
}
if (nexts_left() < static_cast<uint32_t>(kMaxNextArraySize)) {
@@ -752,8 +763,7 @@ IcingDynamicTrie::IcingDynamicTrieStorage::AllocNextArray(int size) {
// Fill with char 0xff so we are sorted properly.
for (int i = 0; i < aligned_size; i++) {
- ret[i].set_val(0xff);
- ret[i].set_node_index(kInvalidNodeIndex);
+ ResetMutableNext(ret[i]);
}
return ret;
}
@@ -807,8 +817,7 @@ uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrc() {
uint32_t IcingDynamicTrie::IcingDynamicTrieStorage::UpdateCrcInternal(
bool write_hdr) {
if (write_hdr && !WriteHeader()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flushing trie header failed: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Flushing trie header failed: " << strerror(errno);
}
crcs_->header_crc = GetHeaderCrc();
@@ -891,7 +900,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Init(
bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
uint8_t *buf, uint32_t buf_size) const {
- uint32_t size = hdr.ByteSize();
+ uint32_t size = hdr.ByteSizeLong();
if (size + sizeof(kMagic) + sizeof(uint32_t) > buf_size) return false;
memcpy(buf, &kMagic, sizeof(kMagic));
memcpy(buf + sizeof(kMagic), &size, sizeof(uint32_t));
@@ -902,8 +911,7 @@ bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::SerializeToArray(
bool IcingDynamicTrie::IcingDynamicTrieStorage::Header::Verify() {
// Check version.
if (hdr.version() != kCurVersion) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Trie version %u mismatch", hdr.version());
+ ICING_LOG(ERROR) << "Trie version " << hdr.version() << " mismatch";
return false;
}
@@ -1145,9 +1153,8 @@ bool IcingDynamicTrie::Sync() {
Warm();
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Syncing dynamic trie %s took %.3fms", filename_base_.c_str(),
- timer.Elapsed() * 1000.);
+ ICING_VLOG(1) << "Syncing dynamic trie " << filename_base_.c_str()
+ << " took " << timer.Elapsed() * 1000 << "ms";
return success;
}
@@ -1197,8 +1204,7 @@ std::unique_ptr<IcingFlashBitmap> IcingDynamicTrie::OpenAndInitBitmap(
const IcingFilesystem *filesystem) {
auto bitmap = std::make_unique<IcingFlashBitmap>(filename, filesystem);
if (!bitmap->Init() || (verify && !bitmap->Verify())) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Init of %s failed",
- filename.c_str());
+ ICING_LOG(ERROR) << "Init of " << filename.c_str() << " failed";
return nullptr;
}
return bitmap;
@@ -1228,16 +1234,14 @@ bool IcingDynamicTrie::InitPropertyBitmaps() {
vector<std::string> files;
if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
&files)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Could not get files at prefix %s", property_bitmaps_prefix_.c_str());
+ ICING_LOG(ERROR) << "Could not get files at prefix " << property_bitmaps_prefix_;
goto failed;
}
for (size_t i = 0; i < files.size(); i++) {
// Decode property id from filename.
size_t property_id_start_idx = files[i].rfind('.');
if (property_id_start_idx == std::string::npos) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Malformed filename %s",
- files[i].c_str());
+ ICING_LOG(ERROR) << "Malformed filename " << files[i];
continue;
}
property_id_start_idx++; // skip dot
@@ -1245,8 +1249,7 @@ bool IcingDynamicTrie::InitPropertyBitmaps() {
uint32_t property_id =
strtol(files[i].c_str() + property_id_start_idx, &end, 10); // NOLINT
if (!end || end != (files[i].c_str() + files[i].size())) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Malformed filename %s",
- files[i].c_str());
+ ICING_LOG(ERROR) << "Malformed filename " << files[i];
continue;
}
std::unique_ptr<IcingFlashBitmap> bitmap = OpenAndInitBitmap(
@@ -1254,8 +1257,7 @@ bool IcingDynamicTrie::InitPropertyBitmaps() {
runtime_options_.storage_policy == RuntimeOptions::kMapSharedWithCrc,
filesystem_);
if (!bitmap) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Open prop bitmap failed: %s", files[i].c_str());
+ ICING_LOG(ERROR) << "Open prop bitmap failed: " << files[i];
goto failed;
}
bitmap->Truncate(truncate_idx);
@@ -1299,50 +1301,6 @@ void IcingDynamicTrie::OnSleep() {
UpdateCrc();
}
-IcingDynamicTrie::NewValueMap::~NewValueMap() {}
-
-bool IcingDynamicTrie::Compact(
- const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out,
- std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const {
- if (old_to_new_tvi == nullptr) {
- ICING_LOG(ERROR) << "TVI is null";
- }
-
- if (!is_initialized()) {
- ICING_LOG(FATAL) << "DynamicTrie not initialized";
- }
-
- PropertyReadersAll prop_readers(*this);
-
- old_to_new_tvi->clear();
- old_to_new_tvi->rehash(size() * 2);
-
- for (Iterator it_all(*this, ""); it_all.IsValid(); it_all.Advance()) {
- uint32_t value_index = it_all.GetValueIndex();
- const void *new_value = old_tvi_to_new_value.GetNewValue(value_index);
- if (!new_value) continue;
-
- uint32_t new_value_index;
- if (!out->Insert(it_all.GetKey(), new_value, &new_value_index, false)) {
- return false;
- }
-
- old_to_new_tvi->insert({value_index, new_value_index});
-
- // Copy properties.
- for (size_t i = 0; i < prop_readers.size(); i++) {
- if (prop_readers.HasProperty(i, value_index)) {
- if (!out->SetProperty(new_value_index, i)) {
- // Ouch. We need to bail.
- return false;
- }
- }
- }
- }
-
- return true;
-}
-
uint32_t IcingDynamicTrie::size() const {
if (!is_initialized()) {
ICING_LOG(FATAL) << "DynamicTrie not initialized";
@@ -1350,10 +1308,12 @@ uint32_t IcingDynamicTrie::size() const {
return storage_->hdr().num_keys();
}
-void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
- Stats *stats) const {
+void IcingDynamicTrie::CollectStatsRecursive(const Node &node, Stats *stats,
+ uint32_t depth) const {
if (node.is_leaf()) {
stats->num_leaves++;
+ stats->sum_depth += depth;
+ stats->max_depth = max(stats->max_depth, depth);
const char *suffix = storage_->GetSuffix(node.next_index());
stats->suffixes_used += strlen(suffix) + 1 + value_size();
if (!suffix[0]) {
@@ -1365,13 +1325,16 @@ void IcingDynamicTrie::CollectStatsRecursive(const Node &node,
for (; i < (1U << node.log2_num_children()); i++) {
const Next &next = *storage_->GetNext(node.next_index(), i);
if (next.node_index() == kInvalidNodeIndex) break;
- CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats);
+ CollectStatsRecursive(*storage_->GetNode(next.node_index()), stats,
+ depth + 1);
}
// At least one valid node in each next array
if (i == 0) {
ICING_LOG(FATAL) << "No valid node in 'next' array";
}
+ stats->sum_children += i;
+ stats->max_children = max(stats->max_children, i);
stats->child_counts[i - 1]++;
stats->wasted[node.log2_num_children()] +=
@@ -1453,9 +1416,12 @@ std::string IcingDynamicTrie::Stats::DumpStats(int verbosity) const {
"Wasted total: %u\n"
"Num intermediates %u num leaves %u "
"suffixes used %u null %u\n"
+ "avg and max children for intermediates: %.3f, %u\n"
+ "avg and max depth for leaves: %.3f, %u\n"
"Total next frag: %.3f%%\n",
total_wasted, num_intermediates, num_leaves, suffixes_used,
- null_suffixes,
+ null_suffixes, 1. * sum_children / num_intermediates, max_children,
+ 1. * sum_depth / num_leaves, max_depth,
100. * math_util::SafeDivide((total_free + total_wasted), num_nexts));
}
IcingStringUtil::SStringAppendF(
@@ -1502,9 +1468,56 @@ void IcingDynamicTrie::Clear() {
deleted_bitmap_->Truncate(0);
}
-bool IcingDynamicTrie::Insert(const char *key, const void *value,
- uint32_t *value_index, bool replace,
- bool *pnew_key) {
+bool IcingDynamicTrie::ClearSuffixAndValue(uint32_t suffix_value_index) {
+ // The size 1 below is for a '\0' between the suffix and the value.
+ size_t suffix_and_value_length =
+ strlen(this->storage_->GetSuffix(suffix_value_index)) + 1 +
+ this->value_size();
+ char *mutable_suffix_and_value = this->storage_->GetMutableSuffix(
+ suffix_value_index, suffix_and_value_length);
+
+ if (mutable_suffix_and_value == nullptr) {
+ return false;
+ }
+
+ memset(mutable_suffix_and_value, 0, suffix_and_value_length);
+ return true;
+}
+
+bool IcingDynamicTrie::ResetNext(uint32_t next_index) {
+ Next *mutable_next =
+ this->storage_->GetMutableNextArray(next_index, /*len=*/1);
+
+ if (mutable_next == nullptr) {
+ return false;
+ }
+ ResetMutableNext(*mutable_next);
+ return true;
+}
+
+bool IcingDynamicTrie::SortNextArray(const Node *node) {
+ if (node == nullptr) {
+ // Nothing to sort, return success directly.
+ return true;
+ }
+
+ uint32_t next_array_buffer_size = 1u << node->log2_num_children();
+ Next *next_array_start = this->storage_->GetMutableNextArray(
+ node->next_index(), next_array_buffer_size);
+
+ if (next_array_start == nullptr) {
+ return false;
+ }
+
+ std::sort(next_array_start, next_array_start + next_array_buffer_size);
+ return true;
+}
+
+libtextclassifier3::Status IcingDynamicTrie::Insert(const char *key,
+ const void *value,
+ uint32_t *value_index,
+ bool replace,
+ bool *pnew_key) {
if (!is_initialized()) {
ICING_LOG(FATAL) << "DynamicTrie not initialized";
}
@@ -1520,8 +1533,7 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value,
if (!(storage_->nodes_left() >= 2 + key_len + 1 &&
storage_->nexts_left() >= 2 + key_len + 1 + kMaxNextArraySize &&
storage_->suffixes_left() >= key_len + 1 + value_size())) {
- // No more space left.
- return false;
+ return absl_ports::ResourceExhaustedError("No more space left");
}
uint32_t best_node_index;
@@ -1563,7 +1575,7 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value,
storage_->GetSuffixIndex(prev_suffix_cur + 1), value_size());
memcpy(mutable_prev_suffix_cur, value, value_size());
}
- return true;
+ return libtextclassifier3::Status::OK;
}
if (*prev_suffix_cur == *key_cur) {
@@ -1577,7 +1589,7 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value,
int common_len = prev_suffix_cur - prev_suffix;
for (int i = 0; i < common_len; i++) {
// Create a single-branch child node.
- Next *split_next = storage_->AllocNextArray(1);
+ ICING_ASSIGN_OR_RETURN(Next * split_next, storage_->AllocNextArray(1));
split_node->set_next_index(storage_->GetNextArrayIndex(split_next));
split_node->set_is_leaf(false);
split_node->set_log2_num_children(0);
@@ -1589,7 +1601,7 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value,
}
// Fill a split.
- Next *split_next = storage_->AllocNextArray(2);
+ ICING_ASSIGN_OR_RETURN(Next * split_next, storage_->AllocNextArray(2));
split_node->set_next_index(storage_->GetNextArrayIndex(split_next));
split_node->set_is_leaf(false);
split_node->set_log2_num_children(1);
@@ -1641,17 +1653,14 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value,
new_leaf_node->set_log2_num_children(0);
// Figure out the real length of the existing next array.
- Next *cur_next = storage_->GetMutableNextArray(
- best_node->next_index(), 1 << best_node->log2_num_children());
- int next_len = 0;
- for (; next_len < (1 << best_node->log2_num_children()) &&
- cur_next[next_len].node_index() != kInvalidNodeIndex;
- next_len++) {
- }
+ uint32_t next_array_buffer_size = 1u << best_node->log2_num_children();
+ Next *cur_next = storage_->GetMutableNextArray(best_node->next_index(),
+ next_array_buffer_size);
+ int next_len = GetValidNextsSize(cur_next, next_array_buffer_size);
Next *new_next = cur_next;
- if (next_len == (1 << best_node->log2_num_children())) {
+ if (next_len == (next_array_buffer_size)) {
// Allocate a new, larger, array.
- new_next = storage_->AllocNextArray(next_len + 1);
+ ICING_ASSIGN_OR_RETURN(new_next, storage_->AllocNextArray(next_len + 1));
memcpy(new_next, cur_next, sizeof(Next) * next_len);
}
@@ -1672,7 +1681,8 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value,
// 8 == log2(256)
if (log2_num_children >= 8) {
- ICING_LOG(FATAL) << "Number of children exceeds the max allowed size";
+ return absl_ports::InternalError(
+ "Number of children exceeds the max allowed size");
}
mutable_best_node->set_log2_num_children(log2_num_children + 1);
@@ -1686,7 +1696,7 @@ bool IcingDynamicTrie::Insert(const char *key, const void *value,
storage_->inc_num_keys();
if (pnew_key) *pnew_key = true;
- return true;
+ return libtextclassifier3::Status::OK;
}
const void *IcingDynamicTrie::GetValueAtIndex(uint32_t value_index) const {
@@ -1735,11 +1745,12 @@ bool IcingDynamicTrie::Find(const char *key, void *value,
}
IcingDynamicTrie::Iterator::Iterator(const IcingDynamicTrie &trie,
- const char *prefix)
+ const char *prefix, bool reverse)
: cur_key_(prefix),
cur_suffix_(nullptr),
cur_suffix_len_(0),
single_leaf_match_(false),
+ reverse_(reverse),
trie_(trie) {
if (!trie.is_initialized()) {
ICING_LOG(FATAL) << "DynamicTrie not initialized";
@@ -1748,19 +1759,29 @@ IcingDynamicTrie::Iterator::Iterator(const IcingDynamicTrie &trie,
Reset();
}
-void IcingDynamicTrie::Iterator::LeftBranchToLeaf(uint32_t node_index) {
+void IcingDynamicTrie::Iterator::BranchToLeaf(uint32_t node_index,
+ BranchType branch_type) {
// Go down the trie, following the left-most child until we hit a
// leaf. Push to stack and cur_key nodes and chars as we go.
- for (; !trie_.storage_->GetNode(node_index)->is_leaf();
- node_index =
- trie_.storage_
- ->GetNext(trie_.storage_->GetNode(node_index)->next_index(), 0)
- ->node_index()) {
- branch_stack_.push_back(Branch(node_index));
- cur_key_.push_back(
- trie_.storage_
- ->GetNext(trie_.storage_->GetNode(node_index)->next_index(), 0)
- ->val());
+ // When reverse_ is true, the method will follow the right-most child.
+ const Node *node = trie_.storage_->GetNode(node_index);
+ while (!node->is_leaf()) {
+ const Next *next_start = trie_.storage_->GetNext(node->next_index(), 0);
+ int child_idx;
+ if (branch_type == BranchType::kRightMost) {
+ uint32_t next_array_size = 1u << node->log2_num_children();
+ child_idx = trie_.GetValidNextsSize(next_start, next_array_size) - 1;
+ } else {
+ // node isn't a leaf. So it must have >0 children.
+ // 0 is the left-most child.
+ child_idx = 0;
+ }
+ const Next &child_next = next_start[child_idx];
+ branch_stack_.push_back(Branch(node_index, child_idx));
+ cur_key_.push_back(child_next.val());
+
+ node_index = child_next.node_index();
+ node = trie_.storage_->GetNode(node_index);
}
// We're at a leaf.
@@ -1796,7 +1817,7 @@ void IcingDynamicTrie::Iterator::Reset() {
// Two cases/states:
//
// - Found an intermediate node. If we matched all of prefix
- // (cur_key_), LeftBranchToLeaf.
+ // (cur_key_), BranchToLeaf.
//
// - Found a leaf node, which is the ONLY matching key for this
// prefix. Check that suffix matches the prefix. Then we set
@@ -1819,7 +1840,9 @@ void IcingDynamicTrie::Iterator::Reset() {
cur_suffix_len_ = strlen(cur_suffix_);
single_leaf_match_ = true;
} else if (static_cast<size_t>(key_offset) == cur_key_.size()) {
- LeftBranchToLeaf(node_index);
+ BranchType branch_type =
+ (reverse_) ? BranchType::kRightMost : BranchType::kLeftMost;
+ BranchToLeaf(node_index, branch_type);
}
}
@@ -1846,19 +1869,25 @@ bool IcingDynamicTrie::Iterator::Advance() {
while (!branch_stack_.empty()) {
Branch *branch = &branch_stack_.back();
const Node *node = trie_.storage_->GetNode(branch->node_idx);
- branch->child_idx++;
- if (branch->child_idx < (1 << node->log2_num_children()) &&
- trie_.storage_->GetNext(node->next_index(), branch->child_idx)
- ->node_index() != kInvalidNodeIndex) {
- // Successfully incremented to the next child. Update the char
- // value at this depth.
- cur_key_[cur_key_.size() - 1] =
- trie_.storage_->GetNext(node->next_index(), branch->child_idx)->val();
- // We successfully found a sub-trie to explore.
- LeftBranchToLeaf(
- trie_.storage_->GetNext(node->next_index(), branch->child_idx)
- ->node_index());
- return true;
+ if (reverse_) {
+ branch->child_idx--;
+ } else {
+ branch->child_idx++;
+ }
+ if (branch->child_idx >= 0 &&
+ branch->child_idx < (1 << node->log2_num_children())) {
+ const Next *child_next =
+ trie_.storage_->GetNext(node->next_index(), branch->child_idx);
+ if (child_next->node_index() != kInvalidNodeIndex) {
+ // Successfully incremented to the next child. Update the char
+ // value at this depth.
+ cur_key_[cur_key_.size() - 1] = child_next->val();
+ // We successfully found a sub-trie to explore.
+ BranchType branch_type =
+ (reverse_) ? BranchType::kRightMost : BranchType::kLeftMost;
+ BranchToLeaf(child_next->node_index(), branch_type);
+ return true;
+ }
}
branch_stack_.pop_back();
cur_key_.resize(cur_key_.size() - 1);
@@ -2047,22 +2076,34 @@ const IcingDynamicTrie::Next *IcingDynamicTrie::GetNextByChar(
return found;
}
+int IcingDynamicTrie::GetValidNextsSize(
+ const IcingDynamicTrie::Next *next_array_start,
+ int next_array_length) const {
+ // Only searching for key char 0xff is not sufficient, as 0xff can be a valid
+ // character. We must also specify kInvalidNodeIndex as the target node index
+ // when searching the next array.
+ return LowerBound(next_array_start, next_array_start + next_array_length,
+ /*key_char=*/0xff, /*node_index=*/kInvalidNodeIndex) -
+ next_array_start;
+}
+
const IcingDynamicTrie::Next *IcingDynamicTrie::LowerBound(
- const Next *start, const Next *end, uint8_t key_char) const {
+ const Next *start, const Next *end, uint8_t key_char,
+ uint32_t node_index) const {
// Above this value will use binary search instead of linear
// search. 16 was chosen from running some benchmarks with
// different values.
static const uint32_t kBinarySearchCutoff = 16;
+ Next key_next(key_char, node_index);
if (end - start >= kBinarySearchCutoff) {
// Binary search.
- Next key_next(key_char, 0);
return lower_bound(start, end, key_next);
} else {
// Linear search.
const Next *found;
for (found = start; found < end; found++) {
- if (found->val() >= key_char) {
+ if (!(*found < key_next)) {
// Should have gotten match.
break;
}
@@ -2072,7 +2113,8 @@ const IcingDynamicTrie::Next *IcingDynamicTrie::LowerBound(
}
void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
- int *key_offset, bool prefix) const {
+ int *key_offset, bool prefix,
+ bool utf8) const {
// Find the best node such that:
//
// - If key is NOT in the trie, key[0..key_offset) is a prefix to
@@ -2093,6 +2135,8 @@ void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
const Node *cur_node = storage_->GetRootNode();
const char *cur_key = key;
+ const Node *utf8_node = cur_node;
+ const char *utf8_key = cur_key;
while (!cur_node->is_leaf()) {
const Next *found = GetNextByChar(cur_node, *cur_key);
if (!found) break;
@@ -2108,12 +2152,136 @@ void IcingDynamicTrie::FindBestNode(const char *key, uint32_t *best_node_index,
break;
}
cur_key++;
+
+ if (utf8 && i18n_utils::IsLeadUtf8Byte(*cur_key)) {
+ utf8_node = cur_node;
+ utf8_key = cur_key;
+ }
+ }
+
+ if (utf8) {
+ // Rewind.
+ cur_node = utf8_node;
+ cur_key = utf8_key;
}
*best_node_index = storage_->GetNodeIndex(cur_node);
*key_offset = reinterpret_cast<const char *>(cur_key) - key;
}
+int IcingDynamicTrie::FindNewBranchingPrefixLength(const char *key,
+ bool utf8) const {
+ if (storage_->empty()) {
+ return kNoBranchFound;
+ }
+
+ uint32_t best_node_index;
+ int key_offset;
+ FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true, utf8);
+ const Node *cur_node = storage_->GetNode(best_node_index);
+ const char *cur_key = key + key_offset;
+ if (cur_node->is_leaf()) {
+ // Prefix in the trie. Split at leaf.
+ const char *prev_suffix = storage_->GetSuffix(cur_node->next_index());
+ while (*prev_suffix != '\0' && *prev_suffix == *cur_key) {
+ prev_suffix++;
+ cur_key++;
+ }
+
+ // Equal strings? No branching.
+ if (*prev_suffix == '\0' && *cur_key == '\0') {
+ return kNoBranchFound;
+ }
+
+ if (utf8) {
+ // Rewind to utf8 boundary.
+ size_t offset = i18n_utils::SafeTruncateUtf8Length(key, cur_key - key);
+ cur_key = key + offset;
+ }
+
+ return cur_key - key;
+ } else if (cur_node->log2_num_children() == 0) {
+ // Intermediate node going from no branching to branching.
+ return cur_key - key;
+ }
+
+ // If we've reached this point, then we're already at a branch point. So there
+ // is no *new* branch point.
+ return kNoBranchFound;
+}
+
+std::vector<int> IcingDynamicTrie::FindBranchingPrefixLengths(const char *key,
+ bool utf8) const {
+ std::vector<int> prefix_lengths;
+
+ if (storage_->empty()) {
+ return prefix_lengths;
+ }
+
+ const Node *cur_node = storage_->GetRootNode();
+ const char *cur_key = key;
+ while (*cur_key && !cur_node->is_leaf()) {
+ // Branching prefix?
+ if (cur_node->log2_num_children() > 0) {
+ int len = cur_key - key;
+ if (utf8) {
+ // Do not cut mid-utf8. Walk up to utf8 boundary.
+ len = i18n_utils::SafeTruncateUtf8Length(key, len);
+ if (prefix_lengths.empty() || len != prefix_lengths.back()) {
+ prefix_lengths.push_back(len);
+ }
+ } else {
+ prefix_lengths.push_back(len);
+ }
+ }
+
+ // Move to next.
+ const Next *found = GetNextByChar(cur_node, *cur_key);
+ if (found == nullptr) {
+ break;
+ }
+ cur_node = storage_->GetNode(found->node_index());
+
+ ++cur_key;
+ }
+ return prefix_lengths;
+}
+
+bool IcingDynamicTrie::IsBranchingTerm(const char *key) const {
+ if (!is_initialized()) {
+ ICING_LOG(FATAL) << "DynamicTrie not initialized";
+ }
+
+ if (storage_->empty()) {
+ return false;
+ }
+
+ uint32_t best_node_index;
+ int key_offset;
+ FindBestNode(key, &best_node_index, &key_offset, /*prefix=*/true);
+ const Node *cur_node = storage_->GetNode(best_node_index);
+
+ if (cur_node->is_leaf()) {
+ return false;
+ }
+
+ // There is no intermediate node for key in the trie.
+ if (key[key_offset] != '\0') {
+ return false;
+ }
+
+ // Found key as an intermediate node, but key is not a valid term stored in
+ // the trie. In this case, we need at least two children for key to be a
+ // branching term.
+ if (GetNextByChar(cur_node, '\0') == nullptr) {
+ return cur_node->log2_num_children() >= 1;
+ }
+
+ // The intermediate node for key must have more than two children for key to
+ // be a branching term, one of which represents the leaf node for key itself.
+ return cur_node->log2_num_children() > 1;
+}
+
void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
Stats stats;
CollectStats(&stats);
@@ -2123,8 +2291,7 @@ void IcingDynamicTrie::GetDebugInfo(int verbosity, std::string *out) const {
vector<std::string> files;
if (!filesystem_->GetMatchingFiles((property_bitmaps_prefix_ + "*").c_str(),
&files)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Could not get files at prefix %s", property_bitmaps_prefix_.c_str());
+ ICING_LOG(ERROR) << "Could not get files at prefix " << property_bitmaps_prefix_;
return;
}
for (size_t i = 0; i < files.size(); i++) {
@@ -2196,8 +2363,7 @@ IcingFlashBitmap *IcingDynamicTrie::OpenOrCreatePropertyBitmap(
}
if (property_id > kMaxPropertyId) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Property id %u out of range", property_id);
+ ICING_LOG(ERROR) << "Property id " << property_id << " out of range";
return nullptr;
}
@@ -2248,6 +2414,121 @@ bool IcingDynamicTrie::ClearDeleted(uint32_t value_index) {
return deleted_bitmap_->SetBit(idx, false);
}
+// Steps:
+// 1. Find the key in the trie.
+// 2. Remove the suffix and the value.
+// 3. Reset the nexts that point to the nodes to be removed.
+// 4. Sort any next array if needed.
+bool IcingDynamicTrie::Delete(const std::string_view key) {
+ if (!is_initialized()) {
+ ICING_LOG(ERROR) << "DynamicTrie not initialized";
+ return false;
+ }
+
+ if (storage_->empty()) {
+ // Nothing to delete.
+ return true;
+ }
+
+ // Tries to find the key in the trie, starting from the root.
+ const Node *current_node = storage_->GetRootNode();
+
+ // The node after which we start to remove data.
+ const Node *last_multichild_node = nullptr;
+
+ // While visiting the trie nodes, we store the indices of Nexts that point
+ // to all the nodes after last_multichild_node. Those nodes must be
+ // consecutive and all have only one child. Resetting those Nexts means that
+ // we remove the data of the key.
+ std::vector<uint32_t> nexts_to_reset;
+ nexts_to_reset.reserve(key.length());
+
+ // Iterates through chars in the key, finds nodes in the trie until a leaf
+ // node is reached. The max number of loops is key.length() + 1 because we
+ // start from the root.
+ for (size_t i = 0; i <= key.length(); ++i) {
+ if (current_node->is_leaf()) {
+ // Leaf node, now check the suffix.
+ if (key.substr(i) != storage_->GetSuffix(current_node->next_index())) {
+ // Key does not exist in the trie, nothing to delete.
+ return true;
+ }
+ // Otherwise, key is found.
+ break;
+ }
+
+ // Finds the next char.
+ const Next *next;
+ if (i == key.length()) {
+ // When we're at the end of the key, the next char is the termination char
+ // '\0'.
+ next = GetNextByChar(current_node, '\0');
+ } else {
+ next = GetNextByChar(current_node, key[i]);
+ }
+
+ if (next == nullptr) {
+ // Key does not exist in the trie, nothing to delete.
+ return true;
+ }
+
+ // Checks the real size of next array.
+ uint32_t next_array_buffer_size = 1u << current_node->log2_num_children();
+ Next *next_array_start = storage_->GetMutableNextArray(
+ current_node->next_index(), next_array_buffer_size);
+ int valid_next_array_size =
+ GetValidNextsSize(next_array_start, next_array_buffer_size);
+ if (valid_next_array_size == 0) {
+ // Key does not exist in the trie, nothing to delete.
+ // This shouldn't happen, but we put a sanity check here in case something
+ // is wrong.
+ return true;
+ } else if (valid_next_array_size == 1) {
+ // Single-child branch will be deleted.
+ nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
+ } else {
+ // We see a new node with multiple children, all the previously seen nodes
+ // shouldn't be removed.
+ last_multichild_node = current_node;
+ nexts_to_reset.clear();
+ nexts_to_reset.push_back(storage_->GetNextArrayIndex(next));
+ }
+
+ // Updates current_node.
+ current_node = storage_->GetNode(next->node_index());
+ }
+ // Now we've found the key in the trie.
+
+ ClearSuffixAndValue(current_node->next_index());
+
+ // Resets nexts to remove key information.
+ for (uint32_t next_index : nexts_to_reset) {
+ ResetNext(next_index);
+ }
+
+ if (last_multichild_node != nullptr) {
+ SortNextArray(last_multichild_node);
+ uint32_t next_array_buffer_size =
+ 1u << last_multichild_node->log2_num_children();
+ Next *next_array_start = this->storage_->GetMutableNextArray(
+ last_multichild_node->next_index(), next_array_buffer_size);
+ uint32_t num_children =
+ GetValidNextsSize(next_array_start, next_array_buffer_size);
+ // Shrink the next array if we can.
+ if (num_children == next_array_buffer_size / 2) {
+ Node *mutable_node = storage_->GetMutableNode(
+ storage_->GetNodeIndex(last_multichild_node));
+ mutable_node->set_log2_num_children(mutable_node->log2_num_children() -
+ 1);
+ // Add the unused second half of the next array to the free list.
+ storage_->FreeNextArray(next_array_start + next_array_buffer_size / 2,
+ mutable_node->log2_num_children());
+ }
+ }
+
+ return true;
+}
+
bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
if (!is_initialized()) {
ICING_LOG(FATAL) << "DynamicTrie not initialized";
@@ -2255,8 +2536,7 @@ bool IcingDynamicTrie::ClearPropertyForAllValues(uint32_t property_id) {
PropertyReadersAll readers(*this);
if (!readers.Exists(property_id)) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Properties for id %u don't exist", property_id);
+ ICING_VLOG(1) << "Properties for id " << property_id << " don't exist";
return true;
}
diff --git a/icing/legacy/index/icing-dynamic-trie.h b/icing/legacy/index/icing-dynamic-trie.h
index 7136ef8..18748d7 100644
--- a/icing/legacy/index/icing-dynamic-trie.h
+++ b/icing/legacy/index/icing-dynamic-trie.h
@@ -35,13 +35,14 @@
#ifndef ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
#define ICING_LEGACY_INDEX_ICING_DYNAMIC_TRIE_H_
-#include <stdint.h>
-
+#include <cstdint>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/legacy/core/icing-compat.h"
#include "icing/legacy/core/icing-packed-pod.h"
#include "icing/legacy/index/icing-filesystem.h"
@@ -153,8 +154,13 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t max_nodes;
// Count of intermediate nodes.
uint32_t num_intermediates;
+ // Total and maximum number of children of intermediate nodes.
+ uint32_t sum_children, max_children;
+
// Count of leaf nodes.
uint32_t num_leaves;
+ // Total and maximum depth of leaf nodes.
+ uint32_t sum_depth, max_depth;
// Next stats
@@ -187,6 +193,7 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t dirty_pages_nexts;
uint32_t dirty_pages_suffixes;
+ // TODO(b/222349894) Convert the string output to a protocol buffer instead.
std::string DumpStats(int verbosity) const;
};
@@ -288,6 +295,16 @@ class IcingDynamicTrie : public IIcingStorage {
// Empty out the trie without closing or removing.
void Clear();
+ // Clears the suffix and value at the given index. Returns true on success.
+ bool ClearSuffixAndValue(uint32_t suffix_value_index);
+
+ // Resets the next at the given index so that it points to no node.
+ // Returns true on success.
+ bool ResetNext(uint32_t next_index);
+
+ // Sorts the next array of the node. Returns true on success.
+ bool SortNextArray(const Node *node);
+
// Sync to disk.
bool Sync() override;
@@ -297,23 +314,6 @@ class IcingDynamicTrie : public IIcingStorage {
// Potentially about to get nuked.
void OnSleep() override;
- // Compact trie into out for value indices present in old_tvi_to_new_value.
- class NewValueMap {
- public:
- virtual ~NewValueMap();
-
- // Returns the new value we want to assign to the entry at old
- // value index. We don't take ownership of the pointer.
- virtual const void *GetNewValue(uint32_t old_value_index) const = 0;
- };
- // Compacts this trie. This drops all deleted keys, drops all keys for which
- // old_tvi_to_new_value returns nullptr, updates values to be the values
- // returned by old_tvi_to_new_value, rewrites tvis, and saves the results into
- // the trie given in 'out'. 'old_to_new_tvi' is be populated with a mapping of
- // old value_index to new value_index.
- bool Compact(const NewValueMap &old_tvi_to_new_value, IcingDynamicTrie *out,
- std::unordered_map<uint32_t, uint32_t> *old_to_new_tvi) const;
-
// Insert value at key. If key already exists and replace == true,
// replaces old value with value. We take a copy of value.
//
@@ -321,18 +321,22 @@ class IcingDynamicTrie : public IIcingStorage {
// value_index. This can then be used with SetValueAtIndex
// below. value_index is not valid past a Clear/Read/Write.
//
- // Returns false if there is no space left in the trie.
- //
// REQUIRES: value a buffer of size value_size()
- bool Insert(const char *key, const void *value) {
+ //
+ // Returns:
+ // OK on success
+ // RESOURCE_EXHAUSTED if no disk space is available
+ // INTERNAL_ERROR if there are inconsistencies in the dynamic trie.
+ libtextclassifier3::Status Insert(const char *key, const void *value) {
return Insert(key, value, nullptr, true, nullptr);
}
- bool Insert(const char *key, const void *value, uint32_t *value_index,
- bool replace) {
+ libtextclassifier3::Status Insert(const char *key, const void *value,
+ uint32_t *value_index, bool replace) {
return Insert(key, value, value_index, replace, nullptr);
}
- bool Insert(const char *key, const void *value, uint32_t *value_index,
- bool replace, bool *pnew_key);
+ libtextclassifier3::Status Insert(const char *key, const void *value,
+ uint32_t *value_index, bool replace,
+ bool *pnew_key);
// Get a value returned by Insert value_index. This points to the
// value in the trie. The pointer is immutable and always valid
@@ -375,6 +379,23 @@ class IcingDynamicTrie : public IIcingStorage {
bool is_full_match() const { return value_index != kInvalidValueIndex; }
};
+ static constexpr int kNoBranchFound = -1;
+ // Return prefix of any new branches created if key were inserted. If utf8 is
+ // true, does not cut key mid-utf8. Returns kNoBranchFound if no branches
+ // would be created.
+ int FindNewBranchingPrefixLength(const char *key, bool utf8) const;
+
+ // Find all prefixes of key where the trie branches. Excludes the key
+ // itself. If utf8 is true, does not cut key mid-utf8.
+ std::vector<int> FindBranchingPrefixLengths(const char *key, bool utf8) const;
+
+ // Check if key is a branching term.
+ //
+ // key is a branching term, if and only if there exists terms s1 and s2 in the
+ // trie such that key is the maximum common prefix of s1 and s2, but s1 and s2
+ // are not prefixes of each other.
+ bool IsBranchingTerm(const char *key) const;
+
void GetDebugInfo(int verbosity, std::string *out) const override;
double min_free_fraction() const;
@@ -402,6 +423,10 @@ class IcingDynamicTrie : public IIcingStorage {
// Clears the deleted property for each value.
bool ClearDeleted(uint32_t value_index);
+ // Deletes the entry associated with the key. Data can not be recovered after
+ // the deletion. Returns true on success.
+ bool Delete(std::string_view key);
+
// Clear a specific property id from all values. For each value that has this
// property cleared, also check to see if it was the only property set; if
// so, set the deleted property for the value to indicate it no longer has any
@@ -479,9 +504,13 @@ class IcingDynamicTrie : public IIcingStorage {
// Not thread-safe.
//
// Change in underlying trie invalidates iterator.
+ //
+ // TODO(b/241784804): change IcingDynamicTrie::Iterator to follow the common
+ // iterator pattern in our codebase.
class Iterator {
public:
- Iterator(const IcingDynamicTrie &trie, const char *prefix);
+ Iterator(const IcingDynamicTrie &trie, const char *prefix,
+ bool reverse = false);
void Reset();
bool Advance();
@@ -498,9 +527,10 @@ class IcingDynamicTrie : public IIcingStorage {
Iterator();
// Copy is ok.
- // Helper function that takes the left-most branch down
- // intermediate nodes to a leaf.
- void LeftBranchToLeaf(uint32_t node_index);
+ enum class BranchType { kLeftMost = 0, kRightMost = 1 };
+ // Helper function that takes the left-most or the right-most branch down
+ // intermediate nodes to a leaf, based on branch_type.
+ void BranchToLeaf(uint32_t node_index, BranchType branch_type);
std::string cur_key_;
const char *cur_suffix_;
@@ -509,10 +539,12 @@ class IcingDynamicTrie : public IIcingStorage {
uint32_t node_idx;
int child_idx;
- explicit Branch(uint32_t ni) : node_idx(ni), child_idx(0) {}
+ explicit Branch(uint32_t node_index, int child_index)
+ : node_idx(node_index), child_idx(child_index) {}
};
std::vector<Branch> branch_stack_;
bool single_leaf_match_;
+ bool reverse_;
const IcingDynamicTrie &trie_;
};
@@ -569,24 +601,27 @@ class IcingDynamicTrie : public IIcingStorage {
class CandidateSet;
// For testing only.
+ friend class IcingDynamicTrieTest_TrieShouldRespectLimits_Test;
friend class IcingDynamicTrieTest_SyncErrorRecovery_Test;
friend class IcingDynamicTrieTest_BitmapsClosedWhenInitFails_Test;
void GetHeader(IcingDynamicTrieHeader *hdr) const;
void SetHeader(const IcingDynamicTrieHeader &new_hdr);
- static const uint32_t kInvalidNodeIndex;
- static const uint32_t kInvalidNextIndex;
static const uint32_t kInvalidSuffixIndex;
// Stats helpers.
- void CollectStatsRecursive(const Node &node, Stats *stats) const;
+ void CollectStatsRecursive(const Node &node, Stats *stats,
+ uint32_t depth = 0) const;
// Helpers for Find and Insert.
const Next *GetNextByChar(const Node *node, uint8_t key_char) const;
- const Next *LowerBound(const Next *start, const Next *end,
- uint8_t key_char) const;
+ const Next *LowerBound(const Next *start, const Next *end, uint8_t key_char,
+ uint32_t node_index = 0) const;
+ // Returns the number of valid nexts in the array.
+ int GetValidNextsSize(const IcingDynamicTrie::Next *next_array_start,
+ int next_array_length) const;
void FindBestNode(const char *key, uint32_t *best_node_index, int *key_offset,
- bool prefix) const;
+ bool prefix, bool utf8 = false) const;
// For value properties. This truncates the data by clearing it, but leaving
// the storage intact.
diff --git a/icing/legacy/index/icing-dynamic-trie_test.cc b/icing/legacy/index/icing-dynamic-trie_test.cc
new file mode 100644
index 0000000..ec7e277
--- /dev/null
+++ b/icing/legacy/index/icing-dynamic-trie_test.cc
@@ -0,0 +1,1450 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/legacy/index/icing-dynamic-trie.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/hash/farmhash.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using testing::ContainerEq;
+using testing::ElementsAre;
+using testing::StrEq;
+
+constexpr std::string_view kKeys[] = {
+ "", "ab", "ac", "abd", "bac", "bb", "bacd", "abbb", "abcdefg",
+};
+constexpr uint32_t kNumKeys = ABSL_ARRAYSIZE(kKeys);
+
+class IcingDynamicTrieTest : public ::testing::Test {
+ protected:
+ // Output trie stats to stderr.
+ static void StatsDump(const IcingDynamicTrie& trie) {
+ IcingDynamicTrie::Stats stats;
+ trie.CollectStats(&stats);
+ DLOG(INFO) << "Stats:\n" << stats.DumpStats(true);
+ }
+
+ static void AddToTrie(IcingDynamicTrie* trie, uint32_t num_keys) {
+ std::string key;
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ key.clear();
+ IcingStringUtil::SStringAppendF(&key, 0, "%u+%010u", i % 2, i);
+ ASSERT_THAT(trie->Insert(key.c_str(), &i), IsOk());
+ }
+ }
+
+ static void CheckTrie(const IcingDynamicTrie& trie, uint32_t num_keys) {
+ std::string key;
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ key.clear();
+ IcingStringUtil::SStringAppendF(&key, 0, "%u+%010u", i % 2, i);
+ uint32_t val;
+ bool found = trie.Find(key.c_str(), &val);
+ EXPECT_TRUE(found);
+ EXPECT_EQ(i, val);
+ }
+ }
+
+ static void PrintTrie(const IcingDynamicTrie& trie) {
+ std::vector<std::string> keys;
+ std::ostringstream os;
+ DLOG(INFO) << "Trie:\n";
+ trie.DumpTrie(&os, &keys);
+ DLOG(INFO) << os.str();
+ }
+
+ void SetUp() override {
+ trie_files_dir_ = GetTestTempDir() + "/trie_files";
+ trie_files_prefix_ = trie_files_dir_ + "/test_file_";
+ }
+
+ void TearDown() override {
+ IcingFilesystem filesystem;
+ filesystem.DeleteDirectoryRecursively(trie_files_dir_.c_str());
+ }
+
+ std::string trie_files_dir_;
+ std::string trie_files_prefix_;
+};
+
+std::vector<std::pair<std::string, int>> RetrieveKeyValuePairs(
+ IcingDynamicTrie::Iterator& term_iter) {
+ std::vector<std::pair<std::string, int>> key_value;
+ for (; term_iter.IsValid(); term_iter.Advance()) {
+ uint32_t val;
+ memcpy(&val, term_iter.GetValue(), sizeof(val));
+ key_value.push_back(std::make_pair(term_iter.GetKey(), val));
+ }
+ return key_value;
+}
+
+constexpr std::string_view kCommonEnglishWords[] = {
+ "that", "was", "for", "on", "are", "with", "they", "be", "at",
+ "one", "have", "this", "from", "word", "but", "what", "some", "you",
+ "had", "the", "and", "can", "out", "other", "were", "which", "their",
+ "time", "will", "how", "said", "each", "tell", "may", "three"};
+constexpr uint32_t kCommonEnglishWordArrayLen =
+ sizeof(kCommonEnglishWords) / sizeof(std::string_view);
+
+TEST_F(IcingDynamicTrieTest, Simple) {
+ // Test simple key insertions.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_THAT(trie.Insert(kKeys[i].data(), &i), IsOk());
+
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ EXPECT_EQ(trie.size(), kNumKeys);
+
+ StatsDump(trie);
+ std::vector<std::string> keys;
+ std::ostringstream os;
+ DLOG(INFO) << "Trie:\n";
+ trie.DumpTrie(&os, &keys);
+ DLOG(INFO) << os.str();
+ EXPECT_EQ(keys.size(), kNumKeys);
+}
+
+TEST_F(IcingDynamicTrieTest, Init) {
+ // Test create/init behavior.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ EXPECT_FALSE(trie.is_initialized());
+ EXPECT_FALSE(trie.Init());
+
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ EXPECT_TRUE(trie.Init());
+ EXPECT_TRUE(trie.is_initialized());
+}
+
+TEST_F(IcingDynamicTrieTest, Iterator) {
+ // Test iterator.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_THAT(trie.Insert(kKeys[i].data(), &i), IsOk());
+ }
+
+ // Should get the entire trie.
+ std::vector<std::pair<std::string, int>> exp_key_values = {
+ {"", 0}, {"ab", 1}, {"abbb", 7}, {"abcdefg", 8}, {"abd", 3},
+ {"ac", 2}, {"bac", 4}, {"bacd", 6}, {"bb", 5}};
+ IcingDynamicTrie::Iterator it_all(trie, "");
+ std::vector<std::pair<std::string, int>> key_values =
+ RetrieveKeyValuePairs(it_all);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it_all.Reset();
+ key_values = RetrieveKeyValuePairs(it_all);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Get everything under "a".
+ exp_key_values = {
+ {"ab", 1}, {"abbb", 7}, {"abcdefg", 8}, {"abd", 3}, {"ac", 2}};
+ IcingDynamicTrie::Iterator it1(trie, "a");
+ key_values = RetrieveKeyValuePairs(it1);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it1.Reset();
+ key_values = RetrieveKeyValuePairs(it1);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Now "b".
+ exp_key_values = {{"bac", 4}, {"bacd", 6}, {"bb", 5}};
+ IcingDynamicTrie::Iterator it2(trie, "b");
+ key_values = RetrieveKeyValuePairs(it2);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it2.Reset();
+ key_values = RetrieveKeyValuePairs(it2);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Get everything under "ab".
+ exp_key_values = {{"ab", 1}, {"abbb", 7}, {"abcdefg", 8}, {"abd", 3}};
+ IcingDynamicTrie::Iterator it3(trie, "ab");
+ key_values = RetrieveKeyValuePairs(it3);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it3.Reset();
+ key_values = RetrieveKeyValuePairs(it3);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should match only one key exactly.
+ constexpr std::string_view kOneMatch[] = {
+ "abd",
+ "abcd",
+ "abcdef",
+ "abcdefg",
+ };
+ // With the following match:
+ constexpr std::string_view kOneMatchMatched[] = {
+ "abd",
+ "abcdefg",
+ "abcdefg",
+ "abcdefg",
+ };
+
+ for (size_t k = 0; k < ABSL_ARRAYSIZE(kOneMatch); k++) {
+ IcingDynamicTrie::Iterator it_single(trie, kOneMatch[k].data());
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_THAT(it_single.GetKey(), StrEq(kOneMatchMatched[k].data()));
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
+
+ // Should get same results after calling Reset
+ it_single.Reset();
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_THAT(it_single.GetKey(), StrEq(kOneMatchMatched[k].data()));
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
+ }
+
+ // Matches nothing.
+ constexpr std::string_view kNoMatch[] = {
+ "abbd",
+ "abcdeg",
+ "abcdefh",
+ };
+ for (size_t k = 0; k < ABSL_ARRAYSIZE(kNoMatch); k++) {
+ IcingDynamicTrie::Iterator it_empty(trie, kNoMatch[k].data());
+ EXPECT_FALSE(it_empty.IsValid());
+ it_empty.Reset();
+ EXPECT_FALSE(it_empty.IsValid());
+ }
+
+ // Clear.
+ trie.Clear();
+ EXPECT_FALSE(IcingDynamicTrie::Iterator(trie, "").IsValid());
+ EXPECT_EQ(0u, trie.size());
+ EXPECT_EQ(1.0, trie.min_free_fraction());
+}
+
+TEST_F(IcingDynamicTrieTest, IteratorReverse) {
+ // Test iterator.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_THAT(trie.Insert(kKeys[i].data(), &i), IsOk());
+ }
+
+ // Should get the entire trie.
+ std::vector<std::pair<std::string, int>> exp_key_values = {
+ {"bb", 5}, {"bacd", 6}, {"bac", 4}, {"ac", 2}, {"abd", 3},
+ {"abcdefg", 8}, {"abbb", 7}, {"ab", 1}, {"", 0}};
+ IcingDynamicTrie::Iterator it_all(trie, "", /*reverse=*/true);
+ std::vector<std::pair<std::string, int>> key_values =
+ RetrieveKeyValuePairs(it_all);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+ it_all.Reset();
+ key_values = RetrieveKeyValuePairs(it_all);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Get everything under "a".
+ exp_key_values = {
+ {"ac", 2}, {"abd", 3}, {"abcdefg", 8}, {"abbb", 7}, {"ab", 1}};
+ IcingDynamicTrie::Iterator it1(trie, "a", /*reverse=*/true);
+ key_values = RetrieveKeyValuePairs(it1);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it1.Reset();
+ key_values = RetrieveKeyValuePairs(it1);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Now "b".
+ exp_key_values = {{"bb", 5}, {"bacd", 6}, {"bac", 4}};
+ IcingDynamicTrie::Iterator it2(trie, "b", /*reverse=*/true);
+ key_values = RetrieveKeyValuePairs(it2);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it2.Reset();
+ key_values = RetrieveKeyValuePairs(it2);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Get everything under "ab".
+ exp_key_values = {{"abd", 3}, {"abcdefg", 8}, {"abbb", 7}, {"ab", 1}};
+ IcingDynamicTrie::Iterator it3(trie, "ab", /*reverse=*/true);
+ key_values = RetrieveKeyValuePairs(it3);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should get same results after calling Reset
+ it3.Reset();
+ key_values = RetrieveKeyValuePairs(it3);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Should match only one key exactly.
+ constexpr std::string_view kOneMatch[] = {
+ "abd",
+ "abcd",
+ "abcdef",
+ "abcdefg",
+ };
+ // With the following match:
+ constexpr std::string_view kOneMatchMatched[] = {
+ "abd",
+ "abcdefg",
+ "abcdefg",
+ "abcdefg",
+ };
+
+ for (size_t k = 0; k < ABSL_ARRAYSIZE(kOneMatch); k++) {
+ IcingDynamicTrie::Iterator it_single(trie, kOneMatch[k].data(),
+ /*reverse=*/true);
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_THAT(it_single.GetKey(), StrEq(kOneMatchMatched[k].data()));
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
+
+ // Should get same results after calling Reset
+ it_single.Reset();
+ ASSERT_TRUE(it_single.IsValid()) << kOneMatch[k];
+ EXPECT_THAT(it_single.GetKey(), StrEq(kOneMatchMatched[k].data()));
+ EXPECT_FALSE(it_single.Advance()) << kOneMatch[k];
+ EXPECT_FALSE(it_single.IsValid()) << kOneMatch[k];
+ }
+
+ // Matches nothing.
+ constexpr std::string_view kNoMatch[] = {
+ "abbd",
+ "abcdeg",
+ "abcdefh",
+ };
+ for (size_t k = 0; k < ABSL_ARRAYSIZE(kNoMatch); k++) {
+ IcingDynamicTrie::Iterator it_empty(trie, kNoMatch[k].data(),
+ /*reverse=*/true);
+ EXPECT_FALSE(it_empty.IsValid());
+ it_empty.Reset();
+ EXPECT_FALSE(it_empty.IsValid());
+ }
+
+ // Clear.
+ trie.Clear();
+ EXPECT_FALSE(
+ IcingDynamicTrie::Iterator(trie, "", /*reverse=*/true).IsValid());
+ EXPECT_EQ(0u, trie.size());
+ EXPECT_EQ(1.0, trie.min_free_fraction());
+}
+
+TEST_F(IcingDynamicTrieTest, IteratorLoadTest) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ std::default_random_engine random;
+ ICING_LOG(ERROR) << "Seed: " << std::default_random_engine::default_seed;
+
+ std::vector<std::pair<std::string, int>> exp_key_values;
+ // Randomly generate 1024 terms.
+ for (int i = 0; i < 1024; ++i) {
+ std::string term = RandomString("abcdefg", 5, &random) + std::to_string(i);
+ ASSERT_THAT(trie.Insert(term.c_str(), &i), IsOk());
+ exp_key_values.push_back(std::make_pair(term, i));
+ }
+ // Lexicographically sort the expected keys.
+ std::sort(exp_key_values.begin(), exp_key_values.end());
+
+ // Check that the iterator works.
+ IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/"");
+ std::vector<std::pair<std::string, int>> key_values =
+ RetrieveKeyValuePairs(term_iter);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Check that Reset works.
+ term_iter.Reset();
+ key_values = RetrieveKeyValuePairs(term_iter);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ std::reverse(exp_key_values.begin(), exp_key_values.end());
+ // Check that the reverse iterator works.
+ IcingDynamicTrie::Iterator term_iter_reverse(trie, /*prefix=*/"",
+ /*reverse=*/true);
+ key_values = RetrieveKeyValuePairs(term_iter_reverse);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+
+ // Check that Reset works.
+ term_iter_reverse.Reset();
+ key_values = RetrieveKeyValuePairs(term_iter_reverse);
+ EXPECT_THAT(key_values, ContainerEq(exp_key_values));
+}
+
+TEST_F(IcingDynamicTrieTest, Persistence) {
+ // Test persistence on the English dictionary.
+ IcingFilesystem filesystem;
+ {
+ // Test with a trie including strings in words. Test will fail if
+ // words are not unique.
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ EXPECT_FALSE(trie.Init());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ ASSERT_THAT(trie.Insert(kCommonEnglishWords[i].data(), &i), IsOk());
+ }
+ // Explicitly omit sync.
+
+ StatsDump(trie);
+ }
+
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Init());
+ EXPECT_EQ(0U, trie.size());
+
+ for (uint32_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ ASSERT_THAT(trie.Insert(kCommonEnglishWords[i].data(), &i), IsOk());
+ }
+ trie.Sync();
+
+ StatsDump(trie);
+ }
+
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Init());
+
+ // Make sure we can find everything with the right value.
+ uint32_t found_count = 0;
+ uint32_t matched_count = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ uint32_t val;
+ bool found = trie.Find(kCommonEnglishWords[i].data(), &val);
+ if (found) {
+ found_count++;
+ if (i == val) {
+ matched_count++;
+ }
+ }
+ }
+ EXPECT_EQ(found_count, kCommonEnglishWordArrayLen);
+ EXPECT_EQ(matched_count, kCommonEnglishWordArrayLen);
+
+ StatsDump(trie);
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, PersistenceShared) {
+ // Test persistence on the English dictionary.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie::RuntimeOptions ropt;
+
+ {
+ // Test with a trie including strings in words. Test will fail if
+ // words are not unique.
+ ropt.storage_policy = IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc;
+ IcingDynamicTrie trie(trie_files_prefix_, ropt, &filesystem);
+ EXPECT_FALSE(trie.Init());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ uint32_t next_reopen = kCommonEnglishWordArrayLen / 16;
+ for (uint32_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ ASSERT_THAT(trie.Insert(kCommonEnglishWords[i].data(), &i), IsOk());
+
+ if (i == next_reopen) {
+ ASSERT_NE(0u, trie.UpdateCrc());
+ trie.Close();
+ ASSERT_TRUE(trie.Init());
+
+ next_reopen += next_reopen / 2;
+ }
+ }
+ // Explicitly omit sync. Shared should automatically persist.
+
+ StatsDump(trie);
+ }
+
+ // Go back and forth between the two policies.
+ for (int i = 0; i < 5; i++) {
+ if (i % 2 == 0) {
+ DLOG(INFO) << "Opening with map shared";
+ ropt.storage_policy = IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc;
+ } else {
+ DLOG(INFO) << "Opening with explicit flush";
+ ropt.storage_policy = IcingDynamicTrie::RuntimeOptions::kExplicitFlush;
+ }
+ IcingDynamicTrie trie(trie_files_prefix_, ropt, &filesystem);
+ ASSERT_TRUE(trie.Init());
+
+ // Make sure we can find everything with the right value.
+ uint32_t found_count = 0;
+ uint32_t matched_count = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen; i++) {
+ uint32_t val;
+ bool found = trie.Find(kCommonEnglishWords[i].data(), &val);
+ if (found) {
+ found_count++;
+ if (i == val) {
+ matched_count++;
+ }
+ }
+ }
+ EXPECT_EQ(found_count, kCommonEnglishWordArrayLen);
+ EXPECT_EQ(matched_count, kCommonEnglishWordArrayLen);
+
+ StatsDump(trie);
+ }
+
+ // Clear and re-open.
+ ropt.storage_policy = IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc;
+ IcingDynamicTrie trie(trie_files_prefix_, ropt, &filesystem);
+ ASSERT_TRUE(trie.Init());
+ trie.Clear();
+ trie.Close();
+ ASSERT_TRUE(trie.Init());
+}
+
+TEST_F(IcingDynamicTrieTest, Sync) {
+ IcingFilesystem filesystem;
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ ASSERT_THAT(trie.Insert(kKeys[i].data(), &i), IsOk());
+
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ StatsDump(trie);
+ PrintTrie(trie);
+
+ trie.Sync();
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+ }
+
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Init());
+
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ StatsDump(trie);
+ PrintTrie(trie);
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, LimitsZero) {
+ // Don't crash if we set limits to 0.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_FALSE(trie.CreateIfNotExist(IcingDynamicTrie::Options(0, 0, 0, 0)));
+}
+
+TEST_F(IcingDynamicTrieTest, LimitsSmall) {
+ // Test limits with a few keys.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(
+ IcingDynamicTrie::Options(10, 300, 30, sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ ASSERT_LT(3U, kNumKeys);
+
+ for (uint32_t i = 0; i < 3; i++) {
+ ASSERT_THAT(trie.Insert(kKeys[i].data(), &i), IsOk()) << i;
+
+ uint32_t val;
+ bool found = trie.Find(kKeys[i].data(), &val);
+ EXPECT_TRUE(found) << kKeys[i];
+ if (found) EXPECT_EQ(i, val) << kKeys[i] << " " << val;
+ }
+
+ uint32_t val = 3;
+ EXPECT_THAT(trie.Insert(kKeys[3].data(), &val),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+
+ StatsDump(trie);
+ PrintTrie(trie);
+}
+
+TEST_F(IcingDynamicTrieTest, DISABLEDFingerprintedKeys) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie::Options options(4 << 20, 4 << 20, 20 << 20,
+ sizeof(uint32_t));
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(options));
+ ASSERT_TRUE(trie.Init());
+ IcingDynamicTrie triefp(trie_files_prefix_ + ".fps",
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(triefp.CreateIfNotExist(options));
+ ASSERT_TRUE(triefp.Init());
+
+ static const uint32_t kNumKeys = 1000000;
+ std::string key;
+ for (uint32_t i = 0; i < kNumKeys; i++) {
+ key.clear();
+ IcingStringUtil::SStringAppendF(
+ &key, 1000, "content://gmail-ls/account/conversation/%u/message/%u", i,
+ 10 * i);
+ ASSERT_THAT(trie.Insert(key.c_str(), &i), IsOk());
+
+ // Now compute a fingerprint.
+ uint64_t fpkey = tc3farmhash::Fingerprint64(key);
+
+ // Convert to base255 since keys in trie cannot contain 0.
+ uint8_t fpkey_base255[9];
+ for (int j = 0; j < 8; j++) {
+ fpkey_base255[j] = (fpkey % 255) + 1;
+ fpkey /= 255;
+ }
+ fpkey_base255[8] = '\0';
+ ASSERT_THAT(triefp.Insert(reinterpret_cast<const char*>(fpkey_base255), &i),
+ IsOk());
+
+ // Sync periodically to gauge write locality.
+ if ((i + 1) % (kNumKeys / 10) == 0) {
+ DLOG(INFO) << "Trie sync";
+ trie.Sync();
+ DLOG(INFO) << "Trie fp sync";
+ triefp.Sync();
+ }
+ }
+
+ DLOG(INFO) << "Trie stats";
+ StatsDump(trie);
+ DLOG(INFO) << "Trie fp stats";
+ StatsDump(triefp);
+}
+
+TEST_F(IcingDynamicTrieTest, AddDups) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ static const uint32_t kNumKeys = 5000;
+ AddToTrie(&trie, kNumKeys);
+ CheckTrie(trie, kNumKeys);
+
+ DLOG(INFO) << "Trie stats";
+ StatsDump(trie);
+
+ AddToTrie(&trie, kNumKeys);
+ CheckTrie(trie, kNumKeys);
+ DLOG(INFO) << "Trie stats";
+ StatsDump(trie);
+}
+
+TEST_F(IcingDynamicTrieTest, Properties) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ static const uint32_t kOne = 1;
+ uint32_t val_idx;
+ ICING_ASSERT_OK(trie.Insert("abcd", &kOne, &val_idx, false));
+ trie.SetProperty(val_idx, 0);
+ trie.SetProperty(val_idx, 3);
+
+ {
+ IcingDynamicTrie::PropertyReader reader(trie, 3);
+ ASSERT_TRUE(reader.Exists());
+ EXPECT_TRUE(reader.HasProperty(val_idx));
+ EXPECT_FALSE(reader.HasProperty(1000));
+ }
+
+ // Disappear after close.
+ trie.Close();
+ ASSERT_TRUE(trie.Init());
+ {
+ IcingDynamicTrie::PropertyReader reader(trie, 3);
+ EXPECT_FALSE(reader.HasProperty(val_idx));
+ }
+
+ // Persist after sync.
+ ICING_ASSERT_OK(trie.Insert("abcd", &kOne, &val_idx, false));
+ trie.SetProperty(val_idx, 1);
+ ASSERT_TRUE(trie.Sync());
+ trie.Close();
+ ASSERT_TRUE(trie.Init());
+
+ uint32_t val;
+ ASSERT_TRUE(trie.Find("abcd", &val, &val_idx));
+ EXPECT_EQ(1u, val);
+ {
+ IcingDynamicTrie::PropertyReader reader(trie, 1);
+ EXPECT_TRUE(reader.HasProperty(val_idx));
+ }
+
+ // Get all.
+ {
+ IcingDynamicTrie::PropertyReadersAll readers(trie);
+ ASSERT_EQ(4u, readers.size());
+ EXPECT_TRUE(readers.Exists(0));
+ EXPECT_TRUE(readers.Exists(1));
+ EXPECT_FALSE(readers.Exists(2));
+ EXPECT_TRUE(readers.Exists(3));
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, ClearSingleProperty) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ static const uint32_t kOne = 1;
+ uint32_t val_idx[3];
+ ICING_ASSERT_OK(trie.Insert("abcd", &kOne, &val_idx[0], false));
+ trie.SetProperty(val_idx[0], 0);
+ trie.SetProperty(val_idx[0], 3);
+
+ ICING_ASSERT_OK(trie.Insert("efgh", &kOne, &val_idx[1], false));
+ trie.SetProperty(val_idx[1], 0);
+ trie.SetProperty(val_idx[1], 3);
+
+ ICING_ASSERT_OK(trie.Insert("ijkl", &kOne, &val_idx[2], false));
+ trie.SetProperty(val_idx[2], 0);
+ trie.SetProperty(val_idx[2], 3);
+
+ {
+ IcingDynamicTrie::PropertyReadersAll readers(trie);
+ ASSERT_EQ(4u, readers.size());
+ EXPECT_TRUE(readers.Exists(0));
+ EXPECT_FALSE(readers.Exists(1));
+ EXPECT_FALSE(readers.Exists(2));
+ EXPECT_TRUE(readers.Exists(3));
+ for (size_t i = 0; i < readers.size(); i++) {
+ if (readers.Exists(i)) {
+ for (size_t j = 0; j < sizeof(val_idx) / sizeof(uint32_t); ++j) {
+ EXPECT_TRUE(readers.HasProperty(i, val_idx[j]));
+ }
+ }
+ }
+ }
+
+ EXPECT_TRUE(trie.ClearPropertyForAllValues(3));
+
+ {
+ IcingDynamicTrie::PropertyReadersAll readers(trie);
+ ASSERT_EQ(4u, readers.size());
+ EXPECT_TRUE(readers.Exists(0));
+ EXPECT_FALSE(readers.Exists(1));
+ EXPECT_FALSE(readers.Exists(2));
+ // Clearing the property causes all values to be deleted.
+ EXPECT_FALSE(readers.Exists(3));
+ for (size_t i = 0; i < readers.size(); i++) {
+ for (size_t j = 0; j < sizeof(val_idx) / sizeof(uint32_t); ++j) {
+ if (i == 0) {
+ EXPECT_TRUE(readers.HasProperty(i, val_idx[j]));
+ } else {
+ EXPECT_FALSE(readers.HasProperty(i, val_idx[j]));
+ }
+ }
+ }
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenRootIsLeaf) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts a key, the root is a leaf.
+ uint32_t value = 1;
+ ASSERT_THAT(trie.Insert("foo", &value), IsOk());
+ ASSERT_TRUE(trie.Find("foo", &value));
+
+ // Deletes the key.
+ EXPECT_TRUE(trie.Delete("foo"));
+ EXPECT_FALSE(trie.Find("foo", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWhenLastCharIsLeaf) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "bar" and "ba", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // null r
+ uint32_t value = 1;
+ ASSERT_THAT(trie.Insert("bar", &value), IsOk());
+ ASSERT_THAT(trie.Insert("ba", &value), IsOk());
+ ASSERT_TRUE(trie.Find("bar", &value));
+ ASSERT_TRUE(trie.Find("ba", &value));
+
+ // Deletes "bar". "r" is a leaf node in the trie.
+ EXPECT_TRUE(trie.Delete("bar"));
+ EXPECT_FALSE(trie.Find("bar", &value));
+ EXPECT_TRUE(trie.Find("ba", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithTerminationNode) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "bar" and "ba", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // null r
+ uint32_t value = 1;
+ ASSERT_THAT(trie.Insert("bar", &value), IsOk());
+ ASSERT_THAT(trie.Insert("ba", &value), IsOk());
+ ASSERT_TRUE(trie.Find("bar", &value));
+ ASSERT_TRUE(trie.Find("ba", &value));
+
+ // Deletes "ba" which is a key with termination node in the trie.
+ EXPECT_TRUE(trie.Delete("ba"));
+ EXPECT_FALSE(trie.Find("ba", &value));
+ EXPECT_TRUE(trie.Find("bar", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleNexts) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "ba", "bb", "bc", and "bd", the trie structure looks like:
+ // root
+ // |
+ // b
+ // / | | \
+ // a b c d
+ uint32_t value = 1;
+ ASSERT_THAT(trie.Insert("ba", &value), IsOk());
+ ASSERT_THAT(trie.Insert("bb", &value), IsOk());
+ ASSERT_THAT(trie.Insert("bc", &value), IsOk());
+ ASSERT_THAT(trie.Insert("bd", &value), IsOk());
+ ASSERT_TRUE(trie.Find("ba", &value));
+ ASSERT_TRUE(trie.Find("bb", &value));
+ ASSERT_TRUE(trie.Find("bc", &value));
+ ASSERT_TRUE(trie.Find("bd", &value));
+
+ // Deletes "bc".
+ EXPECT_TRUE(trie.Delete("bc"));
+ EXPECT_FALSE(trie.Find("bc", &value));
+ EXPECT_TRUE(trie.Find("ba", &value));
+ EXPECT_TRUE(trie.Find("bb", &value));
+ EXPECT_TRUE(trie.Find("bd", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionShouldWorkWithMultipleTrieBranches) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts "batter", "battle", and "bar", the trie structure looks like:
+ // root
+ // |
+ // b
+ // |
+ // a
+ // / \
+ // t r
+ // |
+ // t
+ // / \
+ // e l
+ // | |
+ // r e
+ uint32_t value = 1;
+ ASSERT_THAT(trie.Insert("batter", &value), IsOk());
+ ASSERT_THAT(trie.Insert("battle", &value), IsOk());
+ ASSERT_THAT(trie.Insert("bar", &value), IsOk());
+ ASSERT_TRUE(trie.Find("batter", &value));
+ ASSERT_TRUE(trie.Find("battle", &value));
+ ASSERT_TRUE(trie.Find("bar", &value));
+
+ // Deletes "batter".
+ EXPECT_TRUE(trie.Delete("batter"));
+ EXPECT_FALSE(trie.Find("batter", &value));
+ EXPECT_TRUE(trie.Find("battle", &value));
+ EXPECT_TRUE(trie.Find("bar", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, InsertionShouldWorkAfterDeletion) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_THAT(trie.Insert("bar", &value), IsOk());
+ ASSERT_THAT(trie.Insert("bed", &value), IsOk());
+ ASSERT_THAT(trie.Insert("foo", &value), IsOk());
+
+ // Deletes a key
+ ASSERT_TRUE(trie.Delete("bed"));
+ ASSERT_FALSE(trie.Find("bed", &value));
+
+ // Inserts after deletion
+ ASSERT_THAT(trie.Insert("bed", &value), IsOk());
+ ASSERT_THAT(trie.Insert("bedroom", &value), IsOk());
+ EXPECT_TRUE(trie.Find("bed", &value));
+ EXPECT_TRUE(trie.Find("bedroom", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, IteratorShouldWorkAfterDeletion) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_THAT(trie.Insert("bar", &value), IsOk());
+ ASSERT_THAT(trie.Insert("bed", &value), IsOk());
+ ASSERT_THAT(trie.Insert("foo", &value), IsOk());
+
+ // Deletes a key
+ ASSERT_TRUE(trie.Delete("bed"));
+
+ // Iterates through all keys
+ IcingDynamicTrie::Iterator iterator_all(trie, "");
+ std::vector<std::string> results;
+ for (; iterator_all.IsValid(); iterator_all.Advance()) {
+ results.emplace_back(iterator_all.GetKey());
+ }
+ EXPECT_THAT(results, ElementsAre("bar", "foo"));
+
+ // Iterates through keys that start with "b"
+ IcingDynamicTrie::Iterator iterator_b(trie, "b");
+ results.clear();
+ for (; iterator_b.IsValid(); iterator_b.Advance()) {
+ results.emplace_back(iterator_b.GetKey());
+ }
+ EXPECT_THAT(results, ElementsAre("bar"));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletingNonExistingKeyShouldReturnTrue) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts some keys.
+ uint32_t value = 1;
+ ASSERT_THAT(trie.Insert("bar", &value), IsOk());
+ ASSERT_THAT(trie.Insert("bed", &value), IsOk());
+
+ // "ba" and bedroom are not keys in the trie.
+ EXPECT_TRUE(trie.Delete("ba"));
+ EXPECT_TRUE(trie.Delete("bedroom"));
+
+ // The original keys are not affected.
+ EXPECT_TRUE(trie.Find("bar", &value));
+ EXPECT_TRUE(trie.Find("bed", &value));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionResortsFullNextArray) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ uint32_t value = 1;
+ // 'f' -> [ 'a', 'j', 'o', 'u' ]
+ ASSERT_THAT(trie.Insert("foul", &value), IsOk());
+ ASSERT_THAT(trie.Insert("far", &value), IsOk());
+ ASSERT_THAT(trie.Insert("fudge", &value), IsOk());
+ ASSERT_THAT(trie.Insert("fjord", &value), IsOk());
+
+ // Delete the third child
+ EXPECT_TRUE(trie.Delete("foul"));
+
+ std::vector<std::string> remaining;
+ for (IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/"");
+ term_iter.IsValid(); term_iter.Advance()) {
+ remaining.push_back(term_iter.GetKey());
+ }
+ EXPECT_THAT(remaining, ElementsAre("far", "fjord", "fudge"));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionResortsPartiallyFilledNextArray) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ uint32_t value = 1;
+ // 'f' -> [ 'a', 'o', 'u', 0xFF ]
+ ASSERT_THAT(trie.Insert("foul", &value), IsOk());
+ ASSERT_THAT(trie.Insert("far", &value), IsOk());
+ ASSERT_THAT(trie.Insert("fudge", &value), IsOk());
+
+ // Delete the second child
+ EXPECT_TRUE(trie.Delete("foul"));
+
+ std::vector<std::string> remaining;
+ for (IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/"");
+ term_iter.IsValid(); term_iter.Advance()) {
+ remaining.push_back(term_iter.GetKey());
+ }
+ EXPECT_THAT(remaining, ElementsAre("far", "fudge"));
+}
+
+TEST_F(IcingDynamicTrieTest, DeletionLoadTest) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ std::default_random_engine random;
+ ICING_LOG(ERROR) << "Seed: " << std::default_random_engine::default_seed;
+ std::vector<std::string> terms;
+ uint32_t value;
+ // Randomly generate 2048 terms.
+ for (int i = 0; i < 2048; ++i) {
+ terms.push_back(RandomString("abcdefg", 5, &random));
+ ASSERT_THAT(trie.Insert(terms.back().c_str(), &value), IsOk());
+ }
+
+ // Randomly delete 1024 terms.
+ std::unordered_set<std::string> exp_remaining(terms.begin(), terms.end());
+ std::shuffle(terms.begin(), terms.end(), random);
+ for (int i = 0; i < 1024; ++i) {
+ exp_remaining.erase(terms[i]);
+ ASSERT_TRUE(trie.Delete(terms[i].c_str()));
+ }
+
+ // Check that the iterator still works, and the remaining terms are correct.
+ std::unordered_set<std::string> remaining;
+ for (IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/"");
+ term_iter.IsValid(); term_iter.Advance()) {
+ remaining.insert(term_iter.GetKey());
+ }
+ EXPECT_THAT(remaining, ContainerEq(exp_remaining));
+
+ // Check that we can still insert terms after delete.
+ for (int i = 0; i < 2048; ++i) {
+ std::string term = RandomString("abcdefg", 5, &random);
+ ASSERT_THAT(trie.Insert(term.c_str(), &value), IsOk());
+ exp_remaining.insert(term);
+ }
+ remaining.clear();
+ for (IcingDynamicTrie::Iterator term_iter(trie, /*prefix=*/"");
+ term_iter.IsValid(); term_iter.Advance()) {
+ remaining.insert(term_iter.GetKey());
+ }
+ EXPECT_THAT(remaining, ContainerEq(exp_remaining));
+}
+
+} // namespace
+
+// The tests below are accessing private methods and fields of IcingDynamicTrie
+// so can't be in the anonymous namespace.
+
+TEST_F(IcingDynamicTrieTest, TrieShouldRespectLimits) {
+ // Test limits on numbers of nodes, nexts, and suffixes size.
+ IcingFilesystem filesystem;
+
+ // These 3 numbers are the entities we need in order to insert all the test
+ // words before the last one.
+ uint32_t num_nodes_enough;
+ uint32_t num_nexts_enough;
+ uint32_t suffixes_size_enough;
+
+ // First, try to fill the 3 numbers above.
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Remove());
+ // Creates a trie with enough numbers of nodes, nexts, and suffix file size.
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options(
+ /*max_nodes_in=*/1000, /*max_nexts_in=*/1000,
+ /*max_suffixes_size_in=*/1000, sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts all the test words before the last one.
+ uint32_t value = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen - 1; ++i) {
+ ASSERT_THAT(trie.Insert(kCommonEnglishWords[i].data(), &value), IsOk());
+ }
+
+ IcingDynamicTrieHeader header;
+ trie.GetHeader(&header);
+
+ // Before each insertion, it requires that there're (2 + 1 + key_length)
+ // nodes left, so we need 8 nodes to insert the last word. +7 here will make
+ // it just enough to insert the word before the last one.
+ num_nodes_enough = header.num_nodes() + 7;
+
+ // Before each insertion, it requires that there're (2 + 1 + key_length +
+ // kMaxNextArraySize) nexts left, so we need (8 + kMaxNextArraySize) nexts
+ // to insert the last word. (7 + kMaxNextArraySize) here will make it just
+ // enough to insert the word before the last one.
+ num_nexts_enough =
+ header.num_nexts() + 7 + IcingDynamicTrie::kMaxNextArraySize;
+
+ // Before each insertion, it requires that there're (1 + key_length +
+ // value_size) bytes left for suffixes, so we need (6 + sizeof(uint32_t))
+ // bytes to insert the last word. (5 + sizeof(uint32_t)) here will make it
+ // just enough to insert the word before the last one.
+ suffixes_size_enough = header.suffixes_size() + 5 + sizeof(uint32_t);
+ }
+
+ // Test a trie with just enough number of nodes.
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Remove());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options(
+ num_nodes_enough, /*max_nexts_in=*/1000,
+ /*max_suffixes_size_in=*/1000, sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts all the test words before the last one.
+ uint32_t value = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen - 1; ++i) {
+ ASSERT_THAT(trie.Insert(kCommonEnglishWords[i].data(), &value), IsOk());
+ }
+
+ // Fails to insert the last word because no enough nodes left.
+ EXPECT_THAT(
+ trie.Insert(kCommonEnglishWords[kCommonEnglishWordArrayLen - 1].data(),
+ &value),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ }
+
+ // Test a trie with just enough number of nexts.
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Remove());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options(
+ /*max_nodes_in=*/1000, num_nexts_enough,
+ /*max_suffixes_size_in=*/1000, sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts all the test words before the last one.
+ uint32_t value = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen - 1; ++i) {
+ ASSERT_THAT(trie.Insert(kCommonEnglishWords[i].data(), &value), IsOk());
+ }
+
+ // Fails to insert the last word because no enough nexts left.
+ EXPECT_THAT(
+ trie.Insert(kCommonEnglishWords[kCommonEnglishWordArrayLen - 1].data(),
+ &value),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ }
+
+ // Test a trie with just enough suffixes size.
+ {
+ IcingDynamicTrie trie(trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions(), &filesystem);
+ ASSERT_TRUE(trie.Remove());
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options(
+ /*max_nodes_in=*/1000, /*max_nexts_in=*/1000, suffixes_size_enough,
+ sizeof(uint32_t))));
+ ASSERT_TRUE(trie.Init());
+
+ // Inserts all the test words before the last one.
+ uint32_t value = 0;
+ for (size_t i = 0; i < kCommonEnglishWordArrayLen - 1; ++i) {
+ ASSERT_THAT(trie.Insert(kCommonEnglishWords[i].data(), &value), IsOk());
+ }
+
+ // Fails to insert the last word because no enough space for more suffixes.
+ EXPECT_THAT(
+ trie.Insert(kCommonEnglishWords[kCommonEnglishWordArrayLen - 1].data(),
+ &value),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+ }
+}
+
+TEST_F(IcingDynamicTrieTest, SyncErrorRecovery) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ static const uint32_t kNumKeys = 5000;
+ AddToTrie(&trie, kNumKeys);
+ CheckTrie(trie, kNumKeys);
+
+ trie.Sync();
+ trie.Close();
+
+ // Reach into the file and set the value_size.
+ ASSERT_TRUE(trie.Init());
+ IcingDynamicTrieHeader hdr;
+ trie.GetHeader(&hdr);
+ hdr.set_value_size(hdr.value_size() + 123);
+ trie.SetHeader(hdr);
+ trie.Close();
+
+ ASSERT_FALSE(trie.Init());
+}
+
+TEST_F(IcingDynamicTrieTest, BitmapsClosedWhenInitFails) {
+ // Create trie with one property.
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(
+ trie_files_prefix_,
+ IcingDynamicTrie::RuntimeOptions().set_storage_policy(
+ IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+ ASSERT_TRUE(trie.deleted_bitmap_);
+ trie.SetProperty(0, 0);
+ ASSERT_EQ(1, trie.property_bitmaps_.size());
+ ASSERT_TRUE(trie.property_bitmaps_[0]);
+ trie.Close();
+
+ // Intentionally corrupt deleted_bitmap file to make Init() fail.
+ FILE* fp = fopen(trie.deleted_bitmap_filename_.c_str(), "r+");
+ ASSERT_TRUE(fp);
+ ASSERT_EQ(16, fwrite("################", 1, 16, fp));
+ fclose(fp);
+ ASSERT_FALSE(trie.Init());
+
+ // Check that both the bitmap and the property files have been closed.
+ ASSERT_FALSE(trie.deleted_bitmap_);
+ ASSERT_EQ(0, trie.property_bitmaps_.size());
+}
+
+TEST_F(IcingDynamicTrieTest, IsBranchingTermShouldWorkForExistingTerms) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ uint32_t value = 1;
+
+ ASSERT_THAT(trie.Insert("", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+
+ ASSERT_THAT(trie.Insert("ab", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+
+ ASSERT_THAT(trie.Insert("ac", &value), IsOk());
+ // "" is a prefix of "ab" and "ac", but it is not a branching term.
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ac"));
+
+ ASSERT_THAT(trie.Insert("ba", &value), IsOk());
+ // "" now branches to "ba"
+ EXPECT_TRUE(trie.IsBranchingTerm(""));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ac"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ba"));
+
+ ASSERT_THAT(trie.Insert("a", &value), IsOk());
+ EXPECT_TRUE(trie.IsBranchingTerm(""));
+ // "a" branches to "ab" and "ac"
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ac"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ba"));
+
+ ASSERT_THAT(trie.Insert("abc", &value), IsOk());
+ ASSERT_THAT(trie.Insert("acd", &value), IsOk());
+ EXPECT_TRUE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ // "ab" is a prefix of "abc", but it is not a branching term.
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ // "ac" is a prefix of "acd", but it is not a branching term.
+ EXPECT_FALSE(trie.IsBranchingTerm("ac"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ba"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+ EXPECT_FALSE(trie.IsBranchingTerm("acd"));
+
+ ASSERT_THAT(trie.Insert("abcd", &value), IsOk());
+ EXPECT_TRUE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ // "ab" is a prefix of "abc" and "abcd", but it is not a branching term.
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ac"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ba"));
+ // "abc" is a prefix of "abcd", but it is not a branching term.
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+ EXPECT_FALSE(trie.IsBranchingTerm("acd"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abcd"));
+
+ ASSERT_THAT(trie.Insert("abd", &value), IsOk());
+ EXPECT_TRUE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ // "ab" branches to "abc" and "abd"
+ EXPECT_TRUE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ac"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ba"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+ EXPECT_FALSE(trie.IsBranchingTerm("acd"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abcd"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abd"));
+}
+
+TEST_F(IcingDynamicTrieTest, IsBranchingTermShouldWorkForNonExistingTerms) {
+ IcingFilesystem filesystem;
+ IcingDynamicTrie trie(trie_files_prefix_, IcingDynamicTrie::RuntimeOptions(),
+ &filesystem);
+ ASSERT_TRUE(trie.CreateIfNotExist(IcingDynamicTrie::Options()));
+ ASSERT_TRUE(trie.Init());
+
+ uint32_t value = 1;
+
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_FALSE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+
+ ASSERT_THAT(trie.Insert("aa", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_FALSE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+
+ ASSERT_THAT(trie.Insert("ac", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ // "a" does not exist in the trie, but now it branches to "aa" and "ac".
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+
+ ASSERT_THAT(trie.Insert("ad", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+
+ ASSERT_THAT(trie.Insert("abcd", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_FALSE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+
+ ASSERT_THAT(trie.Insert("abd", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ // "ab" does not exist in the trie, but now it branches to "abcd" and "abd".
+ EXPECT_TRUE(trie.IsBranchingTerm("ab"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc"));
+
+ ASSERT_THAT(trie.Insert("abce", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_TRUE(trie.IsBranchingTerm("ab"));
+ // "abc" does not exist in the trie, but now it branches to "abcd" and "abce".
+ EXPECT_TRUE(trie.IsBranchingTerm("abc"));
+
+ ASSERT_THAT(trie.Insert("abc_suffix", &value), IsOk());
+ EXPECT_FALSE(trie.IsBranchingTerm(""));
+ EXPECT_TRUE(trie.IsBranchingTerm("a"));
+ EXPECT_TRUE(trie.IsBranchingTerm("ab"));
+ EXPECT_TRUE(trie.IsBranchingTerm("abc"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc_s"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc_su"));
+ EXPECT_FALSE(trie.IsBranchingTerm("abc_suffi"));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/legacy/index/icing-filesystem.cc b/icing/legacy/index/icing-filesystem.cc
index 90e9146..fbf5a27 100644
--- a/icing/legacy/index/icing-filesystem.cc
+++ b/icing/legacy/index/icing-filesystem.cc
@@ -16,7 +16,6 @@
#include <dirent.h>
#include <dlfcn.h>
-#include <errno.h>
#include <fcntl.h>
#include <fnmatch.h>
#include <pthread.h>
@@ -27,6 +26,7 @@
#include <unistd.h>
#include <algorithm>
+#include <cerrno>
#include <unordered_set>
#include "icing/absl_ports/str_cat.h"
@@ -65,18 +65,15 @@ void LogOpenFileDescriptors() {
constexpr int kMaxFileDescriptorsToStat = 4096;
struct rlimit rlim = {0, 0};
if (getrlimit(RLIMIT_NOFILE, &rlim) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "getrlimit() failed (errno=%d)", errno);
+ ICING_LOG(ERROR) << "getrlimit() failed (errno=" << errno << ")";
return;
}
int fd_lim = rlim.rlim_cur;
if (fd_lim > kMaxFileDescriptorsToStat) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Maximum number of file descriptors (%d) too large.", fd_lim);
+ ICING_LOG(ERROR) << "Maximum number of file descriptors (" << fd_lim << ") too large.";
fd_lim = kMaxFileDescriptorsToStat;
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Listing up to %d file descriptors.", fd_lim);
+ ICING_LOG(ERROR) << "Listing up to " << fd_lim << " file descriptors.";
// Verify that /proc/self/fd is a directory. If not, procfs is not mounted or
// inaccessible for some other reason. In that case, there's no point trying
@@ -98,15 +95,12 @@ void LogOpenFileDescriptors() {
if (len >= 0) {
// Zero-terminate the buffer, because readlink() won't.
target[len < target_size ? len : target_size - 1] = '\0';
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> \"%s\"", fd,
- target);
+ ICING_LOG(ERROR) << "fd " << fd << " -> \"" << target << "\"";
} else if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("fd %d -> ? (errno=%d)",
- fd, errno);
+ ICING_LOG(ERROR) << "fd " << fd << " -> ? (errno=" << errno << ")";
}
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "File descriptor list complete.");
+ ICING_LOG(ERROR) << "File descriptor list complete.";
}
// Logs an error formatted as: desc1 + file_name + desc2 + strerror(errnum).
@@ -115,8 +109,7 @@ void LogOpenFileDescriptors() {
// file descriptors (see LogOpenFileDescriptors() above).
void LogOpenError(const char *desc1, const char *file_name, const char *desc2,
int errnum) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "%s%s%s%s", desc1, file_name, desc2, strerror(errnum));
+ ICING_LOG(ERROR) << desc1 << file_name << desc2 << strerror(errnum);
if (errnum == EMFILE) {
LogOpenFileDescriptors();
}
@@ -157,8 +150,7 @@ bool ListDirectoryInternal(const char *dir_name,
}
}
if (closedir(dir) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Error closing %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Error closing " << dir_name << ": " << strerror(errno);
}
return true;
}
@@ -181,12 +173,11 @@ void IcingScopedFd::reset(int fd) {
const uint64_t IcingFilesystem::kBadFileSize;
bool IcingFilesystem::DeleteFile(const char *file_name) const {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf("Deleting file %s", file_name);
+ ICING_VLOG(1) << "Deleting file " << file_name;
int ret = unlink(file_name);
bool success = (ret == 0) || (errno == ENOENT);
if (!success) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting file %s failed: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting file " << file_name << " failed: " << strerror(errno);
}
return success;
}
@@ -195,8 +186,7 @@ bool IcingFilesystem::DeleteDirectory(const char *dir_name) const {
int ret = rmdir(dir_name);
bool success = (ret == 0) || (errno == ENOENT);
if (!success) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Deleting directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Deleting directory " << dir_name << " failed: " << strerror(errno);
}
return success;
}
@@ -208,8 +198,7 @@ bool IcingFilesystem::DeleteDirectoryRecursively(const char *dir_name) const {
if (errno == ENOENT) {
return true; // If directory didn't exist, this was successful.
}
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << dir_name << " failed: " << strerror(errno);
return false;
}
vector<std::string> entries;
@@ -222,8 +211,7 @@ bool IcingFilesystem::DeleteDirectoryRecursively(const char *dir_name) const {
++i) {
std::string filename = std::string(dir_name) + '/' + *i;
if (stat(filename.c_str(), &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Stat %s failed: %s", filename.c_str(), strerror(errno));
+ ICING_LOG(ERROR) << "Stat " << filename << " failed: " << strerror(errno);
success = false;
} else if (S_ISDIR(st.st_mode)) {
success = DeleteDirectoryRecursively(filename.c_str()) && success;
@@ -246,8 +234,7 @@ bool IcingFilesystem::FileExists(const char *file_name) const {
exists = S_ISREG(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", file_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file " << file_name << ": " << strerror(errno);
}
exists = false;
}
@@ -261,8 +248,7 @@ bool IcingFilesystem::DirectoryExists(const char *dir_name) const {
exists = S_ISDIR(st.st_mode) != 0;
} else {
if (errno != ENOENT) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat directory %s: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat directory " << dir_name << ": " << strerror(errno);
}
exists = false;
}
@@ -317,8 +303,7 @@ bool IcingFilesystem::GetMatchingFiles(const char *glob,
int basename_idx = GetBasenameIndex(glob);
if (basename_idx == 0) {
// We need a directory.
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Expected directory, no matching files for: %s", glob);
+ ICING_VLOG(1) << "Expected directory, no matching files for: " << glob;
return true;
}
const char *basename_glob = glob + basename_idx;
@@ -374,8 +359,7 @@ uint64_t IcingFilesystem::GetFileSize(int fd) const {
struct stat st;
uint64_t size = kBadFileSize;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
} else {
size = st.st_size;
}
@@ -386,8 +370,7 @@ uint64_t IcingFilesystem::GetFileSize(const char *filename) const {
struct stat st;
uint64_t size = kBadFileSize;
if (stat(filename, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to stat file %s: %s", filename, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file " << filename << ": " << strerror(errno);
} else {
size = st.st_size;
}
@@ -399,8 +382,7 @@ bool IcingFilesystem::Truncate(int fd, uint64_t new_size) const {
if (ret == 0) {
lseek(fd, new_size, SEEK_SET);
} else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to truncate file: %s", strerror(errno));
+ ICING_LOG(ERROR) << "Unable to truncate file: " << strerror(errno);
}
return (ret == 0);
}
@@ -418,8 +400,7 @@ bool IcingFilesystem::Truncate(const char *filename, uint64_t new_size) const {
bool IcingFilesystem::Grow(int fd, uint64_t new_size) const {
int ret = ftruncate(fd, new_size);
if (ret != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to grow file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to grow file: " << strerror(errno);
}
return (ret == 0);
}
@@ -431,8 +412,7 @@ bool IcingFilesystem::Write(int fd, const void *data, size_t data_size) const {
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = write(fd, data, chunk_size);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t *>(data) + wrote;
@@ -449,8 +429,7 @@ bool IcingFilesystem::PWrite(int fd, off_t offset, const void *data,
size_t chunk_size = std::min<size_t>(write_len, 64u * 1024);
ssize_t wrote = pwrite(fd, data, chunk_size, offset);
if (wrote < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Bad write: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Bad write: " << strerror(errno);
return false;
}
data = static_cast<const uint8_t *>(data) + wrote;
@@ -468,8 +447,7 @@ bool IcingFilesystem::DataSync(int fd) const {
#endif
if (result < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to sync data: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to sync data: " << strerror(errno);
return false;
}
return true;
@@ -478,9 +456,7 @@ bool IcingFilesystem::DataSync(int fd) const {
bool IcingFilesystem::RenameFile(const char *old_name,
const char *new_name) const {
if (rename(old_name, new_name) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Unable to rename file %s to %s: %s", old_name, new_name,
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to rename file " << old_name << " to " << new_name << ": " << strerror(errno);
return false;
}
return true;
@@ -518,8 +494,7 @@ bool IcingFilesystem::CreateDirectory(const char *dir_name) const {
if (mkdir(dir_name, S_IRUSR | S_IWUSR | S_IXUSR) == 0) {
success = true;
} else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Creating directory %s failed: %s", dir_name, strerror(errno));
+ ICING_LOG(ERROR) << "Creating directory " << dir_name << " failed: " << strerror(errno);
}
}
return success;
@@ -561,8 +536,7 @@ end:
if (src_fd > 0) close(src_fd);
if (dst_fd > 0) close(dst_fd);
if (!success) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Couldn't copy file %s to %s", src, dst);
+ ICING_LOG(ERROR) << "Couldn't copy file " << src << " to " << dst;
}
return success;
}
@@ -583,8 +557,7 @@ bool IcingFilesystem::ComputeChecksum(int fd, uint32_t *checksum,
uint64_t IcingFilesystem::GetDiskUsage(int fd) const {
struct stat st;
if (fstat(fd, &st) < 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat file: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat file: " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -593,8 +566,7 @@ uint64_t IcingFilesystem::GetDiskUsage(int fd) const {
uint64_t IcingFilesystem::GetFileDiskUsage(const char *path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
return st.st_blocks * kStatBlockSize;
@@ -603,8 +575,7 @@ uint64_t IcingFilesystem::GetFileDiskUsage(const char *path) const {
uint64_t IcingFilesystem::GetDiskUsage(const char *path) const {
struct stat st;
if (stat(path, &st) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unable to stat %s: %s",
- path, strerror(errno));
+ ICING_LOG(ERROR) << "Unable to stat " << path << ": " << strerror(errno);
return kBadFileSize;
}
uint64_t result = st.st_blocks * kStatBlockSize;
diff --git a/icing/legacy/index/icing-filesystem.h b/icing/legacy/index/icing-filesystem.h
index 2b10c1c..ce75a82 100644
--- a/icing/legacy/index/icing-filesystem.h
+++ b/icing/legacy/index/icing-filesystem.h
@@ -17,13 +17,15 @@
#ifndef ICING_LEGACY_INDEX_ICING_FILESYSTEM_H_
#define ICING_LEGACY_INDEX_ICING_FILESYSTEM_H_
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
+#include <sys/types.h>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
#include <memory>
#include <string>
#include <unordered_set>
+#include <utility>
#include <vector>
namespace icing {
@@ -222,6 +224,11 @@ class IcingFilesystem {
// Increments to_increment by size if size is valid, or sets to_increment
// to kBadFileSize if either size or to_increment is kBadFileSize.
static void IncrementByOrSetInvalid(uint64_t size, uint64_t *to_increment);
+
+ // Return -1 if file_size is invalid. Otherwise, return file_size.
+ static int64_t SanitizeFileSize(int64_t file_size) {
+ return (file_size != kBadFileSize) ? file_size : -1;
+ }
};
} // namespace lib
diff --git a/icing/legacy/index/icing-flash-bitmap.cc b/icing/legacy/index/icing-flash-bitmap.cc
index 56dec00..774308f 100644
--- a/icing/legacy/index/icing-flash-bitmap.cc
+++ b/icing/legacy/index/icing-flash-bitmap.cc
@@ -73,8 +73,7 @@ class IcingFlashBitmap::Accessor {
bool IcingFlashBitmap::Verify() const {
if (!is_initialized()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Can't verify unopened flash bitmap %s", filename_.c_str());
+ ICING_LOG(ERROR) << "Can't verify unopened flash bitmap " << filename_;
return false;
}
if (mmapper_ == nullptr) {
@@ -83,26 +82,21 @@ bool IcingFlashBitmap::Verify() const {
}
Accessor accessor(mmapper_.get());
if (accessor.header()->magic != kMagic) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flash bitmap %s has incorrect magic header", filename_.c_str());
+ ICING_LOG(ERROR) << "Flash bitmap " << filename_ << " has incorrect magic header";
return false;
}
if (accessor.header()->version != kCurVersion) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flash bitmap %s has incorrect version", filename_.c_str());
+ ICING_LOG(ERROR) << "Flash bitmap " << filename_ << " has incorrect version";
return false;
}
if (accessor.header()->dirty) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flash bitmap %s is dirty", filename_.c_str());
+ ICING_LOG(ERROR) << "Flash bitmap " << filename_ << " is dirty";
return false;
}
uint32_t crc =
IcingStringUtil::UpdateCrc32(0, accessor.data(), accessor.data_size());
if (accessor.header()->crc != crc) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Flash bitmap %s has incorrect CRC32 %u %u", filename_.c_str(),
- accessor.header()->crc, crc);
+ ICING_LOG(ERROR) << "Flash bitmap " << filename_ << " has incorrect CRC32 " << accessor.header()->crc << " " << crc;
return false;
}
return true;
@@ -265,17 +259,14 @@ uint32_t IcingFlashBitmap::UpdateCrc() const {
bool IcingFlashBitmap::Grow(size_t new_file_size) {
IcingScopedFd fd(filesystem_->OpenForWrite(filename_.c_str()));
if (!filesystem_->Grow(fd.get(), new_file_size)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Grow %s to new size %zu failed", filename_.c_str(), new_file_size);
+ ICING_LOG(ERROR) << "Grow " << filename_ << " to new size " << new_file_size << " failed";
return false;
}
if (!mmapper_->Remap(fd.get(), 0, new_file_size)) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Remap of %s after grow failed", filename_.c_str());
+ ICING_LOG(ERROR) << "Remap of " << filename_ << " after grow failed";
return false;
}
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Grew %s new size %zu", filename_.c_str(), new_file_size);
+ ICING_VLOG(1) << "Grew " << filename_ << " new size " << new_file_size;
Accessor accessor(mmapper_.get());
accessor.header()->dirty = true;
return true;
diff --git a/icing/legacy/index/icing-flash-bitmap.h b/icing/legacy/index/icing-flash-bitmap.h
index 3b3521a..6bb9591 100644
--- a/icing/legacy/index/icing-flash-bitmap.h
+++ b/icing/legacy/index/icing-flash-bitmap.h
@@ -37,8 +37,7 @@
#ifndef ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_
#define ICING_LEGACY_INDEX_ICING_FLASH_BITMAP_H_
-#include <stdint.h>
-
+#include <cstdint>
#include <memory>
#include <string>
@@ -139,6 +138,7 @@ class IcingFlashBitmap {
// Upgrade for version 18.
bool UpgradeTo18();
+ // Legacy file system. Switch to use the new Filesystem class instead.
const IcingFilesystem *const filesystem_;
std::string filename_;
OpenType open_type_;
diff --git a/icing/legacy/index/icing-mmapper.cc b/icing/legacy/index/icing-mmapper.cc
index 737335c..d086da2 100644
--- a/icing/legacy/index/icing-mmapper.cc
+++ b/icing/legacy/index/icing-mmapper.cc
@@ -17,10 +17,11 @@
//
#include "icing/legacy/index/icing-mmapper.h"
-#include <errno.h>
-#include <string.h>
#include <sys/mman.h>
+#include <cerrno>
+#include <cstring>
+
#include "icing/legacy/core/icing-string-util.h"
#include "icing/legacy/index/icing-filesystem.h"
#include "icing/util/logging.h"
@@ -66,8 +67,7 @@ void IcingMMapper::DoMapping(int fd, uint64_t location, size_t size) {
address_ = reinterpret_cast<uint8_t *>(mmap_result_) + alignment_adjustment;
} else {
const char *errstr = strerror(errno);
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf(
- "Could not mmap file for reading: %s", errstr);
+ ICING_LOG(ERROR) << "Could not mmap file for reading: " << errstr;
mmap_result_ = nullptr;
}
}
@@ -94,8 +94,7 @@ IcingMMapper::~IcingMMapper() { Unmap(); }
bool IcingMMapper::Sync() {
if (is_valid() && !read_only_) {
if (msync(mmap_result_, mmap_len_, MS_SYNC) != 0) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("msync failed: %s",
- strerror(errno));
+ ICING_LOG(ERROR) << "msync failed: " << strerror(errno);
return false;
}
}
diff --git a/icing/legacy/index/icing-mmapper.h b/icing/legacy/index/icing-mmapper.h
index bf62aa5..d054c11 100644
--- a/icing/legacy/index/icing-mmapper.h
+++ b/icing/legacy/index/icing-mmapper.h
@@ -22,9 +22,11 @@
#ifndef ICING_LEGACY_INDEX_ICING_MMAPPER_H_
#define ICING_LEGACY_INDEX_ICING_MMAPPER_H_
-#include <stdint.h>
#include <unistd.h>
+#include <cstddef>
+#include <cstdint>
+
namespace icing {
namespace lib {
diff --git a/icing/legacy/index/icing-mock-filesystem.h b/icing/legacy/index/icing-mock-filesystem.h
index 31e012a..122ee7b 100644
--- a/icing/legacy/index/icing-mock-filesystem.h
+++ b/icing/legacy/index/icing-mock-filesystem.h
@@ -15,81 +15,230 @@
#ifndef ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_
#define ICING_LEGACY_INDEX_ICING_MOCK_FILESYSTEM_H_
-#include <stdint.h>
-#include <stdio.h>
-#include <string.h>
-
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
#include <memory>
#include <string>
#include <vector>
-#include "icing/legacy/index/icing-filesystem.h"
#include "gmock/gmock.h"
+#include "icing/legacy/index/icing-filesystem.h"
namespace icing {
namespace lib {
+using ::testing::_;
+using ::testing::A;
class IcingMockFilesystem : public IcingFilesystem {
public:
- MOCK_CONST_METHOD1(DeleteFile, bool(const char *file_name));
+ IcingMockFilesystem() {
+ ON_CALL(*this, DeleteFile).WillByDefault([this](const char *file_name) {
+ return real_icing_filesystem_.DeleteFile(file_name);
+ });
+
+ ON_CALL(*this, DeleteDirectory).WillByDefault([this](const char *dir_name) {
+ return real_icing_filesystem_.DeleteDirectory(dir_name);
+ });
+
+ ON_CALL(*this, DeleteDirectoryRecursively)
+ .WillByDefault([this](const char *dir_name) {
+ return real_icing_filesystem_.DeleteDirectoryRecursively(dir_name);
+ });
+
+ ON_CALL(*this, FileExists).WillByDefault([this](const char *file_name) {
+ return real_icing_filesystem_.FileExists(file_name);
+ });
+
+ ON_CALL(*this, DirectoryExists).WillByDefault([this](const char *dir_name) {
+ return real_icing_filesystem_.DirectoryExists(dir_name);
+ });
+
+ ON_CALL(*this, GetBasenameIndex)
+ .WillByDefault([this](const char *file_name) {
+ return real_icing_filesystem_.GetBasenameIndex(file_name);
+ });
+
+ ON_CALL(*this, GetBasename).WillByDefault([this](const char *file_name) {
+ return real_icing_filesystem_.GetBasename(file_name);
+ });
+
+ ON_CALL(*this, GetDirname).WillByDefault([this](const char *file_name) {
+ return real_icing_filesystem_.GetDirname(file_name);
+ });
+
+ ON_CALL(*this, ListDirectory)
+ .WillByDefault(
+ [this](const char *dir_name, std::vector<std::string> *entries) {
+ return real_icing_filesystem_.ListDirectory(dir_name, entries);
+ });
+
+ ON_CALL(*this, GetMatchingFiles)
+ .WillByDefault(
+ [this](const char *glob, std::vector<std::string> *matches) {
+ return real_icing_filesystem_.GetMatchingFiles(glob, matches);
+ });
+
+ ON_CALL(*this, OpenForWrite).WillByDefault([this](const char *file_name) {
+ return real_icing_filesystem_.OpenForWrite(file_name);
+ });
+
+ ON_CALL(*this, OpenForAppend).WillByDefault([this](const char *file_name) {
+ return real_icing_filesystem_.OpenForAppend(file_name);
+ });
+
+ ON_CALL(*this, OpenForRead).WillByDefault([this](const char *file_name) {
+ return real_icing_filesystem_.OpenForRead(file_name);
+ });
+
+ ON_CALL(*this, GetFileSize(A<int>())).WillByDefault([this](int fd) {
+ return real_icing_filesystem_.GetFileSize(fd);
+ });
+
+ ON_CALL(*this, GetFileSize(A<const char *>()))
+ .WillByDefault([this](const char *filename) {
+ return real_icing_filesystem_.GetFileSize(filename);
+ });
+
+ ON_CALL(*this, Truncate(A<int>(), _))
+ .WillByDefault([this](int fd, uint64_t new_size) {
+ return real_icing_filesystem_.Truncate(fd, new_size);
+ });
+
+ ON_CALL(*this, Truncate(A<const char *>(), _))
+ .WillByDefault([this](const char *filename, uint64_t new_size) {
+ return real_icing_filesystem_.Truncate(filename, new_size);
+ });
+
+ ON_CALL(*this, Grow).WillByDefault([this](int fd, uint64_t new_size) {
+ return real_icing_filesystem_.Grow(fd, new_size);
+ });
+
+ ON_CALL(*this, Write)
+ .WillByDefault([this](int fd, const void *data, size_t data_size) {
+ return real_icing_filesystem_.Write(fd, data, data_size);
+ });
+ ON_CALL(*this, PWrite)
+ .WillByDefault(
+ [this](int fd, off_t offset, const void *data, size_t data_size) {
+ return real_icing_filesystem_.PWrite(fd, offset, data, data_size);
+ });
+
+ ON_CALL(*this, DataSync).WillByDefault([this](int fd) {
+ return real_icing_filesystem_.DataSync(fd);
+ });
+
+ ON_CALL(*this, RenameFile)
+ .WillByDefault([this](const char *old_name, const char *new_name) {
+ return real_icing_filesystem_.RenameFile(old_name, new_name);
+ });
+
+ ON_CALL(*this, SwapFiles)
+ .WillByDefault([this](const char *one, const char *two) {
+ return real_icing_filesystem_.SwapFiles(one, two);
+ });
+
+ ON_CALL(*this, CreateDirectory).WillByDefault([this](const char *dir_name) {
+ return real_icing_filesystem_.CreateDirectory(dir_name);
+ });
+
+ ON_CALL(*this, CreateDirectoryRecursively)
+ .WillByDefault([this](const char *dir_name) {
+ return real_icing_filesystem_.CreateDirectoryRecursively(dir_name);
+ });
+
+ ON_CALL(*this, CopyFile)
+ .WillByDefault([this](const char *src, const char *dst) {
+ return real_icing_filesystem_.CopyFile(src, dst);
+ });
+
+ ON_CALL(*this, ComputeChecksum)
+ .WillByDefault([this](int fd, uint32_t *checksum, uint64_t offset,
+ uint64_t length) {
+ return real_icing_filesystem_.ComputeChecksum(fd, checksum, offset,
+ length);
+ });
+
+ ON_CALL(*this, GetDiskUsage).WillByDefault([this](const char *path) {
+ return real_icing_filesystem_.GetDiskUsage(path);
+ });
+ }
+
+ MOCK_METHOD(bool, DeleteFile, (const char *file_name), (const, override));
+
+ MOCK_METHOD(bool, DeleteDirectory, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(DeleteDirectory, bool(const char *dir_name));
+ MOCK_METHOD(bool, DeleteDirectoryRecursively, (const char *dir_name),
+ (const, override));
- MOCK_CONST_METHOD1(DeleteDirectoryRecursively, bool(const char *dir_name));
+ MOCK_METHOD(bool, FileExists, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(FileExists, bool(const char *file_name));
+ MOCK_METHOD(bool, DirectoryExists, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(DirectoryExists, bool(const char *dir_name));
+ MOCK_METHOD(int, GetBasenameIndex, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD1(GetBasenameIndex, int(const char *file_name));
+ MOCK_METHOD(std::string, GetBasename, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD1(GetBasename, std::string(const char *file_name));
+ MOCK_METHOD(std::string, GetDirname, (const char *file_name),
+ (const, override));
- MOCK_CONST_METHOD1(GetDirname, std::string(const char *file_name));
+ MOCK_METHOD(bool, ListDirectory,
+ (const char *dir_name, std::vector<std::string> *entries),
+ (const, override));
- MOCK_CONST_METHOD2(ListDirectory, bool(const char *dir_name,
- std::vector<std::string> *entries));
+ MOCK_METHOD(bool, GetMatchingFiles,
+ (const char *glob, std::vector<std::string> *matches),
+ (const, override));
- MOCK_CONST_METHOD2(GetMatchingFiles,
- bool(const char *glob, std::vector<std::string> *matches));
+ MOCK_METHOD(int, OpenForWrite, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(OpenForWrite, int(const char *file_name));
+ MOCK_METHOD(int, OpenForAppend, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(OpenForAppend, int(const char *file_name));
+ MOCK_METHOD(int, OpenForRead, (const char *file_name), (const, override));
- MOCK_CONST_METHOD1(OpenForRead, int(const char *file_name));
+ MOCK_METHOD(uint64_t, GetFileSize, (int fd), (const, override));
- MOCK_CONST_METHOD1(GetFileSize, uint64_t(int fd));
+ MOCK_METHOD(uint64_t, GetFileSize, (const char *filename), (const, override));
- MOCK_CONST_METHOD1(GetFileSize, uint64_t(const char *filename));
+ MOCK_METHOD(bool, Truncate, (int fd, uint64_t new_size), (const, override));
- MOCK_CONST_METHOD2(Truncate, bool(int fd, uint64_t new_size));
+ MOCK_METHOD(bool, Truncate, (const char *filename, uint64_t new_size),
+ (const, override));
- MOCK_CONST_METHOD2(Truncate, bool(const char *filename, uint64_t new_size));
+ MOCK_METHOD(bool, Grow, (int fd, uint64_t new_size), (const, override));
- MOCK_CONST_METHOD2(Grow, bool(int fd, uint64_t new_size));
+ MOCK_METHOD(bool, Write, (int fd, const void *data, size_t data_size),
+ (const, override));
+ MOCK_METHOD(bool, PWrite,
+ (int fd, off_t offset, const void *data, size_t data_size),
+ (const, override));
- MOCK_CONST_METHOD3(Write, bool(int fd, const void *data, size_t data_size));
- MOCK_CONST_METHOD4(PWrite, bool(int fd, off_t offset, const void *data,
- size_t data_size));
+ MOCK_METHOD(bool, DataSync, (int fd), (const, override));
- MOCK_CONST_METHOD1(DataSync, bool(int fd));
+ MOCK_METHOD(bool, RenameFile, (const char *old_name, const char *new_name),
+ (const, override));
- MOCK_CONST_METHOD2(RenameFile,
- bool(const char *old_name, const char *new_name));
+ MOCK_METHOD(bool, SwapFiles, (const char *one, const char *two),
+ (const, override));
- MOCK_CONST_METHOD2(SwapFiles, bool(const char *one, const char *two));
+ MOCK_METHOD(bool, CreateDirectory, (const char *dir_name), (const, override));
- MOCK_CONST_METHOD1(CreateDirectory, bool(const char *dir_name));
+ MOCK_METHOD(bool, CreateDirectoryRecursively, (const char *dir_name),
+ (const, override));
- MOCK_CONST_METHOD1(CreateDirectoryRecursively, bool(const char *dir_name));
+ MOCK_METHOD(bool, CopyFile, (const char *src, const char *dst),
+ (const, override));
- MOCK_CONST_METHOD2(CopyFile, bool(const char *src, const char *dst));
+ MOCK_METHOD(bool, ComputeChecksum,
+ (int fd, uint32_t *checksum, uint64_t offset, uint64_t length),
+ (const, override));
- MOCK_CONST_METHOD4(ComputeChecksum, bool(int fd, uint32_t *checksum,
- uint64_t offset, uint64_t length));
+ MOCK_METHOD(uint64_t, GetDiskUsage, (const char *path), (const, override));
- MOCK_CONST_METHOD1(GetDiskUsage, uint64_t(const char *path));
+ private:
+ IcingFilesystem real_icing_filesystem_;
};
} // namespace lib
diff --git a/icing/legacy/index/icing-storage-file.cc b/icing/legacy/index/icing-storage-file.cc
index b27ec67..bbc6b81 100644
--- a/icing/legacy/index/icing-storage-file.cc
+++ b/icing/legacy/index/icing-storage-file.cc
@@ -14,9 +14,9 @@
#include "icing/legacy/index/icing-storage-file.h"
-#include <inttypes.h>
#include <unistd.h>
+#include <cinttypes>
#include <string>
#include "icing/legacy/core/icing-compat.h"
@@ -69,22 +69,18 @@ bool IcingStorageFile::Sync() {
IcingTimer timer;
if (!PreSync()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Pre-sync %s failed",
- filename_.c_str());
+ ICING_LOG(ERROR) << "Pre-sync " << filename_ << " failed";
return false;
}
if (!filesystem_->DataSync(fd_.get())) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Sync %s failed",
- filename_.c_str());
+ ICING_LOG(ERROR) << "Sync " << filename_ << " failed";
return false;
}
if (!PostSync()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Post-sync %s failed",
- filename_.c_str());
+ ICING_LOG(ERROR) << "Post-sync " << filename_ << " failed";
return false;
}
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Syncing %s took %.3fms", filename_.c_str(), timer.Elapsed() * 1000.);
+ ICING_VLOG(1) << "Syncing " << filename_ << " took " << timer.Elapsed() * 1000 << "ms";
return true;
}
diff --git a/icing/legacy/index/icing-storage.h b/icing/legacy/index/icing-storage.h
index cc06c54..58b6aa1 100644
--- a/icing/legacy/index/icing-storage.h
+++ b/icing/legacy/index/icing-storage.h
@@ -20,6 +20,7 @@
#ifndef ICING_LEGACY_INDEX_ICING_STORAGE_H_
#define ICING_LEGACY_INDEX_ICING_STORAGE_H_
+#include <cstdint>
#include <string>
namespace icing {
diff --git a/icing/monkey_test/icing-monkey-test-runner.cc b/icing/monkey_test/icing-monkey-test-runner.cc
new file mode 100644
index 0000000..76e41ce
--- /dev/null
+++ b/icing/monkey_test/icing-monkey-test-runner.cc
@@ -0,0 +1,525 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/monkey_test/icing-monkey-test-runner.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
+#include "icing/icing-search-engine.h"
+#include "icing/monkey_test/in-memory-icing-search-engine.h"
+#include "icing/monkey_test/monkey-test-generators.h"
+#include "icing/monkey_test/monkey-test-util.h"
+#include "icing/monkey_test/monkey-tokenized-document.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/initialize.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/result/result-state-manager.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::Eq;
+using ::testing::Le;
+using ::testing::Not;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAreArray;
+
+SearchSpecProto GenerateRandomSearchSpecProto(
+ MonkeyTestRandomEngine* random,
+ MonkeyDocumentGenerator* document_generator) {
+ // Get a random token from the language set as a single term query.
+ std::string query(document_generator->GetToken());
+ std::uniform_int_distribution<> dist(0, 1);
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+ if (dist(*random) == 1) {
+ term_match_type = TermMatchType::PREFIX;
+ // Randomly drop a suffix of query to test prefix query.
+ std::uniform_int_distribution<> size_dist(1, query.size());
+ query.resize(size_dist(*random));
+ }
+ // 50% chance of getting a section restriction.
+ if (dist(*random) == 1) {
+ const SchemaTypeConfigProto& type_config = document_generator->GetType();
+ if (type_config.properties_size() > 0) {
+ std::uniform_int_distribution<> prop_dist(
+ 0, type_config.properties_size() - 1);
+ query = absl_ports::StrCat(
+ type_config.properties(prop_dist(*random)).property_name(), ":",
+ query);
+ }
+ }
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_query(query);
+ return search_spec;
+}
+
+ScoringSpecProto GenerateRandomScoringSpec(MonkeyTestRandomEngine* random) {
+ ScoringSpecProto scoring_spec;
+
+ constexpr std::array<ScoringSpecProto::RankingStrategy::Code, 3>
+ ranking_strategies = {
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP,
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE};
+
+ std::uniform_int_distribution<> dist(0, ranking_strategies.size() - 1);
+ scoring_spec.set_rank_by(ranking_strategies[dist(*random)]);
+ return scoring_spec;
+}
+
+ResultSpecProto::SnippetSpecProto GenerateRandomSnippetSpecProto(
+ MonkeyTestRandomEngine* random, const ResultSpecProto& result_spec) {
+ ResultSpecProto::SnippetSpecProto snippet_spec;
+
+ std::uniform_int_distribution<> num_to_snippet_dist(
+ 0, result_spec.num_per_page() * 2);
+ snippet_spec.set_num_to_snippet(num_to_snippet_dist(*random));
+
+ std::uniform_int_distribution<> num_matches_per_property_dist(0, 10);
+ snippet_spec.set_num_matches_per_property(
+ num_matches_per_property_dist(*random));
+
+ std::uniform_int_distribution<> dist(0, 4);
+ int random_num = dist(*random);
+ // 1/5 chance of getting one of 0 (disabled), 8, 32, 128, 512
+ int max_window_utf32_length =
+ random_num == 0 ? 0 : (1 << (2 * random_num + 1));
+ snippet_spec.set_max_window_utf32_length(max_window_utf32_length);
+ return snippet_spec;
+}
+
+TypePropertyMask GenerateTypePropertyMask(
+ MonkeyTestRandomEngine* random, const SchemaTypeConfigProto& type_config) {
+ TypePropertyMask type_property_mask;
+ type_property_mask.set_schema_type(type_config.schema_type());
+ for (const auto& properties : type_config.properties()) {
+ // 25% chance of adding the current property to the mask.
+ std::uniform_int_distribution<> dist(0, 3);
+ if (dist(*random) == 0) {
+ type_property_mask.add_paths(properties.property_name());
+ }
+ }
+ return type_property_mask;
+}
+
+ResultSpecProto GenerateRandomResultSpecProto(MonkeyTestRandomEngine* random,
+ const SchemaProto* schema) {
+ std::uniform_int_distribution<> dist(0, 4);
+ ResultSpecProto result_spec;
+ // 1/5 chance of getting one of 1, 4, 16, 64, 256
+ int num_per_page = 1 << (2 * dist(*random));
+ result_spec.set_num_per_page(num_per_page);
+ *result_spec.mutable_snippet_spec() =
+ GenerateRandomSnippetSpecProto(random, result_spec);
+
+ // 1/5 chance of enabling projection.
+ if (dist(*random) == 0) {
+ for (const SchemaTypeConfigProto& type_config : schema->types()) {
+ // 25% chance of adding the current type to the projection.
+ std::uniform_int_distribution<> dist(0, 3);
+ if (dist(*random) == 0) {
+ *result_spec.add_type_property_masks() =
+ GenerateTypePropertyMask(random, type_config);
+ }
+ }
+ }
+ return result_spec;
+}
+
+void SortDocuments(std::vector<DocumentProto>& documents) {
+ std::sort(documents.begin(), documents.end(),
+ [](const DocumentProto& doc1, const DocumentProto& doc2) {
+ if (doc1.namespace_() != doc2.namespace_()) {
+ return doc1.namespace_() < doc2.namespace_();
+ }
+ return doc1.uri() < doc2.uri();
+ });
+}
+
+} // namespace
+
+IcingMonkeyTestRunner::IcingMonkeyTestRunner(
+ IcingMonkeyTestRunnerConfiguration config)
+ : config_(std::move(config)),
+ random_(config_.seed),
+ in_memory_icing_(std::make_unique<InMemoryIcingSearchEngine>(&random_)),
+ schema_generator_(
+ std::make_unique<MonkeySchemaGenerator>(&random_, &config_)) {
+ ICING_LOG(INFO) << "Monkey test runner started with seed: " << config_.seed;
+ std::string dir = GetTestTempDir() + "/icing/monkey";
+ filesystem_.DeleteDirectoryRecursively(dir.c_str());
+ icing_dir_ = std::make_unique<DestructibleDirectory>(&filesystem_, dir);
+}
+
+void IcingMonkeyTestRunner::Run(uint32_t num) {
+ ASSERT_TRUE(icing_ != nullptr)
+ << "Icing search engine has not yet been created. Please call "
+ "Initialize() first";
+
+ uint32_t frequency_sum = 0;
+ for (const auto& schedule : config_.monkey_api_schedules) {
+ frequency_sum += schedule.second;
+ }
+ std::uniform_int_distribution<> dist(0, frequency_sum - 1);
+ for (; num; --num) {
+ int p = dist(random_);
+ for (const auto& schedule : config_.monkey_api_schedules) {
+ if (p < schedule.second) {
+ ASSERT_NO_FATAL_FAILURE(schedule.first(this));
+ break;
+ }
+ p -= schedule.second;
+ }
+ ICING_LOG(INFO) << "Completed Run #" << num
+ << ". Documents in the in-memory icing: "
+ << in_memory_icing_->GetNumAliveDocuments();
+ }
+}
+
+SetSchemaResultProto IcingMonkeyTestRunner::SetSchema(SchemaProto&& schema) {
+ in_memory_icing_->SetSchema(std::move(schema));
+ document_generator_ = std::make_unique<MonkeyDocumentGenerator>(
+ &random_, in_memory_icing_->GetSchema(), &config_);
+ return icing_->SetSchema(*in_memory_icing_->GetSchema(),
+ /*ignore_errors_and_delete_documents=*/true);
+}
+
+void IcingMonkeyTestRunner::Initialize() {
+ ASSERT_NO_FATAL_FAILURE(CreateIcingSearchEngine());
+
+ SchemaProto schema = schema_generator_->GenerateSchema();
+ ICING_LOG(DBG) << "Schema Generated: " << schema.DebugString();
+
+ ASSERT_THAT(SetSchema(std::move(schema)).status(), ProtoIsOk());
+}
+
+void IcingMonkeyTestRunner::DoUpdateSchema() {
+ ICING_LOG(INFO) << "Monkey updating schema";
+
+ MonkeySchemaGenerator::UpdateSchemaResult result =
+ schema_generator_->UpdateSchema(*in_memory_icing_->GetSchema());
+ if (result.is_invalid_schema) {
+ SetSchemaResultProto set_schema_result =
+ icing_->SetSchema(result.schema,
+ /*ignore_errors_and_delete_documents=*/true);
+ ASSERT_THAT(set_schema_result.status(), Not(ProtoIsOk()));
+ return;
+ }
+ ICING_LOG(DBG) << "Updating schema to: " << result.schema.DebugString();
+ SetSchemaResultProto icing_set_schema_result =
+ SetSchema(std::move(result.schema));
+ ASSERT_THAT(icing_set_schema_result.status(), ProtoIsOk());
+ ASSERT_THAT(icing_set_schema_result.deleted_schema_types(),
+ UnorderedElementsAreArray(result.schema_types_deleted));
+ ASSERT_THAT(icing_set_schema_result.incompatible_schema_types(),
+ UnorderedElementsAreArray(result.schema_types_incompatible));
+ ASSERT_THAT(
+ icing_set_schema_result.index_incompatible_changed_schema_types(),
+ UnorderedElementsAreArray(result.schema_types_index_incompatible));
+
+ // Update in-memory icing
+ for (const std::string& deleted_type : result.schema_types_deleted) {
+ ICING_ASSERT_OK(in_memory_icing_->DeleteBySchemaType(deleted_type));
+ }
+ for (const std::string& incompatible_type :
+ result.schema_types_incompatible) {
+ ICING_ASSERT_OK(in_memory_icing_->DeleteBySchemaType(incompatible_type));
+ }
+}
+
+void IcingMonkeyTestRunner::DoGet() {
+ InMemoryIcingSearchEngine::PickDocumentResult document =
+ in_memory_icing_->RandomPickDocument(/*p_alive=*/0.70, /*p_all=*/0.28,
+ /*p_other=*/0.02);
+ ICING_LOG(INFO) << "Monkey getting namespace: " << document.name_space
+ << ", uri: " << document.uri;
+ GetResultProto get_result =
+ icing_->Get(document.name_space, document.uri,
+ GetResultSpecProto::default_instance());
+ if (document.document.has_value()) {
+ ASSERT_THAT(get_result.status(), ProtoIsOk())
+ << "Cannot find the document that is supposed to exist.";
+ ASSERT_THAT(get_result.document(), EqualsProto(document.document.value()))
+ << "The document found does not match with the value in the in-memory "
+ "icing.";
+ } else {
+ // Should expect that no document has been found.
+ if (get_result.status().code() != StatusProto::NOT_FOUND) {
+ if (get_result.status().code() == StatusProto::OK) {
+ FAIL() << "Found a document that is not supposed to be found.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << get_result.status().code()
+ << "): " << get_result.status().message();
+ }
+ }
+}
+
+void IcingMonkeyTestRunner::DoGetAllNamespaces() {
+ ICING_LOG(INFO) << "Monkey getting all namespaces";
+ GetAllNamespacesResultProto get_result = icing_->GetAllNamespaces();
+ ASSERT_THAT(get_result.status(), ProtoIsOk());
+ ASSERT_THAT(get_result.namespaces(),
+ UnorderedElementsAreArray(in_memory_icing_->GetAllNamespaces()));
+}
+
+void IcingMonkeyTestRunner::DoPut() {
+ MonkeyTokenizedDocument doc = document_generator_->GenerateDocument();
+ ICING_LOG(INFO) << "Monkey document generated, namespace: "
+ << doc.document.namespace_()
+ << ", uri: " << doc.document.uri();
+ ICING_LOG(DBG) << doc.document.DebugString();
+ in_memory_icing_->Put(doc);
+ ASSERT_THAT(icing_->Put(doc.document).status(), ProtoIsOk());
+}
+
+void IcingMonkeyTestRunner::DoDelete() {
+ InMemoryIcingSearchEngine::PickDocumentResult document =
+ in_memory_icing_->RandomPickDocument(/*p_alive=*/0.70, /*p_all=*/0.2,
+ /*p_other=*/0.1);
+ ICING_LOG(INFO) << "Monkey deleting namespace: " << document.name_space
+ << ", uri: " << document.uri;
+ DeleteResultProto delete_result =
+ icing_->Delete(document.name_space, document.uri);
+ if (document.document.has_value()) {
+ ICING_ASSERT_OK(
+ in_memory_icing_->Delete(document.name_space, document.uri));
+ ASSERT_THAT(delete_result.status(), ProtoIsOk())
+ << "Cannot delete an existing document.";
+ } else {
+ // Should expect that no document has been deleted.
+ if (delete_result.status().code() != StatusProto::NOT_FOUND) {
+ if (delete_result.status().code() == StatusProto::OK) {
+ FAIL() << "Deleted a non-existing document without an error.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << delete_result.status().code()
+ << "): " << delete_result.status().message();
+ }
+ }
+}
+
+void IcingMonkeyTestRunner::DoDeleteByNamespace() {
+ std::string name_space = document_generator_->GetNamespace();
+ ICING_LOG(INFO) << "Monkey deleting namespace: " << name_space;
+ DeleteByNamespaceResultProto delete_result =
+ icing_->DeleteByNamespace(name_space);
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted,
+ in_memory_icing_->DeleteByNamespace(name_space));
+ if (num_docs_deleted != 0) {
+ ASSERT_THAT(delete_result.status(), ProtoIsOk())
+ << "Cannot delete an existing namespace.";
+ ASSERT_THAT(delete_result.delete_stats().num_documents_deleted(),
+ Eq(num_docs_deleted));
+ } else {
+ // Should expect that no document has been deleted.
+ if (delete_result.status().code() != StatusProto::NOT_FOUND) {
+ if (delete_result.status().code() == StatusProto::OK) {
+ FAIL() << "Deleted a non-existing namespace without an error.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << delete_result.status().code()
+ << "): " << delete_result.status().message();
+ }
+ }
+}
+
+void IcingMonkeyTestRunner::DoDeleteBySchemaType() {
+ std::string schema_type = document_generator_->GetType().schema_type();
+ ICING_LOG(INFO) << "Monkey deleting type: " << schema_type;
+ DeleteBySchemaTypeResultProto delete_result =
+ icing_->DeleteBySchemaType(schema_type);
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted,
+ in_memory_icing_->DeleteBySchemaType(schema_type));
+ if (num_docs_deleted != 0) {
+ ASSERT_THAT(delete_result.status(), ProtoIsOk())
+ << "Cannot delete an existing schema type.";
+ ASSERT_THAT(delete_result.delete_stats().num_documents_deleted(),
+ Eq(num_docs_deleted));
+ } else {
+ // Should expect that no document has been deleted.
+ if (delete_result.status().code() != StatusProto::NOT_FOUND) {
+ if (delete_result.status().code() == StatusProto::OK) {
+ FAIL() << "Deleted a non-existing schema type without an error.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << delete_result.status().code()
+ << "): " << delete_result.status().message();
+ }
+ }
+}
+
+void IcingMonkeyTestRunner::DoDeleteByQuery() {
+ SearchSpecProto search_spec =
+ GenerateRandomSearchSpecProto(&random_, document_generator_.get());
+ ICING_LOG(INFO) << "Monkey deleting by query: " << search_spec.query();
+ DeleteByQueryResultProto delete_result = icing_->DeleteByQuery(search_spec);
+ ICING_ASSERT_OK_AND_ASSIGN(uint32_t num_docs_deleted,
+ in_memory_icing_->DeleteByQuery(search_spec));
+ if (num_docs_deleted != 0) {
+ ASSERT_THAT(delete_result.status(), ProtoIsOk())
+ << "Cannot delete documents that matches with the query.";
+ ASSERT_THAT(delete_result.delete_by_query_stats().num_documents_deleted(),
+ Eq(num_docs_deleted));
+ } else {
+ // Should expect that no document has been deleted.
+ if (delete_result.status().code() != StatusProto::NOT_FOUND) {
+ if (delete_result.status().code() == StatusProto::OK) {
+ FAIL() << "Deleted documents that should not match with the query "
+ "without an error.";
+ }
+ FAIL() << "Icing search engine failure (code "
+ << delete_result.status().code()
+ << "): " << delete_result.status().message();
+ }
+ }
+ ICING_LOG(INFO)
+ << delete_result.delete_by_query_stats().num_documents_deleted()
+ << " documents deleted by query.";
+}
+
+void IcingMonkeyTestRunner::DoSearch() {
+ std::unique_ptr<SearchSpecProto> search_spec =
+ std::make_unique<SearchSpecProto>(
+ GenerateRandomSearchSpecProto(&random_, document_generator_.get()));
+ std::unique_ptr<ScoringSpecProto> scoring_spec =
+ std::make_unique<ScoringSpecProto>(GenerateRandomScoringSpec(&random_));
+ std::unique_ptr<ResultSpecProto> result_spec =
+ std::make_unique<ResultSpecProto>(GenerateRandomResultSpecProto(
+ &random_, in_memory_icing_->GetSchema()));
+ const ResultSpecProto::SnippetSpecProto snippet_spec =
+ result_spec->snippet_spec();
+ bool is_projection_enabled = !result_spec->type_property_masks().empty();
+
+ ICING_LOG(INFO) << "Monkey searching by query: " << search_spec->query()
+ << ", term_match_type: " << search_spec->term_match_type();
+ ICING_VLOG(1) << "search_spec:\n" << search_spec->DebugString();
+ ICING_VLOG(1) << "scoring_spec:\n" << scoring_spec->DebugString();
+ ICING_VLOG(1) << "result_spec:\n" << result_spec->DebugString();
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<DocumentProto> exp_documents,
+ in_memory_icing_->Search(*search_spec));
+
+ SearchResultProto search_result =
+ icing_->Search(*search_spec, *scoring_spec, *result_spec);
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+
+ // Delete all of the specs used in the search. GetNextPage should have no
+ // problem because it shouldn't be keeping any references to them.
+ search_spec.reset();
+ scoring_spec.reset();
+ result_spec.reset();
+
+ std::vector<DocumentProto> actual_documents;
+ int num_snippeted = 0;
+ while (true) {
+ for (const SearchResultProto::ResultProto& doc : search_result.results()) {
+ actual_documents.push_back(doc.document());
+ if (!doc.snippet().entries().empty()) {
+ ++num_snippeted;
+ for (const SnippetProto::EntryProto& entry : doc.snippet().entries()) {
+ ASSERT_THAT(entry.snippet_matches(),
+ SizeIs(Le(snippet_spec.num_matches_per_property())));
+ }
+ }
+ }
+ if (search_result.next_page_token() == kInvalidNextPageToken) {
+ break;
+ }
+ search_result = icing_->GetNextPage(search_result.next_page_token());
+ ASSERT_THAT(search_result.status(), ProtoIsOk());
+ }
+ // The maximum number of scored documents allowed in Icing is 30000, in which
+ // case we are not able to compare the results with the in-memory Icing.
+ if (exp_documents.size() >= 30000) {
+ return;
+ }
+ if (snippet_spec.num_matches_per_property() > 0 && !is_projection_enabled) {
+ ASSERT_THAT(num_snippeted,
+ Eq(std::min<uint32_t>(exp_documents.size(),
+ snippet_spec.num_to_snippet())));
+ }
+ SortDocuments(exp_documents);
+ SortDocuments(actual_documents);
+ ASSERT_THAT(actual_documents, SizeIs(exp_documents.size()));
+ for (int i = 0; i < exp_documents.size(); ++i) {
+ if (is_projection_enabled) {
+ ASSERT_THAT(actual_documents[i].namespace_(),
+ Eq(exp_documents[i].namespace_()));
+ ASSERT_THAT(actual_documents[i].uri(), Eq(exp_documents[i].uri()));
+ continue;
+ }
+ ASSERT_THAT(actual_documents[i], EqualsProto(exp_documents[i]));
+ }
+ ICING_LOG(INFO) << exp_documents.size() << " documents found by query.";
+}
+
+void IcingMonkeyTestRunner::ReloadFromDisk() {
+ ICING_LOG(INFO) << "Monkey reloading from disk";
+ // Destruct the icing search engine by resetting the unique pointer.
+ icing_.reset();
+ ASSERT_NO_FATAL_FAILURE(CreateIcingSearchEngine());
+}
+
+void IcingMonkeyTestRunner::DoOptimize() {
+ ICING_LOG(INFO) << "Monkey doing optimization";
+ ASSERT_THAT(icing_->Optimize().status(), ProtoIsOk());
+}
+
+void IcingMonkeyTestRunner::CreateIcingSearchEngine() {
+ std::uniform_int_distribution<> dist(0, 1);
+
+ bool always_rebuild_index_optimize = dist(random_);
+ float optimize_rebuild_index_threshold =
+ always_rebuild_index_optimize ? 0.0 : 0.9;
+
+ IcingSearchEngineOptions icing_options;
+ icing_options.set_index_merge_size(config_.index_merge_size);
+ icing_options.set_base_dir(icing_dir_->dir());
+ icing_options.set_optimize_rebuild_index_threshold(
+ optimize_rebuild_index_threshold);
+ // The method will be called every time when we ReloadFromDisk(), so randomly
+ // flip this flag to test document store's compatibility.
+ icing_options.set_document_store_namespace_id_fingerprint(
+ (bool)dist(random_));
+ icing_ = std::make_unique<IcingSearchEngine>(icing_options);
+ ASSERT_THAT(icing_->Initialize().status(), ProtoIsOk());
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/monkey_test/icing-monkey-test-runner.h b/icing/monkey_test/icing-monkey-test-runner.h
new file mode 100644
index 0000000..10be60c
--- /dev/null
+++ b/icing/monkey_test/icing-monkey-test-runner.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_
+#define ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "icing/file/destructible-directory.h"
+#include "icing/file/filesystem.h"
+#include "icing/icing-search-engine.h"
+#include "icing/monkey_test/in-memory-icing-search-engine.h"
+#include "icing/monkey_test/monkey-test-generators.h"
+#include "icing/monkey_test/monkey-test-util.h"
+#include "icing/proto/schema.pb.h"
+
+namespace icing {
+namespace lib {
+
+class IcingMonkeyTestRunner {
+ public:
+ IcingMonkeyTestRunner(IcingMonkeyTestRunnerConfiguration config);
+ IcingMonkeyTestRunner(const IcingMonkeyTestRunner&) = delete;
+ IcingMonkeyTestRunner& operator=(const IcingMonkeyTestRunner&) = delete;
+
+ SetSchemaResultProto SetSchema(SchemaProto&& schema);
+
+ // This function must and should only be called before running the monkey
+ // test.
+ void Initialize();
+
+ // Run the monkey test with num operations.
+ void Run(uint32_t num);
+
+ // APIs supported in icing search engine.
+ void DoUpdateSchema();
+ void DoGet();
+ void DoGetAllNamespaces();
+ void DoPut();
+ void DoDelete();
+ void DoDeleteByNamespace();
+ void DoDeleteBySchemaType();
+ void DoDeleteByQuery();
+ void DoSearch();
+
+ // Operations with no observable side-effects.
+ void ReloadFromDisk();
+ void DoOptimize();
+
+ private:
+ IcingMonkeyTestRunnerConfiguration config_;
+ MonkeyTestRandomEngine random_;
+ Filesystem filesystem_;
+ std::unique_ptr<DestructibleDirectory> icing_dir_;
+ std::unique_ptr<InMemoryIcingSearchEngine> in_memory_icing_;
+ std::unique_ptr<IcingSearchEngine> icing_;
+
+ std::unique_ptr<MonkeySchemaGenerator> schema_generator_;
+ std::unique_ptr<MonkeyDocumentGenerator> document_generator_;
+
+ void CreateIcingSearchEngine();
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_MONKEY_TEST_ICING_MONKEY_TEST_RUNNER_H_
diff --git a/icing/monkey_test/icing-search-engine_monkey_test.cc b/icing/monkey_test/icing-search-engine_monkey_test.cc
new file mode 100644
index 0000000..436e27b
--- /dev/null
+++ b/icing/monkey_test/icing-search-engine_monkey_test.cc
@@ -0,0 +1,99 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <random>
+#include <utility>
+
+#include "gtest/gtest.h"
+#include "icing/monkey_test/icing-monkey-test-runner.h"
+#include "icing/monkey_test/monkey-test-util.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/schema/section.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+TEST(IcingSearchEngineMonkeyTest, MonkeyTest) {
+ IcingMonkeyTestRunnerConfiguration config(
+ /*seed=*/std::random_device()(),
+ /*num_types=*/30,
+ /*num_namespaces=*/100,
+ /*num_uris=*/1000,
+ /*index_merge_size=*/1024 * 1024);
+ config.possible_num_properties = {0,
+ 1,
+ 2,
+ 4,
+ 8,
+ 16,
+ kTotalNumSections / 2,
+ kTotalNumSections,
+ kTotalNumSections + 1,
+ kTotalNumSections * 2};
+ config.possible_num_tokens_ = {0, 1, 4, 16, 64, 256};
+ config.monkey_api_schedules = {
+ {&IcingMonkeyTestRunner::DoPut, 500},
+ {&IcingMonkeyTestRunner::DoSearch, 200},
+ {&IcingMonkeyTestRunner::DoGet, 70},
+ {&IcingMonkeyTestRunner::DoGetAllNamespaces, 50},
+ {&IcingMonkeyTestRunner::DoDelete, 50},
+ {&IcingMonkeyTestRunner::DoDeleteByNamespace, 50},
+ {&IcingMonkeyTestRunner::DoDeleteBySchemaType, 45},
+ {&IcingMonkeyTestRunner::DoDeleteByQuery, 20},
+ {&IcingMonkeyTestRunner::DoOptimize, 5},
+ {&IcingMonkeyTestRunner::DoUpdateSchema, 5},
+ {&IcingMonkeyTestRunner::ReloadFromDisk, 5}};
+ uint32_t num_iterations = IsAndroidArm() ? 1000 : 5000;
+ IcingMonkeyTestRunner runner(std::move(config));
+ ASSERT_NO_FATAL_FAILURE(runner.Initialize());
+ ASSERT_NO_FATAL_FAILURE(runner.Run(num_iterations));
+}
+
+TEST(DISABLED_IcingSearchEngineMonkeyTest, MonkeyManyDocTest) {
+ IcingMonkeyTestRunnerConfiguration config(
+ /*seed=*/std::random_device()(),
+ /*num_types=*/30,
+ /*num_namespaces=*/200,
+ /*num_uris=*/100000,
+ /*index_merge_size=*/1024 * 1024);
+
+ // Due to the large amount of documents, we need to make each document smaller
+ // to finish the test.
+ config.possible_num_properties = {0, 1, 2};
+ config.possible_num_tokens_ = {0, 1, 4};
+
+ // No deletion is performed to preserve a large number of documents.
+ config.monkey_api_schedules = {
+ {&IcingMonkeyTestRunner::DoPut, 500},
+ {&IcingMonkeyTestRunner::DoSearch, 200},
+ {&IcingMonkeyTestRunner::DoGet, 70},
+ {&IcingMonkeyTestRunner::DoGetAllNamespaces, 50},
+ {&IcingMonkeyTestRunner::DoOptimize, 5},
+ {&IcingMonkeyTestRunner::ReloadFromDisk, 5}};
+ IcingMonkeyTestRunner runner(std::move(config));
+ ASSERT_NO_FATAL_FAILURE(runner.Initialize());
+ // Pre-fill with 4 million documents
+ SetLoggingLevel(LogSeverity::WARNING);
+ for (int i = 0; i < 4000000; i++) {
+ ASSERT_NO_FATAL_FAILURE(runner.DoPut());
+ }
+ SetLoggingLevel(LogSeverity::INFO);
+ ASSERT_NO_FATAL_FAILURE(runner.Run(1000));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/monkey_test/in-memory-icing-search-engine.cc b/icing/monkey_test/in-memory-icing-search-engine.cc
new file mode 100644
index 0000000..7baa06e
--- /dev/null
+++ b/icing/monkey_test/in-memory-icing-search-engine.cc
@@ -0,0 +1,352 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/monkey_test/in-memory-icing-search-engine.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <random>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/monkey_test/monkey-tokenized-document.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/store/document-id.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Check if s1 is a prefix of s2.
+bool IsPrefix(std::string_view s1, std::string_view s2) {
+ if (s1.length() > s2.length()) {
+ return false;
+ }
+ return s1 == s2.substr(0, s1.length());
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<const PropertyConfigProto *>
+InMemoryIcingSearchEngine::GetPropertyConfig(
+ const std::string &schema_type, const std::string &property_name) const {
+ auto schema_iter = property_config_map_.find(schema_type);
+ if (schema_iter == property_config_map_.end()) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Schema type: ", schema_type, " is not found."));
+ }
+ auto property_iter = schema_iter->second.find(property_name);
+ if (property_iter == schema_iter->second.end()) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Property: ", property_name, " is not found."));
+ }
+ return &property_iter->second;
+}
+
+libtextclassifier3::StatusOr<TermMatchType::Code>
+InMemoryIcingSearchEngine::GetTermMatchType(
+ const std::string &schema_type,
+ const MonkeyTokenizedSection &section) const {
+ bool in_indexable_properties_list = false;
+ bool all_indexable_from_top = true;
+
+ std::vector<std::string_view> properties_in_path =
+ absl_ports::StrSplit(section.path, ".");
+ if (properties_in_path.empty()) {
+ return absl_ports::InvalidArgumentError("Got empty path.");
+ }
+ std::string curr_schema_type = schema_type;
+ for (int i = 0; i < properties_in_path.size(); ++i) {
+ ICING_ASSIGN_OR_RETURN(
+ const PropertyConfigProto *prop,
+ GetPropertyConfig(curr_schema_type,
+ std::string(properties_in_path[i])));
+ if (prop->data_type() == PropertyConfigProto::DataType::STRING) {
+ return prop->string_indexing_config().term_match_type();
+ }
+
+ if (prop->data_type() != PropertyConfigProto::DataType::DOCUMENT) {
+ return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
+ }
+
+ bool old_all_indexable_from_top = all_indexable_from_top;
+ all_indexable_from_top &=
+ prop->document_indexing_config().index_nested_properties();
+ if (!all_indexable_from_top && !in_indexable_properties_list) {
+ // Only try to update in_indexable_properties_list if this is the first
+ // level with index_nested_properties=false.
+ if (old_all_indexable_from_top) {
+ auto &indexable_properties =
+ prop->document_indexing_config().indexable_nested_properties_list();
+ std::string relative_path =
+ absl_ports::StrCatPieces(std::vector<std::string_view>(
+ properties_in_path.begin() + i + 1, properties_in_path.end()));
+ in_indexable_properties_list =
+ std::find(indexable_properties.begin(), indexable_properties.end(),
+ relative_path) != indexable_properties.end();
+ }
+ // Check in_indexable_properties_list again.
+ if (!in_indexable_properties_list) {
+ return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
+ }
+ }
+ curr_schema_type = prop->document_indexing_config().GetTypeName();
+ }
+ return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
+}
+
+libtextclassifier3::StatusOr<bool>
+InMemoryIcingSearchEngine::DoesDocumentMatchQuery(
+ const MonkeyTokenizedDocument &document, const std::string &query,
+ TermMatchType::Code term_match_type) const {
+ std::vector<std::string_view> strs = absl_ports::StrSplit(query, ":");
+ std::string_view query_term;
+ std::string_view section_restrict;
+ if (strs.size() > 1) {
+ section_restrict = strs[0];
+ query_term = strs[1];
+ } else {
+ query_term = query;
+ }
+ for (const MonkeyTokenizedSection &section : document.tokenized_sections) {
+ if (!section_restrict.empty() && section.path != section_restrict) {
+ continue;
+ }
+ ICING_ASSIGN_OR_RETURN(
+ TermMatchType::Code section_term_match_type,
+ GetTermMatchType(document.document.schema(), section));
+ if (section_term_match_type == TermMatchType::UNKNOWN) {
+ // Skip non-indexable property.
+ continue;
+ }
+ for (const std::string &token : section.token_sequence) {
+ if (section_term_match_type == TermMatchType::EXACT_ONLY ||
+ term_match_type == TermMatchType::EXACT_ONLY) {
+ if (token == query_term) {
+ return true;
+ }
+ } else if (IsPrefix(query_term, token)) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+void InMemoryIcingSearchEngine::SetSchema(SchemaProto &&schema) {
+ schema_ = std::make_unique<SchemaProto>(std::move(schema));
+ property_config_map_.clear();
+ for (const SchemaTypeConfigProto &type_config : schema_->types()) {
+ auto &curr_property_map = property_config_map_[type_config.schema_type()];
+ for (const PropertyConfigProto &property_config :
+ type_config.properties()) {
+ curr_property_map.insert(
+ {property_config.property_name(), property_config});
+ }
+ }
+}
+
+InMemoryIcingSearchEngine::PickDocumentResult
+InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all,
+ float p_other) const {
+ // Normalizing p_alive, p_all and p_other, so that they sum to 1.
+ if (p_alive == 0 && p_all == 0 && p_other == 0) {
+ p_alive = p_all = p_other = 1 / 3.;
+ } else {
+ float p_sum = p_alive + p_all + p_other;
+ p_alive = p_alive / p_sum;
+ p_all = p_all / p_sum;
+ p_other = p_other / p_sum;
+ }
+
+ std::uniform_real_distribution<> real_dist(0, 1);
+ float p = real_dist(*random_);
+ if (p <= p_other || documents_.empty()) {
+ // 20 is a fair number of non-existing namespaces and uris, enough for
+ // monkey testing.
+ std::uniform_int_distribution<> dist(0, 19);
+ std::string name_space = absl_ports::StrCat("non_existing_namespace",
+ std::to_string(dist(*random_)));
+ std::string uri =
+ absl_ports::StrCat("non_existing_uri", std::to_string(dist(*random_)));
+ return {name_space, uri};
+ }
+ p -= p_other;
+ DocumentId doc_id;
+ if (p <= p_all || existing_doc_ids_.empty()) {
+ std::uniform_int_distribution<DocumentId> dist(0, documents_.size() - 1);
+ doc_id = dist(*random_);
+ } else {
+ std::uniform_int_distribution<DocumentId> dist(
+ 0, existing_doc_ids_.size() - 1);
+ doc_id = existing_doc_ids_[dist(*random_)];
+ }
+ InMemoryIcingSearchEngine::PickDocumentResult result = {
+ documents_[doc_id].document.namespace_(),
+ documents_[doc_id].document.uri()};
+
+ // Even the (name_space, uri) of the picked doc_id has not been deleted
+ // specifically, doc_id may be outdated because of possible overwriting. So we
+ // need to find the latest document id, and return the latest DocumentProto.
+ auto latest_doc_id = InternalGet(result.name_space, result.uri);
+ if (latest_doc_id.ok()) {
+ result.document = documents_[latest_doc_id.ValueOrDie()].document;
+ }
+ return result;
+}
+
+void InMemoryIcingSearchEngine::Put(const MonkeyTokenizedDocument &document) {
+ // Delete the old one if existing.
+ Delete(document.document.namespace_(), document.document.uri()).IgnoreError();
+ existing_doc_ids_.push_back(documents_.size());
+ namespace_uri_docid_map[document.document.namespace_()]
+ [document.document.uri()] = documents_.size();
+ documents_.push_back(document);
+}
+
+std::unordered_set<std::string> InMemoryIcingSearchEngine::GetAllNamespaces()
+ const {
+ std::unordered_set<std::string> namespaces;
+ for (DocumentId doc_id : existing_doc_ids_) {
+ namespaces.insert(documents_[doc_id].document.namespace_());
+ }
+ return namespaces;
+}
+
+libtextclassifier3::Status InMemoryIcingSearchEngine::Delete(
+ const std::string &name_space, const std::string &uri) {
+ libtextclassifier3::StatusOr<DocumentId> doc_id_or =
+ InternalGet(name_space, uri);
+ if (doc_id_or.ok()) {
+ DocumentId doc_id = doc_id_or.ValueOrDie();
+ const DocumentProto &document = documents_[doc_id].document;
+ namespace_uri_docid_map[document.namespace_()].erase(document.uri());
+ auto end_itr =
+ std::remove(existing_doc_ids_.begin(), existing_doc_ids_.end(), doc_id);
+ existing_doc_ids_.erase(end_itr, existing_doc_ids_.end());
+ }
+ return doc_id_or.status();
+}
+
+libtextclassifier3::StatusOr<uint32_t>
+InMemoryIcingSearchEngine::DeleteByNamespace(const std::string &name_space) {
+ std::vector<DocumentId> doc_ids_to_delete;
+ for (DocumentId doc_id : existing_doc_ids_) {
+ if (documents_[doc_id].document.namespace_() == name_space) {
+ doc_ids_to_delete.push_back(doc_id);
+ }
+ }
+ for (DocumentId doc_id : doc_ids_to_delete) {
+ const DocumentProto &document = documents_[doc_id].document;
+ if (!Delete(document.namespace_(), document.uri()).ok()) {
+ return absl_ports::InternalError(
+ "Should never happen. There are inconsistencies in the in-memory "
+ "Icing.");
+ }
+ }
+ return doc_ids_to_delete.size();
+}
+
+libtextclassifier3::StatusOr<uint32_t>
+InMemoryIcingSearchEngine::DeleteBySchemaType(const std::string &schema_type) {
+ std::vector<DocumentId> doc_ids_to_delete;
+ for (DocumentId doc_id : existing_doc_ids_) {
+ if (documents_[doc_id].document.schema() == schema_type) {
+ doc_ids_to_delete.push_back(doc_id);
+ }
+ }
+ for (DocumentId doc_id : doc_ids_to_delete) {
+ const DocumentProto &document = documents_[doc_id].document;
+ if (!Delete(document.namespace_(), document.uri()).ok()) {
+ return absl_ports::InternalError(
+ "Should never happen. There are inconsistencies in the in-memory "
+ "Icing.");
+ }
+ }
+ return doc_ids_to_delete.size();
+}
+
+libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery(
+ const SearchSpecProto &search_spec) {
+ ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> doc_ids_to_delete,
+ InternalSearch(search_spec));
+ for (DocumentId doc_id : doc_ids_to_delete) {
+ const DocumentProto &document = documents_[doc_id].document;
+ if (!Delete(document.namespace_(), document.uri()).ok()) {
+ return absl_ports::InternalError(
+ "Should never happen. There are inconsistencies in the in-memory "
+ "Icing.");
+ }
+ }
+ return doc_ids_to_delete.size();
+}
+
+libtextclassifier3::StatusOr<std::vector<DocumentProto>>
+InMemoryIcingSearchEngine::Search(const SearchSpecProto &search_spec) const {
+ ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> matched_doc_ids,
+ InternalSearch(search_spec));
+ std::vector<DocumentProto> result;
+ result.reserve(matched_doc_ids.size());
+ for (DocumentId doc_id : matched_doc_ids) {
+ result.push_back(documents_[doc_id].document);
+ }
+ return result;
+}
+
+libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet(
+ const std::string &name_space, const std::string &uri) const {
+ auto uris = namespace_uri_docid_map.find(name_space);
+ if (uris != namespace_uri_docid_map.end()) {
+ auto doc = uris->second.find(uri);
+ if (doc != uris->second.end()) {
+ return doc->second;
+ }
+ }
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ name_space, ", ", uri,
+ " is not found by InMemoryIcingSearchEngine::InternalGet."));
+}
+
+libtextclassifier3::StatusOr<std::vector<DocumentId>>
+InMemoryIcingSearchEngine::InternalSearch(
+ const SearchSpecProto &search_spec) const {
+ std::vector<DocumentId> matched_doc_ids;
+ for (DocumentId doc_id : existing_doc_ids_) {
+ ICING_ASSIGN_OR_RETURN(
+ bool match,
+ DoesDocumentMatchQuery(documents_[doc_id], search_spec.query(),
+ search_spec.term_match_type()));
+ if (match) {
+ matched_doc_ids.push_back(doc_id);
+ }
+ }
+ return matched_doc_ids;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/monkey_test/in-memory-icing-search-engine.h b/icing/monkey_test/in-memory-icing-search-engine.h
new file mode 100644
index 0000000..98e7e4c
--- /dev/null
+++ b/icing/monkey_test/in-memory-icing-search-engine.h
@@ -0,0 +1,167 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_MONKEY_TEST_IN_MEMORY_ICING_SEARCH_ENGINE_H_
+#define ICING_MONKEY_TEST_IN_MEMORY_ICING_SEARCH_ENGINE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/monkey_test/monkey-test-util.h"
+#include "icing/monkey_test/monkey-tokenized-document.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+class InMemoryIcingSearchEngine {
+ public:
+ struct PickDocumentResult {
+ std::string name_space;
+ std::string uri;
+ // document is empty if and only if such (name_space, uri) is not alive
+ // in the in-memory icing.
+ std::optional<DocumentProto> document;
+ };
+
+ InMemoryIcingSearchEngine(MonkeyTestRandomEngine *random) : random_(random) {}
+
+ uint32_t GetNumAliveDocuments() const { return existing_doc_ids_.size(); }
+
+ const SchemaProto *GetSchema() const { return schema_.get(); }
+
+ void SetSchema(SchemaProto &&schema);
+
+ // Randomly pick a document from the in-memory Icing for monkey testing.
+ //
+ // p_alive: chance of getting an alive document.
+ // p_all: chance of getting a document that has ever been "Put" before,
+ // including already "Delete"d documents.
+ // p_other: chance of getting a random namespace + uri that has never been
+ // "Put" before.
+ //
+ // p_alive, p_all, and p_other is required to be positive and sum to 1.
+ // Otherwise, they will be normalized to ensure this.
+ //
+ // Returns an instance of PickDocumentResult.
+ PickDocumentResult RandomPickDocument(float p_alive, float p_all,
+ float p_other) const;
+
+ // Puts the document into the in-memory Icing. If the (namespace, uri) pair
+ // already exists, the old document will be overwritten.
+ void Put(const MonkeyTokenizedDocument &document);
+
+ std::unordered_set<std::string> GetAllNamespaces() const;
+
+ // Deletes the Document specified by the given (namespace, uri) pair.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND if no document exists with namespace, uri
+ libtextclassifier3::Status Delete(const std::string &name_space,
+ const std::string &uri);
+
+ // Deletes all Documents belonging to the specified namespace.
+ //
+ // Returns:
+ // The number of deleted documents on success
+ // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing
+ libtextclassifier3::StatusOr<uint32_t> DeleteByNamespace(
+ const std::string &name_space);
+
+ // Deletes all Documents belonging to the specified type
+ //
+ // Returns:
+ // The number of deleted documents on success
+ // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing
+ libtextclassifier3::StatusOr<uint32_t> DeleteBySchemaType(
+ const std::string &schema_type);
+
+ // Deletes all Documents that match the query specified in search_spec.
+ // Currently, only the "query" and "term_match_type" fields are recognized by
+ // the in-memory Icing, and only single term queries with possible section
+ // restrictions are supported.
+ //
+ // Returns:
+ // The number of deleted documents on success
+ // INTERNAL_ERROR if there are inconsistencies in the in-memory Icing
+ libtextclassifier3::StatusOr<uint32_t> DeleteByQuery(
+ const SearchSpecProto &search_spec);
+
+ // Retrieves documents according to search_spec.
+ // Currently, only the "query" and "term_match_type" fields are recognized by
+ // the in-memory Icing, and only single term queries with possible section
+ // restrictions are supported.
+ libtextclassifier3::StatusOr<std::vector<DocumentProto>> Search(
+ const SearchSpecProto &search_spec) const;
+
+ private:
+ // Does not own.
+ MonkeyTestRandomEngine *random_;
+
+ std::vector<MonkeyTokenizedDocument> documents_;
+ std::vector<DocumentId> existing_doc_ids_;
+ // A map from namespaces to uris and then from uris to internal document ids,
+ // which is used for fast lookups.
+ std::unordered_map<std::string, std::unordered_map<std::string, DocumentId>>
+ namespace_uri_docid_map;
+
+ std::unique_ptr<SchemaProto> schema_;
+ // A map that maps from (schema_type, property_name) to the corresponding
+ // PropertyConfigProto.
+ std::unordered_map<
+ std::string, std::unordered_map<std::string, const PropertyConfigProto &>>
+ property_config_map_;
+
+ // Finds and returns the internal document id for the document identified by
+ // the given key (namespace, uri)
+ //
+ // Returns:
+ // The document id found on success
+ // NOT_FOUND if the key doesn't exist or doc has been deleted
+ libtextclassifier3::StatusOr<DocumentId> InternalGet(
+ const std::string &name_space, const std::string &uri) const;
+
+ // A helper method for DeleteByQuery and Search to get matched internal doc
+ // ids.
+ libtextclassifier3::StatusOr<std::vector<DocumentId>> InternalSearch(
+ const SearchSpecProto &search_spec) const;
+
+ libtextclassifier3::StatusOr<const PropertyConfigProto *> GetPropertyConfig(
+ const std::string &schema_type, const std::string &property_name) const;
+
+ libtextclassifier3::StatusOr<TermMatchType::Code> GetTermMatchType(
+ const std::string &schema_type,
+ const MonkeyTokenizedSection &section) const;
+
+ libtextclassifier3::StatusOr<bool> DoesDocumentMatchQuery(
+ const MonkeyTokenizedDocument &document, const std::string &query,
+ TermMatchType::Code term_match_type) const;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_MONKEY_TEST_IN_MEMORY_ICING_SEARCH_ENGINE_H_
diff --git a/icing/monkey_test/monkey-test-common-words.h b/icing/monkey_test/monkey-test-common-words.h
new file mode 100644
index 0000000..f0ed08a
--- /dev/null
+++ b/icing/monkey_test/monkey-test-common-words.h
@@ -0,0 +1,284 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_MONKEY_TEST_MONKEY_TEST_COMMON_WORDS_H_
+#define ICING_MONKEY_TEST_MONKEY_TEST_COMMON_WORDS_H_
+
+#include <array>
+#include <string_view>
+
+namespace icing {
+namespace lib {
+
+// A bag of words in English for creating random documents. Only words that are
+// at least 3 letters long are included (that's kPrefixLength) so that prefix
+// queries are easily formed from any word in a random document.
+// Data source:
+// https://chromium.googlesource.com/chromium/src/+/HEAD/components/url_formatter/spoof_checks/common_words/data/
+static constexpr std::array<std::string_view, 1000> kCommonWords = {
+ "the", "and", "for", "that",
+ "this", "with", "you", "not",
+ "are", "from", "your", "all",
+ "have", "new", "more", "was",
+ "will", "home", "can", "about",
+ "page", "has", "search", "free",
+ "but", "our", "one", "other",
+ "information", "time", "they", "site",
+ "may", "what", "which", "their",
+ "news", "out", "use", "any",
+ "there", "see", "only", "his",
+ "when", "contact", "here", "business",
+ "who", "web", "also", "now",
+ "help", "get", "view", "online",
+ "first", "been", "would", "how",
+ "were", "services", "some", "these",
+ "click", "its", "like", "service",
+ "than", "find", "price", "date",
+ "back", "top", "people", "had",
+ "list", "name", "just", "over",
+ "state", "year", "day", "into",
+ "email", "two", "health", "world",
+ "next", "used", "work", "last",
+ "most", "products", "music", "buy",
+ "data", "make", "them", "should",
+ "product", "system", "post", "her",
+ "city", "add", "policy", "number",
+ "such", "please", "available", "copyright",
+ "support", "message", "after", "best",
+ "software", "then", "jan", "good",
+ "video", "well", "where", "info",
+ "rights", "public", "books", "high",
+ "school", "through", "each", "links",
+ "she", "review", "years", "order",
+ "very", "privacy", "book", "items",
+ "company", "read", "group", "sex",
+ "need", "many", "user", "said",
+ "does", "set", "under", "general",
+ "research", "university", "january", "mail",
+ "full", "map", "reviews", "program",
+ "life", "know", "games", "way",
+ "days", "management", "part", "could",
+ "great", "united", "hotel", "real",
+ "item", "international", "center", "must",
+ "store", "travel", "comments", "made",
+ "development", "report", "off", "member",
+ "details", "line", "terms", "before",
+ "hotels", "did", "send", "right",
+ "type", "because", "local", "those",
+ "using", "results", "office", "education",
+ "national", "car", "design", "take",
+ "posted", "internet", "address", "community",
+ "within", "states", "area", "want",
+ "phone", "dvd", "shipping", "reserved",
+ "subject", "between", "forum", "family",
+ "long", "based", "code", "show",
+ "even", "black", "check", "special",
+ "prices", "website", "index", "being",
+ "women", "much", "sign", "file",
+ "link", "open", "today", "technology",
+ "south", "case", "project", "same",
+ "pages", "version", "section", "own",
+ "found", "sports", "house", "related",
+ "security", "both", "county", "american",
+ "photo", "game", "members", "power",
+ "while", "care", "network", "down",
+ "computer", "systems", "three", "total",
+ "place", "end", "following", "download",
+ "him", "without", "per", "access",
+ "think", "north", "resources", "current",
+ "posts", "big", "media", "law",
+ "control", "water", "history", "pictures",
+ "size", "art", "personal", "since",
+ "including", "guide", "shop", "directory",
+ "board", "location", "change", "white",
+ "text", "small", "rating", "rate",
+ "government", "children", "during", "usa",
+ "return", "students", "shopping", "account",
+ "times", "sites", "level", "digital",
+ "profile", "previous", "form", "events",
+ "love", "old", "john", "main",
+ "call", "hours", "image", "department",
+ "title", "description", "non", "insurance",
+ "another", "why", "shall", "property",
+ "class", "still", "money", "quality",
+ "every", "listing", "content", "country",
+ "private", "little", "visit", "save",
+ "tools", "low", "reply", "customer",
+ "december", "compare", "movies", "include",
+ "college", "value", "article", "york",
+ "man", "card", "jobs", "provide",
+ "food", "source", "author", "different",
+ "press", "learn", "sale", "around",
+ "print", "course", "job", "canada",
+ "process", "teen", "room", "stock",
+ "training", "too", "credit", "point",
+ "join", "science", "men", "categories",
+ "advanced", "west", "sales", "look",
+ "english", "left", "team", "estate",
+ "box", "conditions", "select", "windows",
+ "photos", "gay", "thread", "week",
+ "category", "note", "live", "large",
+ "gallery", "table", "register", "however",
+ "june", "october", "november", "market",
+ "library", "really", "action", "start",
+ "series", "model", "features", "air",
+ "industry", "plan", "human", "provided",
+ "yes", "required", "second", "hot",
+ "accessories", "cost", "movie", "forums",
+ "march", "september", "better", "say",
+ "questions", "july", "going", "medical",
+ "test", "friend", "come", "dec",
+ "server", "study", "application", "cart",
+ "staff", "articles", "san", "feedback",
+ "again", "play", "looking", "issues",
+ "april", "never", "users", "complete",
+ "street", "topic", "comment", "financial",
+ "things", "working", "against", "standard",
+ "tax", "person", "below", "mobile",
+ "less", "got", "blog", "party",
+ "payment", "equipment", "login", "student",
+ "let", "programs", "offers", "legal",
+ "above", "recent", "park", "stores",
+ "side", "act", "problem", "red",
+ "give", "memory", "performance", "social",
+ "august", "quote", "language", "story",
+ "sell", "options", "experience", "rates",
+ "create", "key", "body", "young",
+ "america", "important", "field", "few",
+ "east", "paper", "single", "age",
+ "activities", "club", "example", "girls",
+ "additional", "password", "latest", "something",
+ "road", "gift", "question", "changes",
+ "night", "hard", "texas", "oct",
+ "pay", "four", "poker", "status",
+ "browse", "issue", "range", "building",
+ "seller", "court", "february", "always",
+ "result", "audio", "light", "write",
+ "war", "nov", "offer", "blue",
+ "groups", "easy", "given", "files",
+ "event", "release", "analysis", "request",
+ "fax", "china", "making", "picture",
+ "needs", "possible", "might", "professional",
+ "yet", "month", "major", "star",
+ "areas", "future", "space", "committee",
+ "hand", "sun", "cards", "problems",
+ "london", "washington", "meeting", "rss",
+ "become", "interest", "child", "keep",
+ "enter", "california", "porn", "share",
+ "similar", "garden", "schools", "million",
+ "added", "reference", "companies", "listed",
+ "baby", "learning", "energy", "run",
+ "delivery", "net", "popular", "term",
+ "film", "stories", "put", "computers",
+ "journal", "reports", "try", "welcome",
+ "central", "images", "president", "notice",
+ "god", "original", "head", "radio",
+ "until", "cell", "color", "self",
+ "council", "away", "includes", "track",
+ "australia", "discussion", "archive", "once",
+ "others", "entertainment", "agreement", "format",
+ "least", "society", "months", "log",
+ "safety", "friends", "sure", "faq",
+ "trade", "edition", "cars", "messages",
+ "marketing", "tell", "further", "updated",
+ "association", "able", "having", "provides",
+ "david", "fun", "already", "green",
+ "studies", "close", "common", "drive",
+ "specific", "several", "gold", "feb",
+ "living", "sep", "collection", "called",
+ "short", "arts", "lot", "ask",
+ "display", "limited", "powered", "solutions",
+ "means", "director", "daily", "beach",
+ "past", "natural", "whether", "due",
+ "electronics", "five", "upon", "period",
+ "planning", "database", "says", "official",
+ "weather", "mar", "land", "average",
+ "done", "technical", "window", "france",
+ "pro", "region", "island", "record",
+ "direct", "conference", "environment", "records",
+ "district", "calendar", "costs", "style",
+ "url", "front", "statement", "update",
+ "parts", "aug", "ever", "downloads",
+ "early", "miles", "sound", "resource",
+ "present", "applications", "either", "ago",
+ "document", "word", "works", "material",
+ "bill", "apr", "written", "talk",
+ "federal", "hosting", "rules", "final",
+ "adult", "tickets", "thing", "centre",
+ "requirements", "via", "cheap", "nude",
+ "kids", "finance", "true", "minutes",
+ "else", "mark", "third", "rock",
+ "gifts", "europe", "reading", "topics",
+ "bad", "individual", "tips", "plus",
+ "auto", "cover", "usually", "edit",
+ "together", "videos", "percent", "fast",
+ "function", "fact", "unit", "getting",
+ "global", "tech", "meet", "far",
+ "economic", "player", "projects", "lyrics",
+ "often", "subscribe", "submit", "germany",
+ "amount", "watch", "included", "feel",
+ "though", "bank", "risk", "thanks",
+ "everything", "deals", "various", "words",
+ "linux", "jul", "production", "commercial",
+ "james", "weight", "town", "heart",
+ "advertising", "received", "choose", "treatment",
+ "newsletter", "archives", "points", "knowledge",
+ "magazine", "error", "camera", "jun",
+ "girl", "currently", "construction", "toys",
+ "registered", "clear", "golf", "receive",
+ "domain", "methods", "chapter", "makes",
+ "protection", "policies", "loan", "wide",
+ "beauty", "manager", "india", "position",
+ "taken", "sort", "listings", "models",
+ "michael", "known", "half", "cases",
+ "step", "engineering", "florida", "simple",
+ "quick", "none", "wireless", "license",
+ "paul", "friday", "lake", "whole",
+ "annual", "published", "later", "basic",
+ "shows", "corporate", "church", "method",
+ "purchase", "customers", "active", "response",
+ "practice", "hardware", "figure", "materials",
+ "fire", "holiday", "chat", "enough",
+ "designed", "along", "among", "death",
+ "writing", "speed", "html", "countries",
+ "loss", "face", "brand", "discount",
+ "higher", "effects", "created", "remember",
+ "standards", "oil", "bit", "yellow",
+ "political", "increase", "advertise", "kingdom",
+ "base", "near", "environmental", "thought",
+ "stuff", "french", "storage", "japan",
+ "doing", "loans", "shoes", "entry",
+ "stay", "nature", "orders", "availability",
+ "africa", "summary", "turn", "mean",
+ "growth", "notes", "agency", "king",
+ "monday", "european", "activity", "copy",
+ "although", "drug", "pics", "western",
+ "income", "force", "cash", "employment",
+ "overall", "bay", "river", "commission",
+ "package", "contents", "seen", "players",
+ "engine", "port", "album", "regional",
+ "stop", "supplies", "started", "administration",
+ "bar", "institute", "views", "plans",
+ "double", "dog", "build", "screen",
+ "exchange", "types", "soon", "sponsored",
+ "lines", "electronic", "continue", "across",
+ "benefits", "needed", "season", "apply",
+ "someone", "held", "anything", "printer",
+ "condition", "effective", "believe", "organization",
+ "effect", "asked", "eur", "mind"};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_MONKEY_TEST_MONKEY_TEST_COMMON_WORDS_H_
diff --git a/icing/monkey_test/monkey-test-generators.cc b/icing/monkey_test/monkey-test-generators.cc
new file mode 100644
index 0000000..0d5ad73
--- /dev/null
+++ b/icing/monkey_test/monkey-test-generators.cc
@@ -0,0 +1,346 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/monkey_test/monkey-test-generators.h"
+
+#include <array>
+#include <cstdint>
+#include <random>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "icing/absl_ports/str_cat.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/document-builder.h"
+#include "icing/monkey_test/monkey-test-util.h"
+#include "icing/monkey_test/monkey-tokenized-document.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> kCardinalities =
+ {PropertyConfigProto::Cardinality::REPEATED,
+ PropertyConfigProto::Cardinality::OPTIONAL,
+ PropertyConfigProto::Cardinality::REQUIRED};
+
+constexpr std::array<TermMatchType::Code, 3> kTermMatchTypes = {
+ TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX};
+
+PropertyConfigProto::Cardinality::Code GetRandomCardinality(
+ MonkeyTestRandomEngine* random) {
+ std::uniform_int_distribution<> dist(0, kCardinalities.size() - 1);
+ return kCardinalities[dist(*random)];
+}
+
+TermMatchType::Code GetRandomTermMatchType(MonkeyTestRandomEngine* random) {
+ std::uniform_int_distribution<> dist(0, kTermMatchTypes.size() - 1);
+ return kTermMatchTypes[dist(*random)];
+}
+
+// TODO: Update this function when supporting document_indexing_config.
+bool IsIndexableProperty(const PropertyConfigProto& property) {
+ return property.string_indexing_config().term_match_type() !=
+ TermMatchType::UNKNOWN;
+}
+
+void SetStringIndexingConfig(PropertyConfigProto& property,
+ TermMatchType::Code term_match_type) {
+ if (term_match_type != TermMatchType::UNKNOWN) {
+ StringIndexingConfig* string_indexing_config =
+ property.mutable_string_indexing_config();
+ string_indexing_config->set_term_match_type(term_match_type);
+ // TODO: Try to add different TokenizerTypes. VERBATIM, RFC822, and URL are
+ // the remaining candidates to consider.
+ string_indexing_config->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
+ } else {
+ property.clear_string_indexing_config();
+ }
+}
+
+} // namespace
+
+SchemaProto MonkeySchemaGenerator::GenerateSchema() {
+ SchemaProto schema;
+ for (int i = 0; i < config_->num_types; ++i) {
+ *schema.add_types() = GenerateType();
+ }
+ return schema;
+}
+
+MonkeySchemaGenerator::UpdateSchemaResult MonkeySchemaGenerator::UpdateSchema(
+ const SchemaProto& schema) {
+ UpdateSchemaResult result = {std::move(schema)};
+ SchemaProto& new_schema = result.schema;
+
+ // Delete up to 2 existing types.
+ std::uniform_int_distribution<> num_types_to_delete_dist(0, 2);
+ for (int num_types_to_delete = num_types_to_delete_dist(*random_);
+ num_types_to_delete >= 0; --num_types_to_delete) {
+ if (new_schema.types_size() > 0) {
+ std::uniform_int_distribution<> dist(0, new_schema.types_size() - 1);
+ int index_to_delete = dist(*random_);
+ result.schema_types_deleted.insert(
+ new_schema.types(index_to_delete).schema_type());
+ new_schema.mutable_types()->SwapElements(index_to_delete,
+ new_schema.types_size() - 1);
+ new_schema.mutable_types()->RemoveLast();
+ }
+ }
+
+ // Updating about 1/3 of existing types.
+ for (int i = 0; i < new_schema.types_size(); ++i) {
+ std::uniform_int_distribution<> dist(0, 2);
+ if (dist(*random_) == 0) {
+ UpdateType(*new_schema.mutable_types(i), result);
+ }
+ }
+
+ // Add up to 2 new types.
+ std::uniform_int_distribution<> num_types_to_add_dist(0, 2);
+ for (int num_types_to_add = num_types_to_add_dist(*random_);
+ num_types_to_add >= 0; --num_types_to_add) {
+ *new_schema.add_types() = GenerateType();
+ }
+
+ return result;
+}
+
+PropertyConfigProto MonkeySchemaGenerator::GenerateProperty(
+ const SchemaTypeConfigProto& type_config,
+ PropertyConfigProto::Cardinality::Code cardinality,
+ TermMatchType::Code term_match_type) {
+ PropertyConfigProto prop;
+ prop.set_property_name(
+ "MonkeyTestProp" +
+ std::to_string(num_properties_generated_[type_config.schema_type()]++));
+ // TODO: Perhaps in future iterations we will want to generate more than just
+ // string properties.
+ prop.set_data_type(PropertyConfigProto::DataType::STRING);
+ prop.set_cardinality(cardinality);
+ SetStringIndexingConfig(prop, term_match_type);
+ return prop;
+}
+
+void MonkeySchemaGenerator::UpdateProperty(
+ const SchemaTypeConfigProto& type_config, PropertyConfigProto& property,
+ UpdateSchemaResult& result) {
+ PropertyConfigProto::Cardinality::Code new_cardinality =
+ GetRandomCardinality(random_);
+ if (new_cardinality != property.cardinality()) {
+ // Only do compatible cardinality update for now, otherwise it would be hard
+ // to track which documents will be invalid after updating the schema.
+ //
+ // The following type of updates are not allowed:
+ // - optional -> required
+ // - repeated -> optional
+ // - repeated -> required
+ if (property.cardinality() == PropertyConfigProto::Cardinality::OPTIONAL &&
+ new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
+ return;
+ }
+ if (property.cardinality() == PropertyConfigProto::Cardinality::REPEATED &&
+ (new_cardinality == PropertyConfigProto::Cardinality::OPTIONAL ||
+ new_cardinality == PropertyConfigProto::Cardinality::REQUIRED)) {
+ return;
+ }
+ property.set_cardinality(new_cardinality);
+ }
+
+ if (property.data_type() == PropertyConfigProto::DataType::STRING) {
+ TermMatchType::Code new_term_match_type = GetRandomTermMatchType(random_);
+ if (new_term_match_type !=
+ property.string_indexing_config().term_match_type()) {
+ SetStringIndexingConfig(property, new_term_match_type);
+ result.schema_types_index_incompatible.insert(type_config.schema_type());
+ }
+ }
+}
+
+SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType() {
+ SchemaTypeConfigProto type_config;
+ type_config.set_schema_type("MonkeyTestType" +
+ std::to_string(num_types_generated_++));
+ std::uniform_int_distribution<> possible_num_properties_dist(
+ 0, config_->possible_num_properties.size() - 1);
+ int total_num_properties =
+ config_->possible_num_properties[possible_num_properties_dist(*random_)];
+
+ int num_indexed_properties = 0;
+ for (int i = 0; i < total_num_properties; ++i) {
+ TermMatchType::Code term_match_type = TermMatchType::UNKNOWN;
+ if (num_indexed_properties < kTotalNumSections) {
+ term_match_type = GetRandomTermMatchType(random_);
+ }
+ if (term_match_type != TermMatchType::UNKNOWN) {
+ num_indexed_properties += 1;
+ }
+ (*type_config.add_properties()) = GenerateProperty(
+ type_config, GetRandomCardinality(random_), term_match_type);
+ }
+ return type_config;
+}
+
+void MonkeySchemaGenerator::UpdateType(SchemaTypeConfigProto& type_config,
+ UpdateSchemaResult& result) {
+ // Delete up to 4 existing property.
+ std::uniform_int_distribution<> num_properties_to_delete_dist(0, 4);
+ for (int num_properties_to_delete = num_properties_to_delete_dist(*random_);
+ num_properties_to_delete >= 0; --num_properties_to_delete) {
+ if (type_config.properties_size() > 0) {
+ std::uniform_int_distribution<> dist(0,
+ type_config.properties_size() - 1);
+ int index_to_delete = dist(*random_);
+ // Only delete a required property for now, otherwise it would be hard
+ // to track which documents will be invalid after updating the schema.
+ if (type_config.properties(index_to_delete).cardinality() !=
+ PropertyConfigProto::Cardinality::REQUIRED) {
+ continue;
+ }
+ if (IsIndexableProperty(type_config.properties(index_to_delete))) {
+ result.schema_types_index_incompatible.insert(
+ type_config.schema_type());
+ }
+ // Removing a property will cause the type to be considered as
+ // incompatible.
+ result.schema_types_incompatible.insert(type_config.schema_type());
+
+ type_config.mutable_properties()->SwapElements(
+ index_to_delete, type_config.properties_size() - 1);
+ type_config.mutable_properties()->RemoveLast();
+ }
+ }
+
+ // Updating about 1/3 of existing properties.
+ for (int i = 0; i < type_config.properties_size(); ++i) {
+ std::uniform_int_distribution<> dist(0, 2);
+ if (dist(*random_) == 0) {
+ UpdateProperty(type_config, *type_config.mutable_properties(i), result);
+ }
+ }
+
+ // Add up to 4 new properties.
+ std::uniform_int_distribution<> num_types_to_add_dist(0, 4);
+ for (int num_types_to_add = num_types_to_add_dist(*random_);
+ num_types_to_add >= 0; --num_types_to_add) {
+ PropertyConfigProto::Cardinality::Code new_cardinality =
+ GetRandomCardinality(random_);
+ // Adding a required property will make all document of this type invalid.
+ if (new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
+ result.schema_types_incompatible.insert(type_config.schema_type());
+ }
+ PropertyConfigProto new_property = GenerateProperty(
+ type_config, new_cardinality, GetRandomTermMatchType(random_));
+ if (IsIndexableProperty(new_property)) {
+ result.schema_types_index_incompatible.insert(type_config.schema_type());
+ }
+ (*type_config.add_properties()) = std::move(new_property);
+ }
+
+ int num_indexed_properties = 0;
+ for (int i = 0; i < type_config.properties_size(); ++i) {
+ if (IsIndexableProperty(type_config.properties(i))) {
+ ++num_indexed_properties;
+ }
+ }
+
+ if (num_indexed_properties > kTotalNumSections) {
+ result.is_invalid_schema = true;
+ }
+}
+
+std::string MonkeyDocumentGenerator::GetNamespace() const {
+ uint32_t name_space;
+ // When num_namespaces is 0, all documents generated get different namespaces.
+ // Otherwise, namespaces will be randomly picked from a set with
+ // num_namespaces elements.
+ if (config_->num_namespaces == 0) {
+ name_space = num_docs_generated_;
+ } else {
+ std::uniform_int_distribution<> dist(0, config_->num_namespaces - 1);
+ name_space = dist(*random_);
+ }
+ return absl_ports::StrCat("namespace", std::to_string(name_space));
+}
+
+std::string MonkeyDocumentGenerator::GetUri() const {
+ uint32_t uri;
+ // When num_uris is 0, all documents generated get different URIs. Otherwise,
+ // URIs will be randomly picked from a set with num_uris elements.
+ if (config_->num_uris == 0) {
+ uri = num_docs_generated_;
+ } else {
+ std::uniform_int_distribution<> dist(0, config_->num_uris - 1);
+ uri = dist(*random_);
+ }
+ return absl_ports::StrCat("uri", std::to_string(uri));
+}
+
+int MonkeyDocumentGenerator::GetNumTokens() const {
+ std::uniform_int_distribution<> dist(
+ 0, config_->possible_num_tokens_.size() - 1);
+ int n = config_->possible_num_tokens_[dist(*random_)];
+ // Add some noise
+ std::uniform_real_distribution<> real_dist(0.5, 1);
+ float p = real_dist(*random_);
+ return n * p;
+}
+
+std::vector<std::string> MonkeyDocumentGenerator::GetPropertyContent() const {
+ std::vector<std::string> content;
+ int num_tokens = GetNumTokens();
+ while (num_tokens) {
+ content.push_back(std::string(GetToken()));
+ --num_tokens;
+ }
+ return content;
+}
+
+MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() {
+ MonkeyTokenizedDocument document;
+ const SchemaTypeConfigProto& type_config = GetType();
+ const std::string& name_space = GetNamespace();
+ DocumentBuilder doc_builder =
+ DocumentBuilder()
+ .SetNamespace(name_space)
+ .SetSchema(type_config.schema_type())
+ .SetUri(GetUri())
+ .SetCreationTimestampMs(clock_.GetSystemTimeMilliseconds());
+ for (const PropertyConfigProto& prop : type_config.properties()) {
+ std::vector<std::string> prop_content = GetPropertyContent();
+ doc_builder.AddStringProperty(prop.property_name(),
+ absl_ports::StrJoin(prop_content, " "));
+ // No matter whether the property is indexable currently, we have to create
+ // a section for it since a non-indexable property can become indexable
+ // after a schema type change. The in-memory icing will automatically skip
+ // sections that are non-indexable at the time of search requests.
+ MonkeyTokenizedSection section = {prop.property_name(),
+ std::move(prop_content)};
+ document.tokenized_sections.push_back(std::move(section));
+ }
+ document.document = doc_builder.Build();
+ ++num_docs_generated_;
+ return document;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/monkey_test/monkey-test-generators.h b/icing/monkey_test/monkey-test-generators.h
new file mode 100644
index 0000000..72a4723
--- /dev/null
+++ b/icing/monkey_test/monkey-test-generators.h
@@ -0,0 +1,127 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_MONKEY_TEST_MONKEY_TEST_GENERATORS_H_
+#define ICING_MONKEY_TEST_MONKEY_TEST_GENERATORS_H_
+
+#include <cstdint>
+#include <random>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/monkey_test/monkey-test-common-words.h"
+#include "icing/monkey_test/monkey-test-util.h"
+#include "icing/monkey_test/monkey-tokenized-document.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/util/clock.h"
+
+namespace icing {
+namespace lib {
+
+// A random schema generator used for monkey testing.
+class MonkeySchemaGenerator {
+ public:
+ struct UpdateSchemaResult {
+ SchemaProto schema;
+ bool is_invalid_schema;
+ std::unordered_set<std::string> schema_types_deleted;
+ std::unordered_set<std::string> schema_types_incompatible;
+ std::unordered_set<std::string> schema_types_index_incompatible;
+ };
+
+ explicit MonkeySchemaGenerator(
+ MonkeyTestRandomEngine* random,
+ const IcingMonkeyTestRunnerConfiguration* config)
+ : random_(random), config_(config) {}
+
+ SchemaProto GenerateSchema();
+
+ UpdateSchemaResult UpdateSchema(const SchemaProto& schema);
+
+ private:
+ PropertyConfigProto GenerateProperty(
+ const SchemaTypeConfigProto& type_config,
+ PropertyConfigProto::Cardinality::Code cardinality,
+ TermMatchType::Code term_match_type);
+
+ void UpdateProperty(const SchemaTypeConfigProto& type_config,
+ PropertyConfigProto& property,
+ UpdateSchemaResult& result);
+
+ SchemaTypeConfigProto GenerateType();
+
+ void UpdateType(SchemaTypeConfigProto& type_config,
+ UpdateSchemaResult& result);
+
+ int num_types_generated_ = 0;
+ // A map from type name to the number of properties generated in the
+ // corresponding types.
+ std::unordered_map<std::string, int> num_properties_generated_;
+
+ MonkeyTestRandomEngine* random_; // Does not own.
+ const IcingMonkeyTestRunnerConfiguration* config_; // Does not own.
+};
+
+// A random document generator used for monkey testing.
+// When num_uris is 0, all documents generated get different URIs. Otherwise,
+// URIs will be randomly picked from a set with num_uris elements.
+// Same for num_namespaces.
+class MonkeyDocumentGenerator {
+ public:
+ explicit MonkeyDocumentGenerator(
+ MonkeyTestRandomEngine* random, const SchemaProto* schema,
+ const IcingMonkeyTestRunnerConfiguration* config)
+ : random_(random), schema_(schema), config_(config) {}
+
+ const SchemaTypeConfigProto& GetType() const {
+ std::uniform_int_distribution<> dist(0, schema_->types_size() - 1);
+ return schema_->types(dist(*random_));
+ }
+
+ std::string_view GetToken() const {
+ // TODO: Instead of randomly picking tokens from the language set
+ // kCommonWords, we can make some words more common than others to simulate
+ // term frequencies in the real world. This can help us get extremely large
+ // posting lists.
+ std::uniform_int_distribution<> dist(0, kCommonWords.size() - 1);
+ return kCommonWords[dist(*random_)];
+ }
+
+ std::string GetNamespace() const;
+
+ std::string GetUri() const;
+
+ int GetNumTokens() const;
+
+ std::vector<std::string> GetPropertyContent() const;
+
+ MonkeyTokenizedDocument GenerateDocument();
+
+ private:
+ MonkeyTestRandomEngine* random_; // Does not own.
+ const SchemaProto* schema_; // Does not own.
+ const IcingMonkeyTestRunnerConfiguration* config_; // Does not own.
+
+ uint32_t num_docs_generated_ = 0;
+ Clock clock_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_MONKEY_TEST_MONKEY_TEST_GENERATORS_H_
diff --git a/icing/monkey_test/monkey-test-util.h b/icing/monkey_test/monkey-test-util.h
new file mode 100644
index 0000000..d6053d8
--- /dev/null
+++ b/icing/monkey_test/monkey-test-util.h
@@ -0,0 +1,68 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_MONKEY_TEST_MONKEY_TEST_UTIL_H_
+#define ICING_MONKEY_TEST_MONKEY_TEST_UTIL_H_
+
+#include <cstdint>
+#include <functional>
+#include <random>
+#include <utility>
+#include <vector>
+
+namespace icing {
+namespace lib {
+
+using MonkeyTestRandomEngine = std::mt19937;
+
+class IcingMonkeyTestRunner;
+
+struct IcingMonkeyTestRunnerConfiguration {
+ explicit IcingMonkeyTestRunnerConfiguration(uint32_t seed, int num_types,
+ int num_namespaces, int num_uris,
+ int index_merge_size)
+ : seed(seed),
+ num_types(num_types),
+ num_namespaces(num_namespaces),
+ num_uris(num_uris),
+ index_merge_size(index_merge_size) {}
+
+ uint32_t seed;
+ int num_types;
+ int num_namespaces;
+ int num_uris;
+ int index_merge_size;
+
+ // To ensure that the random schema is generated with the best quality, the
+ // number of properties for each type will only be randomly picked from this
+ // list, instead of picking it from a range. For example, a vector of
+ // [1, 2, 3, 4] means each generated types have a 25% chance of getting 1
+ // property, 2 properties, 3 properties and 4 properties.
+ std::vector<int> possible_num_properties;
+
+ // The possible number of tokens that may appear in generated documents, with
+ // a noise factor from 0.5 to 1 applied.
+ std::vector<int> possible_num_tokens_;
+
+ // An array of pairs of monkey test APIs with frequencies.
+ // If f_sum is the sum of all the frequencies, an operation with frequency f
+ // means for every f_sum iterations, the operation is expected to run f times.
+ std::vector<std::pair<std::function<void(IcingMonkeyTestRunner*)>, uint32_t>>
+ monkey_api_schedules;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_MONKEY_TEST_MONKEY_TEST_UTIL_H_
diff --git a/icing/monkey_test/monkey-tokenized-document.h b/icing/monkey_test/monkey-tokenized-document.h
new file mode 100644
index 0000000..87b77bb
--- /dev/null
+++ b/icing/monkey_test/monkey-tokenized-document.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_MONKEY_TEST_MONKEY_TOKENIZED_DOCUMENT_H_
+#define ICING_MONKEY_TEST_MONKEY_TOKENIZED_DOCUMENT_H_
+
+#include <string>
+#include <vector>
+
+#include "icing/proto/document.pb.h"
+
+namespace icing {
+namespace lib {
+
+struct MonkeyTokenizedSection {
+ std::string path;
+ std::vector<std::string> token_sequence;
+};
+
+struct MonkeyTokenizedDocument {
+ DocumentProto document;
+ std::vector<MonkeyTokenizedSection> tokenized_sections;
+};
+
+} // namespace lib
+} // namespace icing
+#endif // ICING_MONKEY_TEST_MONKEY_TOKENIZED_DOCUMENT_H_
diff --git a/icing/performance-configuration.cc b/icing/performance-configuration.cc
index aeaa449..1518381 100644
--- a/icing/performance-configuration.cc
+++ b/icing/performance-configuration.cc
@@ -14,7 +14,7 @@
#include "icing/performance-configuration.h"
-#include "icing/result/result-state.h"
+#include "icing/scoring/scored-document-hit.h"
namespace icing {
namespace lib {
@@ -38,54 +38,33 @@ namespace {
// rendering 2 frames.
//
// With the information above, we then try to choose default values for
-// query_length and num_to_score so that the overall time can comfortably fit
-// in with our goal.
+// query_length so that the overall time can comfortably fit in with our goal
+// (note that num_to_score will be decided by the client, which is specified in
+// ResultSpecProto).
// 1. Set query_length to 23000 so that any query can be executed by
// QueryProcessor within 15 ms on a Pixel 3 XL according to results of
// //icing/query:query-processor_benchmark.
-// 2. Set num_to_score to 30000 so that results can be scored and ranked within
-// 3 ms on a Pixel 3 XL according to results of
-// //icing/scoring:score-and-rank_benchmark.
//
// In the worse-case scenario, we still have [33 ms - 15 ms - 3 ms] = 15 ms left
// for all the other things like proto parsing, document fetching, and even
// Android Binder calls if Icing search engine runs in a separate process.
constexpr int kMaxQueryLength = 23000;
-constexpr int kDefaultNumToScore = 30000;
// New Android devices nowadays all allow more than 16 MB memory per app. Using
-// that as a guideline, we set 16 MB as the safe memory threshold.
+// that as a guideline and being more conservative, we set 4 MB as the safe
+// memory threshold.
// TODO(b/150029642): Android apps / framework have better understanding of how
// much memory is allowed, so it would be better to let clients pass in this
// value.
-constexpr int kSafeMemoryUsage = 16 * 1024 * 1024; // 16MB
+constexpr int kSafeMemoryUsage = 4 * 1024 * 1024; // 4MB
-// This number is not determined by benchmarks. We just assume that returning
-// the best 1000 scored document hits of a query is enough. To find the best
-// 1000 scored document hits from a heap, we need roughly 0.7 ms on a Pixel 3 XL
-// according to //icing/scoring:ranker_benchmark.
-constexpr int kMaxNumHitsPerQuery = 1000;
+// The maximum number of hits that can fit below the kSafeMemoryUsage threshold.
+constexpr int kMaxNumTotalHits = kSafeMemoryUsage / sizeof(ScoredDocumentHit);
-// A rough estimation of the size of ResultState if it stores the maximum number
-// of scored document hits.
-constexpr int kMaxMemoryPerResult =
- sizeof(ResultState) + kMaxNumHitsPerQuery * sizeof(ScoredDocumentHit);
-
-// To be safer, we assume that all the Results contain the maximum number of
-// hits and only use half of the memory allowed.
-constexpr int kDefaultNumResultsToCache =
- kSafeMemoryUsage / 2 / kMaxMemoryPerResult;
-
-static_assert(
- kDefaultNumResultsToCache > 500,
- "Default number of results to cache has changed, please update and make "
- "sure it still meets our requirements.");
} // namespace
PerformanceConfiguration::PerformanceConfiguration()
- : PerformanceConfiguration(kMaxQueryLength, kDefaultNumToScore,
- kMaxNumHitsPerQuery, kDefaultNumResultsToCache) {
-}
+ : PerformanceConfiguration(kMaxQueryLength, kMaxNumTotalHits) {}
} // namespace lib
} // namespace icing
diff --git a/icing/performance-configuration.h b/icing/performance-configuration.h
index fa4050b..3ec67f3 100644
--- a/icing/performance-configuration.h
+++ b/icing/performance-configuration.h
@@ -23,29 +23,20 @@ struct PerformanceConfiguration {
// Loads default configuration.
PerformanceConfiguration();
- PerformanceConfiguration(int max_query_length_in, int num_to_score_in,
- int max_num_hits_per_query_in,
- int max_num_cache_results_in)
+ PerformanceConfiguration(int max_query_length_in, int max_num_total_hits)
: max_query_length(max_query_length_in),
- num_to_score(num_to_score_in),
- max_num_hits_per_query(max_num_hits_per_query_in),
- max_num_cache_results(max_num_cache_results_in) {}
+ max_num_total_hits(max_num_total_hits) {}
// Search performance
// Maximum length of query to execute in IndexProcessor.
int max_query_length;
- // Number of results to score in ScoringProcessor for every query.
- int num_to_score;
-
// Memory
- // Maximum number of ScoredDocumentHits to return per query.
- int max_num_hits_per_query;
-
- // Maximum number of ResultStates to store in ResultStateManager.
- int max_num_cache_results;
+ // Maximum number of ScoredDocumentHits to cache in the ResultStateManager at
+ // one time.
+ int max_num_total_hits;
};
// TODO(b/149040810): Consider creating a class to manage performance
diff --git a/icing/portable/endian.h b/icing/portable/endian.h
new file mode 100644
index 0000000..ecebb15
--- /dev/null
+++ b/icing/portable/endian.h
@@ -0,0 +1,208 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Utility functions that depend on bytesex. We define versions of htonll and
+// ntohll (HostToNetworkLL and NetworkToHostLL in our naming), as well as
+// "Google" versions of all the standards: ghtonl, ghtons, and so on
+// (GHostToNetworkL, GHostToNetworkS, etc in our naming). These functions do
+// exactly the same as their standard variants, but don't require including the
+// dangerous netinet/in.h.
+
+#ifndef ICING_PORTABLE_ENDIAN_H_
+#define ICING_PORTABLE_ENDIAN_H_
+
+#include <cstdint>
+
+// IS_LITTLE_ENDIAN, IS_BIG_ENDIAN
+#if defined OS_LINUX || defined OS_ANDROID || defined(__ANDROID__)
+// _BIG_ENDIAN
+#include <endian.h>
+
+#elif defined(__APPLE__)
+
+// BIG_ENDIAN
+#include <machine/endian.h> // NOLINT(build/include)
+
+/* Let's try and follow the Linux convention */
+#define __BYTE_ORDER BYTE_ORDER
+#define __LITTLE_ENDIAN LITTLE_ENDIAN
+#define __BIG_ENDIAN BIG_ENDIAN
+
+#endif // operating system
+
+// defines __BYTE_ORDER for MSVC
+#ifdef COMPILER_MSVC
+#define __BYTE_ORDER __LITTLE_ENDIAN
+#define IS_LITTLE_ENDIAN
+#else // COMPILER_MSVC
+
+// define the macros IS_LITTLE_ENDIAN or IS_BIG_ENDIAN
+// using the above endian definitions from endian.h if
+// endian.h was included
+#ifdef __BYTE_ORDER
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define IS_LITTLE_ENDIAN
+#endif // __BYTE_ORDER == __LITTLE_ENDIAN
+
+#if __BYTE_ORDER == __BIG_ENDIAN
+#define IS_BIG_ENDIAN
+#endif // __BYTE_ORDER == __BIG_ENDIAN
+
+#else // __BYTE_ORDER
+
+#if defined(__LITTLE_ENDIAN__)
+#define IS_LITTLE_ENDIAN
+#elif defined(__BIG_ENDIAN__)
+#define IS_BIG_ENDIAN
+#endif // __LITTLE_ENDIAN__ or __BIG_ENDIAN__
+
+#endif // __BYTE_ORDER
+#endif // COMPILER_MSVC
+
+// byte swap functions (bswap_16, bswap_32, bswap_64).
+// byte swap functions reverse the order of bytes, e.g.
+// byteswap of 102030405060708 = 807060504030201
+// byteswap of 1020304 = 4030201
+
+// The following guarantees declaration of the byte swap functions
+#ifdef COMPILER_MSVC
+#include <cstdlib> // NOLINT(build/include)
+
+#define bswap_16(x) _byteswap_ushort(x)
+#define bswap_32(x) _byteswap_ulong(x)
+#define bswap_64(x) _byteswap_uint64(x)
+
+#elif defined(__APPLE__)
+// Mac OS X / Darwin features
+#include <libkern/OSByteOrder.h>
+
+#define bswap_16(x) OSSwapInt16(x)
+#define bswap_32(x) OSSwapInt32(x)
+#define bswap_64(x) OSSwapInt64(x)
+
+#elif defined(__GLIBC__) || defined(__BIONIC__) || defined(__ASYLO__)
+#include <byteswap.h> // IWYU pragma: export
+
+#else // built-in byteswap functions
+
+static inline uint16 bswap_16(uint16 x) {
+#ifdef __cplusplus
+ return static_cast<uint16>(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8));
+#else // __cplusplus
+ return (uint16)(((x & 0xFF) << 8) | ((x & 0xFF00) >> 8)); // NOLINT
+#endif // __cplusplus
+}
+#define bswap_16(x) bswap_16(x)
+static inline uint32 bswap_32(uint32 x) {
+ return (((x & 0xFF) << 24) | ((x & 0xFF00) << 8) | ((x & 0xFF0000) >> 8) |
+ ((x & 0xFF000000) >> 24));
+}
+#define bswap_32(x) bswap_32(x)
+static inline uint64 bswap_64(uint64 x) {
+ return (((x & (uint64_t)0xFF) << 56) | ((x & (uint64_t)0xFF00) << 40) |
+ ((x & (uint64_t)0xFF0000) << 24) | ((x & (uint64_t)0xFF000000) << 8) |
+ ((x & (uint64_t)0xFF00000000) >> 8) |
+ ((x & (uint64_t)0xFF0000000000) >> 24) |
+ ((x & (uint64_t)0xFF000000000000) >> 40) |
+ ((x & (uint64_t)0xFF00000000000000) >> 56));
+}
+#define bswap_64(x) bswap_64(x)
+
+#endif // end byteswap functions
+
+// Use compiler byte-swapping intrinsics if they are available. 32-bit
+// and 64-bit versions are available in Clang and GCC as of GCC 4.3.0.
+// The 16-bit version is available in Clang and GCC only as of GCC 4.8.0.
+// For simplicity, we enable them all only for GCC 4.8.0 or later.
+#if defined(__clang__) || \
+ (defined(__GNUC__) && \
+ ((__GNUC__ == 4 && __GNUC_MINOR__ >= 8) || __GNUC__ >= 5))
+
+inline uint64_t gbswap_64(uint64_t host_int) {
+ return __builtin_bswap64(host_int);
+}
+inline uint32_t gbswap_32(uint32_t host_int) {
+ return __builtin_bswap32(host_int);
+}
+inline uint16_t gbswap_16(uint16_t host_int) {
+ return __builtin_bswap16(host_int);
+}
+
+#else // intrinsics available
+
+inline uint64 gbswap_64(uint64 host_int) {
+#if defined(__GNUC__) && defined(__x86_64__) && \
+ !(defined(__APPLE__) && defined(__MACH__))
+ // Adapted from /usr/include/byteswap.h. Not available on Mac.
+ if (__builtin_constant_p(host_int)) {
+ return __bswap_constant_64(host_int);
+ } else {
+ uint64 result;
+ __asm__("bswap %0" : "=r"(result) : "0"(host_int));
+ return result;
+ }
+#elif defined(bswap_64)
+ return bswap_64(host_int);
+#else // bswap_64
+ return static_cast<uint64>(bswap_32(static_cast<uint32>(host_int >> 32))) |
+ (static_cast<uint64>(bswap_32(static_cast<uint32>(host_int))) << 32);
+#endif // bswap_64
+}
+inline uint32 gbswap_32(uint32 host_int) { return bswap_32(host_int); }
+inline uint16 gbswap_16(uint16 host_int) { return bswap_16(host_int); }
+
+#endif // intrinsics available
+
+#ifdef IS_LITTLE_ENDIAN
+
+// Definitions for ntohl etc. that don't require us to include
+// netinet/in.h. We wrap gbswap_32 and gbswap_16 in functions rather
+// than just #defining them because in debug mode, gcc doesn't
+// correctly handle the (rather involved) definitions of bswap_32.
+// gcc guarantees that inline functions are as fast as macros, so
+// this isn't a performance hit.
+inline uint16_t GHostToNetworkS(uint16_t x) { return gbswap_16(x); }
+inline uint32_t GHostToNetworkL(uint32_t x) { return gbswap_32(x); }
+inline uint64_t GHostToNetworkLL(uint64_t x) { return gbswap_64(x); }
+
+#elif defined IS_BIG_ENDIAN
+
+// These definitions are simpler on big-endian machines
+// These are functions instead of macros to avoid self-assignment warnings
+// on calls such as "i = ghtnol(i);". This also provides type checking.
+inline uint16 GHostToNetworkS(uint16 x) { return x; }
+inline uint32 GHostToNetworkL(uint32 x) { return x; }
+inline uint64 GHostToNetworkLL(uint64 x) { return x; }
+
+#else // bytesex
+#error \
+ "Unsupported bytesex: Either IS_BIG_ENDIAN or IS_LITTLE_ENDIAN must be defined" // NOLINT
+#endif // bytesex
+
+#ifndef HostToNetworkLL
+// With the rise of 64-bit, some systems are beginning to define this.
+#define HostToNetworkLL(x) GHostToNetworkLL(x)
+#endif // HostToNetworkLL
+
+// ntoh* and hton* are the same thing for any size and bytesex,
+// since the function is an involution, i.e., its own inverse.
+inline uint16_t GNetworkToHostS(uint16_t x) { return GHostToNetworkS(x); }
+inline uint32_t GNetworkToHostL(uint32_t x) { return GHostToNetworkL(x); }
+inline uint64_t GNetworkToHostLL(uint64_t x) { return GHostToNetworkLL(x); }
+
+#ifndef NetworkToHostLL
+#define NetworkToHostLL(x) GHostToNetworkLL(x)
+#endif // NetworkToHostLL
+
+#endif // ICING_PORTABLE_ENDIAN_H_
diff --git a/icing/portable/equals-proto.h b/icing/portable/equals-proto.h
index 6a600be..8bb835e 100644
--- a/icing/portable/equals-proto.h
+++ b/icing/portable/equals-proto.h
@@ -20,8 +20,8 @@
#ifndef ICING_PORTABLE_EQUALS_PROTO_H_
#define ICING_PORTABLE_EQUALS_PROTO_H_
+#include "gmock/gmock.h" // IWYU pragma: export
#include <google/protobuf/message_lite.h> // IWYU pragma: export
-#include "gmock/gmock.h" // IWYU pragma: export
#if defined(__ANDROID__) || defined(__APPLE__)
namespace icing {
diff --git a/icing/portable/gzip_stream.cc b/icing/portable/gzip_stream.cc
new file mode 100644
index 0000000..f00a993
--- /dev/null
+++ b/icing/portable/gzip_stream.cc
@@ -0,0 +1,313 @@
+// Copyright (C) 2009 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains the implementation of classes GzipInputStream and
+// GzipOutputStream. It is forked from protobuf because these classes are only
+// provided in libprotobuf-full but we would like to link libicing against the
+// smaller libprotobuf-lite instead.
+
+#include "icing/portable/gzip_stream.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+namespace protobuf_ports {
+
+static const int kDefaultBufferSize = 65536;
+
+GzipInputStream::GzipInputStream(ZeroCopyInputStream* sub_stream, Format format,
+ int buffer_size)
+ : format_(format), sub_stream_(sub_stream), zerror_(Z_OK), byte_count_(0) {
+ zcontext_.state = Z_NULL;
+ zcontext_.zalloc = Z_NULL;
+ zcontext_.zfree = Z_NULL;
+ zcontext_.opaque = Z_NULL;
+ zcontext_.total_out = 0;
+ zcontext_.next_in = NULL;
+ zcontext_.avail_in = 0;
+ zcontext_.total_in = 0;
+ zcontext_.msg = NULL;
+ if (buffer_size == -1) {
+ output_buffer_length_ = kDefaultBufferSize;
+ } else {
+ output_buffer_length_ = buffer_size;
+ }
+ output_buffer_ = operator new(output_buffer_length_);
+ zcontext_.next_out = static_cast<Bytef*>(output_buffer_);
+ zcontext_.avail_out = output_buffer_length_;
+ output_position_ = output_buffer_;
+}
+GzipInputStream::~GzipInputStream() {
+ operator delete(output_buffer_);
+ zerror_ = inflateEnd(&zcontext_);
+}
+
+static inline int internalInflateInit2(z_stream* zcontext,
+ GzipInputStream::Format format) {
+ int windowBitsFormat = 0;
+ switch (format) {
+ case GzipInputStream::GZIP:
+ windowBitsFormat = 16;
+ break;
+ case GzipInputStream::AUTO:
+ windowBitsFormat = 32;
+ break;
+ case GzipInputStream::ZLIB:
+ windowBitsFormat = 0;
+ break;
+ }
+ return inflateInit2(zcontext, /* windowBits */ 15 | windowBitsFormat);
+}
+
+int GzipInputStream::Inflate(int flush) {
+ if ((zerror_ == Z_OK) && (zcontext_.avail_out == 0)) {
+ // previous inflate filled output buffer. don't change input params yet.
+ } else if (zcontext_.avail_in == 0) {
+ const void* in;
+ int in_size;
+ bool first = zcontext_.next_in == NULL;
+ bool ok = sub_stream_->Next(&in, &in_size);
+ if (!ok) {
+ zcontext_.next_out = NULL;
+ zcontext_.avail_out = 0;
+ return Z_STREAM_END;
+ }
+ zcontext_.next_in = static_cast<Bytef*>(const_cast<void*>(in));
+ zcontext_.avail_in = in_size;
+ if (first) {
+ int error = internalInflateInit2(&zcontext_, format_);
+ if (error != Z_OK) {
+ return error;
+ }
+ }
+ }
+ zcontext_.next_out = static_cast<Bytef*>(output_buffer_);
+ zcontext_.avail_out = output_buffer_length_;
+ output_position_ = output_buffer_;
+ int error = inflate(&zcontext_, flush);
+ return error;
+}
+
+void GzipInputStream::DoNextOutput(const void** data, int* size) {
+ *data = output_position_;
+ *size = ((uintptr_t)zcontext_.next_out) - ((uintptr_t)output_position_);
+ output_position_ = zcontext_.next_out;
+}
+
+// implements ZeroCopyInputStream ----------------------------------
+bool GzipInputStream::Next(const void** data, int* size) {
+ bool ok = (zerror_ == Z_OK) || (zerror_ == Z_STREAM_END) ||
+ (zerror_ == Z_BUF_ERROR);
+ if ((!ok) || (zcontext_.next_out == NULL)) {
+ return false;
+ }
+ if (zcontext_.next_out != output_position_) {
+ DoNextOutput(data, size);
+ return true;
+ }
+ if (zerror_ == Z_STREAM_END) {
+ if (zcontext_.next_out != NULL) {
+ // sub_stream_ may have concatenated streams to follow
+ zerror_ = inflateEnd(&zcontext_);
+ byte_count_ += zcontext_.total_out;
+ if (zerror_ != Z_OK) {
+ return false;
+ }
+ zerror_ = internalInflateInit2(&zcontext_, format_);
+ if (zerror_ != Z_OK) {
+ return false;
+ }
+ } else {
+ *data = NULL;
+ *size = 0;
+ return false;
+ }
+ }
+ zerror_ = Inflate(Z_NO_FLUSH);
+ if ((zerror_ == Z_STREAM_END) && (zcontext_.next_out == NULL)) {
+ // The underlying stream's Next returned false inside Inflate.
+ return false;
+ }
+ ok = (zerror_ == Z_OK) || (zerror_ == Z_STREAM_END) ||
+ (zerror_ == Z_BUF_ERROR);
+ if (!ok) {
+ return false;
+ }
+ DoNextOutput(data, size);
+ return true;
+}
+void GzipInputStream::BackUp(int count) {
+ output_position_ = reinterpret_cast<void*>(
+ reinterpret_cast<uintptr_t>(output_position_) - count);
+}
+bool GzipInputStream::Skip(int count) {
+ const void* data;
+ int size = 0;
+ bool ok = Next(&data, &size);
+ while (ok && (size < count)) {
+ count -= size;
+ ok = Next(&data, &size);
+ }
+ if (size > count) {
+ BackUp(size - count);
+ }
+ return ok;
+}
+int64_t GzipInputStream::ByteCount() const {
+ int64_t ret = byte_count_ + zcontext_.total_out;
+ if (zcontext_.next_out != NULL && output_position_ != NULL) {
+ ret += reinterpret_cast<uintptr_t>(zcontext_.next_out) -
+ reinterpret_cast<uintptr_t>(output_position_);
+ }
+ return ret;
+}
+
+// =========================================================================
+
+GzipOutputStream::Options::Options()
+ : format(GZIP),
+ buffer_size(kDefaultBufferSize),
+ compression_level(Z_DEFAULT_COMPRESSION),
+ compression_strategy(Z_DEFAULT_STRATEGY) {}
+
+GzipOutputStream::GzipOutputStream(ZeroCopyOutputStream* sub_stream) {
+ Init(sub_stream, Options());
+}
+
+GzipOutputStream::GzipOutputStream(ZeroCopyOutputStream* sub_stream,
+ const Options& options) {
+ Init(sub_stream, options);
+}
+
+void GzipOutputStream::Init(ZeroCopyOutputStream* sub_stream,
+ const Options& options) {
+ sub_stream_ = sub_stream;
+ sub_data_ = NULL;
+ sub_data_size_ = 0;
+
+ input_buffer_length_ = options.buffer_size;
+ input_buffer_ = operator new(input_buffer_length_);
+
+ zcontext_.zalloc = Z_NULL;
+ zcontext_.zfree = Z_NULL;
+ zcontext_.opaque = Z_NULL;
+ zcontext_.next_out = NULL;
+ zcontext_.avail_out = 0;
+ zcontext_.total_out = 0;
+ zcontext_.next_in = NULL;
+ zcontext_.avail_in = 0;
+ zcontext_.total_in = 0;
+ zcontext_.msg = NULL;
+ // default to GZIP format
+ int windowBitsFormat = 16;
+ if (options.format == ZLIB) {
+ windowBitsFormat = 0;
+ }
+ zerror_ =
+ deflateInit2(&zcontext_, options.compression_level, Z_DEFLATED,
+ /* windowBits */ 15 | windowBitsFormat,
+ /* memLevel (default) */ 8, options.compression_strategy);
+}
+
+GzipOutputStream::~GzipOutputStream() {
+ Close();
+ operator delete(input_buffer_);
+}
+
+// private
+int GzipOutputStream::Deflate(int flush) {
+ int error = Z_OK;
+ do {
+ if ((sub_data_ == NULL) || (zcontext_.avail_out == 0)) {
+ bool ok = sub_stream_->Next(&sub_data_, &sub_data_size_);
+ if (!ok) {
+ sub_data_ = NULL;
+ sub_data_size_ = 0;
+ return Z_BUF_ERROR;
+ }
+ if (sub_data_size_ <= 0) {
+ ICING_LOG(FATAL) << "Failed to advance underlying stream";
+ }
+ zcontext_.next_out = static_cast<Bytef*>(sub_data_);
+ zcontext_.avail_out = sub_data_size_;
+ }
+ error = deflate(&zcontext_, flush);
+ } while (error == Z_OK && zcontext_.avail_out == 0);
+ if ((flush == Z_FULL_FLUSH) || (flush == Z_FINISH)) {
+ // Notify lower layer of data.
+ sub_stream_->BackUp(zcontext_.avail_out);
+ // We don't own the buffer anymore.
+ sub_data_ = NULL;
+ sub_data_size_ = 0;
+ }
+ return error;
+}
+
+// implements ZeroCopyOutputStream ---------------------------------
+bool GzipOutputStream::Next(void** data, int* size) {
+ if ((zerror_ != Z_OK) && (zerror_ != Z_BUF_ERROR)) {
+ return false;
+ }
+ if (zcontext_.avail_in != 0) {
+ zerror_ = Deflate(Z_NO_FLUSH);
+ if (zerror_ != Z_OK) {
+ return false;
+ }
+ }
+ if (zcontext_.avail_in == 0) {
+ // all input was consumed. reset the buffer.
+ zcontext_.next_in = static_cast<Bytef*>(input_buffer_);
+ zcontext_.avail_in = input_buffer_length_;
+ *data = input_buffer_;
+ *size = input_buffer_length_;
+ } else {
+ // The loop in Deflate should consume all avail_in
+ ICING_LOG(ERROR) << "Deflate left bytes unconsumed";
+ }
+ return true;
+}
+void GzipOutputStream::BackUp(int count) {
+ if (zcontext_.avail_in < static_cast<uInt>(count)) {
+ ICING_LOG(FATAL) << "Not enough data to back up " << count << " bytes";
+ }
+ zcontext_.avail_in -= count;
+}
+int64_t GzipOutputStream::ByteCount() const {
+ return zcontext_.total_in + zcontext_.avail_in;
+}
+
+bool GzipOutputStream::Flush() {
+ zerror_ = Deflate(Z_FULL_FLUSH);
+ // Return true if the flush succeeded or if it was a no-op.
+ return (zerror_ == Z_OK) ||
+ (zerror_ == Z_BUF_ERROR && zcontext_.avail_in == 0 &&
+ zcontext_.avail_out != 0);
+}
+
+bool GzipOutputStream::Close() {
+ if ((zerror_ != Z_OK) && (zerror_ != Z_BUF_ERROR)) {
+ return false;
+ }
+ do {
+ zerror_ = Deflate(Z_FINISH);
+ } while (zerror_ == Z_OK);
+ zerror_ = deflateEnd(&zcontext_);
+ bool ok = zerror_ == Z_OK;
+ zerror_ = Z_STREAM_END;
+ return ok;
+}
+
+} // namespace protobuf_ports
+} // namespace lib
+} // namespace icing
diff --git a/icing/portable/gzip_stream.h b/icing/portable/gzip_stream.h
new file mode 100644
index 0000000..8008a55
--- /dev/null
+++ b/icing/portable/gzip_stream.h
@@ -0,0 +1,177 @@
+// Copyright (C) 2009 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file contains the definition for classes GzipInputStream and
+// GzipOutputStream. It is forked from protobuf because these classes are only
+// provided in libprotobuf-full but we would like to link libicing against the
+// smaller libprotobuf-lite instead.
+//
+// GzipInputStream decompresses data from an underlying
+// ZeroCopyInputStream and provides the decompressed data as a
+// ZeroCopyInputStream.
+//
+// GzipOutputStream is an ZeroCopyOutputStream that compresses data to
+// an underlying ZeroCopyOutputStream.
+
+#ifndef GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_
+#define GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_
+
+#include "icing/portable/zlib.h"
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+
+namespace icing {
+namespace lib {
+namespace protobuf_ports {
+
+// A ZeroCopyInputStream that reads compressed data through zlib
+class GzipInputStream : public google::protobuf::io::ZeroCopyInputStream {
+ public:
+ // Format key for constructor
+ enum Format {
+ // zlib will autodetect gzip header or deflate stream
+ AUTO = 0,
+
+ // GZIP streams have some extra header data for file attributes.
+ GZIP = 1,
+
+ // Simpler zlib stream format.
+ ZLIB = 2,
+ };
+
+ // buffer_size and format may be -1 for default of 64kB and GZIP format
+ explicit GzipInputStream(google::protobuf::io::ZeroCopyInputStream* sub_stream,
+ Format format = AUTO, int buffer_size = -1);
+ virtual ~GzipInputStream();
+
+ // Return last error message or NULL if no error.
+ inline const char* ZlibErrorMessage() const { return zcontext_.msg; }
+ inline int ZlibErrorCode() const { return zerror_; }
+
+ // implements ZeroCopyInputStream ----------------------------------
+ bool Next(const void** data, int* size) override;
+ void BackUp(int count) override;
+ bool Skip(int count) override;
+ int64_t ByteCount() const override;
+
+ private:
+ Format format_;
+
+ google::protobuf::io::ZeroCopyInputStream* sub_stream_;
+
+ z_stream zcontext_;
+ int zerror_;
+
+ void* output_buffer_;
+ void* output_position_;
+ size_t output_buffer_length_;
+ int64_t byte_count_;
+
+ int Inflate(int flush);
+ void DoNextOutput(const void** data, int* size);
+};
+
+class GzipOutputStream : public google::protobuf::io::ZeroCopyOutputStream {
+ public:
+ // Format key for constructor
+ enum Format {
+ // GZIP streams have some extra header data for file attributes.
+ GZIP = 1,
+
+ // Simpler zlib stream format.
+ ZLIB = 2,
+ };
+
+ struct Options {
+ // Defaults to GZIP.
+ Format format;
+
+ // What size buffer to use internally. Defaults to 64kB.
+ int buffer_size;
+
+ // A number between 0 and 9, where 0 is no compression and 9 is best
+ // compression. Defaults to Z_DEFAULT_COMPRESSION (see zlib.h).
+ int compression_level;
+
+ // Defaults to Z_DEFAULT_STRATEGY. Can also be set to Z_FILTERED,
+ // Z_HUFFMAN_ONLY, or Z_RLE. See the documentation for deflateInit2 in
+ // zlib.h for definitions of these constants.
+ int compression_strategy;
+
+ Options(); // Initializes with default values.
+ };
+
+ // Create a GzipOutputStream with default options.
+ explicit GzipOutputStream(google::protobuf::io::ZeroCopyOutputStream* sub_stream);
+
+ // Create a GzipOutputStream with the given options.
+ GzipOutputStream(google::protobuf::io::ZeroCopyOutputStream* sub_stream,
+ const Options& options);
+
+ virtual ~GzipOutputStream();
+
+ // Return last error message or NULL if no error.
+ inline const char* ZlibErrorMessage() const { return zcontext_.msg; }
+ inline int ZlibErrorCode() const { return zerror_; }
+
+ // Flushes data written so far to zipped data in the underlying stream.
+ // It is the caller's responsibility to flush the underlying stream if
+ // necessary.
+ // Compression may be less efficient stopping and starting around flushes.
+ // Returns true if no error.
+ //
+ // Please ensure that block size is > 6. Here is an excerpt from the zlib
+ // doc that explains why:
+ //
+ // In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that avail_out
+ // is greater than six to avoid repeated flush markers due to
+ // avail_out == 0 on return.
+ bool Flush();
+
+ // Writes out all data and closes the gzip stream.
+ // It is the caller's responsibility to close the underlying stream if
+ // necessary.
+ // Returns true if no error.
+ bool Close();
+
+ // implements ZeroCopyOutputStream ---------------------------------
+ bool Next(void** data, int* size) override;
+ void BackUp(int count) override;
+ int64_t ByteCount() const override;
+
+ private:
+ google::protobuf::io::ZeroCopyOutputStream* sub_stream_;
+ // Result from calling Next() on sub_stream_
+ void* sub_data_;
+ int sub_data_size_;
+
+ z_stream zcontext_;
+ int zerror_;
+ void* input_buffer_;
+ size_t input_buffer_length_;
+
+ // Shared constructor code.
+ void Init(google::protobuf::io::ZeroCopyOutputStream* sub_stream,
+ const Options& options);
+
+ // Do some compression.
+ // Takes zlib flush mode.
+ // Returns zlib error code.
+ int Deflate(int flush);
+};
+
+} // namespace protobuf_ports
+} // namespace lib
+} // namespace icing
+
+#endif // GOOGLE3_ICING_PORTABLE_GZIP_STREAM_H_
diff --git a/icing/portable/platform.h b/icing/portable/platform.h
new file mode 100644
index 0000000..6d8c668
--- /dev/null
+++ b/icing/portable/platform.h
@@ -0,0 +1,106 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_PORTABLE_PLATFORM_H_
+#define ICING_PORTABLE_PLATFORM_H_
+
+#include "unicode/uconfig.h" // IWYU pragma: keep
+// clang-format: do not reorder the above include.
+
+#include "unicode/uvernum.h"
+
+namespace icing {
+namespace lib {
+
+// Returns true if the test was built with the CFStringTokenizer as the
+// implementation of LanguageSegmenter.
+inline bool IsCfStringTokenization() {
+#if defined(__APPLE__) && !defined(ICING_IOS_ICU4C_SEGMENTATION)
+ return true;
+#endif // defined(__APPLE__) && !defined(ICING_IOS_ICU4C_SEGMENTATION)
+ return false;
+}
+
+inline bool IsReverseJniTokenization() {
+#ifdef ICING_REVERSE_JNI_SEGMENTATION
+ return true;
+#endif // ICING_REVERSE_JNI_SEGMENTATION
+ return false;
+}
+
+inline bool IsIcuTokenization() {
+ return !IsReverseJniTokenization() && !IsCfStringTokenization();
+}
+
+inline int GetIcuTokenizationVersion() {
+ return IsIcuTokenization() ? U_ICU_VERSION_MAJOR_NUM : 0;
+}
+
+// Whether we're running on android_x86
+inline bool IsAndroidX86() {
+#if defined(__ANDROID__) && defined(__i386__)
+ return true;
+#endif // defined(__ANDROID__) && defined(__i386__)
+ return false;
+}
+
+// Whether we're running on android_armeabi-v7a
+inline bool IsAndroidArm() {
+#if defined(__ANDROID__) && defined(__arm__)
+ return true;
+#endif // defined(__ANDROID__) && defined(__arm__)
+ return false;
+}
+
+// Whether the running test is an iOS test.
+inline bool IsIosPlatform() {
+#if defined(__APPLE__)
+ return true;
+#endif // defined(__APPLE__)
+ return false;
+}
+
+// TODO(b/259129263): verify the flag works for different platforms.
+#if defined(__arm__) || defined(__i386__)
+#define ICING_ARCH_BIT_32
+#elif defined(__aarch64__) || defined(__x86_64__)
+#define ICING_ARCH_BIT_64
+#else
+#define ICING_ARCH_BIT_UNKNOWN
+#endif
+
+enum Architecture {
+ UNKNOWN,
+ BIT_32,
+ BIT_64,
+};
+
+// Returns which architecture we're running on.
+//
+// Architecture macros pulled from
+// https://developer.android.com/ndk/guides/cpu-features
+inline Architecture GetArchitecture() {
+#if defined(ICING_ARCH_BIT_32)
+ return BIT_32;
+#elif defined(ICING_ARCH_BIT_64)
+ return BIT_64;
+#else
+ return UNKNOWN;
+#endif
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_PORTABLE_PLATFORM_H_
diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree-test-utils.h b/icing/query/advanced_query_parser/abstract-syntax-tree-test-utils.h
new file mode 100644
index 0000000..42be07d
--- /dev/null
+++ b/icing/query/advanced_query_parser/abstract-syntax-tree-test-utils.h
@@ -0,0 +1,108 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_ABSTRACT_SYNTAX_TREE_TEST_UTILS_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_ABSTRACT_SYNTAX_TREE_TEST_UTILS_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+
+namespace icing {
+namespace lib {
+
+// A visitor that simply collects the nodes and flattens them in left-side
+// depth-first order.
+enum class NodeType {
+ kFunctionName,
+ kString,
+ kText,
+ kMember,
+ kFunction,
+ kUnaryOperator,
+ kNaryOperator
+};
+
+struct NodeInfo {
+ std::string value;
+ NodeType type;
+
+ bool operator==(const NodeInfo& rhs) const {
+ return value == rhs.value && type == rhs.type;
+ }
+};
+
+MATCHER_P2(EqualsNodeInfo, value, type, "") {
+ if (arg.value != value || arg.type != type) {
+ *result_listener << "(Expected: value=\"" << value
+ << "\", type=" << static_cast<int>(type)
+ << ". Actual: value=\"" << arg.value
+ << "\", type=" << static_cast<int>(arg.type) << ")";
+ return false;
+ }
+ return true;
+}
+
+class SimpleVisitor : public AbstractSyntaxTreeVisitor {
+ public:
+ void VisitFunctionName(const FunctionNameNode* node) override {
+ nodes_.push_back({node->value(), NodeType::kFunctionName});
+ }
+ void VisitString(const StringNode* node) override {
+ nodes_.push_back({node->value(), NodeType::kString});
+ }
+ void VisitText(const TextNode* node) override {
+ nodes_.push_back({node->value(), NodeType::kText});
+ }
+ void VisitMember(const MemberNode* node) override {
+ for (const std::unique_ptr<TextNode>& child : node->children()) {
+ child->Accept(this);
+ }
+ if (node->function() != nullptr) {
+ node->function()->Accept(this);
+ }
+ nodes_.push_back({"", NodeType::kMember});
+ }
+ void VisitFunction(const FunctionNode* node) override {
+ node->function_name()->Accept(this);
+ for (const std::unique_ptr<Node>& arg : node->args()) {
+ arg->Accept(this);
+ }
+ nodes_.push_back({"", NodeType::kFunction});
+ }
+ void VisitUnaryOperator(const UnaryOperatorNode* node) override {
+ node->child()->Accept(this);
+ nodes_.push_back({node->operator_text(), NodeType::kUnaryOperator});
+ }
+ void VisitNaryOperator(const NaryOperatorNode* node) override {
+ for (const std::unique_ptr<Node>& child : node->children()) {
+ child->Accept(this);
+ }
+ nodes_.push_back({node->operator_text(), NodeType::kNaryOperator});
+ }
+
+ const std::vector<NodeInfo>& nodes() const { return nodes_; }
+
+ private:
+ std::vector<NodeInfo> nodes_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_ABSTRACT_SYNTAX_TREE_TEST_UTILS_H_
diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree.h b/icing/query/advanced_query_parser/abstract-syntax-tree.h
new file mode 100644
index 0000000..67049ad
--- /dev/null
+++ b/icing/query/advanced_query_parser/abstract-syntax-tree.h
@@ -0,0 +1,184 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_ABSTRACT_SYNTAX_TREE_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_ABSTRACT_SYNTAX_TREE_H_
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+namespace icing {
+namespace lib {
+
+class FunctionNameNode;
+class StringNode;
+class TextNode;
+class MemberNode;
+class FunctionNode;
+class UnaryOperatorNode;
+class NaryOperatorNode;
+
+class AbstractSyntaxTreeVisitor {
+ public:
+ virtual ~AbstractSyntaxTreeVisitor() = default;
+
+ virtual void VisitFunctionName(const FunctionNameNode* node) = 0;
+ virtual void VisitString(const StringNode* node) = 0;
+ virtual void VisitText(const TextNode* node) = 0;
+ virtual void VisitMember(const MemberNode* node) = 0;
+ virtual void VisitFunction(const FunctionNode* node) = 0;
+ virtual void VisitUnaryOperator(const UnaryOperatorNode* node) = 0;
+ virtual void VisitNaryOperator(const NaryOperatorNode* node) = 0;
+};
+
+class Node {
+ public:
+ virtual ~Node() = default;
+ virtual void Accept(AbstractSyntaxTreeVisitor* visitor) const = 0;
+};
+
+class TerminalNode : public Node {
+ public:
+ explicit TerminalNode(std::string value, std::string_view raw_value,
+ bool is_prefix)
+ : value_(std::move(value)),
+ raw_value_(raw_value),
+ is_prefix_(is_prefix) {}
+
+ const std::string& value() const& { return value_; }
+ std::string value() && { return std::move(value_); }
+
+ bool is_prefix() const { return is_prefix_; }
+
+ std::string_view raw_value() const { return raw_value_; }
+
+ private:
+ std::string value_;
+ std::string_view raw_value_;
+ bool is_prefix_;
+};
+
+class FunctionNameNode : public TerminalNode {
+ public:
+ explicit FunctionNameNode(std::string value)
+ : TerminalNode(std::move(value), /*raw_value=*/"", /*is_prefix=*/false) {}
+ void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
+ visitor->VisitFunctionName(this);
+ }
+};
+
+class StringNode : public TerminalNode {
+ public:
+ explicit StringNode(std::string value, std::string_view raw_value,
+ bool is_prefix = false)
+ : TerminalNode(std::move(value), raw_value, is_prefix) {}
+ void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
+ visitor->VisitString(this);
+ }
+};
+
+class TextNode : public TerminalNode {
+ public:
+ explicit TextNode(std::string value, std::string_view raw_value,
+ bool is_prefix = false)
+ : TerminalNode(std::move(value), raw_value, is_prefix) {}
+ void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
+ visitor->VisitText(this);
+ }
+};
+
+class MemberNode : public Node {
+ public:
+ explicit MemberNode(std::vector<std::unique_ptr<TextNode>> children,
+ std::unique_ptr<FunctionNode> function)
+ : children_(std::move(children)), function_(std::move(function)) {}
+
+ void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
+ visitor->VisitMember(this);
+ }
+ const std::vector<std::unique_ptr<TextNode>>& children() const {
+ return children_;
+ }
+ const FunctionNode* function() const { return function_.get(); }
+
+ private:
+ std::vector<std::unique_ptr<TextNode>> children_;
+ // This is nullable. When it is not nullptr, this class will represent a
+ // function call.
+ std::unique_ptr<FunctionNode> function_;
+};
+
+class FunctionNode : public Node {
+ public:
+ explicit FunctionNode(std::unique_ptr<FunctionNameNode> function_name)
+ : FunctionNode(std::move(function_name), {}) {}
+ explicit FunctionNode(std::unique_ptr<FunctionNameNode> function_name,
+ std::vector<std::unique_ptr<Node>> args)
+ : function_name_(std::move(function_name)), args_(std::move(args)) {}
+
+ void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
+ visitor->VisitFunction(this);
+ }
+ const FunctionNameNode* function_name() const { return function_name_.get(); }
+ const std::vector<std::unique_ptr<Node>>& args() const { return args_; }
+
+ private:
+ std::unique_ptr<FunctionNameNode> function_name_;
+ std::vector<std::unique_ptr<Node>> args_;
+};
+
+class UnaryOperatorNode : public Node {
+ public:
+ explicit UnaryOperatorNode(std::string operator_text,
+ std::unique_ptr<Node> child)
+ : operator_text_(std::move(operator_text)), child_(std::move(child)) {}
+
+ void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
+ visitor->VisitUnaryOperator(this);
+ }
+ const std::string& operator_text() const { return operator_text_; }
+ const Node* child() const { return child_.get(); }
+
+ private:
+ std::string operator_text_;
+ std::unique_ptr<Node> child_;
+};
+
+class NaryOperatorNode : public Node {
+ public:
+ explicit NaryOperatorNode(std::string operator_text,
+ std::vector<std::unique_ptr<Node>> children)
+ : operator_text_(std::move(operator_text)),
+ children_(std::move(children)) {}
+
+ void Accept(AbstractSyntaxTreeVisitor* visitor) const override {
+ visitor->VisitNaryOperator(this);
+ }
+ const std::string& operator_text() const { return operator_text_; }
+ const std::vector<std::unique_ptr<Node>>& children() const {
+ return children_;
+ }
+
+ private:
+ std::string operator_text_;
+ std::vector<std::unique_ptr<Node>> children_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_ABSTRACT_SYNTAX_TREE_H_
diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc b/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc
new file mode 100644
index 0000000..5e28278
--- /dev/null
+++ b/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc
@@ -0,0 +1,143 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree-test-utils.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::ElementsAre;
+
+TEST(AbstractSyntaxTreeTest, Simple) {
+ std::string_view query = "foo";
+ std::unique_ptr<Node> root = std::make_unique<TextNode>("foo", query);
+ SimpleVisitor visitor;
+ root->Accept(&visitor);
+
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText)));
+}
+
+TEST(AbstractSyntaxTreeTest, Composite) {
+ std::string_view query = "(foo bar) OR baz";
+ std::vector<std::unique_ptr<Node>> and_args;
+ and_args.push_back(std::make_unique<TextNode>("foo", query.substr(1, 3)));
+ and_args.push_back(std::make_unique<TextNode>("bar", query.substr(5, 3)));
+ auto and_node =
+ std::make_unique<NaryOperatorNode>("AND", std::move(and_args));
+
+ std::vector<std::unique_ptr<Node>> or_args;
+ or_args.push_back(std::move(and_node));
+ or_args.push_back(std::make_unique<TextNode>("baz", query.substr(13, 3)));
+ std::unique_ptr<Node> root =
+ std::make_unique<NaryOperatorNode>("OR", std::move(or_args));
+
+ SimpleVisitor visitor;
+ root->Accept(&visitor);
+
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator),
+ EqualsNodeInfo("baz", NodeType::kText),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(AbstractSyntaxTreeTest, Function) {
+ // foo()
+ std::unique_ptr<Node> root =
+ std::make_unique<FunctionNode>(std::make_unique<FunctionNameNode>("foo"));
+ SimpleVisitor visitor;
+ root->Accept(&visitor);
+
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction)));
+
+ std::string_view query = "foo(\"bar\")";
+ std::vector<std::unique_ptr<Node>> args;
+ args.push_back(std::make_unique<StringNode>("bar", query.substr(5, 3)));
+ root = std::make_unique<FunctionNode>(
+ std::make_unique<FunctionNameNode>("foo"), std::move(args));
+ visitor = SimpleVisitor();
+ root->Accept(&visitor);
+
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("bar", NodeType::kString),
+ EqualsNodeInfo("", NodeType::kFunction)));
+
+ query = "foo(bar(\"baz\"))";
+ std::vector<std::unique_ptr<Node>> inner_args;
+ inner_args.push_back(std::make_unique<StringNode>("baz", query.substr(9, 3)));
+ args.clear();
+ args.push_back(std::make_unique<FunctionNode>(
+ std::make_unique<FunctionNameNode>("bar"), std::move(inner_args)));
+ root = std::make_unique<FunctionNode>(
+ std::make_unique<FunctionNameNode>("foo"), std::move(args));
+ visitor = SimpleVisitor();
+ root->Accept(&visitor);
+
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("bar", NodeType::kFunctionName),
+ EqualsNodeInfo("baz", NodeType::kString),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(AbstractSyntaxTreeTest, Restriction) {
+ std::string_view query = "sender.name:(IMPORTANT OR URGENT)";
+ std::vector<std::unique_ptr<TextNode>> member_args;
+ member_args.push_back(
+ std::make_unique<TextNode>("sender", query.substr(0, 6)));
+ member_args.push_back(std::make_unique<TextNode>("name", query.substr(7, 4)));
+
+ std::vector<std::unique_ptr<Node>> or_args;
+ or_args.push_back(
+ std::make_unique<TextNode>("IMPORTANT", query.substr(13, 9)));
+ or_args.push_back(std::make_unique<TextNode>("URGENT", query.substr(26, 6)));
+
+ std::vector<std::unique_ptr<Node>> has_args;
+ has_args.push_back(std::make_unique<MemberNode>(std::move(member_args),
+ /*function=*/nullptr));
+ has_args.push_back(
+ std::make_unique<NaryOperatorNode>("OR", std::move(or_args)));
+
+ std::unique_ptr<Node> root =
+ std::make_unique<NaryOperatorNode>(":", std::move(has_args));
+
+ SimpleVisitor visitor;
+ root->Accept(&visitor);
+
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("sender", NodeType::kText),
+ EqualsNodeInfo("name", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("IMPORTANT", NodeType::kText),
+ EqualsNodeInfo("URGENT", NodeType::kText),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator),
+ EqualsNodeInfo(":", NodeType::kNaryOperator)));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/function.cc b/icing/query/advanced_query_parser/function.cc
new file mode 100644
index 0000000..e7938db
--- /dev/null
+++ b/icing/query/advanced_query_parser/function.cc
@@ -0,0 +1,77 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/query/advanced_query_parser/function.h"
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+/*static*/ libtextclassifier3::StatusOr<Function> Function::Create(
+ DataType return_type, std::string name, std::vector<Param> params,
+ Function::EvalFunction eval) {
+ bool has_had_optional = false;
+ for (int i = 0; i < params.size(); ++i) {
+ switch (params.at(i).cardinality) {
+ case Cardinality::kVariable:
+ if (i != params.size() - 1) {
+ return absl_ports::InvalidArgumentError(
+ "Can only specify a variable param as the final param.");
+ }
+ break;
+ case Cardinality::kOptional:
+ has_had_optional = true;
+ break;
+ case Cardinality::kRequired:
+ if (has_had_optional) {
+ return absl_ports::InvalidArgumentError(
+ "Can't specify optional params followed by required params.");
+ }
+ break;
+ }
+ }
+ return Function(return_type, std::move(name), std::move(params),
+ std::move(eval));
+}
+
+libtextclassifier3::StatusOr<PendingValue> Function::Eval(
+ std::vector<PendingValue>&& args) const {
+ for (int i = 0; i < params_.size() || i < args.size(); ++i) {
+ if (i < args.size() && i < params_.size()) {
+ ICING_RETURN_IF_ERROR(params_.at(i).Matches(args.at(i)));
+ } else if (i >= params_.size()) {
+ // There are remaining args. This would happen if the final arg is
+ // kVariable.
+ if (params_.empty() ||
+ params_.rbegin()->cardinality != Cardinality::kVariable) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Expected to find only ", std::to_string(params_.size()),
+ " arguments, but found ", std::to_string(args.size())));
+ }
+ ICING_RETURN_IF_ERROR(params_.rbegin()->Matches(args.at(i)));
+ } else if (params_.at(i).cardinality == Cardinality::kRequired) {
+ // There are no more args, but there are still params to check for. If
+ // These params are kRequired, then there is an error.
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Expected to find ", std::to_string(i + 1), "th argument, but only ",
+ std::to_string(args.size()), " arguments provided."));
+ }
+ }
+ return eval_(std::move(args));
+}
+
+} // namespace lib
+} // namespace icing \ No newline at end of file
diff --git a/icing/query/advanced_query_parser/function.h b/icing/query/advanced_query_parser/function.h
new file mode 100644
index 0000000..3514878
--- /dev/null
+++ b/icing/query/advanced_query_parser/function.h
@@ -0,0 +1,66 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_FUNCTION_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_FUNCTION_H_
+
+#include <functional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/query/advanced_query_parser/param.h"
+#include "icing/query/advanced_query_parser/pending-value.h"
+
+namespace icing {
+namespace lib {
+
+class Function {
+ public:
+ using EvalFunction = std::function<libtextclassifier3::StatusOr<PendingValue>(
+ std::vector<PendingValue>&&)>;
+
+ static libtextclassifier3::StatusOr<Function> Create(
+ DataType return_type, std::string name, std::vector<Param> params,
+ EvalFunction eval);
+
+ Function(const Function& rhs) = default;
+ Function(Function&& rhs) = default;
+
+ Function& operator=(const Function& rhs) = default;
+ Function& operator=(Function&& rhs) = default;
+
+ const std::string& name() const { return name_; }
+
+ libtextclassifier3::StatusOr<PendingValue> Eval(
+ std::vector<PendingValue>&& args) const;
+
+ private:
+ Function(DataType return_type, std::string name, std::vector<Param> params,
+ EvalFunction eval)
+ : name_(std::move(name)),
+ params_(std::move(params)),
+ eval_(std::move(eval)),
+ return_type_(return_type) {}
+
+ std::string name_;
+ std::vector<Param> params_;
+ EvalFunction eval_;
+ DataType return_type_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_FUNCTION_H_
diff --git a/icing/query/advanced_query_parser/function_test.cc b/icing/query/advanced_query_parser/function_test.cc
new file mode 100644
index 0000000..afd4e04
--- /dev/null
+++ b/icing/query/advanced_query_parser/function_test.cc
@@ -0,0 +1,332 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/query/advanced_query_parser/function.h"
+
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gtest/gtest.h"
+#include "icing/query/advanced_query_parser/param.h"
+#include "icing/query/advanced_query_parser/pending-value.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::IsTrue;
+
+struct TrivialEval {
+ libtextclassifier3::StatusOr<PendingValue> operator()(
+ const std::vector<PendingValue>&) const {
+ return PendingValue();
+ }
+};
+
+TEST(FunctionTest, NoParamCreateSucceeds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function, Function::Create(/*return_type=*/DataType::kString,
+ "foo", /*params=*/{}, TrivialEval()));
+ // foo()
+ std::vector<PendingValue> empty_args;
+ ICING_ASSERT_OK_AND_ASSIGN(PendingValue val,
+ function.Eval(std::move(empty_args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+}
+
+TEST(FunctionTest, NoParamNonEmptyArgsFails) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function, Function::Create(/*return_type=*/DataType::kString,
+ "foo", /*params=*/{}, TrivialEval()));
+
+ // foo()
+ std::vector<PendingValue> args;
+ args.push_back(PendingValue());
+ EXPECT_THAT(function.Eval(std::move(args)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(FunctionTest, ParamNotWrongTypeFails) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function,
+ Function::Create(/*return_type=*/DataType::kString, "foo",
+ /*params=*/{Param(DataType::kString)}, TrivialEval()));
+ std::string_view query = "foo(bar)";
+ std::vector<PendingValue> args;
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"bar", query.substr(4, 3), /*is_prefix_val=*/false}));
+ EXPECT_THAT(function.Eval(std::move(args)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(FunctionTest, ParamRequiredArgSucceeds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function,
+ Function::Create(/*return_type=*/DataType::kString, "foo",
+ /*params=*/{Param(DataType::kString)}, TrivialEval()));
+
+ std::string_view query = R"(foo("bar"))";
+ std::vector<PendingValue> args;
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+}
+
+TEST(FunctionTest, ParamRequiredArgNotPresentFails) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function,
+ Function::Create(/*return_type=*/DataType::kString, "foo",
+ /*params=*/{Param(DataType::kString)}, TrivialEval()));
+
+ // foo()
+ std::vector<PendingValue> empty_args;
+ EXPECT_THAT(function.Eval(std::move(empty_args)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(FunctionTest, ParamOptionalArgNotPresentSucceeds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function,
+ Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/{Param(DataType::kString, Cardinality::kOptional)},
+ TrivialEval()));
+
+ // foo()
+ std::vector<PendingValue> empty_args;
+ ICING_ASSERT_OK_AND_ASSIGN(PendingValue val,
+ function.Eval(std::move(empty_args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+}
+
+TEST(FunctionTest, ParamVariableArgNotPresentSucceeds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function,
+ Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/{Param(DataType::kString, Cardinality::kVariable)},
+ TrivialEval()));
+
+ // foo()
+ std::vector<PendingValue> empty_args;
+ ICING_ASSERT_OK_AND_ASSIGN(PendingValue val,
+ function.Eval(std::move(empty_args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+}
+
+TEST(FunctionTest, MultipleArgsTrailingOptionalSucceeds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function, Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/
+ {Param(DataType::kString, Cardinality::kRequired),
+ Param(DataType::kString, Cardinality::kOptional)},
+ TrivialEval()));
+
+ std::string_view query = R"(foo("bar"))";
+ std::vector<PendingValue> args;
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ query = R"(foo("bar", "baz"))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+}
+
+TEST(FunctionTest, MultipleArgsTrailingVariableSucceeds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function, Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/
+ {Param(DataType::kString, Cardinality::kRequired),
+ Param(DataType::kString, Cardinality::kVariable)},
+ TrivialEval()));
+
+ std::string_view query = R"(foo("bar"))";
+ std::vector<PendingValue> args;
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ query = R"(foo("bar", "baz"))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ query = R"(foo("bar", "baz", "bat"))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bat", query.substr(19, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+}
+
+TEST(FunctionTest, MultipleArgsOptionalBeforeRequiredFails) {
+ EXPECT_THAT(Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/
+ {Param(DataType::kString, Cardinality::kOptional),
+ Param(DataType::kString, Cardinality::kRequired)},
+ TrivialEval()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(FunctionTest, MultipleArgsOptionalBeforeOptionalSucceeds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function, Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/
+ {Param(DataType::kString, Cardinality::kOptional),
+ Param(DataType::kText, Cardinality::kOptional)},
+ TrivialEval()));
+
+ // foo()
+ std::vector<PendingValue> args;
+ ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ std::string_view query = R"(foo("bar"))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ query = R"(foo("bar", baz))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ query = R"(foo(baz))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false}));
+ EXPECT_THAT(function.Eval(std::move(args)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(FunctionTest, MultipleArgsOptionalBeforeVariableSucceeds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ Function function, Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/
+ {Param(DataType::kString, Cardinality::kOptional),
+ Param(DataType::kText, Cardinality::kVariable)},
+ TrivialEval()));
+
+ // foo()
+ std::vector<PendingValue> args;
+ ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ std::string_view query = R"(foo("bar"))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ query = R"(foo("bar", baz))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ query = R"(foo("bar", baz, bat))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateStringPendingValue(
+ QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"bat", query.substr(16, 3), /*is_prefix_val=*/false}));
+ ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args)));
+ EXPECT_THAT(val.is_placeholder(), IsTrue());
+
+ query = R"(foo(baz))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false}));
+ EXPECT_THAT(function.Eval(std::move(args)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ query = R"(foo(baz, bat))";
+ args = std::vector<PendingValue>();
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false}));
+ args.push_back(PendingValue::CreateTextPendingValue(
+ QueryTerm{"bat", query.substr(9, 3), /*is_prefix_val=*/false}));
+ EXPECT_THAT(function.Eval(std::move(args)),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(FunctionTest, MultipleArgsVariableBeforeRequiredFails) {
+ EXPECT_THAT(Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/
+ {Param(DataType::kString, Cardinality::kVariable),
+ Param(DataType::kString, Cardinality::kRequired)},
+ TrivialEval()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(FunctionTest, MultipleArgsVariableBeforeOptionalFails) {
+ EXPECT_THAT(Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/
+ {Param(DataType::kString, Cardinality::kVariable),
+ Param(DataType::kString, Cardinality::kOptional)},
+ TrivialEval()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(FunctionTest, MultipleArgsVariableBeforeVariableFails) {
+ EXPECT_THAT(Function::Create(
+ /*return_type=*/DataType::kString, "foo",
+ /*params=*/
+ {Param(DataType::kString, Cardinality::kVariable),
+ Param(DataType::kString, Cardinality::kVariable)},
+ TrivialEval()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing \ No newline at end of file
diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc
new file mode 100644
index 0000000..0dd0bb0
--- /dev/null
+++ b/icing/query/advanced_query_parser/lexer.cc
@@ -0,0 +1,270 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/lexer.h"
+
+#include <string>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/util/i18n-utils.h"
+
+namespace icing {
+namespace lib {
+
+bool Lexer::ConsumeWhitespace() {
+ if (current_char_ == '\0') {
+ return false;
+ }
+ if (i18n_utils::IsWhitespaceAt(query_, current_index_)) {
+ UChar32 uchar32 = i18n_utils::GetUChar32At(query_.data(), query_.length(),
+ current_index_);
+ int length = i18n_utils::GetUtf8Length(uchar32);
+ Advance(length);
+ return true;
+ }
+ return false;
+}
+
+bool Lexer::ConsumeQuerySingleChar() {
+ std::string_view original_text = query_.substr(current_index_, 1);
+ switch (current_char_) {
+ case ':':
+ tokens_.push_back({":", original_text, TokenType::COMPARATOR});
+ break;
+ case '*':
+ tokens_.push_back({"", original_text, TokenType::STAR});
+ break;
+ case '-':
+ if (in_text_) {
+ // MINUS ('-') is considered to be a part of a text segment if it is
+ // in the middle of a TEXT segment (ex. `foo-bar`).
+ return false;
+ }
+ tokens_.push_back({"", original_text, TokenType::MINUS});
+ break;
+ default:
+ return false;
+ }
+ Advance();
+ return true;
+}
+
+bool Lexer::ConsumeScoringSingleChar() {
+ std::string_view original_text = query_.substr(current_index_, 1);
+ switch (current_char_) {
+ case '+':
+ tokens_.push_back({"", original_text, TokenType::PLUS});
+ break;
+ case '*':
+ tokens_.push_back({"", original_text, TokenType::TIMES});
+ break;
+ case '/':
+ tokens_.push_back({"", original_text, TokenType::DIV});
+ break;
+ case '-':
+ tokens_.push_back({"", original_text, TokenType::MINUS});
+ break;
+ default:
+ return false;
+ }
+ Advance();
+ return true;
+}
+
+bool Lexer::ConsumeGeneralSingleChar() {
+ std::string_view original_text = query_.substr(current_index_, 1);
+ switch (current_char_) {
+ case ',':
+ tokens_.push_back({"", original_text, TokenType::COMMA});
+ break;
+ case '.':
+ tokens_.push_back({"", original_text, TokenType::DOT});
+ break;
+ case '(':
+ tokens_.push_back({"", original_text, TokenType::LPAREN});
+ break;
+ case ')':
+ tokens_.push_back({"", original_text, TokenType::RPAREN});
+ break;
+ default:
+ return false;
+ }
+ Advance();
+ return true;
+}
+
+bool Lexer::ConsumeSingleChar() {
+ if (language_ == Language::QUERY) {
+ if (ConsumeQuerySingleChar()) {
+ return true;
+ }
+ } else if (language_ == Language::SCORING) {
+ if (ConsumeScoringSingleChar()) {
+ return true;
+ }
+ }
+ return ConsumeGeneralSingleChar();
+}
+
+bool Lexer::ConsumeComparator() {
+ if (current_char_ != '<' && current_char_ != '>' && current_char_ != '!' &&
+ current_char_ != '=') {
+ return false;
+ }
+ // Now, current_char_ must be one of '<', '>', '!', or '='.
+ // Matching for '<=', '>=', '!=', or '=='.
+ char next_char = PeekNext(1);
+ if (next_char == '=') {
+ tokens_.push_back({{current_char_, next_char},
+ query_.substr(current_index_, 2),
+ TokenType::COMPARATOR});
+ Advance(2);
+ return true;
+ }
+ // Now, next_char must not be '='. Let's match for '<' and '>'.
+ if (current_char_ == '<' || current_char_ == '>') {
+ tokens_.push_back({{current_char_},
+ query_.substr(current_index_, 1),
+ TokenType::COMPARATOR});
+ Advance();
+ return true;
+ }
+ return false;
+}
+
+bool Lexer::ConsumeAndOr() {
+ if (current_char_ != '&' && current_char_ != '|') {
+ return false;
+ }
+ char next_char = PeekNext(1);
+ if (current_char_ != next_char) {
+ return false;
+ }
+ std::string_view original_text = query_.substr(current_index_, 2);
+ if (current_char_ == '&') {
+ tokens_.push_back({"", original_text, TokenType::AND});
+ } else {
+ tokens_.push_back({"", original_text, TokenType::OR});
+ }
+ Advance(2);
+ return true;
+}
+
+bool Lexer::ConsumeStringLiteral() {
+ if (current_char_ != '"') {
+ return false;
+ }
+ Advance();
+ int32_t unnormalized_start_pos = current_index_;
+ while (current_char_ != '\0' && current_char_ != '"') {
+ // When getting a backslash, we will always match the next character, even
+ // if the next character is a quotation mark
+ if (current_char_ == '\\') {
+ Advance();
+ if (current_char_ == '\0') {
+ // In this case, we are missing a terminating quotation mark.
+ break;
+ }
+ }
+ Advance();
+ }
+ if (current_char_ == '\0') {
+ SyntaxError("missing terminating \" character");
+ return false;
+ }
+ int32_t unnormalized_length = current_index_ - unnormalized_start_pos;
+ std::string_view raw_token_text =
+ query_.substr(unnormalized_start_pos, unnormalized_length);
+ std::string token_text(raw_token_text);
+ tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING});
+ Advance();
+ return true;
+}
+
+bool Lexer::ConsumeText() {
+ if (current_char_ == '\0') {
+ return false;
+ }
+ tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT});
+ int token_index = tokens_.size() - 1;
+
+ int32_t unnormalized_start_pos = current_index_;
+ int32_t unnormalized_end_pos = current_index_;
+ while (!ConsumeNonText() && current_char_ != '\0') {
+ in_text_ = true;
+ // When getting a backslash in TEXT, unescape it by accepting its following
+ // character no matter which character it is, including white spaces,
+ // operator symbols, parentheses, etc.
+ if (current_char_ == '\\') {
+ Advance();
+ if (current_char_ == '\0') {
+ SyntaxError("missing a escaping character after \\");
+ break;
+ }
+ }
+ tokens_[token_index].text.push_back(current_char_);
+ Advance();
+ unnormalized_end_pos = current_index_;
+ }
+ in_text_ = false;
+
+ tokens_[token_index].original_text = query_.substr(
+ unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos);
+ if (unnormalized_end_pos < query_.length() &&
+ query_[unnormalized_end_pos] == '(') {
+ // A TEXT followed by a LPAREN is a FUNCTION_NAME.
+ tokens_[token_index].type = TokenType::FUNCTION_NAME;
+ }
+
+ if (language_ == Lexer::Language::QUERY) {
+ std::string &text = tokens_[token_index].text;
+ TokenType &type = tokens_[token_index].type;
+ if (text == "AND") {
+ text.clear();
+ type = TokenType::AND;
+ } else if (text == "OR") {
+ text.clear();
+ type = TokenType::OR;
+ } else if (text == "NOT") {
+ text.clear();
+ type = TokenType::NOT;
+ }
+ }
+ return true;
+}
+
+libtextclassifier3::StatusOr<std::vector<Lexer::LexerToken>>
+Lexer::ExtractTokens() {
+ while (current_char_ != '\0') {
+ // Clear out any non-text before matching a Text.
+ while (ConsumeNonText()) {
+ }
+ ConsumeText();
+ }
+ if (!error_.empty()) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Syntax Error: ", error_));
+ }
+ if (tokens_.size() > kMaxNumTokens) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("The maximum number of tokens allowed is ",
+ std::to_string(kMaxNumTokens), ", but got ",
+ std::to_string(tokens_.size()), " tokens."));
+ }
+ return tokens_;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/lexer.h b/icing/query/advanced_query_parser/lexer.h
new file mode 100644
index 0000000..b313fa7
--- /dev/null
+++ b/icing/query/advanced_query_parser/lexer.h
@@ -0,0 +1,169 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+
+class Lexer {
+ public:
+ enum class Language { QUERY, SCORING };
+
+ // The maximum number of tokens allowed, in order to prevent stack overflow
+ // issues in the parsers or visitors.
+ static constexpr uint32_t kMaxNumTokens = 2048;
+
+ enum class TokenType {
+ COMMA, // ','
+ DOT, // '.'
+ PLUS, // '+' Not allowed in QUERY language.
+ MINUS, // '-'
+ STAR, // '*' Not allowed in SCORING language.
+ TIMES, // '*' Not allowed in QUERY language.
+ DIV, // '/' Not allowed in QUERY language.
+ LPAREN, // '('
+ RPAREN, // ')'
+ COMPARATOR, // '<=' | '<' | '>=' | '>' | '!=' | '==' | ':'
+ // Not allowed in SCORING language.
+ AND, // 'AND' | '&&' Not allowed in SCORING language.
+ OR, // 'OR' | '||' Not allowed in SCORING language.
+ NOT, // 'NOT' Not allowed in SCORING language.
+ STRING, // String literal surrounded by quotation marks. The
+ // original_text of a STRING token will not include quotation
+ // marks.
+ TEXT, // A sequence of chars that are not any above-listed operator
+ FUNCTION_NAME, // A TEXT followed by LPAREN.
+ // Whitespaces not inside a string literal will be skipped.
+ // WS: " " | "\t" | "\n" | "\r" | "\f" -> skip ;
+ };
+
+ struct LexerToken {
+ // For STRING, text will contain the raw original text of the token
+ // in between quotation marks, without unescaping.
+ //
+ // For TEXT, text will contain the text of the token after unescaping all
+ // escaped characters.
+ //
+ // For FUNCTION_NAME, this field will contain the name of the function.
+ //
+ // For COMPARATOR, this field will contain the comparator.
+ //
+ // For other types, this field will be empty.
+ std::string text;
+
+ // Lifecycle is dependent on the lifecycle of the string pointed to by
+ // query_.
+ std::string_view original_text;
+
+ // The type of the token.
+ TokenType type;
+ };
+
+ explicit Lexer(std::string_view query, Language language)
+ : query_(query), language_(language) {
+ Advance();
+ }
+
+ // Get a vector of LexerToken after lexing the query given in the constructor.
+ //
+ // Returns:
+ // A vector of LexerToken on success
+ // INVALID_ARGUMENT on syntax error.
+ libtextclassifier3::StatusOr<std::vector<LexerToken>> ExtractTokens();
+
+ private:
+ // Advance to current_index_ + n.
+ void Advance(uint32_t n = 1) {
+ if (current_index_ + n >= query_.size()) {
+ current_index_ = query_.size();
+ current_char_ = '\0';
+ } else {
+ current_index_ += n;
+ current_char_ = query_[current_index_];
+ }
+ }
+
+ // Get the character at current_index_ + n.
+ char PeekNext(uint32_t n = 1) {
+ if (current_index_ + n >= query_.size()) {
+ return '\0';
+ } else {
+ return query_[current_index_ + n];
+ }
+ }
+
+ void SyntaxError(std::string error) {
+ current_index_ = query_.size();
+ current_char_ = '\0';
+ error_ = std::move(error);
+ }
+
+ // Try to match a whitespace token and skip it.
+ bool ConsumeWhitespace();
+
+ // Try to match a single-char token other than '<' and '>'.
+ bool ConsumeSingleChar();
+ bool ConsumeQuerySingleChar();
+ bool ConsumeScoringSingleChar();
+ bool ConsumeGeneralSingleChar();
+
+ // Try to match a comparator token other than ':'.
+ bool ConsumeComparator();
+
+ // Try to match '&&' and '||'.
+ // 'AND' and 'OR' will be handled in Text() instead, so that 'ANDfoo' and
+ // 'fooOR' is a TEXT, instead of an 'AND' or 'OR'.
+ bool ConsumeAndOr();
+
+ // Try to match a string literal.
+ bool ConsumeStringLiteral();
+
+ // Try to match a non-text.
+ bool ConsumeNonText() {
+ return ConsumeWhitespace() || ConsumeSingleChar() ||
+ (language_ == Language::QUERY && ConsumeComparator()) ||
+ (language_ == Language::QUERY && ConsumeAndOr()) ||
+ ConsumeStringLiteral();
+ }
+
+ // Try to match TEXT, FUNCTION_NAME, 'AND', 'OR' and 'NOT'.
+ // REQUIRES: ConsumeNonText() must be called immediately before calling this
+ // function.
+ bool ConsumeText();
+
+ std::string_view query_;
+ std::string error_;
+ Language language_;
+ int32_t current_index_ = -1;
+ char current_char_ = '\0';
+ std::vector<LexerToken> tokens_;
+
+ // Stores whether the lexer is currently inspecting a TEXT segment while
+ // handling current_char_.
+ bool in_text_ = false;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_
diff --git a/icing/query/advanced_query_parser/lexer_fuzz_test.cc b/icing/query/advanced_query_parser/lexer_fuzz_test.cc
new file mode 100644
index 0000000..f9190db
--- /dev/null
+++ b/icing/query/advanced_query_parser/lexer_fuzz_test.cc
@@ -0,0 +1,37 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <memory>
+#include <string_view>
+
+#include "icing/query/advanced_query_parser/lexer.h"
+
+namespace icing {
+namespace lib {
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ std::string_view text(reinterpret_cast<const char*>(data), size);
+
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>(text, Lexer::Language::QUERY);
+ lexer->ExtractTokens();
+
+ lexer = std::make_unique<Lexer>(text, Lexer::Language::SCORING);
+ lexer->ExtractTokens();
+ return 0;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/lexer_test.cc b/icing/query/advanced_query_parser/lexer_test.cc
new file mode 100644
index 0000000..ec0e663
--- /dev/null
+++ b/icing/query/advanced_query_parser/lexer_test.cc
@@ -0,0 +1,698 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/lexer.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+using ::testing::ElementsAre;
+
+MATCHER_P2(EqualsLexerToken, text, type, "") {
+ const Lexer::LexerToken& actual = arg;
+ *result_listener << "actual is {text=" << actual.text
+ << ", type=" << static_cast<int>(actual.type)
+ << "}, but expected was {text=" << text
+ << ", type=" << static_cast<int>(type) << "}.";
+ return actual.text == text && actual.type == type;
+}
+
+MATCHER_P(EqualsLexerToken, type, "") {
+ const Lexer::LexerToken& actual = arg;
+ *result_listener << "actual is {text=" << actual.text
+ << ", type=" << static_cast<int>(actual.type)
+ << "}, but expected was {text=(empty), type="
+ << static_cast<int>(type) << "}.";
+ return actual.text.empty() && actual.type == type;
+}
+
+TEST(LexerTest, SimpleQuery) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("fooAND", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("fooAND", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("ORfoo", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("ORfoo", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("fooANDbar", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("fooANDbar",
+ Lexer::TokenType::TEXT)));
+}
+
+TEST(LexerTest, PrefixQuery) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo*", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR)));
+
+ lexer = std::make_unique<Lexer>("fooAND*", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("fooAND", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR)));
+
+ lexer = std::make_unique<Lexer>("*ORfoo", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("", Lexer::TokenType::STAR),
+ EqualsLexerToken("ORfoo", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("fooANDbar*", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("fooANDbar", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR)));
+}
+
+TEST(LexerTest, SimpleStringQuery) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("\"foo\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::STRING)));
+
+ lexer = std::make_unique<Lexer>("\"fooAND\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("fooAND",
+ Lexer::TokenType::STRING)));
+
+ lexer = std::make_unique<Lexer>("\"ORfoo\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("ORfoo", Lexer::TokenType::STRING)));
+
+ lexer = std::make_unique<Lexer>("\"fooANDbar\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("fooANDbar",
+ Lexer::TokenType::STRING)));
+}
+
+TEST(LexerTest, TwoTermQuery) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo AND bar", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("foo && bar", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("foo&&bar", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("foo OR \"bar\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::OR),
+ EqualsLexerToken("bar", Lexer::TokenType::STRING)));
+}
+
+TEST(LexerTest, QueryWithSpecialSymbol) {
+ // With escaping
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo\\ \\&\\&bar", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("foo &&bar",
+ Lexer::TokenType::TEXT)));
+ lexer = std::make_unique<Lexer>("foo\\&\\&bar&&baz", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo&&bar", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("baz", Lexer::TokenType::TEXT)));
+ lexer = std::make_unique<Lexer>("foo\\\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo\"", Lexer::TokenType::TEXT)));
+
+ // With quotation marks
+ lexer = std::make_unique<Lexer>("\"foo &&bar\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("foo &&bar",
+ Lexer::TokenType::STRING)));
+ lexer = std::make_unique<Lexer>("\"foo&&bar\"&&baz", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(EqualsLexerToken("foo&&bar", Lexer::TokenType::STRING),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("baz", Lexer::TokenType::TEXT)));
+ lexer = std::make_unique<Lexer>("\"foo\\\"\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("foo\\\"",
+ Lexer::TokenType::STRING)));
+}
+
+TEST(LexerTest, TextInStringShouldBeOriginal) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("\"foo\\nbar\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens, ElementsAre(EqualsLexerToken("foo\\nbar",
+ Lexer::TokenType::STRING)));
+}
+
+TEST(LexerTest, QueryWithFunctionCalls) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo AND fun(bar)", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("fun", Lexer::TokenType::FUNCTION_NAME),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::RPAREN)));
+
+ // Not a function call
+ lexer = std::make_unique<Lexer>("foo AND fun (bar)", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("fun", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::RPAREN)));
+}
+
+TEST(LexerTest, QueryWithComparator) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("name: foo", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("name", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("foo", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("email.name:foo", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("email", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::DOT),
+ EqualsLexerToken("name", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("foo", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("age > 20", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("age", Lexer::TokenType::TEXT),
+ EqualsLexerToken(">", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("20", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("age>=20", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("age", Lexer::TokenType::TEXT),
+ EqualsLexerToken(">=", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("20", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("age <20", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("age", Lexer::TokenType::TEXT),
+ EqualsLexerToken("<", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("20", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("age<= 20", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("age", Lexer::TokenType::TEXT),
+ EqualsLexerToken("<=", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("20", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("age == 20", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("age", Lexer::TokenType::TEXT),
+ EqualsLexerToken("==", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("20", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("age != 20", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("age", Lexer::TokenType::TEXT),
+ EqualsLexerToken("!=", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("20", Lexer::TokenType::TEXT)));
+}
+
+TEST(LexerTest, ComplexQuery) {
+ std::unique_ptr<Lexer> lexer = std::make_unique<Lexer>(
+ "email.sender: (foo* AND bar OR pow(age, 2)>100) || (-baz foo) && "
+ "NOT verbatimSearch(\"hello world\")",
+ Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(
+ EqualsLexerToken("email", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::DOT),
+ EqualsLexerToken("sender", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::OR),
+ EqualsLexerToken("pow", Lexer::TokenType::FUNCTION_NAME),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("age", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::COMMA),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::RPAREN),
+ EqualsLexerToken(">", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("100", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::RPAREN),
+ EqualsLexerToken(Lexer::TokenType::OR),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken(Lexer::TokenType::MINUS),
+ EqualsLexerToken("baz", Lexer::TokenType::TEXT),
+ EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::RPAREN),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken(Lexer::TokenType::NOT),
+ EqualsLexerToken("verbatimSearch", Lexer::TokenType::FUNCTION_NAME),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("hello world", Lexer::TokenType::STRING),
+ EqualsLexerToken(Lexer::TokenType::RPAREN)));
+}
+
+TEST(LexerTest, UTF8WhiteSpace) {
+ std::unique_ptr<Lexer> lexer = std::make_unique<Lexer>(
+ "\xe2\x80\x88"
+ "foo"
+ "\xe2\x80\x89"
+ "\xe2\x80\x89"
+ "bar"
+ "\xe2\x80\x8a",
+ Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT)));
+}
+
+TEST(LexerTest, CJKT) {
+ std::unique_ptr<Lexer> lexer = std::make_unique<Lexer>(
+ "我 && 每天 || 走路 OR 去 -上班", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("我", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("每天", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::OR),
+ EqualsLexerToken("走路", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::OR),
+ EqualsLexerToken("去", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::MINUS),
+ EqualsLexerToken("上班", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("私&& は ||毎日 AND 仕事 -に 歩い て い ます",
+ Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("私", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("は", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::OR),
+ EqualsLexerToken("毎日", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("仕事", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::MINUS),
+ EqualsLexerToken("に", Lexer::TokenType::TEXT),
+ EqualsLexerToken("歩い", Lexer::TokenType::TEXT),
+ EqualsLexerToken("て", Lexer::TokenType::TEXT),
+ EqualsLexerToken("い", Lexer::TokenType::TEXT),
+ EqualsLexerToken("ます", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("ញុំ&&ដើរទៅ||ធ្វើការ-រាល់ថ្ងៃ",
+ Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(EqualsLexerToken("ញុំ", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::AND),
+ EqualsLexerToken("ដើរទៅ", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::OR),
+ EqualsLexerToken("ធ្វើការ-រាល់ថ្ងៃ", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>(
+ "나는"
+ "\xe2\x80\x88" // White Space
+ "매일"
+ "\xe2\x80\x89" // White Space
+ "출근합니다",
+ Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(EqualsLexerToken("나는", Lexer::TokenType::TEXT),
+ EqualsLexerToken("매일", Lexer::TokenType::TEXT),
+ EqualsLexerToken("출근합니다", Lexer::TokenType::TEXT)));
+}
+
+TEST(LexerTest, SyntaxError) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("\"foo", Lexer::Language::QUERY);
+ EXPECT_THAT(lexer->ExtractTokens(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ lexer = std::make_unique<Lexer>("\"foo\\", Lexer::Language::QUERY);
+ EXPECT_THAT(lexer->ExtractTokens(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ lexer = std::make_unique<Lexer>("foo\\", Lexer::Language::QUERY);
+ EXPECT_THAT(lexer->ExtractTokens(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+// "!", "=", "&" and "|" should be treated as valid symbols in TEXT, if not
+// matched as "!=", "==", "&&", or "||".
+TEST(LexerTest, SpecialSymbolAsText) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("age=20", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("age=20", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("age !20", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("age", Lexer::TokenType::TEXT),
+ EqualsLexerToken("!20", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("foo& bar", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo&", Lexer::TokenType::TEXT),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("foo | bar", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken("|", Lexer::TokenType::TEXT),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT)));
+}
+
+TEST(LexerTest, ScoringArithmetic) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("1 + 2", Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::PLUS),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("1+2*3/4", Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::PLUS),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::TIMES),
+ EqualsLexerToken("3", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::DIV),
+ EqualsLexerToken("4", Lexer::TokenType::TEXT)));
+
+ // Arithmetic operators will not be produced in query language.
+ lexer = std::make_unique<Lexer>("1 + 2", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1", Lexer::TokenType::TEXT),
+ EqualsLexerToken("+", Lexer::TokenType::TEXT),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("1+2*3/4", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1+2", Lexer::TokenType::TEXT),
+ EqualsLexerToken("", Lexer::TokenType::STAR),
+ EqualsLexerToken("3/4", Lexer::TokenType::TEXT)));
+}
+
+// Currently, in scoring language, the lexer will view these logic operators as
+// TEXTs. In the future, they may be rejected instead.
+TEST(LexerTest, LogicOperatorNotInScoring) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("1 && 2", Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1", Lexer::TokenType::TEXT),
+ EqualsLexerToken("&&", Lexer::TokenType::TEXT),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("1&&2", Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1&&2", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("1&&2 ||3", Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1&&2", Lexer::TokenType::TEXT),
+ EqualsLexerToken("||3", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("1 AND 2 OR 3 AND NOT 4",
+ Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1", Lexer::TokenType::TEXT),
+ EqualsLexerToken("AND", Lexer::TokenType::TEXT),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT),
+ EqualsLexerToken("OR", Lexer::TokenType::TEXT),
+ EqualsLexerToken("3", Lexer::TokenType::TEXT),
+ EqualsLexerToken("AND", Lexer::TokenType::TEXT),
+ EqualsLexerToken("NOT", Lexer::TokenType::TEXT),
+ EqualsLexerToken("4", Lexer::TokenType::TEXT)));
+}
+
+TEST(LexerTest, ComparatorNotInScoring) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("1 > 2", Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1", Lexer::TokenType::TEXT),
+ EqualsLexerToken(">", Lexer::TokenType::TEXT),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("1>2", Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1>2", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("1>2>=3 <= 4:5== 6<7<=8!= 9",
+ Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1>2>=3", Lexer::TokenType::TEXT),
+ EqualsLexerToken("<=", Lexer::TokenType::TEXT),
+ EqualsLexerToken("4:5==", Lexer::TokenType::TEXT),
+ EqualsLexerToken("6<7<=8!=", Lexer::TokenType::TEXT),
+ EqualsLexerToken("9", Lexer::TokenType::TEXT)));
+
+ // Comparator should be produced in query language.
+ lexer = std::make_unique<Lexer>("1>2>=3 <= 4:5== 6<7<=8!= 9",
+ Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(tokens, lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("1", Lexer::TokenType::TEXT),
+ EqualsLexerToken(">", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT),
+ EqualsLexerToken(">=", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("3", Lexer::TokenType::TEXT),
+ EqualsLexerToken("<=", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("4", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("5", Lexer::TokenType::TEXT),
+ EqualsLexerToken("==", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("6", Lexer::TokenType::TEXT),
+ EqualsLexerToken("<", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("7", Lexer::TokenType::TEXT),
+ EqualsLexerToken("<=", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("8", Lexer::TokenType::TEXT),
+ EqualsLexerToken("!=", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("9", Lexer::TokenType::TEXT)));
+}
+
+TEST(LexerTest, ComplexScoring) {
+ std::unique_ptr<Lexer> lexer = std::make_unique<Lexer>(
+ "1/log( (CreationTimestamp(document) + LastUsedTimestamp(document)) / 2 "
+ ") * pow(2.3, DocumentScore())",
+ Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(
+ EqualsLexerToken("1", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::DIV),
+ EqualsLexerToken("log", Lexer::TokenType::FUNCTION_NAME),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("CreationTimestamp",
+ Lexer::TokenType::FUNCTION_NAME),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("document", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::RPAREN),
+ EqualsLexerToken(Lexer::TokenType::PLUS),
+ EqualsLexerToken("LastUsedTimestamp",
+ Lexer::TokenType::FUNCTION_NAME),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("document", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::RPAREN),
+ EqualsLexerToken(Lexer::TokenType::RPAREN),
+ EqualsLexerToken(Lexer::TokenType::DIV),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::RPAREN),
+ EqualsLexerToken(Lexer::TokenType::TIMES),
+ EqualsLexerToken("pow", Lexer::TokenType::FUNCTION_NAME),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken("2", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::DOT),
+ EqualsLexerToken("3", Lexer::TokenType::TEXT),
+ EqualsLexerToken(Lexer::TokenType::COMMA),
+ EqualsLexerToken("DocumentScore", Lexer::TokenType::FUNCTION_NAME),
+ EqualsLexerToken(Lexer::TokenType::LPAREN),
+ EqualsLexerToken(Lexer::TokenType::RPAREN),
+ EqualsLexerToken(Lexer::TokenType::RPAREN)));
+}
+
+// foo:bar:baz is considered an invalid query as proposed in
+// http://go/appsearch-advanced-query-impl-plan#bookmark=id.yoeyepokmbc5 ; this
+// ensures that the lexer consistently tokenizes colons independently.
+TEST(LexerTest, NoAmbiguousTokenizing) {
+ // This is an invalid query; the lexer doesn't treat `bar:baz` as one token.
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo:bar:baz", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> invalidQueryTokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(invalidQueryTokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("bar", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("baz", Lexer::TokenType::TEXT)));
+
+ lexer = std::make_unique<Lexer>("foo:\"bar:baz\"", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> validQueryTokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(
+ validQueryTokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("bar:baz", Lexer::TokenType::STRING)));
+}
+
+TEST(LexerTest, WhiteSpacesDoNotAffectColonTokenization) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("a:b c : d e: f g :h", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("a", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("b", Lexer::TokenType::TEXT),
+ EqualsLexerToken("c", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("d", Lexer::TokenType::TEXT),
+ EqualsLexerToken("e", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("f", Lexer::TokenType::TEXT),
+ EqualsLexerToken("g", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("h", Lexer::TokenType::TEXT)));
+}
+
+// For the "bar:baz" part to be treated as a TEXT token in a query like
+// foo:bar:baz, an explicit escape is required, so use foo:bar\:baz instead.
+TEST(LexerTest, ColonInTextRequiresExplicitEscaping) {
+ std::unique_ptr<Lexer> lexer =
+ std::make_unique<Lexer>("foo:bar\\:baz", Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> tokens,
+ lexer->ExtractTokens());
+ EXPECT_THAT(tokens,
+ ElementsAre(EqualsLexerToken("foo", Lexer::TokenType::TEXT),
+ EqualsLexerToken(":", Lexer::TokenType::COMPARATOR),
+ EqualsLexerToken("bar:baz", Lexer::TokenType::TEXT)));
+}
+
+TEST(LexerTest, QueryShouldRejectTokensBeyondLimit) {
+ std::string query;
+ for (int i = 0; i < Lexer::kMaxNumTokens + 1; ++i) {
+ query.push_back('(');
+ }
+ Lexer lexer(query, Lexer::Language::QUERY);
+ EXPECT_THAT(lexer.ExtractTokens(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(LexerTest, ScoringShouldRejectTokensBeyondLimit) {
+ std::string scoring;
+ for (int i = 0; i < Lexer::kMaxNumTokens + 1; ++i) {
+ scoring.push_back('(');
+ }
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ EXPECT_THAT(lexer.ExtractTokens(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/param.h b/icing/query/advanced_query_parser/param.h
new file mode 100644
index 0000000..69c46be
--- /dev/null
+++ b/icing/query/advanced_query_parser/param.h
@@ -0,0 +1,57 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_PARAM_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_PARAM_H_
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/query/advanced_query_parser/pending-value.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+enum class Cardinality {
+ kRequired,
+ kOptional,
+ kVariable,
+};
+
+struct Param {
+ explicit Param(DataType data_type,
+ Cardinality cardinality = Cardinality::kRequired)
+ : data_type(data_type), cardinality(cardinality) {}
+
+ libtextclassifier3::Status Matches(PendingValue& arg) const {
+ bool matches = arg.data_type() == data_type;
+ // Values of type kText could also potentially be valid kLong values. If
+ // we're expecting a kLong and we have a kText, try to parse it as a kLong.
+ if (!matches && data_type == DataType::kLong &&
+ arg.data_type() == DataType::kText) {
+ ICING_RETURN_IF_ERROR(arg.ParseInt());
+ matches = true;
+ }
+ return matches ? libtextclassifier3::Status::OK
+ : absl_ports::InvalidArgumentError(
+ "Provided arg doesn't match required param type.");
+ }
+
+ DataType data_type;
+ Cardinality cardinality;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_PARAM_H_
diff --git a/icing/query/advanced_query_parser/parser.cc b/icing/query/advanced_query_parser/parser.cc
new file mode 100644
index 0000000..82576a1
--- /dev/null
+++ b/icing/query/advanced_query_parser/parser.cc
@@ -0,0 +1,449 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/parser.h"
+
+#include <memory>
+#include <string_view>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::unique_ptr<Node> CreateNaryNode(
+ std::string_view operator_text,
+ std::vector<std::unique_ptr<Node>>&& operands) {
+ if (operands.empty()) {
+ return nullptr;
+ }
+ if (operands.size() == 1) {
+ return std::move(operands.at(0));
+ }
+ return std::make_unique<NaryOperatorNode>(std::string(operator_text),
+ std::move(operands));
+}
+
+} // namespace
+
+libtextclassifier3::Status Parser::Consume(Lexer::TokenType token_type) {
+ if (!Match(token_type)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Unable to consume token %d.", static_cast<int>(token_type)));
+ }
+ ++current_token_;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() {
+ if (!Match(Lexer::TokenType::TEXT)) {
+ return absl_ports::InvalidArgumentError("Unable to consume token as TEXT.");
+ }
+ auto text_node = std::make_unique<TextNode>(std::move(current_token_->text),
+ current_token_->original_text);
+ ++current_token_;
+ return text_node;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<FunctionNameNode>>
+Parser::ConsumeFunctionName() {
+ if (!Match(Lexer::TokenType::FUNCTION_NAME)) {
+ return absl_ports::InvalidArgumentError(
+ "Unable to consume token as FUNCTION_NAME.");
+ }
+ auto function_name_node =
+ std::make_unique<FunctionNameNode>(std::move(current_token_->text));
+ ++current_token_;
+ return function_name_node;
+}
+
+// stringElement
+// : STRING STAR?
+libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
+Parser::ConsumeStringElement() {
+ if (!Match(Lexer::TokenType::STRING)) {
+ return absl_ports::InvalidArgumentError(
+ "Unable to consume token as STRING.");
+ }
+ std::string text = std::move(current_token_->text);
+ std::string_view raw_text = current_token_->original_text;
+ ++current_token_;
+
+ bool is_prefix = false;
+ if (Match(Lexer::TokenType::STAR)) {
+ is_prefix = true;
+ ++current_token_;
+ }
+
+ return std::make_unique<StringNode>(std::move(text), raw_text, is_prefix);
+}
+
+libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() {
+ if (!Match(Lexer::TokenType::COMPARATOR)) {
+ return absl_ports::InvalidArgumentError(
+ "Unable to consume token as COMPARATOR.");
+ }
+ std::string comparator = std::move(current_token_->text);
+ ++current_token_;
+ return comparator;
+}
+
+// member
+// : TEXT (DOT TEXT)* (DOT function)?
+// | TEXT STAR
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>>
+Parser::ConsumeMember() {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText());
+ std::vector<std::unique_ptr<TextNode>> children;
+
+ // Member could be either `TEXT (DOT TEXT)* (DOT function)?` or `TEXT STAR`
+ // at this point. So check for 'STAR' to differentiate the two cases.
+ if (Match(Lexer::TokenType::STAR)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::STAR));
+ std::string_view raw_text = text_node->raw_value();
+ std::string text = std::move(*text_node).value();
+ text_node = std::make_unique<TextNode>(std::move(text), raw_text,
+ /*is_prefix=*/true);
+ children.push_back(std::move(text_node));
+ } else {
+ children.push_back(std::move(text_node));
+ while (Match(Lexer::TokenType::DOT)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::DOT));
+ if (MatchFunction()) {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node,
+ ConsumeFunction());
+ // Once a function is matched, we should exit the current rule based on
+ // the grammar.
+ return std::make_unique<MemberNode>(std::move(children),
+ std::move(function_node));
+ }
+ ICING_ASSIGN_OR_RETURN(text_node, ConsumeText());
+ children.push_back(std::move(text_node));
+ }
+ }
+ return std::make_unique<MemberNode>(std::move(children),
+ /*function=*/nullptr);
+}
+
+// function
+// : FUNCTION_NAME LPAREN argList? RPAREN
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<FunctionNode>>
+Parser::ConsumeFunction() {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNameNode> function_name,
+ ConsumeFunctionName());
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
+
+ std::vector<std::unique_ptr<Node>> args;
+ if (Match(Lexer::TokenType::RPAREN)) {
+ // Got empty argument.
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
+ } else {
+ ICING_ASSIGN_OR_RETURN(args, ConsumeArgs());
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
+ }
+ return std::make_unique<FunctionNode>(std::move(function_name),
+ std::move(args));
+}
+
+// comparable
+// : stringElement
+// | member
+// | function
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeComparable() {
+ if (Match(Lexer::TokenType::STRING)) {
+ return ConsumeStringElement();
+ } else if (MatchMember()) {
+ return ConsumeMember();
+ }
+ // The current token sequence isn't a STRING or member. Therefore, it must be
+ // a function.
+ return ConsumeFunction();
+}
+
+// composite
+// : LPAREN expression RPAREN
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeComposite() {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
+
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> expression, ConsumeExpression());
+
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
+ return expression;
+}
+
+// argList
+// : expression (COMMA expression)*
+// ;
+libtextclassifier3::StatusOr<std::vector<std::unique_ptr<Node>>>
+Parser::ConsumeArgs() {
+ std::vector<std::unique_ptr<Node>> args;
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> arg, ConsumeExpression());
+ args.push_back(std::move(arg));
+ while (Match(Lexer::TokenType::COMMA)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::COMMA));
+ ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression());
+ args.push_back(std::move(arg));
+ }
+ return args;
+}
+
+// restriction
+// : comparable (COMPARATOR MINUS? (comparable | composite))?
+// ;
+// COMPARATOR will not be produced in Scoring Lexer.
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeRestriction() {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> comparable, ConsumeComparable());
+
+ if (!Match(Lexer::TokenType::COMPARATOR)) {
+ return comparable;
+ }
+ ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator());
+
+ bool has_minus = Match(Lexer::TokenType::MINUS);
+ if (has_minus) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
+ }
+
+ std::unique_ptr<Node> arg;
+ if (MatchComposite()) {
+ ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite());
+ } else if (MatchComparable()) {
+ ICING_ASSIGN_OR_RETURN(arg, ConsumeComparable());
+ } else {
+ return absl_ports::InvalidArgumentError(
+ "ARG: must begin with LPAREN or FIRST(comparable)");
+ }
+
+ if (has_minus) {
+ arg = std::make_unique<UnaryOperatorNode>("MINUS", std::move(arg));
+ }
+
+ std::vector<std::unique_ptr<Node>> args;
+ args.push_back(std::move(comparable));
+ args.push_back(std::move(arg));
+ return std::make_unique<NaryOperatorNode>(std::move(operator_text),
+ std::move(args));
+}
+
+// simple
+// : restriction
+// | composite
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSimple() {
+ if (MatchComposite()) {
+ return ConsumeComposite();
+ } else if (MatchRestriction()) {
+ return ConsumeRestriction();
+ }
+ return absl_ports::InvalidArgumentError(
+ "SIMPLE: must be a restriction or composite");
+}
+
+// term
+// : NOT? simple
+// | MINUS simple
+// ;
+// NOT will not be produced in Scoring Lexer.
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() {
+ if (!Match(Lexer::TokenType::NOT) && !Match(Lexer::TokenType::MINUS)) {
+ return ConsumeSimple();
+ }
+ std::string operator_text;
+ if (language_ == Lexer::Language::SCORING) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
+ operator_text = "MINUS";
+ } else {
+ if (Match(Lexer::TokenType::NOT)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::NOT));
+ operator_text = "NOT";
+ } else {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
+ operator_text = "MINUS";
+ }
+ }
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple());
+ return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple));
+}
+
+// factor
+// : term (OR term)*
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeFactor() {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> term, ConsumeTerm());
+ std::vector<std::unique_ptr<Node>> terms;
+ terms.push_back(std::move(term));
+
+ while (Match(Lexer::TokenType::OR)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::OR));
+ ICING_ASSIGN_OR_RETURN(term, ConsumeTerm());
+ terms.push_back(std::move(term));
+ }
+
+ return CreateNaryNode("OR", std::move(terms));
+}
+
+// sequence
+// : (factor)+
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSequence() {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> factor, ConsumeFactor());
+ std::vector<std::unique_ptr<Node>> factors;
+ factors.push_back(std::move(factor));
+
+ while (MatchFactor()) {
+ ICING_ASSIGN_OR_RETURN(factor, ConsumeFactor());
+ factors.push_back(std::move(factor));
+ }
+
+ return CreateNaryNode("AND", std::move(factors));
+}
+
+// expression
+// : sequence (AND sequence)*
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeQueryExpression() {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> sequence, ConsumeSequence());
+ std::vector<std::unique_ptr<Node>> sequences;
+ sequences.push_back(std::move(sequence));
+
+ while (Match(Lexer::TokenType::AND)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::AND));
+ ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence());
+ sequences.push_back(std::move(sequence));
+ }
+
+ return CreateNaryNode("AND", std::move(sequences));
+}
+
+// multExpr
+// : term ((TIMES | DIV) term)*
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeTerm());
+ std::vector<std::unique_ptr<Node>> stack;
+ stack.push_back(std::move(node));
+
+ while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) {
+ while (Match(Lexer::TokenType::TIMES)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::TIMES));
+ ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
+ stack.push_back(std::move(node));
+ }
+ node = CreateNaryNode("TIMES", std::move(stack));
+ stack.clear();
+ stack.push_back(std::move(node));
+
+ while (Match(Lexer::TokenType::DIV)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::DIV));
+ ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
+ stack.push_back(std::move(node));
+ }
+ node = CreateNaryNode("DIV", std::move(stack));
+ stack.clear();
+ stack.push_back(std::move(node));
+ }
+
+ return std::move(stack[0]);
+}
+
+// expression
+// : multExpr ((PLUS | MINUS) multExpr)*
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeScoringExpression() {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeMultExpr());
+ std::vector<std::unique_ptr<Node>> stack;
+ stack.push_back(std::move(node));
+
+ while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) {
+ while (Match(Lexer::TokenType::PLUS)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::PLUS));
+ ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
+ stack.push_back(std::move(node));
+ }
+ node = CreateNaryNode("PLUS", std::move(stack));
+ stack.clear();
+ stack.push_back(std::move(node));
+
+ while (Match(Lexer::TokenType::MINUS)) {
+ ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
+ ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
+ stack.push_back(std::move(node));
+ }
+ node = CreateNaryNode("MINUS", std::move(stack));
+ stack.clear();
+ stack.push_back(std::move(node));
+ }
+
+ return std::move(stack[0]);
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeExpression() {
+ switch (language_) {
+ case Lexer::Language::QUERY:
+ return ConsumeQueryExpression();
+ case Lexer::Language::SCORING:
+ return ConsumeScoringExpression();
+ }
+}
+
+// query
+// : expression? EOF
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeQuery() {
+ language_ = Lexer::Language::QUERY;
+ std::unique_ptr<Node> node;
+ if (current_token_ != lexer_tokens_.end()) {
+ ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
+ }
+ if (current_token_ != lexer_tokens_.end()) {
+ return absl_ports::InvalidArgumentError(
+ "Error parsing Query. Must reach EOF after parsing Expression!");
+ }
+ return node;
+}
+
+// scoring
+// : expression EOF
+// ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeScoring() {
+ language_ = Lexer::Language::SCORING;
+ std::unique_ptr<Node> node;
+ if (current_token_ == lexer_tokens_.end()) {
+ return absl_ports::InvalidArgumentError("Got empty scoring expression!");
+ }
+ ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
+ if (current_token_ != lexer_tokens_.end()) {
+ return absl_ports::InvalidArgumentError(
+ "Error parsing the scoring expression. Must reach EOF after parsing "
+ "Expression!");
+ }
+ return node;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/parser.h b/icing/query/advanced_query_parser/parser.h
new file mode 100644
index 0000000..a48c562
--- /dev/null
+++ b/icing/query/advanced_query_parser/parser.h
@@ -0,0 +1,141 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_PARSER_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_PARSER_H_
+
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/query/advanced_query_parser/lexer.h"
+
+namespace icing {
+namespace lib {
+
+class Parser {
+ public:
+ static Parser Create(std::vector<Lexer::LexerToken>&& lexer_tokens) {
+ return Parser(std::move(lexer_tokens));
+ }
+
+ // Returns:
+ // On success, pointer to the root node of the AST
+ // INVALID_ARGUMENT for input that does not conform to the grammar
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeQuery();
+
+ // Returns:
+ // On success, pointer to the root node of the AST
+ // INVALID_ARGUMENT for input that does not conform to the grammar
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeScoring();
+
+ private:
+ explicit Parser(std::vector<Lexer::LexerToken>&& lexer_tokens)
+ : lexer_tokens_(std::move(lexer_tokens)),
+ current_token_(lexer_tokens_.begin()) {}
+
+ // Match Functions
+ // These functions are used to test whether the current_token matches a member
+ // of the FIRST set of a particular symbol in our grammar.
+ bool Match(Lexer::TokenType token_type) const {
+ return current_token_ != lexer_tokens_.end() &&
+ current_token_->type == token_type;
+ }
+
+ bool MatchMember() const { return Match(Lexer::TokenType::TEXT); }
+
+ bool MatchFunction() const { return Match(Lexer::TokenType::FUNCTION_NAME); }
+
+ bool MatchComparable() const {
+ return Match(Lexer::TokenType::STRING) || MatchMember() || MatchFunction();
+ }
+
+ bool MatchComposite() const { return Match(Lexer::TokenType::LPAREN); }
+
+ bool MatchRestriction() const { return MatchComparable(); }
+
+ bool MatchSimple() const { return MatchRestriction() || MatchComposite(); }
+
+ bool MatchTerm() const {
+ return MatchSimple() || Match(Lexer::TokenType::NOT) ||
+ Match(Lexer::TokenType::MINUS);
+ }
+
+ bool MatchFactor() const { return MatchTerm(); }
+
+ // Consume Functions
+ // These functions attempt to parse the token sequence starting at
+ // current_token_.
+ // Returns INVALID_ARGUMENT if unable to parse the token sequence starting at
+ // current_token_ as that particular grammar symbol. There are no guarantees
+ // about what state current_token and lexer_tokens_ are in when returning an
+ // error.
+ //
+ // Consume functions for terminal symbols. These are the only Consume
+ // functions that will directly modify current_token_.
+ // The Consume functions for terminals will guarantee not to modify
+ // current_token_ and lexer_tokens_ when returning an error.
+ libtextclassifier3::Status Consume(Lexer::TokenType token_type);
+
+ libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> ConsumeText();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<FunctionNameNode>>
+ ConsumeFunctionName();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
+ ConsumeStringElement();
+
+ libtextclassifier3::StatusOr<std::string> ConsumeComparator();
+
+ // Consume functions for non-terminal symbols.
+ libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>> ConsumeMember();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<FunctionNode>> ConsumeFunction();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeComparable();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeComposite();
+
+ libtextclassifier3::StatusOr<std::vector<std::unique_ptr<Node>>>
+ ConsumeArgs();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeRestriction();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeSimple();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeTerm();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeFactor();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeSequence();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeQueryExpression();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeMultExpr();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+ ConsumeScoringExpression();
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ConsumeExpression();
+
+ std::vector<Lexer::LexerToken> lexer_tokens_;
+ std::vector<Lexer::LexerToken>::const_iterator current_token_;
+ Lexer::Language language_ = Lexer::Language::QUERY;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_PARSER_H_
diff --git a/icing/query/advanced_query_parser/parser_integration_test.cc b/icing/query/advanced_query_parser/parser_integration_test.cc
new file mode 100644
index 0000000..fa1bd2e
--- /dev/null
+++ b/icing/query/advanced_query_parser/parser_integration_test.cc
@@ -0,0 +1,1012 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree-test-utils.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/query/advanced_query_parser/lexer.h"
+#include "icing/query/advanced_query_parser/parser.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::IsNull;
+using ::testing::SizeIs;
+
+TEST(ParserIntegrationTest, EmptyQuery) {
+ std::string query = "";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+ EXPECT_THAT(tree_root, IsNull());
+}
+
+TEST(ParserIntegrationTest, EmptyScoring) {
+ std::string query = "";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeScoring(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, SingleTerm) {
+ std::string query = "foo";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // member
+ // |
+ // text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserIntegrationTest, ImplicitAnd) {
+ std::string query = "foo bar";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // AND
+ // / \
+ // member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, AND }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, Or) {
+ std::string query = "foo OR bar";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // OR
+ // / \
+ // member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, OR }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, And) {
+ std::string query = "foo AND bar";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // AND
+ // / \
+ // member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, AND }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, Not) {
+ std::string query = "NOT foo";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // NOT
+ // |
+ // member
+ // |
+ // text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, NOT }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("NOT", NodeType::kUnaryOperator)));
+}
+
+TEST(ParserIntegrationTest, Minus) {
+ std::string query = "-foo";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // MINUS
+ // |
+ // member
+ // |
+ // text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, MINUS }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kUnaryOperator)));
+}
+
+TEST(ParserIntegrationTest, Has) {
+ std::string query = "subject:foo";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // :
+ // / \
+ // member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, binaryOp }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("subject", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo(":", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, HasNested) {
+ std::string query = "sender.name:foo";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // :
+ // / \
+ // member member
+ // / \ |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, text, member, text, member, binaryOp }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("sender", NodeType::kText),
+ EqualsNodeInfo("name", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo(":", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, EmptyFunction) {
+ std::string query = "foo()";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // function
+ // |
+ // function_name
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, function }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(ParserIntegrationTest, FunctionSingleArg) {
+ std::string query = "foo(\"bar\")";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // function
+ // / \
+ // function_name string
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, string, function }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("bar", NodeType::kString),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(ParserIntegrationTest, FunctionMultiArg) {
+ std::string query = "foo(\"bar\", \"baz\")";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // function
+ // / | \
+ // function_name string string
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, string, string, function }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("bar", NodeType::kString),
+ EqualsNodeInfo("baz", NodeType::kString),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(ParserIntegrationTest, FunctionNested) {
+ std::string query = "foo(bar())";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // function
+ // / \
+ // function_name function
+ // |
+ // function_name
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, function_name, function, function }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("bar", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(ParserIntegrationTest, FunctionWithTrailingSequence) {
+ std::string query = "foo() OR bar";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // OR
+ // / \
+ // function member
+ // | |
+ // function_name text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, function, text, member, OR }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, Composite) {
+ std::string query = "foo OR (bar baz)";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // OR
+ // / \
+ // member AND
+ // | / \
+ // text member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, text, member, AND, OR }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("baz", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, CompositeWithTrailingSequence) {
+ std::string query = "(bar baz) OR foo";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // OR
+ // / \
+ // AND member
+ // / \ |
+ // member member text
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, AND, text, member, OR }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("baz", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator),
+ EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, Complex) {
+ std::string query = "foo bar:baz OR pal(\"bat\")";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // AND
+ // / \
+ // member OR
+ // | / \
+ // text : function
+ // / \ / \
+ // member member function_name string
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, text, member, :, function_name, string,
+ // function, OR, AND }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("baz", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo(":", NodeType::kNaryOperator),
+ EqualsNodeInfo("pal", NodeType::kFunctionName),
+ EqualsNodeInfo("bat", NodeType::kString),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, InvalidHas) {
+ std::string query = "foo:"; // No right hand operand to :
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, InvalidComposite) {
+ std::string query = "(foo bar"; // No terminating RPAREN
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, InvalidMember) {
+ std::string query = "foo."; // DOT must have succeeding TEXT
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, InvalidOr) {
+ std::string query = "foo OR"; // No right hand operand to OR
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, InvalidAnd) {
+ std::string query = "foo AND"; // No right hand operand to AND
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, InvalidNot) {
+ std::string query = "NOT"; // No right hand operand to NOT
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, InvalidMinus) {
+ std::string query = "-"; // No right hand operand to -
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, InvalidFunctionCallNoRparen) {
+ std::string query = "foo("; // No terminating RPAREN
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, InvalidFunctionArgsHangingComma) {
+ std::string query = "foo(\"bar\",)"; // no valid arg following COMMA
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserIntegrationTest, ScoringPlus) {
+ std::string scoring = "1 + 1 + 1";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // PLUS
+ // / | \
+ // member member member
+ // | | |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, ScoringMinus) {
+ std::string scoring = "1 - 1 - 1";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // MINUS
+ // / | \
+ // member member member
+ // | | |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, ScoringUnaryMinus) {
+ std::string scoring = "1 + -1 + 1";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // PLUS
+ // / | \
+ // member MINUS member
+ // | | |
+ // text member text
+ // |
+ // text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kUnaryOperator),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, ScoringPlusMinus) {
+ std::string scoring = "11 + 12 - 13 + 14";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // PLUS
+ // / \
+ // MINUS member
+ // / \ |
+ // PLUS member text
+ // / \ |
+ // member member text
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("11", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("12", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator),
+ EqualsNodeInfo("13", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kNaryOperator),
+ EqualsNodeInfo("14", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, ScoringTimes) {
+ std::string scoring = "1 * 1 * 1";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // TIMES
+ // / | \
+ // member member member
+ // | | |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("TIMES", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, ScoringDiv) {
+ std::string scoring = "1 / 1 / 1";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // DIV
+ // / | \
+ // member member member
+ // | | |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("DIV", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, ScoringTimesDiv) {
+ std::string scoring = "11 / 12 * 13 / 14 / 15";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // DIV
+ // / | \
+ // TIMES member member
+ // / \ | |
+ // DIV member text text
+ // / \ |
+ // member member text
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("11", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("12", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("DIV", NodeType::kNaryOperator),
+ EqualsNodeInfo("13", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("TIMES", NodeType::kNaryOperator),
+ EqualsNodeInfo("14", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("15", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("DIV", NodeType::kNaryOperator)));
+}
+
+TEST(ParserIntegrationTest, ComplexScoring) {
+ // With parentheses in function arguments.
+ std::string scoring = "1 + pow((2 * sin(3)), 4) + -5 / 6";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ std::vector<NodeInfo> node = visitor.nodes();
+ EXPECT_THAT(node,
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("pow", NodeType::kFunctionName),
+ EqualsNodeInfo("2", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("sin", NodeType::kFunctionName),
+ EqualsNodeInfo("3", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("TIMES", NodeType::kNaryOperator),
+ EqualsNodeInfo("4", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("5", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kUnaryOperator),
+ EqualsNodeInfo("6", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("DIV", NodeType::kNaryOperator),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator)));
+
+ // Without parentheses in function arguments.
+ scoring = "1 + pow(2 * sin(3), 4) + -5 / 6";
+ lexer = Lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(lexer_tokens, lexer.ExtractTokens());
+ parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(tree_root, parser.ConsumeScoring());
+ visitor = SimpleVisitor();
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(), ElementsAreArray(node));
+}
+
+TEST(ParserIntegrationTest, ScoringMemberFunction) {
+ std::string scoring = "this.CreationTimestamp()";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // member
+ // / \
+ // text function
+ // |
+ // function_name
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(
+ visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("this", NodeType::kText),
+ EqualsNodeInfo("CreationTimestamp", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserIntegrationTest, QueryMemberFunction) {
+ std::string query = "this.foo()";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // member
+ // / \
+ // text function
+ // |
+ // function_name
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("this", NodeType::kText),
+ EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserIntegrationTest, ScoringComplexMemberFunction) {
+ std::string scoring = "a.b.fun(c, d)";
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // member
+ // / | \
+ // text text function
+ // / | \
+ // function_name member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("a", NodeType::kText),
+ EqualsNodeInfo("b", NodeType::kText),
+ EqualsNodeInfo("fun", NodeType::kFunctionName),
+ EqualsNodeInfo("c", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("d", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserTest, QueryComplexMemberFunction) {
+ std::string query = "this.abc.fun(def, ghi)";
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // member
+ // / | \
+ // text text function
+ // / | \
+ // function_name member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("this", NodeType::kText),
+ EqualsNodeInfo("abc", NodeType::kText),
+ EqualsNodeInfo("fun", NodeType::kFunctionName),
+ EqualsNodeInfo("def", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("ghi", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserTest, QueryShouldNotStackOverflowAtMaxNumTokens) {
+ // query = "(( ... (foo bar) ... ))"
+ std::string query;
+ for (int i = 0; i < Lexer::kMaxNumTokens / 2 - 1; ++i) {
+ query.push_back('(');
+ }
+ query.append("foo bar");
+ for (int i = 0; i < Lexer::kMaxNumTokens / 2 - 1; ++i) {
+ query.push_back(')');
+ }
+
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ EXPECT_THAT(lexer_tokens, SizeIs(Lexer::kMaxNumTokens));
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(), IsOk());
+}
+
+TEST(ParserTest, ScoringShouldNotStackOverflowAtMaxNumTokens) {
+ // scoring = "(( ... (-1) ... ))"
+ std::string scoring;
+ for (int i = 0; i < Lexer::kMaxNumTokens / 2 - 1; ++i) {
+ scoring.push_back('(');
+ }
+ scoring.append("-1");
+ for (int i = 0; i < Lexer::kMaxNumTokens / 2 - 1; ++i) {
+ scoring.push_back(')');
+ }
+
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ EXPECT_THAT(lexer_tokens, SizeIs(Lexer::kMaxNumTokens));
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeScoring(), IsOk());
+}
+
+TEST(ParserTest, InvalidQueryShouldNotStackOverflowAtMaxNumTokens) {
+ std::string query;
+ for (int i = 0; i < Lexer::kMaxNumTokens; ++i) {
+ query.push_back('(');
+ }
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ EXPECT_THAT(lexer_tokens, SizeIs(Lexer::kMaxNumTokens));
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidScoringShouldNotStackOverflowAtMaxNumTokens) {
+ std::string scoring;
+ for (int i = 0; i < Lexer::kMaxNumTokens; ++i) {
+ scoring.push_back('(');
+ }
+ Lexer lexer(scoring, Lexer::Language::SCORING);
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ EXPECT_THAT(lexer_tokens, SizeIs(Lexer::kMaxNumTokens));
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeScoring(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/parser_test.cc b/icing/query/advanced_query_parser/parser_test.cc
new file mode 100644
index 0000000..824c2ce
--- /dev/null
+++ b/icing/query/advanced_query_parser/parser_test.cc
@@ -0,0 +1,1087 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/parser.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree-test-utils.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/query/advanced_query_parser/lexer.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::IsNull;
+
+TEST(ParserTest, EmptyQuery) {
+ std::vector<Lexer::LexerToken> lexer_tokens;
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+ EXPECT_THAT(tree_root, IsNull());
+}
+
+TEST(ParserTest, EmptyScoring) {
+ std::vector<Lexer::LexerToken> lexer_tokens;
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeScoring(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, SingleTerm) {
+ std::string_view query = "foo";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query, Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // member
+ // |
+ // text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserTest, ImplicitAnd) {
+ std::string_view query = "foo bar";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {"bar", query.substr(4, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // AND
+ // / \
+ // member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, AND }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, Or) {
+ std::string_view query = "foo OR bar";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(4, 2), Lexer::TokenType::OR},
+ {"bar", query.substr(7, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // OR
+ // / \
+ // member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, OR }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, And) {
+ std::string_view query = "foo AND bar";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(4, 3), Lexer::TokenType::AND},
+ {"bar", query.substr(8, 4), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // AND
+ // / \
+ // member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, AND }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, Not) {
+ std::string_view query = "NOT foo";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"", query.substr(0, 3), Lexer::TokenType::NOT},
+ {"foo", query.substr(4, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // NOT
+ // |
+ // member
+ // |
+ // text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, NOT }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("NOT", NodeType::kUnaryOperator)));
+}
+
+TEST(ParserTest, Minus) {
+ std::string_view query = "-foo";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"", query.substr(0, 1), Lexer::TokenType::MINUS},
+ {"foo", query.substr(1, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // MINUS
+ // |
+ // member
+ // |
+ // text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, MINUS }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kUnaryOperator)));
+}
+
+TEST(ParserTest, Has) {
+ std::string_view query = "subject:foo";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"subject", query.substr(0, 7), Lexer::TokenType::TEXT},
+ {":", query.substr(7, 1), Lexer::TokenType::COMPARATOR},
+ {"foo", query.substr(8, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // :
+ // / \
+ // member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, binaryOp }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("subject", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo(":", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, HasNested) {
+ std::string_view query = "sender.name:foo";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"sender", query.substr(0, 6), Lexer::TokenType::TEXT},
+ {"", query.substr(6, 1), Lexer::TokenType::DOT},
+ {"name", query.substr(7, 4), Lexer::TokenType::TEXT},
+ {":", query.substr(11, 1), Lexer::TokenType::COMPARATOR},
+ {"foo", query.substr(12, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // :
+ // / \
+ // member member
+ // / \ |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, text, member, text, member, binaryOp }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("sender", NodeType::kText),
+ EqualsNodeInfo("name", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo(":", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, EmptyFunction) {
+ std::string_view query = "foo()";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(3, 1), Lexer::TokenType::LPAREN},
+ {"", query.substr(4, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // function
+ // |
+ // function_name
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, function }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(ParserTest, FunctionSingleArg) {
+ std::string_view query = "foo(\"bar\")";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(3, 1), Lexer::TokenType::LPAREN},
+ {"bar", query.substr(5, 3), Lexer::TokenType::STRING},
+ {"", query.substr(8, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // function
+ // / \
+ // function_name string
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, string, function }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("bar", NodeType::kString),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(ParserTest, FunctionMultiArg) {
+ std::string_view query = "foo(\"bar\", \"baz\")";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(3, 1), Lexer::TokenType::LPAREN},
+ {"bar", query.substr(5, 3), Lexer::TokenType::STRING},
+ {"", query.substr(9, 1), Lexer::TokenType::COMMA},
+ {"baz", query.substr(12, 3), Lexer::TokenType::STRING},
+ {"", query.substr(16, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // function
+ // / | \
+ // function_name string string
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, string, string, function }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("bar", NodeType::kString),
+ EqualsNodeInfo("baz", NodeType::kString),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(ParserTest, FunctionNested) {
+ std::string_view query = "foo(bar())";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(3, 1), Lexer::TokenType::LPAREN},
+ {"bar", query.substr(4, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(7, 1), Lexer::TokenType::LPAREN},
+ {"", query.substr(8, 1), Lexer::TokenType::RPAREN},
+ {"", query.substr(9, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // function
+ // / \
+ // function_name function
+ // |
+ // function_name
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, function_name, function, function }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("bar", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kFunction)));
+}
+
+TEST(ParserTest, FunctionWithTrailingSequence) {
+ std::string_view query = "foo() OR bar";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(3, 1), Lexer::TokenType::LPAREN},
+ {"", query.substr(4, 1), Lexer::TokenType::RPAREN},
+ {"", query.substr(6, 2), Lexer::TokenType::OR},
+ {"bar", query.substr(9, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // OR
+ // / \
+ // function member
+ // | |
+ // function_name text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { function_name, function, text, member, OR }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, Composite) {
+ std::string_view query = "foo OR (bar baz)";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(4, 2), Lexer::TokenType::OR},
+ {"", query.substr(7, 1), Lexer::TokenType::LPAREN},
+ {"bar", query.substr(8, 3), Lexer::TokenType::TEXT},
+ {"baz", query.substr(12, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(15, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // OR
+ // / \
+ // member AND
+ // | / \
+ // text member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, text, member, AND, OR }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("baz", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, CompositeWithTrailingSequence) {
+ std::string_view query = "(bar baz) OR foo";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"", query.substr(0, 1), Lexer::TokenType::LPAREN},
+ {"bar", query.substr(1, 3), Lexer::TokenType::TEXT},
+ {"baz", query.substr(5, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(8, 1), Lexer::TokenType::RPAREN},
+ {"", query.substr(10, 2), Lexer::TokenType::OR},
+ {"foo", query.substr(13, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // OR
+ // / \
+ // AND member
+ // / \ |
+ // member member text
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, AND, text, member, OR }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("baz", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator),
+ EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, Complex) {
+ std::string_view query = R"(foo bar:baz OR pal("bat"))";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {"bar", query.substr(4, 3), Lexer::TokenType::TEXT},
+ {":", query.substr(7, 1), Lexer::TokenType::COMPARATOR},
+ {"baz", query.substr(8, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(12, 2), Lexer::TokenType::OR},
+ {"pal", query.substr(15, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(18, 1), Lexer::TokenType::LPAREN},
+ {"bat", query.substr(20, 3), Lexer::TokenType::STRING},
+ {"", query.substr(24, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // AND
+ // / \
+ // member OR
+ // | / \
+ // text : function
+ // / \ / \
+ // member member function_name string
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ // SimpleVisitor ordering
+ // { text, member, text, member, text, member, :, function_name, string,
+ // function, OR, AND }
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("foo", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("bar", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("baz", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo(":", NodeType::kNaryOperator),
+ EqualsNodeInfo("pal", NodeType::kFunctionName),
+ EqualsNodeInfo("bat", NodeType::kString),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("OR", NodeType::kNaryOperator),
+ EqualsNodeInfo("AND", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, InvalidHas) {
+ std::string_view query = "foo:"; // No right hand operand to :
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {":", query.substr(3, 1), Lexer::TokenType::COMPARATOR}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidComposite) {
+ std::string_view query = "(foo bar"; // No terminating RPAREN
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"", query.substr(0, 1), Lexer::TokenType::LPAREN},
+ {"foo", query.substr(1, 3), Lexer::TokenType::TEXT},
+ {"bar", query.substr(5, 3), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidMember) {
+ std::string_view query = "foo."; // DOT must have succeeding TEXT
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(3, 1), Lexer::TokenType::DOT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidOr) {
+ std::string_view query = "foo OR"; // No right hand operand to OR
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(3, 2), Lexer::TokenType::OR}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidAnd) {
+ std::string_view query = "foo AND"; // No right hand operand to AND
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(4, 3), Lexer::TokenType::AND}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidNot) {
+ std::string_view query = "NOT"; // No right hand operand to NOT
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"", query.substr(0, 3), Lexer::TokenType::NOT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidMinus) {
+ std::string_view query = "-"; // No right hand operand to -
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"", query.substr(0, 1), Lexer::TokenType::MINUS}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidFunctionCallNoRparen) {
+ std::string_view query = "foo("; // No terminating RPAREN
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(3, 0), Lexer::TokenType::LPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidFunctionCallNoLparen) {
+ std::string_view query =
+ "foo bar"; // foo labeled FUNCTION_NAME despite no LPAREN
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"bar", query.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, InvalidFunctionArgsHangingComma) {
+ std::string_view query = R"(foo("bar",))"; // no valid arg following COMMA
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(3, 1), Lexer::TokenType::LPAREN},
+ {"bar", query.substr(5, 3), Lexer::TokenType::STRING},
+ {"", query.substr(9, 1), Lexer::TokenType::COMMA},
+ {"", query.substr(10, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeQuery(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ParserTest, ScoringPlus) {
+ std::string_view scoring_exp = "1 + 1 + 1";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS},
+ {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(6, 1), Lexer::TokenType::PLUS},
+ {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // PLUS
+ // / | \
+ // member member member
+ // | | |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, ScoringMinus) {
+ std::string_view scoring_exp = "1 - 1 - 1";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(2, 1), Lexer::TokenType::MINUS},
+ {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(6, 1), Lexer::TokenType::MINUS},
+ {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // MINUS
+ // / | \
+ // member member member
+ // | | |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, ScoringUnaryMinus) {
+ std::string_view scoring_exp = "1 + -1 + 1";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS},
+ {"", scoring_exp.substr(4, 1), Lexer::TokenType::MINUS},
+ {"1", scoring_exp.substr(5, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(7, 1), Lexer::TokenType::PLUS},
+ {"1", scoring_exp.substr(9, 1), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // PLUS
+ // / | \
+ // member MINUS member
+ // | | |
+ // text member text
+ // |
+ // text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kUnaryOperator),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, ScoringPlusMinus) {
+ std::string_view scoring_exp = "11 + 12 - 13 + 14";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"11", scoring_exp.substr(0, 2), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(3, 1), Lexer::TokenType::PLUS},
+ {"12", scoring_exp.substr(5, 2), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(6, 1), Lexer::TokenType::MINUS},
+ {"13", scoring_exp.substr(8, 2), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(11, 1), Lexer::TokenType::PLUS},
+ {"14", scoring_exp.substr(13, 2), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // PLUS
+ // / \
+ // MINUS member
+ // / \ |
+ // PLUS member text
+ // / \ |
+ // member member text
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("11", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("12", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator),
+ EqualsNodeInfo("13", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kNaryOperator),
+ EqualsNodeInfo("14", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, ScoringTimes) {
+ std::string_view scoring_exp = "1 * 1 * 1";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(2, 1), Lexer::TokenType::TIMES},
+ {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(6, 1), Lexer::TokenType::TIMES},
+ {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // TIMES
+ // / | \
+ // member member member
+ // | | |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("TIMES", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, ScoringDiv) {
+ std::string_view scoring_exp = "1 / 1 / 1";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(2, 1), Lexer::TokenType::DIV},
+ {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(6, 1), Lexer::TokenType::DIV},
+ {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // DIV
+ // / | \
+ // member member member
+ // | | |
+ // text text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("DIV", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, ScoringTimesDiv) {
+ std::string_view scoring_exp = "11 / 12 * 13 / 14 / 15";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"11", scoring_exp.substr(0, 2), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(3, 1), Lexer::TokenType::DIV},
+ {"12", scoring_exp.substr(5, 2), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(8, 1), Lexer::TokenType::TIMES},
+ {"13", scoring_exp.substr(10, 2), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(13, 1), Lexer::TokenType::DIV},
+ {"14", scoring_exp.substr(15, 2), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(18, 1), Lexer::TokenType::DIV},
+ {"15", scoring_exp.substr(20, 2), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // DIV
+ // / | \
+ // TIMES member member
+ // / \ | |
+ // DIV member text text
+ // / \ |
+ // member member text
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("11", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("12", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("DIV", NodeType::kNaryOperator),
+ EqualsNodeInfo("13", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("TIMES", NodeType::kNaryOperator),
+ EqualsNodeInfo("14", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("15", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("DIV", NodeType::kNaryOperator)));
+}
+
+TEST(ParserTest, ComplexScoring) {
+ std::string_view scoring_exp = "1 + pow((2 * sin(3)), 4) + -5 / 6";
+ // With parentheses in function arguments.
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS},
+ {"pow", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN},
+ {"", scoring_exp.substr(8, 1), Lexer::TokenType::LPAREN},
+ {"2", scoring_exp.substr(9, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(11, 1), Lexer::TokenType::TIMES},
+ {"sin", scoring_exp.substr(13, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", scoring_exp.substr(16, 1), Lexer::TokenType::LPAREN},
+ {"3", scoring_exp.substr(17, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(18, 1), Lexer::TokenType::RPAREN},
+ {"", scoring_exp.substr(19, 1), Lexer::TokenType::RPAREN},
+ {"", scoring_exp.substr(20, 1), Lexer::TokenType::COMMA},
+ {"4", scoring_exp.substr(22, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(23, 1), Lexer::TokenType::RPAREN},
+ {"", scoring_exp.substr(25, 1), Lexer::TokenType::PLUS},
+ {"", scoring_exp.substr(27, 1), Lexer::TokenType::MINUS},
+ {"5", scoring_exp.substr(28, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(30, 1), Lexer::TokenType::DIV},
+ {"6", scoring_exp.substr(32, 1), Lexer::TokenType::TEXT},
+ };
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ std::vector<NodeInfo> node = visitor.nodes();
+ EXPECT_THAT(node,
+ ElementsAre(EqualsNodeInfo("1", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("pow", NodeType::kFunctionName),
+ EqualsNodeInfo("2", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("sin", NodeType::kFunctionName),
+ EqualsNodeInfo("3", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("TIMES", NodeType::kNaryOperator),
+ EqualsNodeInfo("4", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("5", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("MINUS", NodeType::kUnaryOperator),
+ EqualsNodeInfo("6", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("DIV", NodeType::kNaryOperator),
+ EqualsNodeInfo("PLUS", NodeType::kNaryOperator)));
+
+ scoring_exp = "1 + pow(2 * sin(3), 4) + -5 / 6";
+ // Without parentheses in function arguments.
+ lexer_tokens = {
+ {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS},
+ {"pow", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN},
+ {"2", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(10, 1), Lexer::TokenType::TIMES},
+ {"sin", scoring_exp.substr(12, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", scoring_exp.substr(15, 1), Lexer::TokenType::LPAREN},
+ {"3", scoring_exp.substr(16, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(17, 1), Lexer::TokenType::RPAREN},
+ {"", scoring_exp.substr(18, 1), Lexer::TokenType::COMMA},
+ {"4", scoring_exp.substr(20, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(21, 1), Lexer::TokenType::RPAREN},
+ {"", scoring_exp.substr(23, 1), Lexer::TokenType::PLUS},
+ {"", scoring_exp.substr(25, 1), Lexer::TokenType::MINUS},
+ {"5", scoring_exp.substr(26, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(28, 1), Lexer::TokenType::DIV},
+ {"6", scoring_exp.substr(30, 1), Lexer::TokenType::TEXT},
+ };
+ parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(tree_root, parser.ConsumeScoring());
+ visitor = SimpleVisitor();
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(), ElementsAreArray(node));
+}
+
+TEST(ParserTest, ScoringMemberFunction) {
+ std::string_view scoring_exp = "this.CreationTimestamp()";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"this", scoring_exp.substr(0, 4), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(4, 1), Lexer::TokenType::DOT},
+ {"CreationTimestamp", scoring_exp.substr(5, 17),
+ Lexer::TokenType::FUNCTION_NAME},
+ {"", scoring_exp.substr(22, 1), Lexer::TokenType::LPAREN},
+ {"", scoring_exp.substr(23, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // member
+ // / \
+ // text function
+ // |
+ // function_name
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(
+ visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("this", NodeType::kText),
+ EqualsNodeInfo("CreationTimestamp", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserTest, QueryMemberFunction) {
+ std::string_view query = "this.foo()";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"this", query.substr(0, 4), Lexer::TokenType::TEXT},
+ {"", query.substr(4, 1), Lexer::TokenType::DOT},
+ {"foo", query.substr(5, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(8, 1), Lexer::TokenType::LPAREN},
+ {"", query.substr(9, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // member
+ // / \
+ // text function
+ // |
+ // function_name
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("this", NodeType::kText),
+ EqualsNodeInfo("foo", NodeType::kFunctionName),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserTest, ScoringComplexMemberFunction) {
+ std::string_view scoring_exp = "a.b.fun(c, d)";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"a", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(1, 1), Lexer::TokenType::DOT},
+ {"b", scoring_exp.substr(2, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(3, 1), Lexer::TokenType::DOT},
+ {"fun", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN},
+ {"c", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(9, 1), Lexer::TokenType::COMMA},
+ {"d", scoring_exp.substr(11, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(12, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ // Expected AST:
+ // member
+ // / | \
+ // text text function
+ // / | \
+ // function_name member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("a", NodeType::kText),
+ EqualsNodeInfo("b", NodeType::kText),
+ EqualsNodeInfo("fun", NodeType::kFunctionName),
+ EqualsNodeInfo("c", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("d", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserTest, QueryComplexMemberFunction) {
+ std::string_view query = "this.abc.fun(def, ghi)";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"this", query.substr(0, 4), Lexer::TokenType::TEXT},
+ {"", query.substr(4, 1), Lexer::TokenType::DOT},
+ {"abc", query.substr(5, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(8, 1), Lexer::TokenType::DOT},
+ {"fun", query.substr(9, 3), Lexer::TokenType::FUNCTION_NAME},
+ {"", query.substr(12, 1), Lexer::TokenType::LPAREN},
+ {"def", query.substr(13, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(16, 1), Lexer::TokenType::COMMA},
+ {"ghi", query.substr(17, 3), Lexer::TokenType::TEXT},
+ {"", query.substr(20, 1), Lexer::TokenType::RPAREN}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ // Expected AST:
+ // member
+ // / | \
+ // text text function
+ // / | \
+ // function_name member member
+ // | |
+ // text text
+ SimpleVisitor visitor;
+ tree_root->Accept(&visitor);
+ EXPECT_THAT(visitor.nodes(),
+ ElementsAre(EqualsNodeInfo("this", NodeType::kText),
+ EqualsNodeInfo("abc", NodeType::kText),
+ EqualsNodeInfo("fun", NodeType::kFunctionName),
+ EqualsNodeInfo("def", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("ghi", NodeType::kText),
+ EqualsNodeInfo("", NodeType::kMember),
+ EqualsNodeInfo("", NodeType::kFunction),
+ EqualsNodeInfo("", NodeType::kMember)));
+}
+
+TEST(ParserTest, InvalidScoringToken) {
+ std::string_view scoring_exp = "1 + NOT 1";
+ std::vector<Lexer::LexerToken> lexer_tokens = {
+ {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT},
+ {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS},
+ {"", scoring_exp.substr(4, 3), Lexer::TokenType::NOT},
+ {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}};
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ EXPECT_THAT(parser.ConsumeScoring(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/pending-value.cc b/icing/query/advanced_query_parser/pending-value.cc
new file mode 100644
index 0000000..67bdc3a
--- /dev/null
+++ b/icing/query/advanced_query_parser/pending-value.cc
@@ -0,0 +1,44 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/query/advanced_query_parser/pending-value.h"
+
+#include "icing/absl_ports/canonical_errors.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::Status PendingValue::ParseInt() {
+ if (data_type_ == DataType::kLong) {
+ return libtextclassifier3::Status::OK;
+ } else if (data_type_ != DataType::kText) {
+ return absl_ports::InvalidArgumentError("Cannot parse value as LONG");
+ }
+ if (query_term_.is_prefix_val) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Cannot use prefix operator '*' with numeric value: ",
+ query_term_.term));
+ }
+ char* value_end;
+ long_val_ = std::strtoll(query_term_.term.c_str(), &value_end, /*base=*/10);
+ if (value_end != query_term_.term.c_str() + query_term_.term.length()) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Unable to parse \"", query_term_.term, "\" as number."));
+ }
+ data_type_ = DataType::kLong;
+ query_term_ = {/*term=*/"", /*raw_term=*/"", /*is_prefix_val=*/false};
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/pending-value.h b/icing/query/advanced_query_parser/pending-value.h
new file mode 100644
index 0000000..1a6717e
--- /dev/null
+++ b/icing/query/advanced_query_parser/pending-value.h
@@ -0,0 +1,160 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_PENDING_VALUE_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_PENDING_VALUE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+enum class DataType {
+ kNone,
+ kLong,
+ kText,
+ kString,
+ kStringList,
+ kDocumentIterator,
+};
+
+struct QueryTerm {
+ std::string term;
+ std::string_view raw_term;
+ bool is_prefix_val;
+};
+
+// A holder for intermediate results when processing child nodes.
+struct PendingValue {
+ static PendingValue CreateStringPendingValue(QueryTerm str) {
+ return PendingValue(std::move(str), DataType::kString);
+ }
+
+ static PendingValue CreateTextPendingValue(QueryTerm text) {
+ return PendingValue(std::move(text), DataType::kText);
+ }
+
+ PendingValue() : data_type_(DataType::kNone) {}
+
+ explicit PendingValue(std::unique_ptr<DocHitInfoIterator> iterator)
+ : iterator_(std::move(iterator)),
+ data_type_(DataType::kDocumentIterator) {}
+
+ explicit PendingValue(std::vector<std::string> string_lists)
+ : string_vals_(std::move(string_lists)),
+ data_type_(DataType::kStringList) {}
+
+ PendingValue(const PendingValue&) = delete;
+ PendingValue(PendingValue&&) = default;
+
+ PendingValue& operator=(const PendingValue&) = delete;
+ PendingValue& operator=(PendingValue&&) = default;
+
+ // Placeholder is used to indicate where the children of a particular node
+ // begin.
+ bool is_placeholder() const { return data_type_ == DataType::kNone; }
+
+ libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+ iterator() && {
+ ICING_RETURN_IF_ERROR(CheckDataType(DataType::kDocumentIterator));
+ return std::move(iterator_);
+ }
+
+ libtextclassifier3::StatusOr<const std::vector<std::string>*> string_vals()
+ const& {
+ ICING_RETURN_IF_ERROR(CheckDataType(DataType::kStringList));
+ return &string_vals_;
+ }
+ libtextclassifier3::StatusOr<std::vector<std::string>> string_vals() && {
+ ICING_RETURN_IF_ERROR(CheckDataType(DataType::kStringList));
+ return std::move(string_vals_);
+ }
+
+ libtextclassifier3::StatusOr<const QueryTerm*> string_val() const& {
+ ICING_RETURN_IF_ERROR(CheckDataType(DataType::kString));
+ return &query_term_;
+ }
+ libtextclassifier3::StatusOr<QueryTerm> string_val() && {
+ ICING_RETURN_IF_ERROR(CheckDataType(DataType::kString));
+ return std::move(query_term_);
+ }
+
+ libtextclassifier3::StatusOr<const QueryTerm*> text_val() const& {
+ ICING_RETURN_IF_ERROR(CheckDataType(DataType::kText));
+ return &query_term_;
+ }
+ libtextclassifier3::StatusOr<QueryTerm> text_val() && {
+ ICING_RETURN_IF_ERROR(CheckDataType(DataType::kText));
+ return std::move(query_term_);
+ }
+
+ libtextclassifier3::StatusOr<int64_t> long_val() {
+ ICING_RETURN_IF_ERROR(ParseInt());
+ return long_val_;
+ }
+
+ // Attempts to interpret the value as an int. A pending value can be parsed as
+ // an int under two circumstances:
+ // 1. It holds a kText value which can be parsed to an int
+ // 2. It holds a kLong value
+ // If #1 is true, then the parsed value will be stored in long_value and
+ // data_type will be updated to kLong.
+ // RETURNS:
+ // - OK, if able to successfully parse the value into a long
+ // - INVALID_ARGUMENT if the value could not be parsed as a long
+ libtextclassifier3::Status ParseInt();
+
+ DataType data_type() const { return data_type_; }
+
+ private:
+ explicit PendingValue(QueryTerm query_term, DataType data_type)
+ : query_term_(std::move(query_term)), data_type_(data_type) {}
+
+ libtextclassifier3::Status CheckDataType(DataType required_data_type) const {
+ if (data_type_ == required_data_type) {
+ return libtextclassifier3::Status::OK;
+ }
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Unable to retrieve value of type '",
+ std::to_string(static_cast<int>(required_data_type)),
+ "' from pending value of type '",
+ std::to_string(static_cast<int>(data_type_)), "'"));
+ }
+
+ // iterator_ will be populated when data_type_ is kDocumentIterator.
+ std::unique_ptr<DocHitInfoIterator> iterator_;
+
+ // string_vals_ will be populated when data_type_ kStringList.
+ std::vector<std::string> string_vals_;
+
+ // query_term_ will be populated when data_type_ is kString or kText
+ QueryTerm query_term_;
+
+ // long_val_ will be populated when data_type_ is kLong - after a successful
+ // call to ParseInt.
+ int64_t long_val_;
+ DataType data_type_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_PENDING_VALUE_H_
diff --git a/icing/query/advanced_query_parser/query-visitor.cc b/icing/query/advanced_query_parser/query-visitor.cc
new file mode 100644
index 0000000..31da959
--- /dev/null
+++ b/icing/query/advanced_query_parser/query-visitor.cc
@@ -0,0 +1,963 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/query-visitor.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/index/iterator/doc-hit-info-iterator-all-document-id.h"
+#include "icing/index/iterator/doc-hit-info-iterator-and.h"
+#include "icing/index/iterator/doc-hit-info-iterator-none.h"
+#include "icing/index/iterator/doc-hit-info-iterator-not.h"
+#include "icing/index/iterator/doc-hit-info-iterator-or.h"
+#include "icing/index/iterator/doc-hit-info-iterator-property-in-document.h"
+#include "icing/index/iterator/doc-hit-info-iterator-property-in-schema.h"
+#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/property-existence-indexing-handler.h"
+#include "icing/query/advanced_query_parser/lexer.h"
+#include "icing/query/advanced_query_parser/param.h"
+#include "icing/query/advanced_query_parser/parser.h"
+#include "icing/query/advanced_query_parser/pending-value.h"
+#include "icing/query/advanced_query_parser/util/string-util.h"
+#include "icing/query/query-features.h"
+#include "icing/schema/property-util.h"
+#include "icing/schema/section.h"
+#include "icing/tokenization/token.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+struct CreateList {
+ libtextclassifier3::StatusOr<PendingValue> operator()(
+ std::vector<PendingValue>&& args) const {
+ std::vector<std::string> values;
+ values.reserve(args.size());
+ for (PendingValue& arg : args) {
+ QueryTerm string_val = std::move(arg).string_val().ValueOrDie();
+ values.push_back(std::move(string_val.term));
+ }
+ return PendingValue(std::move(values));
+ }
+};
+
+bool IsNumericComparator(std::string_view operator_text) {
+ if (operator_text.length() < 1 || operator_text.length() > 2) {
+ return false;
+ }
+ // TODO(tjbarron) decide how/if to support !=
+ return operator_text == "<" || operator_text == ">" ||
+ operator_text == "==" || operator_text == "<=" ||
+ operator_text == ">=";
+}
+
+bool IsSupportedNaryOperator(std::string_view operator_text) {
+ return IsNumericComparator(operator_text) || operator_text == "AND" ||
+ operator_text == "OR" || operator_text == ":";
+}
+
+struct Int64Range {
+ int64_t low;
+ int64_t high;
+};
+
+libtextclassifier3::StatusOr<Int64Range> GetInt64Range(
+ std::string_view operator_text, int64_t int_value) {
+ Int64Range range = {std::numeric_limits<int64_t>::min(),
+ std::numeric_limits<int64_t>::max()};
+ if (operator_text == "<") {
+ if (int_value == std::numeric_limits<int64_t>::min()) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot specify < INT64_MIN in query expression.");
+ }
+ range.high = int_value - 1;
+ } else if (operator_text == "<=") {
+ range.high = int_value;
+ } else if (operator_text == "==") {
+ range.high = int_value;
+ range.low = int_value;
+ } else if (operator_text == ">=") {
+ range.low = int_value;
+ } else if (operator_text == ">") {
+ if (int_value == std::numeric_limits<int64_t>::max()) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot specify > INT64_MAX in query expression.");
+ }
+ range.low = int_value + 1;
+ }
+ return range;
+}
+
+} // namespace
+
+void QueryVisitor::PendingPropertyRestricts::AddValidRestricts(
+ std::set<std::string> new_restricts) {
+ if (!has_active_property_restricts()) {
+ pending_property_restricts_.push_back(std::move(new_restricts));
+ return;
+ }
+
+ // There is an active property restrict already in effect. To determine the
+ // updated active property restrict being applied at this level, we need to
+ // calculate the intersection of new_restricts and
+ // active_property_restricts.
+ const std::set<std::string>& active_restricts = active_property_restricts();
+ auto active_restricts_itr = active_restricts.begin();
+ for (auto new_restricts_itr = new_restricts.begin();
+ new_restricts_itr != new_restricts.end();) {
+ while (active_restricts_itr != active_restricts.end() &&
+ *active_restricts_itr < *new_restricts_itr) {
+ // new_restricts_itr is behind active_restricts_itr.
+ ++active_restricts_itr;
+ }
+ if (active_restricts_itr == active_restricts.end()) {
+ // There's nothing left in active restricts. Everything at
+ // new_restricts_itr and beyond should be removed
+ new_restricts_itr =
+ new_restricts.erase(new_restricts_itr, new_restricts.end());
+ } else if (*active_restricts_itr > *new_restricts_itr) {
+ // new_restricts_itr points to elements not present in
+ // active_restricts_itr
+ new_restricts_itr = new_restricts.erase(new_restricts_itr);
+ } else {
+ // the element that new_restricts_itr points to is present in
+ // active_restricts_itr.
+ ++new_restricts_itr;
+ }
+ }
+ pending_property_restricts_.push_back(std::move(new_restricts));
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+QueryVisitor::CreateTermIterator(const QueryTerm& query_term) {
+ if (query_term.is_prefix_val) {
+ // '*' prefix operator was added in list filters
+ features_.insert(kListFilterQueryLanguageFeature);
+ }
+ TermMatchType::Code match_type = GetTermMatchType(query_term.is_prefix_val);
+ int unnormalized_term_start =
+ query_term.raw_term.data() - raw_query_text_.data();
+ if (!processing_not_) {
+ // 1. Add term to property_query_terms_map
+ if (pending_property_restricts_.has_active_property_restricts()) {
+ for (const std::string& property_restrict :
+ pending_property_restricts_.active_property_restricts()) {
+ property_query_terms_map_[property_restrict].insert(query_term.term);
+ }
+ } else {
+ property_query_terms_map_[""].insert(query_term.term);
+ }
+
+ // 2. If needed add term iterator to query_term_iterators_ map.
+ if (needs_term_frequency_info_) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> term_iterator,
+ index_.GetIterator(query_term.term, unnormalized_term_start,
+ query_term.raw_term.length(), kSectionIdMaskAll,
+ match_type_, needs_term_frequency_info_));
+ query_term_iterators_[query_term.term] =
+ std::make_unique<DocHitInfoIteratorFilter>(
+ std::move(term_iterator), &document_store_, &schema_store_,
+ filter_options_, current_time_ms_);
+ }
+ }
+
+ // 3. Add the term iterator.
+ return index_.GetIterator(query_term.term, unnormalized_term_start,
+ query_term.raw_term.length(), kSectionIdMaskAll,
+ match_type, needs_term_frequency_info_);
+}
+
+void QueryVisitor::RegisterFunctions() {
+ // std::vector<std::string> createList(std::string...);
+ Function create_list_function_ =
+ Function::Create(DataType::kStringList, "createList",
+ {Param(DataType::kString, Cardinality::kRequired),
+ Param(DataType::kString, Cardinality::kVariable)},
+ CreateList())
+ .ValueOrDie();
+ registered_functions_.insert(
+ {create_list_function_.name(), std::move(create_list_function_)});
+
+ // DocHitInfoIterator search(std::string);
+ // DocHitInfoIterator search(std::string, std::vector<std::string>);
+ auto search_eval = [this](std::vector<PendingValue>&& args) {
+ return this->SearchFunction(std::move(args));
+ };
+ Function search_function =
+ Function::Create(DataType::kDocumentIterator, "search",
+ {Param(DataType::kString),
+ Param(DataType::kStringList, Cardinality::kOptional)},
+ std::move(search_eval))
+ .ValueOrDie();
+ registered_functions_.insert(
+ {search_function.name(), std::move(search_function)});
+
+ // DocHitInfoIterator propertyDefined(std::string);
+ auto property_defined = [this](std::vector<PendingValue>&& args) {
+ return this->PropertyDefinedFunction(std::move(args));
+ };
+ Function property_defined_function =
+ Function::Create(DataType::kDocumentIterator, "propertyDefined",
+ {Param(DataType::kString)}, std::move(property_defined))
+ .ValueOrDie();
+ registered_functions_.insert(
+ {property_defined_function.name(), std::move(property_defined_function)});
+
+ // DocHitInfoIterator hasProperty(std::string);
+ auto has_property = [this](std::vector<PendingValue>&& args) {
+ return this->HasPropertyFunction(std::move(args));
+ };
+ Function has_property_function =
+ Function::Create(DataType::kDocumentIterator, "hasProperty",
+ {Param(DataType::kString)}, std::move(has_property))
+ .ValueOrDie();
+ registered_functions_.insert(
+ {has_property_function.name(), std::move(has_property_function)});
+}
+
+libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction(
+ std::vector<PendingValue>&& args) {
+ // The second arg (if present) is a list of sections to restrict to.
+ if (args.size() == 2) {
+ std::set<std::string> new_restricts;
+ std::vector<std::string> property_restricts =
+ std::move(args.at(1)).string_vals().ValueOrDie();
+ for (std::string& property_restrict : property_restricts) {
+ new_restricts.insert(std::move(property_restrict));
+ }
+ pending_property_restricts_.AddValidRestricts(std::move(new_restricts));
+ if (pending_property_restricts_.active_property_restricts().empty()) {
+ pending_property_restricts_.PopRestricts();
+ return PendingValue(std::make_unique<DocHitInfoIteratorNone>());
+ }
+ }
+
+ // The first arg is guaranteed to be a STRING at this point. It should be safe
+ // to call ValueOrDie.
+ const QueryTerm* query = args.at(0).string_val().ValueOrDie();
+ Lexer lexer(query->term, Lexer::Language::QUERY);
+ ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ std::unique_ptr<DocHitInfoIterator> iterator;
+ QueryResults query_result;
+ if (tree_root == nullptr) {
+ iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ document_store_.last_added_document_id());
+ } else {
+ QueryVisitor query_visitor(
+ &index_, &numeric_index_, &document_store_, &schema_store_,
+ &normalizer_, &tokenizer_, query->raw_term, filter_options_,
+ match_type_, needs_term_frequency_info_, pending_property_restricts_,
+ processing_not_, current_time_ms_);
+ tree_root->Accept(&query_visitor);
+ ICING_ASSIGN_OR_RETURN(query_result,
+ std::move(query_visitor).ConsumeResults());
+ iterator = std::move(query_result.root_iterator);
+ }
+
+ // Update members based on results of processing the query.
+ if (args.size() == 2 &&
+ pending_property_restricts_.has_active_property_restricts()) {
+ iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(iterator), &document_store_, &schema_store_,
+ pending_property_restricts_.active_property_restricts(),
+ current_time_ms_);
+ pending_property_restricts_.PopRestricts();
+ }
+ if (!processing_not_) {
+ std::move(
+ query_result.query_term_iterators.begin(),
+ query_result.query_term_iterators.end(),
+ std::inserter(query_term_iterators_, query_term_iterators_.end()));
+
+ std::move(query_result.query_terms.begin(), query_result.query_terms.end(),
+ std::inserter(property_query_terms_map_,
+ property_query_terms_map_.end()));
+ }
+ std::move(query_result.features_in_use.begin(),
+ query_result.features_in_use.end(),
+ std::inserter(features_, features_.end()));
+ return PendingValue(std::move(iterator));
+}
+
+libtextclassifier3::StatusOr<PendingValue>
+QueryVisitor::PropertyDefinedFunction(std::vector<PendingValue>&& args) {
+ // The first arg is guaranteed to be a STRING at this point. It should be safe
+ // to call ValueOrDie.
+ const QueryTerm* member = args.at(0).string_val().ValueOrDie();
+
+ std::unique_ptr<DocHitInfoIterator> all_docs_iterator =
+ std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ document_store_.last_added_document_id());
+
+ std::set<std::string> target_sections = {std::move(member->term)};
+ std::unique_ptr<DocHitInfoIterator> property_in_schema_iterator =
+ std::make_unique<DocHitInfoIteratorPropertyInSchema>(
+ std::move(all_docs_iterator), &document_store_, &schema_store_,
+ std::move(target_sections), current_time_ms_);
+
+ features_.insert(kListFilterQueryLanguageFeature);
+
+ return PendingValue(std::move(property_in_schema_iterator));
+}
+
+libtextclassifier3::StatusOr<PendingValue> QueryVisitor::HasPropertyFunction(
+ std::vector<PendingValue>&& args) {
+ // The first arg is guaranteed to be a STRING at this point. It should be safe
+ // to call ValueOrDie.
+ const std::string& property_path = args.at(0).string_val().ValueOrDie()->term;
+
+ // Perform an exact search for the property existence metadata token.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> meta_hit_iterator,
+ index_.GetIterator(
+ absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path),
+ /*term_start_index=*/0,
+ /*unnormalized_term_length=*/0, kSectionIdMaskAll,
+ TermMatchType::EXACT_ONLY,
+ /*need_hit_term_frequency=*/false));
+
+ std::unique_ptr<DocHitInfoIterator> property_in_document_iterator =
+ std::make_unique<DocHitInfoIteratorPropertyInDocument>(
+ std::move(meta_hit_iterator));
+
+ features_.insert(kHasPropertyFunctionFeature);
+
+ return PendingValue(std::move(property_in_document_iterator));
+}
+
+libtextclassifier3::StatusOr<int64_t> QueryVisitor::PopPendingIntValue() {
+ if (pending_values_.empty()) {
+ return absl_ports::InvalidArgumentError("Unable to retrieve int value.");
+ }
+ ICING_ASSIGN_OR_RETURN(int64_t int_value, pending_values_.top().long_val());
+ pending_values_.pop();
+ return int_value;
+}
+
+libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingStringValue() {
+ if (pending_values_.empty()) {
+ return absl_ports::InvalidArgumentError("Unable to retrieve string value.");
+ }
+ ICING_ASSIGN_OR_RETURN(QueryTerm string_value,
+ std::move(pending_values_.top()).string_val());
+ pending_values_.pop();
+ return string_value;
+}
+
+libtextclassifier3::StatusOr<QueryTerm> QueryVisitor::PopPendingTextValue() {
+ if (pending_values_.empty()) {
+ return absl_ports::InvalidArgumentError("Unable to retrieve text value.");
+ }
+ ICING_ASSIGN_OR_RETURN(QueryTerm text_value,
+ std::move(pending_values_.top()).text_val());
+ pending_values_.pop();
+ return text_value;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+QueryVisitor::PopPendingIterator() {
+ if (pending_values_.empty() || pending_values_.top().is_placeholder()) {
+ return absl_ports::InvalidArgumentError("Unable to retrieve iterator.");
+ }
+ if (pending_values_.top().data_type() == DataType::kDocumentIterator) {
+ std::unique_ptr<DocHitInfoIterator> iterator =
+ std::move(pending_values_.top()).iterator().ValueOrDie();
+ pending_values_.pop();
+ return iterator;
+ } else if (pending_values_.top().data_type() == DataType::kString) {
+ features_.insert(kVerbatimSearchFeature);
+ ICING_ASSIGN_OR_RETURN(QueryTerm string_value, PopPendingStringValue());
+ return CreateTermIterator(std::move(string_value));
+ } else {
+ ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> token_itr,
+ tokenizer_.Tokenize(text_value.term));
+ std::string normalized_term;
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ // The tokenizer will produce 1+ tokens out of the text. The prefix operator
+ // only applies to the final token.
+ bool reached_final_token = !token_itr->Advance();
+ // raw_text is the portion of text_value.raw_term that hasn't yet been
+ // matched to any of the tokens that we've processed. escaped_token will
+ // hold the portion of raw_text that corresponds to the current token that
+ // is being processed.
+ std::string_view raw_text = text_value.raw_term;
+ std::string_view raw_token;
+ while (!reached_final_token) {
+ std::vector<Token> tokens = token_itr->GetTokens();
+ if (tokens.size() > 1) {
+ // The tokenizer iterator iterates between token groups. In practice,
+ // the tokenizer used with QueryVisitor (PlainTokenizer) will always
+ // only produce a single token per token group.
+ return absl_ports::InvalidArgumentError(
+ "Encountered unexpected token group with >1 tokens.");
+ }
+
+ reached_final_token = !token_itr->Advance();
+ const Token& token = tokens.at(0);
+ if (reached_final_token && token.text.length() == raw_text.length()) {
+ // Unescaped tokens are strictly smaller than their escaped counterparts
+ // This means that if we're at the final token and token.length equals
+ // raw_text, then all of raw_text must correspond to this token.
+ raw_token = raw_text;
+ } else {
+ ICING_ASSIGN_OR_RETURN(raw_token, string_util::FindEscapedToken(
+ raw_text, token.text));
+ }
+ normalized_term = normalizer_.NormalizeTerm(token.text);
+ QueryTerm term_value{std::move(normalized_term), raw_token,
+ reached_final_token && text_value.is_prefix_val};
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
+ CreateTermIterator(std::move(term_value)));
+ iterators.push_back(std::move(iterator));
+
+ // Remove escaped_token from raw_text now that we've processed
+ // raw_text.
+ const char* escaped_token_end = raw_token.data() + raw_token.length();
+ raw_text = raw_text.substr(escaped_token_end - raw_text.data());
+ }
+
+ // Finally, create an And Iterator. If there's only a single term here, then
+ // it will just return that term iterator. Otherwise, segmented text is
+ // treated as a group of terms AND'd together.
+ return CreateAndIterator(std::move(iterators));
+ }
+}
+
+libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>>
+QueryVisitor::PopAllPendingIterators() {
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators;
+ while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> itr,
+ PopPendingIterator());
+ iterators.push_back(std::move(itr));
+ }
+ if (pending_values_.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Unable to retrieve expected iterators.");
+ }
+ // Iterators will be in reverse order because we retrieved them from the
+ // stack. Reverse them to get back to the original ordering.
+ std::reverse(iterators.begin(), iterators.end());
+ return iterators;
+}
+
+libtextclassifier3::Status QueryVisitor::ProcessNumericComparator(
+ const NaryOperatorNode* node) {
+ if (node->children().size() != 2) {
+ return absl_ports::InvalidArgumentError("Expected 2 children.");
+ }
+
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+
+ // 2. The first child is the property to restrict by.
+ node->children().at(0)->Accept(this);
+ if (has_pending_error()) {
+ return std::move(pending_error_);
+ }
+ ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
+
+ if (text_value.is_prefix_val) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot use prefix operator '*' with a property name!");
+ }
+
+ // If there is an active property restrict and this property is not present in
+ // in the active restrict set, then it's not satisfiable.
+ if (pending_property_restricts_.has_active_property_restricts() &&
+ pending_property_restricts_.active_property_restricts().find(
+ text_value.term) ==
+ pending_property_restricts_.active_property_restricts().end()) {
+ // The property restrict can't be satisfiable. Pop the placeholder that was
+ // just added and push a FALSE iterator.
+ pending_property_restricts_.PopRestricts();
+ pending_values_.pop();
+ pending_values_.push(
+ PendingValue(std::make_unique<DocHitInfoIteratorNone>()));
+ return libtextclassifier3::Status::OK;
+ }
+
+ // 3. The second child should be parseable as an integer value.
+ expecting_numeric_arg_ = true;
+ node->children().at(1)->Accept(this);
+ expecting_numeric_arg_ = false;
+ ICING_ASSIGN_OR_RETURN(int64_t int_value, PopPendingIntValue());
+
+ // 4. Check for the placeholder.
+ if (!pending_values_.top().is_placeholder()) {
+ return absl_ports::InvalidArgumentError(
+ "Error processing arguments for node.");
+ }
+ pending_values_.pop();
+
+ // 5. Create the iterator and push it onto pending_values_.
+ ICING_ASSIGN_OR_RETURN(Int64Range range,
+ GetInt64Range(node->operator_text(), int_value));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator,
+ numeric_index_.GetIterator(
+ text_value.term, range.low, range.high,
+ document_store_, schema_store_, current_time_ms_));
+
+ features_.insert(kNumericSearchFeature);
+ pending_values_.push(PendingValue(std::move(iterator)));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessAndOperator(
+ const NaryOperatorNode* node) {
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators,
+ PopAllPendingIterators());
+ return PendingValue(CreateAndIterator(std::move(iterators)));
+}
+
+libtextclassifier3::StatusOr<PendingValue> QueryVisitor::ProcessOrOperator(
+ const NaryOperatorNode* node) {
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<std::unique_ptr<DocHitInfoIterator>> iterators,
+ PopAllPendingIterators());
+ return PendingValue(CreateOrIterator(std::move(iterators)));
+}
+
+libtextclassifier3::Status QueryVisitor::ProcessNegationOperator(
+ const UnaryOperatorNode* node) {
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+
+ // 2. Visit child
+ node->child()->Accept(this);
+ if (has_pending_error()) {
+ return std::move(pending_error_);
+ }
+
+ if (pending_values_.size() < 2) {
+ return absl_ports::InvalidArgumentError(
+ "Visit unary operator child didn't correctly add pending values.");
+ }
+
+ // 3. We want to preserve the original text of the integer value, append our
+ // minus and *then* parse as an int.
+ ICING_ASSIGN_OR_RETURN(QueryTerm int_text_val, PopPendingTextValue());
+ int_text_val.term = absl_ports::StrCat("-", int_text_val.term);
+ PendingValue pending_value =
+ PendingValue::CreateTextPendingValue(std::move(int_text_val));
+ ICING_RETURN_IF_ERROR(pending_value.long_val());
+
+ // We've parsed our integer value successfully. Pop our placeholder, push it
+ // on to the stack and return successfully.
+ if (!pending_values_.top().is_placeholder()) {
+ return absl_ports::InvalidArgumentError(
+ "Error processing arguments for node.");
+ }
+ pending_values_.pop();
+ pending_values_.push(std::move(pending_value));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QueryVisitor::ProcessNotOperator(
+ const UnaryOperatorNode* node) {
+ // TODO(b/265312785) Consider implementing query optimization when we run into
+ // nested NOTs. This would allow us to simplify a query like "NOT (-foo)" to
+ // just "foo". This would also require more complicate rewrites as we would
+ // need to do things like rewrite "NOT (-a OR b)" as "a AND -b" and
+ // "NOT (price < 5)" as "price >= 5".
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+ // Toggle whatever the current value of 'processing_not_' is before visiting
+ // the children.
+ processing_not_ = !processing_not_;
+
+ // 2. Visit child
+ node->child()->Accept(this);
+ if (has_pending_error()) {
+ return std::move(pending_error_);
+ }
+
+ if (pending_values_.size() < 2) {
+ return absl_ports::InvalidArgumentError(
+ "Visit unary operator child didn't correctly add pending values.");
+ }
+
+ // 3. Retrieve the delegate iterator
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> delegate,
+ PopPendingIterator());
+
+ // 4. Check for the placeholder.
+ if (!pending_values_.top().is_placeholder()) {
+ return absl_ports::InvalidArgumentError(
+ "Error processing arguments for node.");
+ }
+ pending_values_.pop();
+
+ pending_values_.push(PendingValue(std::make_unique<DocHitInfoIteratorNot>(
+ std::move(delegate), document_store_.last_added_document_id())));
+
+ // Untoggle whatever the current value of 'processing_not_' is now that we've
+ // finished processing this NOT.
+ processing_not_ = !processing_not_;
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status QueryVisitor::ProcessHasOperator(
+ const NaryOperatorNode* node) {
+ if (node->children().size() != 2) {
+ return absl_ports::InvalidArgumentError("Expected 2 children.");
+ }
+
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+
+ // 2. Visit the first child - the property.
+ node->children().at(0)->Accept(this);
+ if (has_pending_error()) {
+ return pending_error_;
+ }
+ ICING_ASSIGN_OR_RETURN(QueryTerm text_value, PopPendingTextValue());
+ if (text_value.is_prefix_val) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot use prefix operator '*' with a property name!");
+ }
+ pending_property_restricts_.AddValidRestricts({text_value.term});
+
+ // Just added a restrict - if there are no active property restricts then that
+ // be because this restrict is unsatisfiable.
+ if (pending_property_restricts_.active_property_restricts().empty()) {
+ // The property restrict can't be satisfiable. Pop the placeholder that was
+ // just added and push a FALSE iterator.
+ pending_property_restricts_.PopRestricts();
+ pending_values_.pop();
+ pending_values_.push(
+ PendingValue(std::make_unique<DocHitInfoIteratorNone>()));
+ return libtextclassifier3::Status::OK;
+ }
+
+ // 3. Visit the second child - the argument.
+ node->children().at(1)->Accept(this);
+ if (has_pending_error()) {
+ return pending_error_;
+ }
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> delegate,
+ PopPendingIterator());
+
+ // 4. Check for the placeholder.
+ if (!pending_values_.top().is_placeholder()) {
+ return absl_ports::InvalidArgumentError(
+ "Error processing arguments for node.");
+ }
+ pending_values_.pop();
+ pending_property_restricts_.PopRestricts();
+
+ std::set<std::string> property_restricts = {std::move(text_value.term)};
+ pending_values_.push(
+ PendingValue(DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(delegate), &document_store_, &schema_store_,
+ std::move(property_restricts), current_time_ms_)));
+ return libtextclassifier3::Status::OK;
+}
+
+void QueryVisitor::VisitFunctionName(const FunctionNameNode* node) {
+ pending_error_ = absl_ports::UnimplementedError(
+ "Function Name node visiting not implemented yet.");
+}
+
+void QueryVisitor::VisitString(const StringNode* node) {
+ // A STRING node can only be a term. Create the iterator now.
+ auto unescaped_string_or = string_util::UnescapeStringValue(node->value());
+ if (!unescaped_string_or.ok()) {
+ pending_error_ = std::move(unescaped_string_or).status();
+ return;
+ }
+ std::string unescaped_string = std::move(unescaped_string_or).ValueOrDie();
+ QueryTerm val{std::move(unescaped_string), node->raw_value(),
+ node->is_prefix()};
+ pending_values_.push(PendingValue::CreateStringPendingValue(std::move(val)));
+}
+
+void QueryVisitor::VisitText(const TextNode* node) {
+ // TEXT nodes could either be a term (and will become DocHitInfoIteratorTerm)
+ // or a property name. As such, we just push the TEXT value into pending
+ // values and determine which it is at a later point.
+ QueryTerm val{std::move(node->value()), node->raw_value(), node->is_prefix()};
+ pending_values_.push(PendingValue::CreateTextPendingValue(std::move(val)));
+}
+
+void QueryVisitor::VisitMember(const MemberNode* node) {
+ if (node->children().empty()) {
+ pending_error_ =
+ absl_ports::InvalidArgumentError("Encountered malformed member node.");
+ return;
+ }
+
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+
+ // 2. Visit the children.
+ for (const std::unique_ptr<TextNode>& child : node->children()) {
+ child->Accept(this);
+ if (has_pending_error()) {
+ return;
+ }
+ }
+
+ // 3. Now process the results of the children and produce a single pending
+ // value representing this member.
+ PendingValue pending_value;
+ if (node->children().size() == 1) {
+ // 3a. This member only has a single child, then the pending value produced
+ // by that child is the final value produced by this member.
+ pending_value = std::move(pending_values_.top());
+ pending_values_.pop();
+ } else {
+ // 3b. Retrieve the values of all children and concatenate them into a
+ // single value.
+ libtextclassifier3::StatusOr<QueryTerm> member_or;
+ std::vector<std::string> members;
+ QueryTerm text_val;
+ const char* start = nullptr;
+ const char* end = nullptr;
+ while (!pending_values_.empty() &&
+ !pending_values_.top().is_placeholder()) {
+ member_or = PopPendingTextValue();
+ if (!member_or.ok()) {
+ pending_error_ = std::move(member_or).status();
+ return;
+ }
+ text_val = std::move(member_or).ValueOrDie();
+ if (text_val.is_prefix_val) {
+ pending_error_ = absl_ports::InvalidArgumentError(
+ "Cannot use prefix operator '*' within a property name!");
+ return;
+ }
+ if (start == nullptr) {
+ start = text_val.raw_term.data();
+ end = text_val.raw_term.data() + text_val.raw_term.length();
+ } else {
+ start = std::min(start, text_val.raw_term.data());
+ end = std::max(end, text_val.raw_term.data() + text_val.raw_term.length());
+ }
+ members.push_back(std::move(text_val.term));
+ }
+ QueryTerm member;
+ member.term = absl_ports::StrJoin(members.rbegin(), members.rend(),
+ property_util::kPropertyPathSeparator);
+ member.raw_term = std::string_view(start, end - start);
+ member.is_prefix_val = false;
+ pending_value = PendingValue::CreateTextPendingValue(std::move(member));
+ }
+
+ // 4. If pending_values_ is empty somehow, then our placeholder disappeared
+ // somehow.
+ if (pending_values_.empty()) {
+ pending_error_ = absl_ports::InvalidArgumentError(
+ "Error processing arguments for member node.");
+ return;
+ }
+ pending_values_.pop();
+
+ pending_values_.push(std::move(pending_value));
+}
+
+void QueryVisitor::VisitFunction(const FunctionNode* node) {
+ // 1. Get the associated function.
+ auto itr = registered_functions_.find(node->function_name()->value());
+ if (itr == registered_functions_.end()) {
+ pending_error_ = absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Function ", node->function_name()->value(), " is not supported."));
+ return;
+ }
+
+ // 2. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+
+ // 3. Visit the children.
+ for (const std::unique_ptr<Node>& arg : node->args()) {
+ arg->Accept(this);
+ if (has_pending_error()) {
+ return;
+ }
+ }
+
+ // 4. Collect the arguments and evaluate the function.
+ std::vector<PendingValue> args;
+ while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) {
+ args.push_back(std::move(pending_values_.top()));
+ pending_values_.pop();
+ }
+ std::reverse(args.begin(), args.end());
+ const Function& function = itr->second;
+ auto eval_result = function.Eval(std::move(args));
+ if (!eval_result.ok()) {
+ pending_error_ = std::move(eval_result).status();
+ return;
+ }
+
+ // 5. Pop placeholder in pending_values and add the result of our function.
+ pending_values_.pop();
+ pending_values_.push(std::move(eval_result).ValueOrDie());
+
+ // Support for custom functions was added in list filters.
+ features_.insert(kListFilterQueryLanguageFeature);
+}
+
+// TODO(b/265312785) Clarify handling of the interaction between HAS and NOT.
+// Currently, `prop1:(NOT foo bar)` will not match any documents. Likewise,
+// `search("NOT foo bar", createList("prop1"))` will not match any documents.
+//
+// We should either confirm that this is the desired behavior or consider
+// rewriting these queries so that they're interpreted as
+// `NOT prop1:foo AND prop1:bar` and
+// `NOT search("foo", createList("prop1"))
+// AND search("bar", createList("prop1"))`
+void QueryVisitor::VisitUnaryOperator(const UnaryOperatorNode* node) {
+ bool is_minus = node->operator_text() == "MINUS";
+ if (node->operator_text() != "NOT" && !is_minus) {
+ pending_error_ = absl_ports::UnimplementedError(
+ absl_ports::StrCat("Visiting for unary operator ",
+ node->operator_text(), " not implemented yet."));
+ return;
+ }
+
+ libtextclassifier3::Status status;
+ if (expecting_numeric_arg_ && is_minus) {
+ // If the operator is a MINUS ('-') and we're at the child of a numeric
+ // comparator, then this must be a negation ('-3')
+ status = ProcessNegationOperator(node);
+ } else {
+ status = ProcessNotOperator(node);
+ }
+
+ if (!status.ok()) {
+ pending_error_ = std::move(status);
+ }
+
+ if (!is_minus ||
+ pending_property_restricts_.has_active_property_restricts() ||
+ processing_not_) {
+ // 'NOT' operator was added in list filters.
+ // Likewise, mixing property restricts and NOTs were made valid in list
+ // filters.
+ features_.insert(kListFilterQueryLanguageFeature);
+ }
+}
+
+void QueryVisitor::VisitNaryOperator(const NaryOperatorNode* node) {
+ if (!IsSupportedNaryOperator(node->operator_text())) {
+ pending_error_ = absl_ports::UnimplementedError(
+ "No support for any non-numeric operators.");
+ return;
+ }
+
+ if (pending_property_restricts_.has_active_property_restricts() ||
+ processing_not_) {
+ // Likewise, mixing property restricts and NOT with compound statements was
+ // added in list filters.
+ features_.insert(kListFilterQueryLanguageFeature);
+ }
+
+ if (node->operator_text() == ":") {
+ libtextclassifier3::Status status = ProcessHasOperator(node);
+ if (!status.ok()) {
+ pending_error_ = std::move(status);
+ }
+ return;
+ } else if (IsNumericComparator(node->operator_text())) {
+ libtextclassifier3::Status status = ProcessNumericComparator(node);
+ if (!status.ok()) {
+ pending_error_ = std::move(status);
+ }
+ return;
+ }
+
+ // 1. Put in a placeholder PendingValue
+ pending_values_.push(PendingValue());
+
+ // 2. Visit the children.
+ for (int i = 0; i < node->children().size(); ++i) {
+ node->children().at(i)->Accept(this);
+ if (has_pending_error()) {
+ return;
+ }
+ }
+
+ // 3. Retrieve the pending value for this node.
+ libtextclassifier3::StatusOr<PendingValue> pending_value_or;
+ if (node->operator_text() == "AND") {
+ pending_value_or = ProcessAndOperator(node);
+ } else if (node->operator_text() == "OR") {
+ pending_value_or = ProcessOrOperator(node);
+ }
+ if (!pending_value_or.ok()) {
+ pending_error_ = std::move(pending_value_or).status();
+ return;
+ }
+ PendingValue pending_value = std::move(pending_value_or).ValueOrDie();
+
+ // 4. Check for the placeholder.
+ if (!pending_values_.top().is_placeholder()) {
+ pending_error_ = absl_ports::InvalidArgumentError(
+ "Error processing arguments for node.");
+ return;
+ }
+ pending_values_.pop();
+
+ pending_values_.push(std::move(pending_value));
+}
+
+libtextclassifier3::StatusOr<QueryResults> QueryVisitor::ConsumeResults() && {
+ if (has_pending_error()) {
+ return std::move(pending_error_);
+ }
+ if (pending_values_.size() != 1) {
+ return absl_ports::InvalidArgumentError(
+ "Visitor does not contain a single root iterator.");
+ }
+ auto iterator_or = PopPendingIterator();
+ if (!iterator_or.ok()) {
+ return std::move(iterator_or).status();
+ }
+
+ QueryResults results;
+ results.root_iterator = std::move(iterator_or).ValueOrDie();
+ results.query_term_iterators = std::move(query_term_iterators_);
+ results.query_terms = std::move(property_query_terms_map_);
+ results.features_in_use = std::move(features_);
+ return results;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/query-visitor.h b/icing/query/advanced_query_parser/query-visitor.h
new file mode 100644
index 0000000..d090b3c
--- /dev/null
+++ b/icing/query/advanced_query_parser/query-visitor.h
@@ -0,0 +1,327 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
+
+#include <cstdint>
+#include <memory>
+#include <stack>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/query/advanced_query_parser/function.h"
+#include "icing/query/advanced_query_parser/pending-value.h"
+#include "icing/query/query-features.h"
+#include "icing/query/query-results.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+// The Visitor used to create the DocHitInfoIterator tree from the AST output by
+// the parser.
+class QueryVisitor : public AbstractSyntaxTreeVisitor {
+ public:
+ explicit QueryVisitor(Index* index,
+ const NumericIndex<int64_t>* numeric_index,
+ const DocumentStore* document_store,
+ const SchemaStore* schema_store,
+ const Normalizer* normalizer,
+ const Tokenizer* tokenizer,
+ std::string_view raw_query_text,
+ DocHitInfoIteratorFilter::Options filter_options,
+ TermMatchType::Code match_type,
+ bool needs_term_frequency_info, int64_t current_time_ms)
+ : QueryVisitor(index, numeric_index, document_store, schema_store,
+ normalizer, tokenizer, raw_query_text, filter_options,
+ match_type, needs_term_frequency_info,
+ PendingPropertyRestricts(),
+ /*processing_not=*/false, current_time_ms) {}
+
+ void VisitFunctionName(const FunctionNameNode* node) override;
+ void VisitString(const StringNode* node) override;
+ void VisitText(const TextNode* node) override;
+ void VisitMember(const MemberNode* node) override;
+ void VisitFunction(const FunctionNode* node) override;
+ void VisitUnaryOperator(const UnaryOperatorNode* node) override;
+ void VisitNaryOperator(const NaryOperatorNode* node) override;
+
+ // RETURNS:
+ // - the QueryResults reflecting the AST that was visited
+ // - INVALID_ARGUMENT if the AST does not conform to supported expressions
+ // - NOT_FOUND if the AST refers to a property that does not exist
+ libtextclassifier3::StatusOr<QueryResults> ConsumeResults() &&;
+
+ private:
+ // An internal class to help manage property restricts being applied at
+ // different levels.
+ class PendingPropertyRestricts {
+ public:
+ // Add another set of property restricts. Elements of new_restricts that are
+ // not present in active_property_rest
+ void AddValidRestricts(std::set<std::string> new_restricts);
+
+ // Pops the most recently added set of property restricts.
+ void PopRestricts() {
+ if (has_active_property_restricts()) {
+ pending_property_restricts_.pop_back();
+ }
+ }
+
+ bool has_active_property_restricts() const {
+ return !pending_property_restricts_.empty();
+ }
+
+ // The set of all property restrictions that are currently being applied.
+ const std::set<std::string>& active_property_restricts() const {
+ return pending_property_restricts_.back();
+ }
+
+ private:
+ std::vector<std::set<std::string>> pending_property_restricts_;
+ };
+
+ explicit QueryVisitor(
+ Index* index, const NumericIndex<int64_t>* numeric_index,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ const Normalizer* normalizer, const Tokenizer* tokenizer,
+ std::string_view raw_query_text,
+ DocHitInfoIteratorFilter::Options filter_options,
+ TermMatchType::Code match_type, bool needs_term_frequency_info,
+ PendingPropertyRestricts pending_property_restricts, bool processing_not,
+ int64_t current_time_ms)
+ : index_(*index),
+ numeric_index_(*numeric_index),
+ document_store_(*document_store),
+ schema_store_(*schema_store),
+ normalizer_(*normalizer),
+ tokenizer_(*tokenizer),
+ raw_query_text_(raw_query_text),
+ filter_options_(std::move(filter_options)),
+ match_type_(match_type),
+ needs_term_frequency_info_(needs_term_frequency_info),
+ pending_property_restricts_(std::move(pending_property_restricts)),
+ processing_not_(processing_not),
+ expecting_numeric_arg_(false),
+ current_time_ms_(current_time_ms) {
+ RegisterFunctions();
+ }
+
+ bool has_pending_error() const { return !pending_error_.ok(); }
+
+ // Creates a DocHitInfoIterator reflecting the provided term and whether the
+ // prefix operator has been applied to this term. Also populates,
+ // property_query_terms_map_ and query_term_iterators_ as appropriate.
+ // Returns:
+ // - On success, a DocHitInfoIterator for the provided term
+ // - INVALID_ARGUMENT if unable to create an iterator for the term.
+ libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+ CreateTermIterator(const QueryTerm& term);
+
+ // Processes the PendingValue at the top of pending_values_, parses it into a
+ // int64_t and pops the top.
+ // Returns:
+ // - On success, the int value stored in the text at the top
+ // - INVALID_ARGUMENT if pending_values_ is empty, doesn't hold a text or
+ // can't be parsed as an int.
+ libtextclassifier3::StatusOr<int64_t> PopPendingIntValue();
+
+ // Processes the PendingValue at the top of pending_values_ and pops the top.
+ // Returns:
+ // - On success, the string value stored in the text at the top and a bool
+ // indicating whether or not the string value has a prefix operator.
+ // - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a string.
+ libtextclassifier3::StatusOr<QueryTerm> PopPendingStringValue();
+
+ // Processes the PendingValue at the top of pending_values_ and pops the top.
+ // Returns:
+ // - On success, the string value stored in the text at the top
+ // indicating whether or not the string value has a prefix operator.
+ // - INVALID_ARGUMENT if pending_values_ is empty or doesn't hold a text.
+ libtextclassifier3::StatusOr<QueryTerm> PopPendingTextValue();
+
+ // Processes the PendingValue at the top of pending_values_ and pops the top.
+ // Returns:
+ // - On success, a DocHitInfoIterator representing for the term at the top
+ // - INVALID_ARGUMENT if pending_values_ is empty or if unable to create an
+ // iterator for the term.
+ libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
+ PopPendingIterator();
+
+ // Processes all PendingValues at the top of pending_values_ until the first
+ // placeholder is encounter.
+ // Returns:
+ // - On success, a vector containing all DocHitInfoIterators representing
+ // the values at the top of pending_values_
+ // - INVALID_ARGUMENT if pending_values_is empty or if unable to create an
+ // iterator for any of the terms at the top of pending_values_
+ libtextclassifier3::StatusOr<std::vector<std::unique_ptr<DocHitInfoIterator>>>
+ PopAllPendingIterators();
+
+ // Processes the unary operator node as a NOT operator. A NOT can have an
+ // operator type of "NOT" or "MINUS"
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if any errors are encountered while processing
+ // node->child
+ libtextclassifier3::Status ProcessNotOperator(const UnaryOperatorNode* node);
+
+ // Processes the unary operator node as a negation operator. A negation
+ // operator should have an operator of type "MINUS" and it's children must
+ // resolve to a numeric value.
+ //
+ // RETURNS:
+ // - OK on success
+ // - INVALID_ARGUMENT if the node->child can't be resolved to a numeric
+ // value.
+ libtextclassifier3::Status ProcessNegationOperator(
+ const UnaryOperatorNode* node);
+
+ // Processes the NumericComparator represented by node. This must be called
+ // *after* this node's children have been visited. The PendingValues added by
+ // this node's children will be consumed by this function and the PendingValue
+ // for this node will be returned.
+ // Returns:
+ // - On success, OK
+ // - INVALID_ARGUMENT if unable to retrieve string value or int value
+ // - NOT_FOUND if there is no entry in the numeric index for the property
+ libtextclassifier3::Status ProcessNumericComparator(
+ const NaryOperatorNode* node);
+
+ // Processes the AND and OR operators represented by the node. This must be
+ // called *after* this node's children have been visited. The PendingValues
+ // added by this node's children will be consumed by this function and the
+ // PendingValue for this node will be returned.
+ // Returns:
+ // - On success, then PendingValue representing this node and it's children.
+ // - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's
+ // children.
+ libtextclassifier3::StatusOr<PendingValue> ProcessAndOperator(
+ const NaryOperatorNode* node);
+
+ // Processes the OR operator represented by the node. This must be called
+ // *after* this node's children have been visited. The PendingValues added by
+ // this node's children will be consumed by this function and the PendingValue
+ // for this node will be returned.
+ // Returns:
+ // - On success, then PendingValue representing this node and it's children.
+ // - INVALID_ARGUMENT if unable to retrieve iterators for any of this node's
+ // children.
+ libtextclassifier3::StatusOr<PendingValue> ProcessOrOperator(
+ const NaryOperatorNode* node);
+
+ // Populates registered_functions with the currently supported set of
+ // functions.
+ void RegisterFunctions();
+
+ // Implementation of `search` custom function in the query language.
+ // Returns:
+ // - a PendingValue holding the DocHitInfoIterator reflecting the query
+ // provided to SearchFunction
+ // - any errors returned by Lexer::ExtractTokens, Parser::ConsumeQuery or
+ // QueryVisitor::ConsumeResults.
+ libtextclassifier3::StatusOr<PendingValue> SearchFunction(
+ std::vector<PendingValue>&& args);
+
+ // Implementation of the propertyDefined(property_path) custom function.
+ // Returns:
+ // - a Pending Value holding a DocHitIterator that returns hits for all
+ // documents whose schema types have defined the property specified by
+ // property_path.
+ // - any errors returned by Lexer::ExtractTokens
+ libtextclassifier3::StatusOr<PendingValue> PropertyDefinedFunction(
+ std::vector<PendingValue>&& args);
+
+ // Implementation of the hasProperty(property_path) custom function.
+ // Returns:
+ // - a Pending Value holding a DocHitIterator that returns hits for all
+ // documents that have the property specified by property_path.
+ // - any errors returned by Lexer::ExtractTokens
+ libtextclassifier3::StatusOr<PendingValue> HasPropertyFunction(
+ std::vector<PendingValue>&& args);
+
+ // Handles a NaryOperatorNode where the operator is HAS (':') and pushes an
+ // iterator with the proper section filter applied. If the current property
+ // restriction represented by pending_property_restricts and the first child
+ // of this node is unsatisfiable (ex. `prop1:(prop2:foo)`), then a NONE
+ // iterator is returned immediately and subtree represented by the second
+ // child is not traversed.
+ //
+ // Returns:
+ // - OK on success
+ // - INVALID_ARGUMENT node does not have exactly two children or the two
+ // children cannot be resolved to a MEMBER or an iterator respectively.
+ libtextclassifier3::Status ProcessHasOperator(const NaryOperatorNode* node);
+
+ // Returns the correct match type to apply based on both the match type and
+ // whether the prefix operator is currently present.
+ TermMatchType::Code GetTermMatchType(bool is_prefix) const {
+ return (is_prefix) ? TermMatchType::PREFIX : match_type_;
+ }
+
+ std::stack<PendingValue> pending_values_;
+ libtextclassifier3::Status pending_error_;
+
+ // A map from function name to Function instance.
+ std::unordered_map<std::string, Function> registered_functions_;
+
+ SectionRestrictQueryTermsMap property_query_terms_map_;
+
+ QueryTermIteratorsMap query_term_iterators_;
+ // Set of features invoked in the query.
+ std::unordered_set<Feature> features_;
+
+ Index& index_; // Does not own!
+ const NumericIndex<int64_t>& numeric_index_; // Does not own!
+ const DocumentStore& document_store_; // Does not own!
+ const SchemaStore& schema_store_; // Does not own!
+ const Normalizer& normalizer_; // Does not own!
+ const Tokenizer& tokenizer_; // Does not own!
+
+ std::string_view raw_query_text_;
+ DocHitInfoIteratorFilter::Options filter_options_;
+ TermMatchType::Code match_type_;
+ // Whether or not term_frequency information is needed. This affects:
+ // - how DocHitInfoIteratorTerms are constructed
+ // - whether the QueryTermIteratorsMap is populated in the QueryResults.
+ bool needs_term_frequency_info_;
+
+ // The stack of property restricts currently being processed by the visitor.
+ PendingPropertyRestricts pending_property_restricts_;
+ bool processing_not_;
+
+ // Whether we are in the midst of processing a subtree that is expected to
+ // resolve to a numeric argument.
+ bool expecting_numeric_arg_;
+
+ int64_t current_time_ms_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_QUERY_VISITOR_H_
diff --git a/icing/query/advanced_query_parser/query-visitor_test.cc b/icing/query/advanced_query_parser/query-visitor_test.cc
new file mode 100644
index 0000000..9455baa
--- /dev/null
+++ b/icing/query/advanced_query_parser/query-visitor_test.cc
@@ -0,0 +1,4112 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/query-visitor.h"
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/dummy-numeric-index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/index/property-existence-indexing-handler.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/portable/platform.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/query/advanced_query_parser/lexer.h"
+#include "icing/query/advanced_query_parser/parser.h"
+#include "icing/query/query-features.h"
+#include "icing/query/query-results.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/tokenizer-factory.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "icing/util/status-macros.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+using ::testing::UnorderedElementsAre;
+
+constexpr DocumentId kDocumentId0 = 0;
+constexpr DocumentId kDocumentId1 = 1;
+constexpr DocumentId kDocumentId2 = 2;
+
+constexpr SectionId kSectionId0 = 0;
+constexpr SectionId kSectionId1 = 1;
+constexpr SectionId kSectionId2 = 2;
+
+template <typename T, typename U>
+std::vector<T> ExtractKeys(const std::unordered_map<T, U>& map) {
+ std::vector<T> keys;
+ keys.reserve(map.size());
+ for (const auto& [key, value] : map) {
+ keys.push_back(key);
+ }
+ return keys;
+}
+
+enum class QueryType {
+ kPlain,
+ kSearch,
+};
+
+class QueryVisitorTest : public ::testing::TestWithParam<QueryType> {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/icing";
+ index_dir_ = test_dir_ + "/index";
+ numeric_index_dir_ = test_dir_ + "/numeric_index";
+ store_dir_ = test_dir_ + "/store";
+ schema_store_dir_ = test_dir_ + "/schema_store";
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ jni_cache_ = GetTestJniCache();
+
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &clock_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, store_dir_, &clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
+
+ Index::Options options(index_dir_.c_str(),
+ /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ numeric_index_,
+ DummyNumericIndex<int64_t>::Create(filesystem_, numeric_index_dir_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(
+ ULOC_US, jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(segmenter_options));
+
+ ICING_ASSERT_OK_AND_ASSIGN(tokenizer_,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter_.get()));
+ }
+
+ libtextclassifier3::StatusOr<std::unique_ptr<Node>> ParseQueryHelper(
+ std::string_view query) {
+ Lexer lexer(query, Lexer::Language::QUERY);
+ ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ return parser.ConsumeQuery();
+ }
+
+ std::string EscapeString(std::string_view str) {
+ std::string result;
+ result.reserve(str.size());
+ for (char c : str) {
+ if (c == '\\' || c == '"') {
+ result.push_back('\\');
+ }
+ result.push_back(c);
+ }
+ return result;
+ }
+
+ std::string CreateQuery(std::string query,
+ std::string property_restrict = "") {
+ switch (GetParam()) {
+ case QueryType::kPlain:
+ if (property_restrict.empty()) {
+ // CreateQuery("foo bar") returns `foo bar`
+ return query;
+ }
+ // CreateQuery("foo", "subject") returns `subject:foo`
+ return absl_ports::StrCat(property_restrict, ":", query);
+ case QueryType::kSearch:
+ query = EscapeString(query);
+ property_restrict = EscapeString(property_restrict);
+ if (property_restrict.empty()) {
+ // CreateQuery("foo bar") returns `search("foo bar")`
+ return absl_ports::StrCat("search(\"", query, "\")");
+ }
+ // CreateQuery("foo", "subject") returns
+ // `search("foo bar", createList("subject"))`
+ return absl_ports::StrCat("search(\"", query, "\", createList(\"",
+ property_restrict, "\"))");
+ }
+ }
+
+ Filesystem filesystem_;
+ IcingFilesystem icing_filesystem_;
+ std::string test_dir_;
+ std::string index_dir_;
+ std::string numeric_index_dir_;
+ std::string schema_store_dir_;
+ std::string store_dir_;
+ Clock clock_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<Index> index_;
+ std::unique_ptr<DummyNumericIndex<int64_t>> numeric_index_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<Tokenizer> tokenizer_;
+ std::unique_ptr<const JniCache> jni_cache_;
+};
+
+TEST_P(QueryVisitorTest, SimpleLessThan) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values 0, 1 and 2
+ // respectively.
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(0));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price < 2");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ // "price" is a property restrict here and "2" isn't a "term" - its a numeric
+ // value. So QueryTermIterators should be empty.
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, SimpleLessThanEq) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values 0, 1 and 2
+ // respectively.
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(0));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price <= 1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ // "price" is a property restrict here and "1" isn't a "term" - its a numeric
+ // value. So QueryTermIterators should be empty.
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, SimpleEqual) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values 0, 1 and 2
+ // respectively.
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(0));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price == 2");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ // "price" is a property restrict here and "2" isn't a "term" - its a numeric
+ // value. So QueryTermIterators should be empty.
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+TEST_P(QueryVisitorTest, SimpleGreaterThanEq) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values 0, 1 and 2
+ // respectively.
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(0));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price >= 1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ // "price" is a property restrict here and "1" isn't a "term" - its a numeric
+ // value. So QueryTermIterators should be empty.
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId1));
+}
+
+TEST_P(QueryVisitorTest, SimpleGreaterThan) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values 0, 1 and 2
+ // respectively.
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(0));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price > 1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ // "price" is a property restrict here and "1" isn't a "term" - its a numeric
+ // value. So QueryTermIterators should be empty.
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+TEST_P(QueryVisitorTest, IntMinLessThanEqual) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values INT_MIN,
+ // INT_MAX and INT_MIN + 1 respectively.
+ int64_t int_min = std::numeric_limits<int64_t>::min();
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(int_min));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(std::numeric_limits<int64_t>::max()));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(int_min + 1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price <= " + std::to_string(int_min));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ // "price" is a property restrict here and int_min isn't a "term" - its a
+ // numeric value. So QueryTermIterators should be empty.
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, IntMaxGreaterThanEqual) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values INT_MIN,
+ // INT_MAX and INT_MAX - 1 respectively.
+ int64_t int_max = std::numeric_limits<int64_t>::max();
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(std::numeric_limits<int64_t>::min()));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(int_max));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(int_max - 1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price >= " + std::to_string(int_max));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ // "price" is a property restrict here and int_max isn't a "term" - its a
+ // numeric value. So QueryTermIterators should be empty.
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+}
+
+TEST_P(QueryVisitorTest, NestedPropertyLessThan) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values 0, 1 and 2
+ // respectively.
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("subscription.price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(0));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor =
+ numeric_index_->Edit("subscription.price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor =
+ numeric_index_->Edit("subscription.price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("subscription.price < 2");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ // "subscription.price" is a property restrict here and int_max isn't a "term"
+ // - its a numeric value. So QueryTermIterators should be empty.
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, IntParsingError) {
+ std::string query = CreateQuery("subscription.price < fruit");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, NotEqualsUnsupported) {
+ std::string query = CreateQuery("subscription.price != 3");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED));
+}
+
+TEST_P(QueryVisitorTest, LessThanTooManyOperandsInvalid) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values 0, 1 and 2
+ // respectively.
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("subscription.price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(0));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor =
+ numeric_index_->Edit("subscription.price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor =
+ numeric_index_->Edit("subscription.price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ // Create an invalid AST for the query '3 < subscription.price 25' where '<'
+ // has three operands
+ std::string_view query = "3 < subscription.price 25";
+ auto property_node =
+ std::make_unique<TextNode>("subscription", query.substr(4, 12));
+ auto subproperty_node =
+ std::make_unique<TextNode>("price", query.substr(17, 5));
+ std::vector<std::unique_ptr<TextNode>> member_args;
+ member_args.push_back(std::move(property_node));
+ member_args.push_back(std::move(subproperty_node));
+ auto member_node = std::make_unique<MemberNode>(std::move(member_args),
+ /*function=*/nullptr);
+
+ auto value_node = std::make_unique<TextNode>("3", query.substr(0, 1));
+ auto extra_value_node = std::make_unique<TextNode>("25", query.substr(23, 2));
+ std::vector<std::unique_ptr<Node>> args;
+ args.push_back(std::move(value_node));
+ args.push_back(std::move(member_node));
+ args.push_back(std::move(extra_value_node));
+ auto root_node = std::make_unique<NaryOperatorNode>("<", std::move(args));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, LessThanTooFewOperandsInvalid) {
+ // Create an invalid AST for the query 'subscription.price <' where '<'
+ // has a single operand
+ std::string_view query = "subscription.price <";
+ auto property_node =
+ std::make_unique<TextNode>("subscription", query.substr(0, 12));
+ auto subproperty_node =
+ std::make_unique<TextNode>("price", query.substr(13, 5));
+ std::vector<std::unique_ptr<TextNode>> member_args;
+ member_args.push_back(std::move(property_node));
+ member_args.push_back(std::move(subproperty_node));
+ auto member_node = std::make_unique<MemberNode>(std::move(member_args),
+ /*function=*/nullptr);
+
+ std::vector<std::unique_ptr<Node>> args;
+ args.push_back(std::move(member_node));
+ auto root_node = std::make_unique<NaryOperatorNode>("<", std::move(args));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, LessThanNonExistentPropertyNotFound) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values 0, 1 and 2
+ // respectively.
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("subscription.price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(0));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor =
+ numeric_index_->Edit("subscription.price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor =
+ numeric_index_->Edit("subscription.price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("time < 25");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+}
+
+TEST_P(QueryVisitorTest, NeverVisitedReturnsInvalid) {
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), "",
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, IntMinLessThanInvalid) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values INT_MIN,
+ // INT_MAX and INT_MIN + 1 respectively.
+ int64_t int_min = std::numeric_limits<int64_t>::min();
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(int_min));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(std::numeric_limits<int64_t>::max()));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(int_min + 1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price <" + std::to_string(int_min));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, IntMaxGreaterThanInvalid) {
+ // Setup the numeric index with docs 0, 1 and 2 holding the values INT_MIN,
+ // INT_MAX and INT_MAX - 1 respectively.
+ int64_t int_max = std::numeric_limits<int64_t>::max();
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(std::numeric_limits<int64_t>::min()));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId1);
+ ICING_ASSERT_OK(editor->BufferKey(int_max));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId2);
+ ICING_ASSERT_OK(editor->BufferKey(int_max - 1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ std::string query = CreateQuery("price >" + std::to_string(int_max));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, NumericComparisonPropertyStringIsInvalid) {
+ // "price" is a STRING token, which cannot be a property name.
+ std::string query = CreateQuery(R"("price" > 7)");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, NumericComparatorDoesntAffectLaterTerms) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type"))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Index three documents:
+ // - Doc0: ["-2", "-1", "1", "2"] and [-2, -1, 1, 2]
+ // - Doc1: [-1]
+ // - Doc2: ["2"] and [-1]
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit("price", kDocumentId0, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(-2));
+ ICING_ASSERT_OK(editor->BufferKey(-1));
+ ICING_ASSERT_OK(editor->BufferKey(1));
+ ICING_ASSERT_OK(editor->BufferKey(2));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+ Index::Editor term_editor = index_->Edit(
+ kDocumentId0, kSectionId1, TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(term_editor.BufferTerm("-2"));
+ ICING_ASSERT_OK(term_editor.BufferTerm("-1"));
+ ICING_ASSERT_OK(term_editor.BufferTerm("1"));
+ ICING_ASSERT_OK(term_editor.BufferTerm("2"));
+ ICING_ASSERT_OK(term_editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = numeric_index_->Edit("price", kDocumentId1, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(-1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = numeric_index_->Edit("price", kDocumentId2, kSectionId0);
+ ICING_ASSERT_OK(editor->BufferKey(-1));
+ ICING_ASSERT_OK(std::move(*editor).IndexAllBufferedKeys());
+ term_editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(term_editor.BufferTerm("2"));
+ ICING_ASSERT_OK(term_editor.IndexAllBufferedTerms());
+
+ // Translating MINUS chars that are interpreted as NOTs, this query would be
+ // `price == -1 AND NOT 2`
+ // All documents should match `price == -1`
+ // Both docs 0 and 2 should be excluded because of the `NOT 2` clause
+ // doc0 has both a text and number entry for `-2`, neither of which should
+ // match.
+ std::string query = CreateQuery("price == -1 -2");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kNumericSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+}
+
+TEST_P(QueryVisitorTest, SingleTermTermFrequencyEnabled) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and
+ // "bar" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+
+ ASSERT_THAT(query_results.root_iterator->Advance(), IsOk());
+ std::vector<TermMatchInfo> match_infos;
+ query_results.root_iterator->PopulateMatchedTermsStats(&match_infos);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{kSectionId1, 1}};
+ EXPECT_THAT(match_infos, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map)));
+
+ ASSERT_THAT(query_results.root_iterator->Advance(), IsOk());
+ match_infos.clear();
+ query_results.root_iterator->PopulateMatchedTermsStats(&match_infos);
+ EXPECT_THAT(match_infos, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map)));
+
+ EXPECT_THAT(query_results.root_iterator->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST_P(QueryVisitorTest, SingleTermTermFrequencyDisabled) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and
+ // "bar" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/false, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+
+ ASSERT_THAT(query_results.root_iterator->Advance(), IsOk());
+ std::vector<TermMatchInfo> match_infos;
+ query_results.root_iterator->PopulateMatchedTermsStats(&match_infos);
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{kSectionId1, 0}};
+ EXPECT_THAT(match_infos, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map)));
+
+ ASSERT_THAT(query_results.root_iterator->Advance(), IsOk());
+ match_infos.clear();
+ query_results.root_iterator->PopulateMatchedTermsStats(&match_infos);
+ EXPECT_THAT(match_infos, ElementsAre(EqualsTermMatchInfo(
+ "foo", expected_section_ids_tf_map)));
+
+ EXPECT_THAT(query_results.root_iterator->Advance(),
+ StatusIs(libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED));
+}
+
+TEST_P(QueryVisitorTest, SingleTermPrefix) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and
+ // "bar" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // An EXACT query for 'fo' won't match anything.
+ std::string query = CreateQuery("fo");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("fo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("fo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+
+ query = CreateQuery("fo*");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("fo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("fo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyReturnsInvalid) {
+ std::string query = "price* < 2";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, PrefixOperatorAfterNumericValueReturnsInvalid) {
+ std::string query = "price < 2*";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyRestrictReturnsInvalid) {
+ std::string query = "subject*:foo";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, SegmentationWithPrefix) {
+ // Setup the index with docs 0, 1 and 2 holding the values ["foo", "ba"],
+ // ["foo", "ba"] and ["bar", "fo"] respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("ba"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("ba"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.BufferTerm("fo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // An EXACT query for `ba?fo` will be lexed into a single TEXT token.
+ // The visitor will tokenize it into `ba` and `fo` (`?` is dropped because it
+ // is punctuation). Each document will match one and only one of these exact
+ // tokens. Therefore, nothing will match this query.
+ std::string query = CreateQuery("ba?fo");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("ba", "fo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("ba", "fo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+
+ // An EXACT query for `ba?fo*` will be lexed into a TEXT token and a TIMES
+ // token.
+ // The visitor will tokenize the TEXT into `ba` and `fo` (`?` is dropped
+ // because it is punctuation). The prefix operator should only apply to the
+ // final token `fo`. This will cause matches with docs 0 and 1 which contain
+ // "ba" and "foo". doc2 will not match because "ba" does not exactly match
+ // either "bar" or "fo".
+ query = CreateQuery("ba?fo*");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("ba", "fo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("ba", "fo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, SingleVerbatimTerm) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo:bar(baz)",
+ // "foo:bar(baz)" and "bar:baz(foo)" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo:bar(baz)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo:bar(baz)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar:baz(foo)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("\"foo:bar(baz)\"");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo:bar(baz)"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo:bar(baz)"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, SingleVerbatimTermPrefix) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo:bar(baz)",
+ // "foo:bar(abc)" and "bar:baz(foo)" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo:bar(baz)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo:bar(abc)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar:baz(foo)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Query for `"foo:bar("*`. This should match docs 0 and 1.
+ std::string query = CreateQuery("\"foo:bar(\"*");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo:bar("));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo:bar("));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+// There are three primary cases to worry about for escaping:
+//
+// NOTE: The following comments use ` chars to denote the beginning and end of
+// the verbatim term rather than " chars to avoid confusion. Additionally, the
+// raw chars themselves are shown. So `foobar\\` in actual c++ would be written
+// as std::string verbatim_term = "foobar\\\\";
+//
+// 1. How does a user represent a quote char (") without terminating the
+// verbatim term?
+// Example: verbatim_term = `foobar"`
+// Answer: quote char must be escaped. verbatim_query = `foobar\"`
+TEST_P(QueryVisitorTest, VerbatimTermEscapingQuote) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foobary",
+ // "foobar\" and "foobar"" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_EXACT, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobary)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobar")"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // From the comment above, verbatim_term = `foobar"` and verbatim_query =
+ // `foobar\"`
+ std::string query = CreateQuery(R"(("foobar\""))");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre(R"(foobar")"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre(R"(foobar")"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+// 2. How does a user represent a escape char (\) that immediately precedes the
+// end of the verbatim term
+// Example: verbatim_term = `foobar\`
+// Answer: escape chars can be escaped. verbatim_query = `foobar\\`
+TEST_P(QueryVisitorTest, VerbatimTermEscapingEscape) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foobary",
+ // "foobar\" and "foobar"" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_EXACT, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobary)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ // From the comment above, verbatim_term = `foobar\`.
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobar")"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Issue a query for the verbatim token `foobar\`.
+ std::string query = CreateQuery(R"(("foobar\\"))");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre(R"(foobar\)"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre(R"(foobar\)"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+}
+
+// 3. How do we handle other escaped chars?
+// Example: verbatim_query = `foobar\y`.
+// Answer: all chars preceded by an escape character are blindly escaped (as
+// in, consume the escape char and add the char like we do for the
+// quote char). So the above query would match the verbatim_term
+// `foobary`.
+TEST_P(QueryVisitorTest, VerbatimTermEscapingNonSpecialChar) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foobary",
+ // "foobar\" and "foobar"" respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_EXACT, /*namespace_id=*/0);
+ // From the comment above, verbatim_term = `foobary`.
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobary)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\y)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Issue a query for the verbatim token `foobary`.
+ std::string query = CreateQuery(R"(("foobar\y"))");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre(R"(foobary)"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre(R"(foobary)"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId0));
+
+ // Issue a query for the verbatim token `foobar\y`.
+ query = CreateQuery(R"(("foobar\\y"))");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre(R"(foobar\y)"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre(R"(foobar\y)"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+// This isn't a special case, but is fairly useful for demonstrating. There are
+// a number of escaped sequences in c++, including the new line character '\n'.
+// It is worth emphasizing that the new line character, like the others in c++,
+// is its own separate ascii value. For a query `foobar\n`, the parser will see
+// the character sequence [`f`, `o`, `o`, `b`, `a`, `r`, `\n`] - it *won't* ever
+// see `\` and `n`.
+TEST_P(QueryVisitorTest, VerbatimTermNewLine) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foobar\n",
+ // `foobar\` and `foobar\n` respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_EXACT, /*namespace_id=*/0);
+ // From the comment above, verbatim_term = `foobar` + '\n'.
+ ICING_ASSERT_OK(editor.BufferTerm("foobar\n"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ // verbatim_term = `foobar\n`. This is distinct from the term added above.
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foobar\n)"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Issue a query for the verbatim token `foobar` + '\n'.
+ std::string query = CreateQuery("\"foobar\n\"");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foobar\n"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foobar\n"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId0));
+
+ // Now, issue a query for the verbatim token `foobar\n`.
+ query = CreateQuery(R"(("foobar\\n"))");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre(R"(foobar\n)"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre(R"(foobar\n)"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+TEST_P(QueryVisitorTest, VerbatimTermEscapingComplex) {
+ // Setup the index with docs 0, 1 and 2 holding the values `foo\"bar\nbaz"`,
+ // `foo\\\"bar\\nbaz\"` and `foo\\"bar\\nbaz"` respectively.
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_EXACT, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foo\"bar\nbaz")"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ // Add the verbatim_term from doc 0 but with all of the escapes left in
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foo\\\"bar\\nbaz\")"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_EXACT,
+ /*namespace_id=*/0);
+ // Add the verbatim_term from doc 0 but with the escapes for '\' chars left in
+ ICING_ASSERT_OK(editor.BufferTerm(R"(foo\\"bar\\nbaz")"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Issue a query for the verbatim token `foo\"bar\nbaz"`.
+ std::string query = CreateQuery(R"(("foo\\\"bar\\nbaz\""))");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kVerbatimSearchFeature));
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre(R"(foo\"bar\nbaz")"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre(R"(foo\"bar\nbaz")"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, SingleMinusTerm) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and
+ // "bar" respectively.
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type"))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("-foo");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty());
+ EXPECT_THAT(query_results.query_term_iterators, IsEmpty());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+TEST_P(QueryVisitorTest, SingleNotTerm) {
+ // Setup the index with docs 0, 1 and 2 holding the values "foo", "foo" and
+ // "bar" respectively.
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type"))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("NOT foo");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.query_terms, IsEmpty());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(query_results.query_term_iterators, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+TEST_P(QueryVisitorTest, NestedNotTerms) {
+ // Setup the index with docs 0, 1 and 2 holding the values
+ // ["foo", "bar", "baz"], ["foo", "baz"] and ["bar", "baz"] respectively.
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type"))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.BufferTerm("baz"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("baz"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.BufferTerm("baz"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Double negative could be rewritten as `(foo AND NOT bar) baz`
+ std::string query = CreateQuery("NOT (-foo OR bar) baz");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo", "baz"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "baz"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+}
+
+TEST_P(QueryVisitorTest, DeeplyNestedNotTerms) {
+ // Setup the index with docs 0, 1 and 2 holding the values
+ // ["foo", "bar", "baz"], ["foo", "baz"] and ["bar", "baz"] respectively.
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type"))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.BufferTerm("baz"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("baz"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.BufferTerm("baz"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Simplifying:
+ // NOT (-(NOT (foo -bar) baz) -bat) NOT bass
+ // NOT (-((-foo OR bar) baz) -bat) NOT bass
+ // NOT (((foo -bar) OR -baz) -bat) NOT bass
+ // (((-foo OR bar) baz) OR bat) NOT bass
+ //
+ // Doc 0 : (((-TRUE OR TRUE) TRUE) OR FALSE) NOT FALSE ->
+ // ((FALSE OR TRUE) TRUE) TRUE -> ((TRUE) TRUE) TRUE -> TRUE
+ // Doc 1 : (((-TRUE OR FALSE) TRUE) OR FALSE) NOT FALSE
+ // ((FALSE OR FALSE) TRUE) TRUE -> ((FALSE) TRUE) TRUE -> FALSE
+ // Doc 2 : (((-FALSE OR TRUE) TRUE) OR FALSE) NOT FALSE
+ // ((TRUE OR TRUE) TRUE) TRUE -> ((TRUE) TRUE) TRUE -> TRUE
+ std::string query = CreateQuery("NOT (-(NOT (foo -bar) baz) -bat) NOT bass");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("bar", "baz", "bat"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("bar", "baz", "bat"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, ImplicitAndTerms) {
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo bar");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+}
+
+TEST_P(QueryVisitorTest, ExplicitAndTerms) {
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo AND bar");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+}
+
+TEST_P(QueryVisitorTest, OrTerms) {
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("fo"));
+ ICING_ASSERT_OK(editor.BufferTerm("ba"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo OR bar");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, AndOrTermPrecedence) {
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("baz"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Should be interpreted like `foo (bar OR baz)`
+ std::string query = CreateQuery("foo bar OR baz");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo", "bar", "baz"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar", "baz"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId1));
+
+ // Should be interpreted like `(bar OR baz) foo`
+ query = CreateQuery("bar OR baz foo");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo", "bar", "baz"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar", "baz"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId1));
+
+ query = CreateQuery("(bar OR baz) foo");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_three(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_three);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_three).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo", "bar", "baz"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar", "baz"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId1));
+}
+
+TEST_P(QueryVisitorTest, AndOrNotPrecedence) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("baz"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Should be interpreted like `foo ((NOT bar) OR baz)`
+ std::string query = CreateQuery("foo NOT bar OR baz");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("foo", "baz"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "baz"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId0));
+
+ query = CreateQuery("foo NOT (bar OR baz)");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ EXPECT_THAT(query_results.query_terms[""], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, PropertyFilter) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo", /*property_restrict=*/"prop1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_F(QueryVisitorTest, MultiPropertyFilter) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop3")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+ SectionId prop3_section_id = 2;
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop3_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = R"(search("foo", createList("prop1", "prop2")))";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1", "prop2"));
+ EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop2"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, PropertyFilterStringIsInvalid) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // "prop1" is a STRING token, which cannot be a property name.
+ std::string query = CreateQuery(R"(("prop1":foo))");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, PropertyFilterNonNormalized) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("PROP1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("PROP2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo", /*property_restrict=*/"PROP1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("PROP1"));
+ EXPECT_THAT(query_results.query_terms["PROP1"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, PropertyFilterWithGrouping) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query =
+ CreateQuery("(foo OR bar)", /*property_restrict=*/"prop1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"],
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, ValidNestedPropertyFilter) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("(prop1:foo)", /*property_restrict=*/"prop1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+
+ query = CreateQuery("(prop1:(prop1:(prop1:(prop1:foo))))",
+ /*property_restrict=*/"prop1");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId1));
+}
+
+TEST_P(QueryVisitorTest, InvalidNestedPropertyFilter) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("(prop2:foo)", /*property_restrict=*/"prop1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty());
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+
+ // Resulting queries:
+ // - kPlain: `prop1:(prop2:(prop1:(prop2:(prop1:foo))))`
+ // - kSearch: `-search("(prop2:(prop1:(prop2:(prop1:foo))))",
+ // createList("prop1"))`
+ query = CreateQuery("(prop2:(prop1:(prop2:(prop1:foo))))",
+ /*property_restrict=*/"prop1");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty());
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators), IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+}
+
+TEST_P(QueryVisitorTest, NotWithPropertyFilter) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Resulting queries:
+ // - kPlain: `-prop1:(foo OR bar)`
+ // - kSearch: `-search("foo OR bar", createList("prop1"))`
+ std::string query = absl_ports::StrCat(
+ "-", CreateQuery("(foo OR bar)", /*property_restrict=*/"prop1"));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty());
+ EXPECT_THAT(query_results.query_term_iterators, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+
+ // Resulting queries:
+ // - kPlain: `NOT prop1:(foo OR bar)`
+ // - kSearch: `NOT search("foo OR bar", createList("prop1"))`
+ query = absl_ports::StrCat(
+ "NOT ", CreateQuery("(foo OR bar)", /*property_restrict=*/"prop1"));
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), IsEmpty());
+ EXPECT_THAT(query_results.query_term_iterators, IsEmpty());
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+TEST_P(QueryVisitorTest, PropertyFilterWithNot) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+
+ // Create documents as follows:
+ // Doc0:
+ // prop1: "bar"
+ // prop2: ""
+ // Doc1:
+ // prop1: "foo"
+ // prop2: ""
+ // Doc2:
+ // prop1: ""
+ // prop2: "foo"
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Resulting queries:
+ // - kPlain: `prop1:(-foo OR bar)`
+ // - kSearch: `search("-foo OR bar", createList("prop1"))`
+ //
+ // The query is equivalent to `-prop1:foo OR prop1:bar`, thus doc0 and doc2
+ // will be matched.
+ std::string query =
+ CreateQuery("(-foo OR bar)", /*property_restrict=*/"prop1");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId0));
+
+ // Resulting queries:
+ // - kPlain: `prop1:(-foo OR bar)`
+ // - kSearch: `search("-foo OR bar", createList("prop1"))`
+ //
+ // The query is equivalent to `-prop1:foo OR prop1:bar`, thus doc0 and doc2
+ // will be matched.
+ query = CreateQuery("(NOT foo OR bar)", /*property_restrict=*/"prop1");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2, kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, SegmentationTest) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+
+ // ICU segmentation will break this into "每天" and "上班".
+ // CFStringTokenizer (ios) will break this into "每", "天" and "上班"
+ std::string query = CreateQuery("每天上班");
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("上班"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(kDocumentId0, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ if (IsCfStringTokenization()) {
+ ICING_ASSERT_OK(editor.BufferTerm("每"));
+ ICING_ASSERT_OK(editor.BufferTerm("天"));
+ } else {
+ ICING_ASSERT_OK(editor.BufferTerm("每天"));
+ }
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("上班"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop2_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ if (IsCfStringTokenization()) {
+ ICING_ASSERT_OK(editor.BufferTerm("每"));
+ ICING_ASSERT_OK(editor.BufferTerm("天"));
+ } else {
+ ICING_ASSERT_OK(editor.BufferTerm("每天"));
+ }
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms), UnorderedElementsAre(""));
+ if (IsCfStringTokenization()) {
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("每", "天", "上班"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("每", "天", "上班"));
+ } else {
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("每天", "上班"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("每天", "上班"));
+ }
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest, PropertyRestrictsPopCorrectly) {
+ PropertyConfigProto prop =
+ PropertyConfigBuilder()
+ .SetName("prop0")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(prop)
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop1"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop2")))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ SectionId prop0_id = 0;
+ SectionId prop1_id = 1;
+ SectionId prop2_id = 2;
+ NamespaceId ns_id = 0;
+
+ // Create the following docs:
+ // - Doc 0: Contains 'val0', 'val1', 'val2' in 'prop0'. Shouldn't match.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid0, document_store_->Put(doc));
+ Index::Editor editor =
+ index_->Edit(docid0, prop0_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // - Doc 1: Contains 'val0', 'val1', 'val2' in 'prop1'. Should match.
+ doc = DocumentBuilder(doc).SetUri("uri1").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid1, document_store_->Put(doc));
+ editor = index_->Edit(docid1, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // - Doc 2: Contains 'val0', 'val1', 'val2' in 'prop2'. Shouldn't match.
+ doc = DocumentBuilder(doc).SetUri("uri2").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid2, document_store_->Put(doc));
+ editor = index_->Edit(docid2, prop2_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // - Doc 3: Contains 'val0' in 'prop0', 'val1' in 'prop1' etc. Should match.
+ doc = DocumentBuilder(doc).SetUri("uri3").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid3, document_store_->Put(doc));
+ editor = index_->Edit(docid3, prop0_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(docid3, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(docid3, prop2_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // - Doc 4: Contains 'val1' in 'prop0', 'val2' in 'prop1', 'val0' in 'prop2'.
+ // Shouldn't match.
+ doc = DocumentBuilder(doc).SetUri("uri4").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid4, document_store_->Put(doc));
+ editor = index_->Edit(docid4, prop0_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(docid4, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(docid4, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Now issue a query with 'val1' restricted to 'prop1'. This should match only
+ // docs 1 and 3.
+ // Resulting queries:
+ // - kPlain: `val0 prop1:val1 val2`
+ // - kSearch: `val0 search("val1", createList("prop1")) val2`
+ std::string query = absl_ports::StrCat(
+ "val0 ", CreateQuery("val1", /*property_restrict=*/"prop1"), " val2");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ if (GetParam() == QueryType::kSearch) {
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ } else {
+ EXPECT_THAT(query_results.features_in_use, IsEmpty());
+ }
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("", "prop1"));
+ EXPECT_THAT(query_results.query_terms[""],
+ UnorderedElementsAre("val0", "val2"));
+ EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("val1"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("val0", "val1", "val2"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(docid3, docid1));
+}
+
+TEST_P(QueryVisitorTest, UnsatisfiablePropertyRestrictsPopCorrectly) {
+ PropertyConfigProto prop =
+ PropertyConfigBuilder()
+ .SetName("prop0")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(prop)
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop1"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop2")))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ SectionId prop0_id = 0;
+ SectionId prop1_id = 1;
+ SectionId prop2_id = 2;
+ NamespaceId ns_id = 0;
+
+ // Create the following docs:
+ // - Doc 0: Contains 'val0', 'val1', 'val2' in 'prop0'. Should match.
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid0, document_store_->Put(doc));
+ Index::Editor editor =
+ index_->Edit(docid0, prop0_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // - Doc 1: Contains 'val0', 'val1', 'val2' in 'prop1'. Shouldn't match.
+ doc = DocumentBuilder(doc).SetUri("uri1").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid1, document_store_->Put(doc));
+ editor = index_->Edit(docid1, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // - Doc 2: Contains 'val0', 'val1', 'val2' in 'prop2'. Should match.
+ doc = DocumentBuilder(doc).SetUri("uri2").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid2, document_store_->Put(doc));
+ editor = index_->Edit(docid2, prop2_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // - Doc 3: Contains 'val0' in 'prop0', 'val1' in 'prop1' etc. Should match.
+ doc = DocumentBuilder(doc).SetUri("uri3").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid3, document_store_->Put(doc));
+ editor = index_->Edit(docid3, prop0_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(docid3, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(docid3, prop2_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // - Doc 4: Contains 'val1' in 'prop0', 'val2' in 'prop1', 'val0' in 'prop2'.
+ // Shouldn't match.
+ doc = DocumentBuilder(doc).SetUri("uri4").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid4, document_store_->Put(doc));
+ editor = index_->Edit(docid4, prop0_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val1"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(docid4, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val2"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+ editor = index_->Edit(docid4, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("val0"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Now issue a query with 'val1' restricted to 'prop1'. This should match only
+ // docs 1 and 3.
+ // Resulting queries:
+ // - kPlain: `val0 OR prop1:(prop2:val1) OR val2`
+ // - kSearch: `prop0:val0 OR search("(prop2:val1)", createList("prop1")) OR
+ // prop2:val2`
+ std::string query = absl_ports::StrCat(
+ "prop0:val0 OR prop1:(",
+ CreateQuery("val1", /*property_restrict=*/"prop2"), ") OR prop2:val2");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop0", "prop2"));
+ EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("val0"));
+ EXPECT_THAT(query_results.query_terms["prop2"], UnorderedElementsAre("val2"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("val0", "val2"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(docid3, docid2, docid0));
+}
+
+TEST_F(QueryVisitorTest, UnsupportedFunctionReturnsInvalidArgument) {
+ std::string query = "unsupportedFunction()";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest, SearchFunctionTooFewArgumentsReturnsInvalidArgument) {
+ std::string query = "search()";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest, SearchFunctionTooManyArgumentsReturnsInvalidArgument) {
+ std::string query = R"(search("foo", createList("subject"), "bar"))";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest,
+ SearchFunctionWrongFirstArgumentTypeReturnsInvalidArgument) {
+ // First argument type=TEXT, expected STRING.
+ std::string query = "search(7)";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // First argument type=string list, expected STRING.
+ query = R"(search(createList("subject")))";
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ EXPECT_THAT(std::move(query_visitor_two).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest,
+ SearchFunctionWrongSecondArgumentTypeReturnsInvalidArgument) {
+ // Second argument type=STRING, expected string list.
+ std::string query = R"(search("foo", "bar"))";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Second argument type=TEXT, expected string list.
+ query = R"(search("foo", 7))";
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ EXPECT_THAT(std::move(query_visitor_two).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest,
+ SearchFunctionCreateListZeroPropertiesReturnsInvalidArgument) {
+ std::string query = R"(search("foo", createList()))";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest, SearchFunctionNestedFunctionCalls) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop1_section_id = 0;
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, prop1_section_id,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId1, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("type").Build()));
+ editor = index_->Edit(kDocumentId2, prop1_section_id, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // *If* nested function calls were allowed, then this would simplify as:
+ // `search("search(\"foo\") bar")` -> `search("foo bar")` -> `foo bar`
+ // But nested function calls are disallowed. So this is rejected.
+ std::string level_one_query = R"(search("foo", createList("prop1")) bar)";
+ std::string level_two_query =
+ absl_ports::StrCat(R"(search(")", EscapeString(level_one_query),
+ R"(", createList("prop1")))");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(level_two_query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"],
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+
+ std::string level_three_query =
+ absl_ports::StrCat(R"(search(")", EscapeString(level_two_query),
+ R"(", createList("prop1")))");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_three_query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"],
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+
+ std::string level_four_query =
+ absl_ports::StrCat(R"(search(")", EscapeString(level_three_query),
+ R"(", createList("prop1")))");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_four_query));
+ QueryVisitor query_visitor_three(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ level_four_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_three);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_three).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop1"));
+ EXPECT_THAT(query_results.query_terms["prop1"],
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo", "bar"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(kDocumentId2));
+}
+
+// This test will nest `search` calls together with the set of restricts
+// narrowing at each level so that the set of docs matching the query shrinks.
+TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) {
+ PropertyConfigProto prop =
+ PropertyConfigBuilder()
+ .SetName("prop0")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(prop)
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop1"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop2"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop3"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop4"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop5"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop6"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop7")))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop0_id = 0;
+ SectionId prop1_id = 1;
+ SectionId prop2_id = 2;
+ SectionId prop3_id = 3;
+ SectionId prop4_id = 4;
+ SectionId prop5_id = 5;
+ SectionId prop6_id = 6;
+ SectionId prop7_id = 7;
+
+ NamespaceId ns_id = 0;
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid0, document_store_->Put(doc));
+ Index::Editor editor =
+ index_->Edit(kDocumentId0, prop0_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid1,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ editor = index_->Edit(docid1, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid2,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ editor = index_->Edit(docid2, prop2_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid3,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ editor = index_->Edit(docid3, prop3_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid4,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ editor = index_->Edit(docid4, prop4_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid5,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ editor = index_->Edit(docid5, prop5_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid6,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ editor = index_->Edit(docid6, prop6_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid7,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ editor = index_->Edit(docid7, prop7_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // *If* nested function calls were allowed, then this would simplify as:
+ // `search("search(\"foo\") bar")` -> `search("foo bar")` -> `foo bar`
+ // But nested function calls are disallowed. So this is rejected.
+ std::string level_one_query =
+ R"(search("foo", createList("prop2", "prop5", "prop1", "prop3", "prop0", "prop6", "prop4", "prop7")))";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(level_one_query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_one_query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop0", "prop1", "prop2", "prop3", "prop4",
+ "prop5", "prop6", "prop7"));
+ EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop1"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop2"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop3"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop4"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop5"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop6"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop7"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(docid7, docid6, docid5, docid4, docid3, docid2,
+ docid1, docid0));
+
+ std::string level_two_query = absl_ports::StrCat(
+ R"(search(")", EscapeString(level_one_query),
+ R"(", createList("prop6", "prop0", "prop4", "prop2")))");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_two_query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop0", "prop2", "prop4", "prop6"));
+ EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop2"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop4"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop6"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(docid6, docid4, docid2, docid0));
+
+ std::string level_three_query =
+ absl_ports::StrCat(R"(search(")", EscapeString(level_two_query),
+ R"(", createList("prop0", "prop6")))");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_three_query));
+ QueryVisitor query_visitor_three(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_three);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_three).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop0", "prop6"));
+ EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop6"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(docid6, docid0));
+}
+
+// This test will nest `search` calls together with the set of restricts
+// narrowing at each level so that the set of docs matching the query shrinks.
+TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpanding) {
+ PropertyConfigProto prop =
+ PropertyConfigBuilder()
+ .SetName("prop0")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("type")
+ .AddProperty(prop)
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop1"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop2"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop3"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop4"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop5"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop6"))
+ .AddProperty(PropertyConfigBuilder(prop).SetName("prop7")))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Section ids are assigned alphabetically.
+ SectionId prop0_id = 0;
+ SectionId prop1_id = 1;
+ SectionId prop2_id = 2;
+ SectionId prop3_id = 3;
+ SectionId prop4_id = 4;
+ SectionId prop5_id = 5;
+ SectionId prop6_id = 6;
+ SectionId prop7_id = 7;
+
+ NamespaceId ns_id = 0;
+ DocumentProto doc =
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("type").Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId docid0, document_store_->Put(doc));
+ Index::Editor editor =
+ index_->Edit(kDocumentId0, prop0_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid1,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri1").Build()));
+ editor = index_->Edit(docid1, prop1_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid2,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri2").Build()));
+ editor = index_->Edit(docid2, prop2_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid3,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri3").Build()));
+ editor = index_->Edit(docid3, prop3_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid4,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri4").Build()));
+ editor = index_->Edit(docid4, prop4_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid5,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri5").Build()));
+ editor = index_->Edit(docid5, prop5_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid6,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri6").Build()));
+ editor = index_->Edit(docid6, prop6_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId docid7,
+ document_store_->Put(DocumentBuilder(doc).SetUri("uri7").Build()));
+ editor = index_->Edit(docid7, prop7_id, TERM_MATCH_PREFIX, ns_id);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // *If* nested function calls were allowed, then this would simplify as:
+ // `search("search(\"foo\") bar")` -> `search("foo bar")` -> `foo bar`
+ // But nested function calls are disallowed. So this is rejected.
+ std::string level_one_query =
+ R"(search("foo", createList("prop0", "prop6")))";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(level_one_query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_one_query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop0", "prop6"));
+ EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop6"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(docid6, docid0));
+
+ std::string level_two_query = absl_ports::StrCat(
+ R"(search(")", EscapeString(level_one_query),
+ R"(", createList("prop6", "prop0", "prop4", "prop2")))");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_two_query));
+ QueryVisitor query_visitor_two(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_two);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_two).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop0", "prop6"));
+ EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop6"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(docid6, docid0));
+
+ std::string level_three_query =
+ absl_ports::StrCat(R"(search(")", EscapeString(level_two_query),
+ R"(", createList("prop2", "prop5", "prop1", "prop3",)",
+ R"( "prop0", "prop6", "prop4", "prop7")))");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_three_query));
+ QueryVisitor query_visitor_three(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(),
+ level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor_three);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor_three).ConsumeResults());
+
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+ EXPECT_THAT(ExtractKeys(query_results.query_terms),
+ UnorderedElementsAre("prop0", "prop6"));
+ EXPECT_THAT(query_results.query_terms["prop0"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(query_results.query_terms["prop6"], UnorderedElementsAre("foo"));
+ EXPECT_THAT(ExtractKeys(query_results.query_term_iterators),
+ UnorderedElementsAre("foo"));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ ElementsAre(docid6, docid0));
+}
+
+TEST_F(QueryVisitorTest,
+ PropertyDefinedFunctionWithNoArgumentReturnsInvalidArgument) {
+ std::string query = "propertyDefined()";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(
+ QueryVisitorTest,
+ PropertyDefinedFunctionWithMoreThanOneTextArgumentReturnsInvalidArgument) {
+ std::string query = "propertyDefined(\"foo\", \"bar\")";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest,
+ PropertyDefinedFunctionWithTextArgumentReturnsInvalidArgument) {
+ // The argument type is TEXT, not STRING here.
+ std::string query = "propertyDefined(foo)";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest,
+ PropertyDefinedFunctionWithNonTextArgumentReturnsInvalidArgument) {
+ std::string query = "propertyDefined(1 < 2)";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, PropertyDefinedFunctionReturnsMatchingDocuments) {
+ // Set up two schemas, one with a "url" field and one without.
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("typeWithUrl")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("url")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("typeWithoutUrl"))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Document 0 has the term "foo" and its schema has the url property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("typeWithUrl").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Document 1 has the term "foo" and its schema DOESN'T have the url property.
+ ICING_ASSERT_OK(document_store_->Put(DocumentBuilder()
+ .SetKey("ns", "uri1")
+ .SetSchema("typeWithoutUrl")
+ .Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Document 2 has the term "bar" and its schema has the url property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("typeWithUrl").Build()));
+ editor = index_->Edit(kDocumentId2, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo propertyDefined(\"url\")");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ UnorderedElementsAre(kDocumentId0));
+}
+
+TEST_P(QueryVisitorTest,
+ PropertyDefinedFunctionReturnsNothingIfNoMatchingProperties) {
+ // Set up two schemas, one with a "url" field and one without.
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("typeWithUrl")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("url")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("typeWithoutUrl"))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Document 0 has the term "foo" and its schema has the url property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("typeWithUrl").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Document 1 has the term "foo" and its schema DOESN'T have the url property.
+ ICING_ASSERT_OK(document_store_->Put(DocumentBuilder()
+ .SetKey("ns", "uri1")
+ .SetSchema("typeWithoutUrl")
+ .Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Attempt to query a non-existent property.
+ std::string query = CreateQuery("propertyDefined(\"nonexistentproperty\")");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+}
+
+TEST_P(QueryVisitorTest,
+ PropertyDefinedFunctionWithNegationMatchesDocsWithNoSuchProperty) {
+ // Set up two schemas, one with a "url" field and one without.
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("typeWithUrl")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("url")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("typeWithoutUrl"))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Document 0 has the term "foo" and its schema has the url property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("typeWithUrl").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId1,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Document 1 has the term "foo" and its schema DOESN'T have the url property.
+ ICING_ASSERT_OK(document_store_->Put(DocumentBuilder()
+ .SetKey("ns", "uri1")
+ .SetSchema("typeWithoutUrl")
+ .Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId1, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ std::string query = CreateQuery("foo AND NOT propertyDefined(\"url\")");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info_=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kListFilterQueryLanguageFeature));
+
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ UnorderedElementsAre(kDocumentId1));
+}
+
+TEST_F(QueryVisitorTest,
+ HasPropertyFunctionWithNoArgumentReturnsInvalidArgument) {
+ std::string query = "hasProperty()";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest,
+ HasPropertyFunctionWithMoreThanOneStringArgumentReturnsInvalidArgument) {
+ std::string query = "hasProperty(\"foo\", \"bar\")";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest,
+ HasPropertyFunctionWithTextArgumentReturnsInvalidArgument) {
+ // The argument type is TEXT, not STRING here.
+ std::string query = "hasProperty(foo)";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(QueryVisitorTest,
+ HasPropertyFunctionWithNonStringArgumentReturnsInvalidArgument) {
+ std::string query = "hasProperty(1 < 2)";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ EXPECT_THAT(std::move(query_visitor).ConsumeResults(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryVisitorTest, HasPropertyFunctionReturnsMatchingDocuments) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Simple")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("price")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Document 0 has the term "foo" and has the "price" property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("Simple").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId0,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm(
+ absl_ports::StrCat(kPropertyExistenceTokenPrefix, "price").c_str()));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Document 1 has the term "foo" and doesn't have the "price" property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("Simple").Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId0, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Document 2 has the term "bar" and has the "price" property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri2").SetSchema("Simple").Build()));
+ editor = index_->Edit(kDocumentId2, kSectionId0, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("bar"));
+ ICING_ASSERT_OK(editor.BufferTerm(
+ absl_ports::StrCat(kPropertyExistenceTokenPrefix, "price").c_str()));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Test that `foo hasProperty("price")` matches document 0 only.
+ std::string query = CreateQuery("foo hasProperty(\"price\")");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor1(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor1);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor1).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kHasPropertyFunctionFeature,
+ kListFilterQueryLanguageFeature));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ UnorderedElementsAre(kDocumentId0));
+
+ // Test that `bar OR NOT hasProperty("price")` matches document 1 and
+ // document 2.
+ query = CreateQuery("bar OR NOT hasProperty(\"price\")");
+ ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query));
+ QueryVisitor query_visitor2(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor2);
+ ICING_ASSERT_OK_AND_ASSIGN(query_results,
+ std::move(query_visitor2).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kHasPropertyFunctionFeature,
+ kListFilterQueryLanguageFeature));
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()),
+ UnorderedElementsAre(kDocumentId1, kDocumentId2));
+}
+
+TEST_P(QueryVisitorTest,
+ HasPropertyFunctionReturnsNothingIfNoMatchingProperties) {
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Simple")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("price")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build(),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Document 0 has the term "foo" and has the "price" property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri0").SetSchema("Simple").Build()));
+ Index::Editor editor = index_->Edit(kDocumentId0, kSectionId0,
+ TERM_MATCH_PREFIX, /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.BufferTerm(
+ absl_ports::StrCat(kPropertyExistenceTokenPrefix, "price").c_str()));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Document 1 has the term "foo" and doesn't have the "price" property.
+ ICING_ASSERT_OK(document_store_->Put(
+ DocumentBuilder().SetKey("ns", "uri1").SetSchema("Simple").Build()));
+ editor = index_->Edit(kDocumentId1, kSectionId0, TERM_MATCH_PREFIX,
+ /*namespace_id=*/0);
+ ICING_ASSERT_OK(editor.BufferTerm("foo"));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
+
+ // Attempt to query a non-existent property.
+ std::string query = CreateQuery("hasProperty(\"nonexistentproperty\")");
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> root_node,
+ ParseQueryHelper(query));
+ QueryVisitor query_visitor(
+ index_.get(), numeric_index_.get(), document_store_.get(),
+ schema_store_.get(), normalizer_.get(), tokenizer_.get(), query,
+ DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX,
+ /*needs_term_frequency_info=*/true, clock_.GetSystemTimeMilliseconds());
+ root_node->Accept(&query_visitor);
+ ICING_ASSERT_OK_AND_ASSIGN(QueryResults query_results,
+ std::move(query_visitor).ConsumeResults());
+ EXPECT_THAT(query_results.features_in_use,
+ UnorderedElementsAre(kHasPropertyFunctionFeature,
+ kListFilterQueryLanguageFeature));
+
+ EXPECT_THAT(GetDocumentIds(query_results.root_iterator.get()), IsEmpty());
+}
+
+INSTANTIATE_TEST_SUITE_P(QueryVisitorTest, QueryVisitorTest,
+ testing::Values(QueryType::kPlain,
+ QueryType::kSearch));
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/advanced_query_parser/util/string-util.cc b/icing/query/advanced_query_parser/util/string-util.cc
new file mode 100644
index 0000000..9af2ed6
--- /dev/null
+++ b/icing/query/advanced_query_parser/util/string-util.cc
@@ -0,0 +1,106 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/util/string-util.h"
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+
+namespace icing {
+namespace lib {
+
+namespace string_util {
+
+libtextclassifier3::StatusOr<std::string> UnescapeStringValue(
+ std::string_view value) {
+ std::string result;
+ bool in_escape = false;
+ for (char c : value) {
+ if (in_escape) {
+ in_escape = false;
+ } else if (c == '\\') {
+ in_escape = true;
+ continue;
+ } else if (c == '"') {
+ return absl_ports::InvalidArgumentError(
+ "Encountered an unescaped quotation mark!");
+ }
+ result += c;
+ }
+ return result;
+}
+
+libtextclassifier3::StatusOr<std::string_view> FindEscapedToken(
+ std::string_view escaped_string, std::string_view unescaped_token) {
+ if (unescaped_token.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Cannot find escaped token in empty unescaped token.");
+ }
+
+ // Find the start of unescaped_token within the escaped_string
+ const char* esc_string_end = escaped_string.data() + escaped_string.length();
+ size_t pos = escaped_string.find(unescaped_token[0]);
+ const char* esc_token_start = (pos == std::string_view::npos)
+ ? esc_string_end
+ : escaped_string.data() + pos;
+ const char* esc_token_cur = esc_token_start;
+ const char* possible_next_start = nullptr;
+ bool is_escaped = false;
+ int i = 0;
+ for (; i < unescaped_token.length() && esc_token_cur < esc_string_end;
+ ++esc_token_cur) {
+ if (esc_token_cur != esc_token_start &&
+ *esc_token_cur == unescaped_token[0] &&
+ possible_next_start == nullptr) {
+ possible_next_start = esc_token_cur;
+ }
+
+ // Every char in unescaped_token should either be an escape or match the
+ // next char in unescaped_token.
+ if (!is_escaped && *esc_token_cur == '\\') {
+ is_escaped = true;
+ } else if (*esc_token_cur == unescaped_token[i]) {
+ is_escaped = false;
+ ++i;
+ } else {
+ // No match. If we don't have a possible_next_start, then try to find one.
+ if (possible_next_start == nullptr) {
+ pos = escaped_string.find(unescaped_token[0],
+ esc_token_cur - escaped_string.data());
+ if (pos == std::string_view::npos) {
+ break;
+ }
+ esc_token_start = escaped_string.data() + pos;
+ } else {
+ esc_token_start = possible_next_start;
+ possible_next_start = nullptr;
+ }
+ // esc_token_start has been reset to a char that equals unescaped_token[0]
+ // The for loop above will advance esc_token_cur so set i to 1.
+ i = 1;
+ esc_token_cur = esc_token_start;
+ }
+ }
+ if (i != unescaped_token.length()) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Couldn't match chars at token=", unescaped_token,
+ ") and raw_text=", escaped_string));
+ }
+ return std::string_view(esc_token_start, esc_token_cur - esc_token_start);
+}
+
+} // namespace string_util
+
+} // namespace lib
+} // namespace icing \ No newline at end of file
diff --git a/icing/query/advanced_query_parser/util/string-util.h b/icing/query/advanced_query_parser/util/string-util.h
new file mode 100644
index 0000000..09fb451
--- /dev/null
+++ b/icing/query/advanced_query_parser/util/string-util.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_
+
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+
+namespace string_util {
+
+// Returns:
+// - On success, value with the escapes removed.
+// - INVALID_ARGUMENT if an non-escaped quote is encountered.
+// Ex. "fo\\\\o" -> "fo\\o"
+libtextclassifier3::StatusOr<std::string> UnescapeStringValue(
+ std::string_view value);
+
+// Returns:
+// - On success, string_view pointing to the segment of escaped_string that,
+// if unescaped, would match unescaped_token.
+// - INVALID_ARGUMENT
+// Ex. escaped_string="foo b\\a\\\"r baz", unescaped_token="ba\"r"
+// returns "b\\a\\\"r"
+libtextclassifier3::StatusOr<std::string_view> FindEscapedToken(
+ std::string_view escaped_string, std::string_view unescaped_token);
+
+} // namespace string_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_
diff --git a/icing/query/advanced_query_parser/util/string-util_test.cc b/icing/query/advanced_query_parser/util/string-util_test.cc
new file mode 100644
index 0000000..a7ccf3e
--- /dev/null
+++ b/icing/query/advanced_query_parser/util/string-util_test.cc
@@ -0,0 +1,125 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/util/string-util.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+TEST(StringUtilTest, UnescapeStringEmptyString) {
+ EXPECT_THAT(string_util::UnescapeStringValue(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(StringUtilTest, UnescapeStringStringWithNoEscapes) {
+ EXPECT_THAT(string_util::UnescapeStringValue("foo"), IsOkAndHolds("foo"));
+ EXPECT_THAT(string_util::UnescapeStringValue("f o o"), IsOkAndHolds("f o o"));
+ EXPECT_THAT(string_util::UnescapeStringValue("f\to\to"),
+ IsOkAndHolds("f\to\to"));
+ EXPECT_THAT(string_util::UnescapeStringValue("f.o.o"), IsOkAndHolds("f.o.o"));
+}
+
+TEST(StringUtilTest, UnescapeStringStringWithEscapes) {
+ EXPECT_THAT(string_util::UnescapeStringValue("f\\oo"), IsOkAndHolds("foo"));
+ EXPECT_THAT(string_util::UnescapeStringValue("f\\\\oo"),
+ IsOkAndHolds("f\\oo"));
+ EXPECT_THAT(string_util::UnescapeStringValue("f\\\"oo"),
+ IsOkAndHolds("f\"oo"));
+ EXPECT_THAT(string_util::UnescapeStringValue("foo\\"), IsOkAndHolds("foo"));
+ EXPECT_THAT(string_util::UnescapeStringValue("foo b\\a\\\"r baz"),
+ IsOkAndHolds("foo ba\"r baz"));
+ EXPECT_THAT(string_util::UnescapeStringValue("bar b\\aar bar\\s bart"),
+ IsOkAndHolds("bar baar bars bart"));
+ EXPECT_THAT(string_util::UnescapeStringValue("\\\\\\\\a"),
+ IsOkAndHolds("\\\\a"));
+}
+
+TEST(StringUtilTest, UnescapeStringQuoteWithoutEscape) {
+ EXPECT_THAT(string_util::UnescapeStringValue("f\\o\"o"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(string_util::UnescapeStringValue("f\"oo"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(StringUtilTest, FindEscapedTokenEmptyUnescapedToken) {
+ EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", ""),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(StringUtilTest, FindEscapedTokenTokenNotPresent) {
+ EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "elephant"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "bat"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "taz"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "bazz"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(StringUtilTest, FindEscapedTokenMatchInMiddleToken) {
+ EXPECT_THAT(string_util::FindEscapedToken("babar", "bar"),
+ IsOkAndHolds("bar"));
+}
+
+TEST(StringUtilTest, FindEscapedTokenMatches) {
+ EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "ba\"r"),
+ IsOkAndHolds("b\\a\\\"r"));
+ EXPECT_THAT(string_util::FindEscapedToken("\\\\\\\\a", "\\\\a"),
+ IsOkAndHolds("\\\\\\\\a"));
+}
+
+TEST(StringUtilTest, FindEscapedTokenTraversesThroughEscapedText) {
+ std::string_view escaped_text = "bar b\\aar bar\\s bart";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::string_view result,
+ string_util::FindEscapedToken(escaped_text, "bar"));
+ // escaped_text = "bar b\\aar bar\\s bart";
+ // escaped_token ^ ^
+ EXPECT_THAT(result, Eq("bar"));
+
+ // escaped_text = "b\\aar bar\\s bart";
+ // escaped_token ^ ^
+ const char* result_end = result.data() + result.length();
+ escaped_text = escaped_text.substr(result_end - escaped_text.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ result, string_util::FindEscapedToken(escaped_text, "bar"));
+ EXPECT_THAT(result, Eq("bar"));
+
+ // escaped_text = "\\s bart";
+ // escaped_token ^ ^
+ result_end = result.data() + result.length();
+ escaped_text = escaped_text.substr(result_end - escaped_text.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ result, string_util::FindEscapedToken(escaped_text, "bar"));
+ EXPECT_THAT(result, Eq("bar"));
+
+ result_end = result.data() + result.length();
+ escaped_text = escaped_text.substr(result_end - escaped_text.data());
+ EXPECT_THAT(string_util::FindEscapedToken(escaped_text, "bar"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing \ No newline at end of file
diff --git a/icing/query/query-features.h b/icing/query/query-features.h
new file mode 100644
index 0000000..d829cd7
--- /dev/null
+++ b/icing/query/query-features.h
@@ -0,0 +1,63 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_QUERY_FEATURES_H_
+#define ICING_QUERY_QUERY_FEATURES_H_
+
+#include <string_view>
+#include <unordered_set>
+
+namespace icing {
+namespace lib {
+
+// A feature used in a query.
+// All feature values here must be kept in sync with its counterpart in:
+// androidx-main/frameworks/support/appsearch/appsearch/src/main/java/androidx/appsearch/app/Features.java
+using Feature = std::string_view;
+
+// This feature relates to the use of the numeric comparison operators in the
+// advanced query language. Ex. `price < 10`.
+constexpr Feature kNumericSearchFeature =
+ "NUMERIC_SEARCH"; // Features#NUMERIC_SEARCH
+
+// This feature relates to the use of the STRING terminal in the advanced query
+// language. Ex. `"foo?bar"` is treated as a single term - `foo?bar`.
+constexpr Feature kVerbatimSearchFeature =
+ "VERBATIM_SEARCH"; // Features#VERBATIM_SEARCH
+
+// This feature covers all additions (other than numeric search and verbatim
+// search) to the query language to bring it into better alignment with the list
+// filters spec.
+// This includes:
+// - support for function calls
+// - expanding support for negation and property restriction expressions
+// - prefix operator '*'
+// - 'NOT' operator
+// - propertyDefined("url")
+constexpr Feature kListFilterQueryLanguageFeature =
+ "LIST_FILTER_QUERY_LANGUAGE"; // Features#LIST_FILTER_QUERY_LANGUAGE
+
+// This feature relates to the use of the "hasProperty(property_path)" function.
+constexpr Feature kHasPropertyFunctionFeature =
+ "HAS_PROPERTY_FUNCTION"; // Features#HAS_PROPERTY_FUNCTION
+
+inline std::unordered_set<Feature> GetQueryFeaturesSet() {
+ return {kNumericSearchFeature, kVerbatimSearchFeature,
+ kListFilterQueryLanguageFeature, kHasPropertyFunctionFeature};
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_QUERY_FEATURES_H_
diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc
index 4d714f8..bbfbf3c 100644
--- a/icing/query/query-processor.cc
+++ b/icing/query/query-processor.cc
@@ -18,8 +18,8 @@
#include <memory>
#include <stack>
#include <string>
-#include <string_view>
#include <unordered_map>
+#include <unordered_set>
#include <utility>
#include <vector>
@@ -35,7 +35,15 @@
#include "icing/index/iterator/doc-hit-info-iterator-section-restrict.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/proto/search.pb.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/query/advanced_query_parser/lexer.h"
+#include "icing/query/advanced_query_parser/parser.h"
+#include "icing/query/advanced_query_parser/query-visitor.h"
+#include "icing/query/query-features.h"
+#include "icing/query/query-processor.h"
+#include "icing/query/query-results.h"
#include "icing/query/query-terms.h"
+#include "icing/query/query-utils.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
@@ -46,7 +54,6 @@
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/transform/normalizer.h"
-#include "icing/util/clock.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -70,7 +77,7 @@ struct ParserStateFrame {
// If the last independent token was a property/section filter, then we need
// to save the section name so we can create a section filter iterator.
- std::string_view section_restrict = "";
+ std::string section_restrict;
};
// Combines any OR and AND iterators together into one iterator.
@@ -101,63 +108,124 @@ std::unique_ptr<DocHitInfoIterator> ProcessParserStateFrame(
} // namespace
libtextclassifier3::StatusOr<std::unique_ptr<QueryProcessor>>
-QueryProcessor::Create(Index* index,
+QueryProcessor::Create(Index* index, const NumericIndex<int64_t>* numeric_index,
const LanguageSegmenter* language_segmenter,
const Normalizer* normalizer,
const DocumentStore* document_store,
- const SchemaStore* schema_store, const Clock* clock) {
+ const SchemaStore* schema_store) {
ICING_RETURN_ERROR_IF_NULL(index);
+ ICING_RETURN_ERROR_IF_NULL(numeric_index);
ICING_RETURN_ERROR_IF_NULL(language_segmenter);
ICING_RETURN_ERROR_IF_NULL(normalizer);
ICING_RETURN_ERROR_IF_NULL(document_store);
ICING_RETURN_ERROR_IF_NULL(schema_store);
- ICING_RETURN_ERROR_IF_NULL(clock);
return std::unique_ptr<QueryProcessor>(
- new QueryProcessor(index, language_segmenter, normalizer, document_store,
- schema_store, clock));
+ new QueryProcessor(index, numeric_index, language_segmenter, normalizer,
+ document_store, schema_store));
}
QueryProcessor::QueryProcessor(Index* index,
+ const NumericIndex<int64_t>* numeric_index,
const LanguageSegmenter* language_segmenter,
const Normalizer* normalizer,
const DocumentStore* document_store,
- const SchemaStore* schema_store,
- const Clock* clock)
+ const SchemaStore* schema_store)
: index_(*index),
+ numeric_index_(*numeric_index),
language_segmenter_(*language_segmenter),
normalizer_(*normalizer),
document_store_(*document_store),
- schema_store_(*schema_store),
- clock_(*clock) {}
-
-libtextclassifier3::StatusOr<QueryProcessor::QueryResults>
-QueryProcessor::ParseSearch(const SearchSpecProto& search_spec) {
- ICING_ASSIGN_OR_RETURN(QueryResults results, ParseRawQuery(search_spec));
-
- DocHitInfoIteratorFilter::Options options;
-
- if (search_spec.namespace_filters_size() > 0) {
- options.namespaces =
- std::vector<std::string_view>(search_spec.namespace_filters().begin(),
- search_spec.namespace_filters().end());
+ schema_store_(*schema_store) {}
+
+libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseSearch(
+ const SearchSpecProto& search_spec,
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy,
+ int64_t current_time_ms) {
+ if (search_spec.search_type() == SearchSpecProto::SearchType::UNDEFINED) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Search type ",
+ SearchSpecProto::SearchType::Code_Name(search_spec.search_type()),
+ " is not supported."));
+ }
+ QueryResults results;
+ if (search_spec.search_type() ==
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ ICING_VLOG(1) << "Using EXPERIMENTAL_ICING_ADVANCED_QUERY parser!";
+ ICING_ASSIGN_OR_RETURN(
+ results,
+ ParseAdvancedQuery(search_spec, ranking_strategy, current_time_ms));
+ } else {
+ ICING_ASSIGN_OR_RETURN(
+ results, ParseRawQuery(search_spec, ranking_strategy, current_time_ms));
}
- if (search_spec.schema_type_filters_size() > 0) {
- options.schema_types =
- std::vector<std::string_view>(search_spec.schema_type_filters().begin(),
- search_spec.schema_type_filters().end());
+ // Check that all new features used in the search have been enabled in the
+ // SearchSpec.
+ const std::unordered_set<Feature> enabled_features(
+ search_spec.enabled_features().begin(),
+ search_spec.enabled_features().end());
+ for (const Feature feature : results.features_in_use) {
+ if (enabled_features.find(feature) == enabled_features.end()) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Attempted use of unenabled feature ", feature));
+ }
}
+ DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
results.root_iterator = std::make_unique<DocHitInfoIteratorFilter>(
std::move(results.root_iterator), &document_store_, &schema_store_,
- &clock_, options);
+ options, current_time_ms);
+ if (!search_spec.type_property_filters().empty()) {
+ results.root_iterator =
+ DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
+ std::move(results.root_iterator), &document_store_, &schema_store_,
+ search_spec, current_time_ms);
+ }
return results;
}
+libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseAdvancedQuery(
+ const SearchSpecProto& search_spec,
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy,
+ int64_t current_time_ms) const {
+ QueryResults results;
+ Lexer lexer(search_spec.query(), Lexer::Language::QUERY);
+ ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeQuery());
+
+ if (tree_root == nullptr) {
+ results.root_iterator = std::make_unique<DocHitInfoIteratorAllDocumentId>(
+ document_store_.last_added_document_id());
+ return results;
+ }
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, &language_segmenter_));
+ DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
+ bool needs_term_frequency_info =
+ ranking_strategy == ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE;
+ QueryVisitor query_visitor(&index_, &numeric_index_, &document_store_,
+ &schema_store_, &normalizer_,
+ plain_tokenizer.get(), search_spec.query(),
+ std::move(options), search_spec.term_match_type(),
+ needs_term_frequency_info, current_time_ms);
+ tree_root->Accept(&query_visitor);
+ return std::move(query_visitor).ConsumeResults();
+}
+
// TODO(cassiewang): Collect query stats to populate the SearchResultsProto
-libtextclassifier3::StatusOr<QueryProcessor::QueryResults>
-QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) {
+libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseRawQuery(
+ const SearchSpecProto& search_spec,
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy,
+ int64_t current_time_ms) {
+ DocHitInfoIteratorFilter::Options options = GetFilterOptions(search_spec);
+
// Tokenize the incoming raw query
//
// TODO(cassiewang): Consider caching/creating a tokenizer factory that will
@@ -173,14 +241,13 @@ QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) {
std::stack<ParserStateFrame> frames;
frames.emplace();
-
QueryResults results;
// Process all the tokens
for (int i = 0; i < tokens.size(); i++) {
const Token& token = tokens.at(i);
std::unique_ptr<DocHitInfoIterator> result_iterator;
- // TODO(cassiewang): Handle negation tokens
+ // TODO(b/202076890): Handle negation tokens
switch (token.type) {
case Token::Type::QUERY_LEFT_PARENTHESES: {
frames.emplace(ParserStateFrame());
@@ -218,7 +285,7 @@ QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) {
"Encountered empty stack of ParserStateFrames");
}
- frames.top().section_restrict = token.text;
+ frames.top().section_restrict = std::string(token.text);
break;
}
case Token::Type::REGULAR: {
@@ -252,18 +319,38 @@ QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) {
// We do the same amount of disk reads, so it may be dependent on how
// big the schema is and/or how popular schema type filtering and
// section filtering is.
-
ICING_ASSIGN_OR_RETURN(
result_iterator,
- index_.GetIterator(normalized_text, kSectionIdMaskAll,
- search_spec.term_match_type()));
-
- // Add terms to match if this is not a negation term.
+ index_.GetIterator(
+ normalized_text,
+ token.text.data() - search_spec.query().c_str(),
+ token.text.length(), kSectionIdMaskAll,
+ search_spec.term_match_type(),
+ /*need_hit_term_frequency=*/ranking_strategy ==
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));
+
+ // Add term iterator and terms to match if this is not a negation term.
// WARNING: setting query terms at this point is not compatible with
// group-level excludes, group-level sections restricts or excluded
// section restricts. Those are not currently supported. If they became
// supported, this handling for query terms would need to be altered.
if (!frames.top().saw_exclude) {
+ if (ranking_strategy ==
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<DocHitInfoIterator> term_iterator,
+ index_.GetIterator(
+ normalized_text,
+ token.text.data() - search_spec.query().c_str(),
+ token.text.length(), kSectionIdMaskAll,
+ search_spec.term_match_type(),
+ /*need_hit_term_frequency=*/ranking_strategy ==
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE));
+ results.query_term_iterators[normalized_text] =
+ std::make_unique<DocHitInfoIteratorFilter>(
+ std::move(term_iterator), &document_store_, &schema_store_,
+ options, current_time_ms);
+ }
results.query_terms[frames.top().section_restrict].insert(
std::move(normalized_text));
}
@@ -316,9 +403,11 @@ QueryProcessor::ParseRawQuery(const SearchSpecProto& search_spec) {
if (!frames.top().section_restrict.empty()) {
// We saw a section restrict earlier, wrap the result iterator in
// the section restrict
- result_iterator = std::make_unique<DocHitInfoIteratorSectionRestrict>(
+ std::set<std::string> section_restricts;
+ section_restricts.insert(std::move(frames.top().section_restrict));
+ result_iterator = DocHitInfoIteratorSectionRestrict::ApplyRestrictions(
std::move(result_iterator), &document_store_, &schema_store_,
- frames.top().section_restrict);
+ std::move(section_restricts), current_time_ms);
frames.top().section_restrict = "";
}
diff --git a/icing/query/query-processor.h b/icing/query/query-processor.h
index fa98627..d4c22dd 100644
--- a/icing/query/query-processor.h
+++ b/icing/query/query-processor.h
@@ -15,18 +15,21 @@
#ifndef ICING_QUERY_QUERY_PROCESSOR_H_
#define ICING_QUERY_QUERY_PROCESSOR_H_
+#include <cstdint>
#include <memory>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/index/index.h"
+#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/numeric-index.h"
#include "icing/proto/search.pb.h"
+#include "icing/query/query-results.h"
#include "icing/query/query-terms.h"
#include "icing/schema/schema-store.h"
#include "icing/store/document-store.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer.h"
-#include "icing/util/clock.h"
namespace icing {
namespace lib {
@@ -44,19 +47,18 @@ class QueryProcessor {
// An QueryProcessor on success
// FAILED_PRECONDITION if any of the pointers is null.
static libtextclassifier3::StatusOr<std::unique_ptr<QueryProcessor>> Create(
- Index* index, const LanguageSegmenter* language_segmenter,
- const Normalizer* normalizer, const DocumentStore* document_store,
- const SchemaStore* schema_store, const Clock* clock);
+ Index* index, const NumericIndex<int64_t>* numeric_index,
+ const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
+ const DocumentStore* document_store, const SchemaStore* schema_store);
- struct QueryResults {
- std::unique_ptr<DocHitInfoIterator> root_iterator;
- // A map from section names to sets of terms restricted to those sections.
- // Query terms that are not restricted are found at the entry with key "".
- SectionRestrictQueryTermsMap query_terms;
- };
// Parse the search configurations (including the query, any additional
// filters, etc.) in the SearchSpecProto into one DocHitInfoIterator.
//
+ // When ranking_strategy == RELEVANCE_SCORE, the root_iterator and the
+ // query_term_iterators returned will keep term frequency information
+ // internally, so that term frequency stats will be collected when calling
+ // PopulateMatchedTermsStats to the iterators.
+ //
// Returns:
// On success,
// - One iterator that represents the entire query
@@ -64,14 +66,29 @@ class QueryProcessor {
// INVALID_ARGUMENT if query syntax is incorrect and cannot be tokenized
// INTERNAL_ERROR on all other errors
libtextclassifier3::StatusOr<QueryResults> ParseSearch(
- const SearchSpecProto& search_spec);
+ const SearchSpecProto& search_spec,
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy,
+ int64_t current_time_ms);
private:
explicit QueryProcessor(Index* index,
+ const NumericIndex<int64_t>* numeric_index,
const LanguageSegmenter* language_segmenter,
const Normalizer* normalizer,
const DocumentStore* document_store,
- const SchemaStore* schema_store, const Clock* clock);
+ const SchemaStore* schema_store);
+
+ // Parse the query into a one DocHitInfoIterator that represents the root of a
+ // query tree in our new Advanced Query Language.
+ //
+ // Returns:
+ // On success,
+ // - One iterator that represents the entire query
+ // INVALID_ARGUMENT if query syntax is incorrect and cannot be tokenized
+ libtextclassifier3::StatusOr<QueryResults> ParseAdvancedQuery(
+ const SearchSpecProto& search_spec,
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy,
+ int64_t current_time_ms) const;
// Parse the query into a one DocHitInfoIterator that represents the root of a
// query tree.
@@ -83,16 +100,18 @@ class QueryProcessor {
// INVALID_ARGUMENT if query syntax is incorrect and cannot be tokenized
// INTERNAL_ERROR on all other errors
libtextclassifier3::StatusOr<QueryResults> ParseRawQuery(
- const SearchSpecProto& search_spec);
+ const SearchSpecProto& search_spec,
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy,
+ int64_t current_time_ms);
// Not const because we could modify/sort the hit buffer in the lite index at
// query time.
Index& index_;
+ const NumericIndex<int64_t>& numeric_index_;
const LanguageSegmenter& language_segmenter_;
const Normalizer& normalizer_;
const DocumentStore& document_store_;
const SchemaStore& schema_store_;
- const Clock& clock_;
};
} // namespace lib
diff --git a/icing/query/query-processor_benchmark.cc b/icing/query/query-processor_benchmark.cc
index 000bf3a..025e8e6 100644
--- a/icing/query/query-processor_benchmark.cc
+++ b/icing/query/query-processor_benchmark.cc
@@ -16,27 +16,32 @@
#include "gmock/gmock.h"
#include "third_party/absl/flags/flag.h"
#include "icing/document-builder.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/index.h"
+#include "icing/index/numeric/dummy-numeric-index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/query/query-processor.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/transform/normalizer-factory.h"
+#include "icing/util/clock.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
// //icing/query:query-processor_benchmark
//
// $ blaze-bin/icing/query/query-processor_benchmark
-// --benchmarks=all
+// --benchmark_filter=all
//
// Run on an Android device:
// Make target //icing/tokenization:language-segmenter depend on
@@ -52,8 +57,8 @@
// $ adb push blaze-bin/icing/query/query-processor_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/query-processor_benchmark --benchmarks=all
-// --adb
+// $ adb shell /data/local/tmp/query-processor_benchmark
+// --benchmark_filter=all --adb
// Flag to tell the benchmark that it'll be run on an Android device via adb,
// the benchmark will set up data files accordingly.
@@ -69,13 +74,17 @@ void AddTokenToIndex(Index* index, DocumentId document_id, SectionId section_id,
const std::string& token) {
Index::Editor editor =
index->Edit(document_id, section_id, term_match_type, /*namespace_id=*/0);
- ICING_ASSERT_OK(editor.AddHit(token.c_str()));
+ ICING_ASSERT_OK(editor.BufferTerm(token.c_str()));
+ ICING_ASSERT_OK(editor.IndexAllBufferedTerms());
}
-std::unique_ptr<Index> CreateIndex(const IcingFilesystem& filesystem,
+std::unique_ptr<Index> CreateIndex(const IcingFilesystem& icing_filesystem,
+ const Filesystem& filesystem,
const std::string& index_dir) {
- Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10);
- return Index::Create(options, &filesystem).ValueOrDie();
+ Index::Options options(index_dir, /*index_merge_size=*/1024 * 1024 * 10,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ return Index::Create(options, &filesystem, &icing_filesystem).ValueOrDie();
}
std::unique_ptr<Normalizer> CreateNormalizer() {
@@ -85,6 +94,18 @@ std::unique_ptr<Normalizer> CreateNormalizer() {
.ValueOrDie();
}
+libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store) {
+ return DocumentStore::Create(
+ filesystem, base_dir, clock, schema_store,
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr);
+}
+
void BM_QueryOneTerm(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
@@ -96,6 +117,7 @@ void BM_QueryOneTerm(benchmark::State& state) {
Filesystem filesystem;
const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string index_dir = base_dir + "/index";
+ const std::string numeric_index_dir = base_dir + "/numeric_index";
const std::string schema_dir = base_dir + "/schema";
const std::string doc_store_dir = base_dir + "/store";
@@ -106,23 +128,35 @@ void BM_QueryOneTerm(benchmark::State& state) {
ICING_LOG(ERROR) << "Failed to create test directories";
}
- std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ std::unique_ptr<Index> index =
+ CreateIndex(icing_filesystem, filesystem, index_dir);
+ // TODO(b/249829533): switch to use persistent numeric index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto numeric_index,
+ DummyNumericIndex<int64_t>::Create(filesystem, numeric_index_dir));
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
- FakeClock fake_clock;
SchemaProto schema;
auto type_config = schema.add_types();
type_config->set_schema_type("type1");
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem, schema_dir));
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
-
- std::unique_ptr<DocumentStore> document_store =
- DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock,
- schema_store.get())
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir, &clock));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ DocumentStore::CreateResult create_result =
+ CreateDocumentStore(&filesystem, doc_store_dir, &clock,
+ schema_store.get())
.ValueOrDie();
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
DocumentId document_id = document_store
->Put(DocumentBuilder()
@@ -137,17 +171,21 @@ void BM_QueryOneTerm(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index.get(), language_segmenter.get(),
- normalizer.get(), document_store.get(),
- schema_store.get(), &fake_clock));
+ QueryProcessor::Create(index.get(), numeric_index.get(),
+ language_segmenter.get(), normalizer.get(),
+ document_store.get(), schema_store.get()));
SearchSpecProto search_spec;
search_spec.set_query(input_string);
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
for (auto _ : state) {
- QueryProcessor::QueryResults results =
- query_processor->ParseSearch(search_spec).ValueOrDie();
+ QueryResults results =
+ query_processor
+ ->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ clock.GetSystemTimeMilliseconds())
+ .ValueOrDie();
while (results.root_iterator->Advance().ok()) {
results.root_iterator->doc_hit_info();
}
@@ -208,6 +246,7 @@ void BM_QueryFiveTerms(benchmark::State& state) {
Filesystem filesystem;
const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string index_dir = base_dir + "/index";
+ const std::string numeric_index_dir = base_dir + "/numeric_index";
const std::string schema_dir = base_dir + "/schema";
const std::string doc_store_dir = base_dir + "/store";
@@ -218,23 +257,35 @@ void BM_QueryFiveTerms(benchmark::State& state) {
ICING_LOG(ERROR) << "Failed to create test directories";
}
- std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ std::unique_ptr<Index> index =
+ CreateIndex(icing_filesystem, filesystem, index_dir);
+ // TODO(b/249829533): switch to use persistent numeric index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto numeric_index,
+ DummyNumericIndex<int64_t>::Create(filesystem, numeric_index_dir));
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
- FakeClock fake_clock;
SchemaProto schema;
auto type_config = schema.add_types();
type_config->set_schema_type("type1");
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem, schema_dir));
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
-
- std::unique_ptr<DocumentStore> document_store =
- DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock,
- schema_store.get())
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir, &clock));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ DocumentStore::CreateResult create_result =
+ CreateDocumentStore(&filesystem, doc_store_dir, &clock,
+ schema_store.get())
.ValueOrDie();
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
DocumentId document_id = document_store
->Put(DocumentBuilder()
@@ -263,9 +314,9 @@ void BM_QueryFiveTerms(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index.get(), language_segmenter.get(),
- normalizer.get(), document_store.get(),
- schema_store.get(), &fake_clock));
+ QueryProcessor::Create(index.get(), numeric_index.get(),
+ language_segmenter.get(), normalizer.get(),
+ document_store.get(), schema_store.get()));
const std::string query_string = absl_ports::StrCat(
input_string_a, " ", input_string_b, " ", input_string_c, " ",
@@ -276,8 +327,12 @@ void BM_QueryFiveTerms(benchmark::State& state) {
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
for (auto _ : state) {
- QueryProcessor::QueryResults results =
- query_processor->ParseSearch(search_spec).ValueOrDie();
+ QueryResults results =
+ query_processor
+ ->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ clock.GetSystemTimeMilliseconds())
+ .ValueOrDie();
while (results.root_iterator->Advance().ok()) {
results.root_iterator->doc_hit_info();
}
@@ -338,6 +393,7 @@ void BM_QueryDiacriticTerm(benchmark::State& state) {
Filesystem filesystem;
const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string index_dir = base_dir + "/index";
+ const std::string numeric_index_dir = base_dir + "/numeric_index";
const std::string schema_dir = base_dir + "/schema";
const std::string doc_store_dir = base_dir + "/store";
@@ -348,23 +404,35 @@ void BM_QueryDiacriticTerm(benchmark::State& state) {
ICING_LOG(ERROR) << "Failed to create test directories";
}
- std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ std::unique_ptr<Index> index =
+ CreateIndex(icing_filesystem, filesystem, index_dir);
+ // TODO(b/249829533): switch to use persistent numeric index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto numeric_index,
+ DummyNumericIndex<int64_t>::Create(filesystem, numeric_index_dir));
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
- FakeClock fake_clock;
SchemaProto schema;
auto type_config = schema.add_types();
type_config->set_schema_type("type1");
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem, schema_dir));
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
-
- std::unique_ptr<DocumentStore> document_store =
- DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock,
- schema_store.get())
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir, &clock));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ DocumentStore::CreateResult create_result =
+ CreateDocumentStore(&filesystem, doc_store_dir, &clock,
+ schema_store.get())
.ValueOrDie();
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
DocumentId document_id = document_store
->Put(DocumentBuilder()
@@ -382,17 +450,21 @@ void BM_QueryDiacriticTerm(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index.get(), language_segmenter.get(),
- normalizer.get(), document_store.get(),
- schema_store.get(), &fake_clock));
+ QueryProcessor::Create(index.get(), numeric_index.get(),
+ language_segmenter.get(), normalizer.get(),
+ document_store.get(), schema_store.get()));
SearchSpecProto search_spec;
search_spec.set_query(input_string);
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
for (auto _ : state) {
- QueryProcessor::QueryResults results =
- query_processor->ParseSearch(search_spec).ValueOrDie();
+ QueryResults results =
+ query_processor
+ ->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ clock.GetSystemTimeMilliseconds())
+ .ValueOrDie();
while (results.root_iterator->Advance().ok()) {
results.root_iterator->doc_hit_info();
}
@@ -453,6 +525,7 @@ void BM_QueryHiragana(benchmark::State& state) {
Filesystem filesystem;
const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
const std::string index_dir = base_dir + "/index";
+ const std::string numeric_index_dir = base_dir + "/numeric_index";
const std::string schema_dir = base_dir + "/schema";
const std::string doc_store_dir = base_dir + "/store";
@@ -463,23 +536,35 @@ void BM_QueryHiragana(benchmark::State& state) {
ICING_LOG(ERROR) << "Failed to create test directories";
}
- std::unique_ptr<Index> index = CreateIndex(icing_filesystem, index_dir);
+ std::unique_ptr<Index> index =
+ CreateIndex(icing_filesystem, filesystem, index_dir);
+ // TODO(b/249829533): switch to use persistent numeric index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto numeric_index,
+ DummyNumericIndex<int64_t>::Create(filesystem, numeric_index_dir));
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::unique_ptr<Normalizer> normalizer = CreateNormalizer();
- FakeClock fake_clock;
SchemaProto schema;
auto type_config = schema.add_types();
type_config->set_schema_type("type1");
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem, schema_dir));
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
-
- std::unique_ptr<DocumentStore> document_store =
- DocumentStore::Create(&filesystem, doc_store_dir, &fake_clock,
- schema_store.get())
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir, &clock));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ DocumentStore::CreateResult create_result =
+ CreateDocumentStore(&filesystem, doc_store_dir, &clock,
+ schema_store.get())
.ValueOrDie();
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
DocumentId document_id = document_store
->Put(DocumentBuilder()
@@ -497,17 +582,21 @@ void BM_QueryHiragana(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index.get(), language_segmenter.get(),
- normalizer.get(), document_store.get(),
- schema_store.get(), &fake_clock));
+ QueryProcessor::Create(index.get(), numeric_index.get(),
+ language_segmenter.get(), normalizer.get(),
+ document_store.get(), schema_store.get()));
SearchSpecProto search_spec;
search_spec.set_query(input_string);
search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
for (auto _ : state) {
- QueryProcessor::QueryResults results =
- query_processor->ParseSearch(search_spec).ValueOrDie();
+ QueryResults results =
+ query_processor
+ ->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ clock.GetSystemTimeMilliseconds())
+ .ValueOrDie();
while (results.root_iterator->Advance().ok()) {
results.root_iterator->doc_hit_info();
}
diff --git a/icing/query/query-processor_test.cc b/icing/query/query-processor_test.cc
index dc94a72..53e3035 100644
--- a/icing/query/query-processor_test.cc
+++ b/icing/query/query-processor_test.cc
@@ -14,30 +14,38 @@
#include "icing/query/query-processor.h"
+#include <cstdint>
#include <memory>
#include <string>
+#include <vector>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/index.h"
#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/index/numeric/dummy-numeric-index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/jni/jni-cache.h"
#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/portable/platform.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/search.pb.h"
#include "icing/proto/term.pb.h"
+#include "icing/query/query-features.h"
+#include "icing/query/query-results.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
@@ -55,62 +63,67 @@ namespace {
using ::testing::ElementsAre;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-using ::testing::Test;
using ::testing::UnorderedElementsAre;
-SchemaTypeConfigProto* AddSchemaType(SchemaProto* schema,
- std::string schema_type) {
- SchemaTypeConfigProto* type_config = schema->add_types();
- type_config->set_schema_type(schema_type);
- return type_config;
-}
-
-void AddIndexedProperty(SchemaTypeConfigProto* type_config, std::string name) {
- PropertyConfigProto* property_config = type_config->add_properties();
- property_config->set_property_name(name);
- property_config->set_data_type(PropertyConfigProto::DataType::STRING);
- property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property_config->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store) {
+ return DocumentStore::Create(
+ filesystem, base_dir, clock, schema_store,
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr);
}
-void AddUnindexedProperty(SchemaTypeConfigProto* type_config,
- std::string name) {
- PropertyConfigProto* property_config = type_config->add_properties();
- property_config->set_property_name(name);
- property_config->set_data_type(PropertyConfigProto::DataType::STRING);
-}
-
-class QueryProcessorTest : public Test {
+class QueryProcessorTest
+ : public ::testing::TestWithParam<SearchSpecProto::SearchType::Code> {
protected:
QueryProcessorTest()
: test_dir_(GetTestTempDir() + "/icing"),
store_dir_(test_dir_ + "/store"),
- index_dir_(test_dir_ + "/index") {}
+ schema_store_dir_(test_dir_ + "/schema_store"),
+ index_dir_(test_dir_ + "/index"),
+ numeric_index_dir_(test_dir_ + "/numeric_index") {}
void SetUp() override {
filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
-#ifndef ICING_REVERSE_JNI_SEGMENTATION
- // If we've specified using the reverse-JNI method for segmentation (i.e.
- // not ICU), then we won't have the ICU data file included to set up.
- // Technically, we could choose to use reverse-JNI for segmentation AND
- // include an ICU data file, but that seems unlikely and our current BUILD
- // setup doesn't do this.
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
-#endif // ICING_REVERSE_JNI_SEGMENTATION
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, store_dir_, &fake_clock_,
+ schema_store_.get()));
+ document_store_ = std::move(create_result.document_store);
Index::Options options(index_dir_,
- /*index_merge_size=*/1024 * 1024);
- ICING_ASSERT_OK_AND_ASSIGN(index_,
- Index::Create(options, &icing_filesystem_));
+ /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+ // TODO(b/249829533): switch to use persistent numeric index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ numeric_index_,
+ DummyNumericIndex<int64_t>::Create(filesystem_, numeric_index_dir_));
language_segmenter_factory::SegmenterOptions segmenter_options(
ULOC_US, jni_cache_.get());
@@ -120,6 +133,12 @@ class QueryProcessorTest : public Test {
ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
/*max_term_byte_size=*/1000));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ query_processor_,
+ QueryProcessor::Create(index_.get(), numeric_index_.get(),
+ language_segmenter_.get(), normalizer_.get(),
+ document_store_.get(), schema_store_.get()));
}
libtextclassifier3::Status AddTokenToIndex(
@@ -127,7 +146,18 @@ class QueryProcessorTest : public Test {
TermMatchType::Code term_match_type, const std::string& token) {
Index::Editor editor = index_->Edit(document_id, section_id,
term_match_type, /*namespace_id=*/0);
- return editor.AddHit(token.c_str());
+ auto status = editor.BufferTerm(token.c_str());
+ return status.ok() ? editor.IndexAllBufferedTerms() : status;
+ }
+
+ libtextclassifier3::Status AddToNumericIndex(DocumentId document_id,
+ const std::string& property,
+ SectionId section_id,
+ int64_t value) {
+ std::unique_ptr<NumericIndex<int64_t>::Editor> editor =
+ numeric_index_->Edit(property, document_id, section_id);
+ ICING_RETURN_IF_ERROR(editor->BufferKey(value));
+ return std::move(*editor).IndexAllBufferedKeys();
}
void TearDown() override {
@@ -135,67 +165,70 @@ class QueryProcessorTest : public Test {
schema_store_.reset();
filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
}
-
Filesystem filesystem_;
const std::string test_dir_;
const std::string store_dir_;
+ const std::string schema_store_dir_;
+
+ private:
+ IcingFilesystem icing_filesystem_;
+ const std::string index_dir_;
+ const std::string numeric_index_dir_;
+
+ protected:
std::unique_ptr<Index> index_;
+ std::unique_ptr<NumericIndex<int64_t>> numeric_index_;
std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<Normalizer> normalizer_;
- std::unique_ptr<SchemaStore> schema_store_;
- std::unique_ptr<DocumentStore> document_store_;
FakeClock fake_clock_;
std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
-
- private:
- IcingFilesystem icing_filesystem_;
- const std::string index_dir_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<QueryProcessor> query_processor_;
};
-TEST_F(QueryProcessorTest, CreationWithNullPointerShouldFail) {
+TEST_P(QueryProcessorTest, CreationWithNullPointerShouldFail) {
EXPECT_THAT(
- QueryProcessor::Create(/*index=*/nullptr, language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_),
+ QueryProcessor::Create(/*index=*/nullptr, numeric_index_.get(),
+ language_segmenter_.get(), normalizer_.get(),
+ document_store_.get(), schema_store_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
EXPECT_THAT(
- QueryProcessor::Create(index_.get(), /*language_segmenter=*/nullptr,
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_),
+ QueryProcessor::Create(index_.get(), /*numeric_index_=*/nullptr,
+ language_segmenter_.get(), normalizer_.get(),
+ document_store_.get(), schema_store_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
EXPECT_THAT(
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- /*normalizer=*/nullptr, document_store_.get(),
- schema_store_.get(), &fake_clock_),
+ QueryProcessor::Create(index_.get(), numeric_index_.get(),
+ /*language_segmenter=*/nullptr, normalizer_.get(),
+ document_store_.get(), schema_store_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
EXPECT_THAT(
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), /*document_store=*/nullptr,
- schema_store_.get(), &fake_clock_),
+ QueryProcessor::Create(
+ index_.get(), numeric_index_.get(), language_segmenter_.get(),
+ /*normalizer=*/nullptr, document_store_.get(), schema_store_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- /*schema_store=*/nullptr, &fake_clock_),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(QueryProcessor::Create(index_.get(), language_segmenter_.get(),
+ EXPECT_THAT(
+ QueryProcessor::Create(index_.get(), numeric_index_.get(),
+ language_segmenter_.get(), normalizer_.get(),
+ /*document_store=*/nullptr, schema_store_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(QueryProcessor::Create(index_.get(), numeric_index_.get(),
+ language_segmenter_.get(),
normalizer_.get(), document_store_.get(),
- schema_store_.get(), /*clock=*/nullptr),
+ /*schema_store=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) {
+TEST_P(QueryProcessorTest, EmptyGroupMatchAllDocuments) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
@@ -210,39 +243,41 @@ TEST_F(QueryProcessorTest, EmptyGroupMatchAllDocuments) {
// We don't need to insert anything in the index since the empty query will
// match all DocumentIds from the DocumentStore
-
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("()");
+ search_spec.set_search_type(GetParam());
+ if (GetParam() !=
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
-
- // Descending order of valid DocumentIds
- EXPECT_THAT(GetDocumentIds(results.root_iterator.get()),
- ElementsAre(document_id2, document_id1));
- EXPECT_THAT(results.query_terms, IsEmpty());
+ // Descending order of valid DocumentIds
+ EXPECT_THAT(GetDocumentIds(results.root_iterator.get()),
+ ElementsAre(document_id2, document_id1));
+ EXPECT_THAT(results.query_terms, IsEmpty());
+ EXPECT_THAT(results.query_term_iterators, IsEmpty());
+ } else {
+ // TODO(b/208654892): Resolve the difference between RAW_QUERY and ADVANCED
+ // regarding empty composite expressions.
+ EXPECT_THAT(query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ }
}
-TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) {
+TEST_P(QueryProcessorTest, EmptyQueryMatchAllDocuments) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
@@ -257,39 +292,32 @@ TEST_F(QueryProcessorTest, EmptyQueryMatchAllDocuments) {
// We don't need to insert anything in the index since the empty query will
// match all DocumentIds from the DocumentStore
-
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("");
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
EXPECT_THAT(GetDocumentIds(results.root_iterator.get()),
ElementsAre(document_id2, document_id1));
EXPECT_THAT(results.query_terms, IsEmpty());
+ EXPECT_THAT(results.query_term_iterators, IsEmpty());
}
-TEST_F(QueryProcessorTest, QueryTermNormalized) {
+TEST_P(QueryProcessorTest, QueryTermNormalized) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -312,40 +340,45 @@ TEST_F(QueryProcessorTest, QueryTermNormalized) {
AddTokenToIndex(document_id, section_id, term_match_type, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("hElLo WORLD");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("hello", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("world", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(2));
- // Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id, section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "world"));
}
-TEST_F(QueryProcessorTest, OneTermPrefixMatch) {
+TEST_P(QueryProcessorTest, OneTermPrefixMatch) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -365,40 +398,101 @@ TEST_F(QueryProcessorTest, OneTermPrefixMatch) {
AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("he");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "he", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
- // Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id, section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("he"));
}
-TEST_F(QueryProcessorTest, OneTermExactMatch) {
+TEST_P(QueryProcessorTest, OneTermPrefixMatchWithMaxSectionID) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ // Populate the index
+ SectionId section_id = kMaxSectionId;
+ SectionIdMask section_id_mask = UINT64_C(1) << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+ std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies{};
+ term_frequencies[kMaxSectionId] = 1;
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("he");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "he", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("he"));
+}
+
+TEST_P(QueryProcessorTest, OneTermExactMatch) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -418,44 +512,105 @@ TEST_F(QueryProcessorTest, OneTermExactMatch) {
AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("hello");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hello", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
- // Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id, section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello"));
}
-TEST_F(QueryProcessorTest, AndTwoTermExactMatch) {
+TEST_P(QueryProcessorTest, AndSameTermExactMatch) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("hello hello");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hello", expected_section_ids_tf_map)));
+
+ ASSERT_FALSE(results.root_iterator->Advance().ok());
+
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello"));
+}
+
+TEST_P(QueryProcessorTest, AndTwoTermExactMatch) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -474,44 +629,107 @@ TEST_F(QueryProcessorTest, AndTwoTermExactMatch) {
AddTokenToIndex(document_id, section_id, term_match_type, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("hello world");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("hello", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("world", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(2));
- // Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id, section_id_mask)));
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "world"));
}
-TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) {
+TEST_P(QueryProcessorTest, AndSameTermPrefixMatch) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::PREFIX;
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ SearchSpecProto search_spec;
+ search_spec.set_query("he he");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "he", expected_section_ids_tf_map)));
+
+ ASSERT_FALSE(results.root_iterator->Advance().ok());
+
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("he"));
+}
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+TEST_P(QueryProcessorTest, AndTwoTermPrefixMatch) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -530,44 +748,50 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixMatch) {
AddTokenToIndex(document_id, section_id, term_match_type, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("he wo");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("he", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("wo", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(2));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("he", "wo"));
}
-TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) {
+TEST_P(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -586,44 +810,50 @@ TEST_F(QueryProcessorTest, AndTwoTermPrefixAndExactMatch) {
AddTokenToIndex(document_id, section_id, term_match_type, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("hello wo");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("hello", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("wo", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(2));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "wo"));
}
-TEST_F(QueryProcessorTest, OrTwoTermExactMatch) {
+TEST_P(QueryProcessorTest, OrTwoTermExactMatch) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -647,45 +877,58 @@ TEST_F(QueryProcessorTest, OrTwoTermExactMatch) {
AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("hello OR world");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask),
- DocHitInfo(document_id1, section_id_mask)));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id2);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "world", expected_section_ids_tf_map)));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id1);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ matched_terms_stats.clear();
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hello", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(2));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "world"));
}
-TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) {
+TEST_P(QueryProcessorTest, OrTwoTermPrefixMatch) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -709,45 +952,58 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixMatch) {
AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("he OR wo");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask),
- DocHitInfo(document_id1, section_id_mask)));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id2);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "wo", expected_section_ids_tf_map)));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id1);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ matched_terms_stats.clear();
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "he", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(2));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("he", "wo"));
}
-TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) {
+TEST_P(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -770,45 +1026,57 @@ TEST_F(QueryProcessorTest, OrTwoTermPrefixAndExactMatch) {
AddTokenToIndex(document_id2, section_id, TermMatchType::PREFIX, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("hello OR wo");
search_spec.set_term_match_type(TermMatchType::PREFIX);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask),
- DocHitInfo(document_id1, section_id_mask)));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id2);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "wo", expected_section_ids_tf_map)));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id1);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ matched_terms_stats.clear();
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
+ "hello", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(2));
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("hello", "wo"));
}
-TEST_F(QueryProcessorTest, CombinedAndOrTerms) {
+TEST_P(QueryProcessorTest, CombinedAndOrTerms) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -833,6 +1101,7 @@ TEST_F(QueryProcessorTest, CombinedAndOrTerms) {
IsOk());
EXPECT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
IsOk());
+ ICING_ASSERT_OK(index_->Merge());
// Document 2 has content "animal kitten cat"
EXPECT_THAT(
@@ -844,26 +1113,37 @@ TEST_F(QueryProcessorTest, CombinedAndOrTerms) {
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
{
// OR gets precedence over AND, this is parsed as ((puppy OR kitten) AND
// dog)
SearchSpecProto search_spec;
search_spec.set_query("puppy OR kitten dog");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Only Document 1 matches since it has puppy AND dog
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(),
+ document_id1);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("puppy", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("dog", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(3));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""],
UnorderedElementsAre("puppy", "kitten", "dog"));
@@ -875,57 +1155,102 @@ TEST_F(QueryProcessorTest, CombinedAndOrTerms) {
SearchSpecProto search_spec;
search_spec.set_query("animal puppy OR kitten");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
- // Both Document 1 and 2 match since Document 1 has puppy AND dog, and
- // Document 2 has kitten
+ // Both Document 1 and 2 match since Document 1 has animal AND puppy, and
+ // Document 2 has animal AND kitten
// Descending order of valid DocumentIds
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask),
- DocHitInfo(document_id1, section_id_mask)));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(),
+ document_id2);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(
+ EqualsTermMatchInfo("animal", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("kitten", expected_section_ids_tf_map)));
+
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(),
+ document_id1);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ matched_terms_stats.clear();
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("animal", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("puppy", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(3));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""],
UnorderedElementsAre("animal", "puppy", "kitten"));
}
{
- // OR gets precedence over AND, this is parsed as (kitten AND ((foo OR bar)
- // OR cat))
+ // OR gets precedence over AND, this is parsed as (kitten AND ((foo OR
+ // bar) OR cat))
SearchSpecProto search_spec;
search_spec.set_query("kitten foo OR bar OR cat");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Only Document 2 matches since it has both kitten and cat
- EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask)));
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(),
+ document_id2);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 1}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("kitten", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("cat", expected_section_ids_tf_map)));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(4));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""],
UnorderedElementsAre("kitten", "foo", "bar", "cat"));
}
}
-TEST_F(QueryProcessorTest, OneGroup) {
+TEST_P(QueryProcessorTest, OneGroup) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -939,7 +1264,6 @@ TEST_F(QueryProcessorTest, OneGroup) {
// Populate the index
SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "puppy dog"
@@ -956,47 +1280,44 @@ TEST_F(QueryProcessorTest, OneGroup) {
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
// Without grouping, this would be parsed as ((puppy OR kitten) AND foo) and
// no documents would match. But with grouping, Document 1 matches puppy
SearchSpecProto search_spec;
search_spec.set_query("puppy OR (kitten foo)");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo(document_id1);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ ElementsAre(expectedDocHitInfo));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(3));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""],
UnorderedElementsAre("puppy", "kitten", "foo"));
}
-TEST_F(QueryProcessorTest, TwoGroups) {
+TEST_P(QueryProcessorTest, TwoGroups) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -1010,7 +1331,6 @@ TEST_F(QueryProcessorTest, TwoGroups) {
// Populate the index
SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "puppy dog"
@@ -1027,48 +1347,47 @@ TEST_F(QueryProcessorTest, TwoGroups) {
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
// Without grouping, this would be parsed as (puppy AND (dog OR kitten) AND
// cat) and wouldn't match any documents. But with grouping, Document 1
// matches (puppy AND dog) and Document 2 matches (kitten and cat).
SearchSpecProto search_spec;
search_spec.set_query("(puppy dog) OR (kitten cat)");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo1(document_id1);
+ expectedDocHitInfo1.UpdateSection(/*section_id=*/0);
+ DocHitInfo expectedDocHitInfo2(document_id2);
+ expectedDocHitInfo2.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask),
- DocHitInfo(document_id1, section_id_mask)));
+ ElementsAre(expectedDocHitInfo2, expectedDocHitInfo1));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(4));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""],
UnorderedElementsAre("puppy", "dog", "kitten", "cat"));
}
-TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) {
+TEST_P(QueryProcessorTest, ManyLevelNestedGrouping) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -1082,7 +1401,6 @@ TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) {
// Populate the index
SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "puppy dog"
@@ -1099,47 +1417,44 @@ TEST_F(QueryProcessorTest, ManyLevelNestedGrouping) {
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
// Without grouping, this would be parsed as ((puppy OR kitten) AND foo) and
// no documents would match. But with grouping, Document 1 matches puppy
SearchSpecProto search_spec;
search_spec.set_query("puppy OR ((((kitten foo))))");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo(document_id1);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ ElementsAre(expectedDocHitInfo));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(3));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""],
UnorderedElementsAre("puppy", "kitten", "foo"));
}
-TEST_F(QueryProcessorTest, OneLevelNestedGrouping) {
+TEST_P(QueryProcessorTest, OneLevelNestedGrouping) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that the DocHitInfoIterators will see that the
- // document exists and not filter out the DocumentId as deleted.
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -1153,7 +1468,6 @@ TEST_F(QueryProcessorTest, OneLevelNestedGrouping) {
// Populate the index
SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "puppy dog"
@@ -1170,47 +1484,46 @@ TEST_F(QueryProcessorTest, OneLevelNestedGrouping) {
EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
// Document 1 will match puppy and Document 2 matches (kitten AND (cat))
SearchSpecProto search_spec;
- search_spec.set_query("puppy OR (kitten(cat))");
+ // TODO(b/208654892) decide how we want to handle queries of the form foo(...)
+ search_spec.set_query("puppy OR (kitten (cat))");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo1(document_id1);
+ expectedDocHitInfo1.UpdateSection(/*section_id=*/0);
+ DocHitInfo expectedDocHitInfo2(document_id2);
+ expectedDocHitInfo2.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask),
- DocHitInfo(document_id1, section_id_mask)));
+ ElementsAre(expectedDocHitInfo2, expectedDocHitInfo1));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(3));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""],
UnorderedElementsAre("puppy", "kitten", "cat"));
}
-TEST_F(QueryProcessorTest, ExcludeTerm) {
+TEST_P(QueryProcessorTest, ExcludeTerm) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that they'll bump the last_added_document_id,
- // which will give us the proper exclusion results
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that they'll bump the
+ // last_added_document_id, which will give us the proper exclusion results
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -1233,45 +1546,39 @@ TEST_F(QueryProcessorTest, ExcludeTerm) {
AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("-hello");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
- // We don't know have the section mask to indicate what section "world" came.
- // It doesn't matter which section it was in since the query doesn't care. It
- // just wanted documents that didn't have "hello"
+ // We don't know have the section mask to indicate what section "world"
+ // came. It doesn't matter which section it was in since the query doesn't
+ // care. It just wanted documents that didn't have "hello"
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
ElementsAre(DocHitInfo(document_id2, kSectionIdMaskNone)));
EXPECT_THAT(results.query_terms, IsEmpty());
+ EXPECT_THAT(results.query_term_iterators, IsEmpty());
}
-TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) {
+TEST_P(QueryProcessorTest, ExcludeNonexistentTerm) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that they'll bump the last_added_document_id,
- // which will give us the proper exclusion results
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that they'll bump the
+ // last_added_document_id, which will give us the proper exclusion results
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -1293,44 +1600,38 @@ TEST_F(QueryProcessorTest, ExcludeNonexistentTerm) {
AddTokenToIndex(document_id2, section_id, term_match_type, "world"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("-foo");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
ElementsAre(DocHitInfo(document_id2, kSectionIdMaskNone),
DocHitInfo(document_id1, kSectionIdMaskNone)));
EXPECT_THAT(results.query_terms, IsEmpty());
+ EXPECT_THAT(results.query_term_iterators, IsEmpty());
}
-TEST_F(QueryProcessorTest, ExcludeAnd) {
+TEST_P(QueryProcessorTest, ExcludeAnd) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that they'll bump the last_added_document_id,
- // which will give us the proper exclusion results
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that they'll bump the
+ // last_added_document_id, which will give us the proper exclusion results
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -1360,25 +1661,24 @@ TEST_F(QueryProcessorTest, ExcludeAnd) {
ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
{
SearchSpecProto search_spec;
search_spec.set_query("-dog -cat");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
- // The query is interpreted as "exclude all documents that have animal, and
- // exclude all documents that have cat". Since both documents contain
+ // The query is interpreted as "exclude all documents that have animal,
+ // and exclude all documents that have cat". Since both documents contain
// animal, there are no results.
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+ EXPECT_THAT(results.query_term_iterators, IsEmpty());
+
EXPECT_THAT(results.query_terms, IsEmpty());
}
@@ -1386,36 +1686,38 @@ TEST_F(QueryProcessorTest, ExcludeAnd) {
SearchSpecProto search_spec;
search_spec.set_query("-animal cat");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
- // The query is interpreted as "exclude all documents that have animal, and
- // include all documents that have cat". Since both documents contain
+ // The query is interpreted as "exclude all documents that have animal,
+ // and include all documents that have cat". Since both documents contain
// animal, there are no results.
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("cat"));
}
}
-TEST_F(QueryProcessorTest, ExcludeOr) {
+TEST_P(QueryProcessorTest, ExcludeOr) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- // These documents don't actually match to the tokens in the index. We're just
- // inserting the documents so that they'll bump the last_added_document_id,
- // which will give us the proper exclusion results
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that they'll bump the
+ // last_added_document_id, which will give us the proper exclusion results
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
@@ -1429,7 +1731,6 @@ TEST_F(QueryProcessorTest, ExcludeOr) {
// Populate the index
SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "animal dog"
@@ -1446,26 +1747,25 @@ TEST_F(QueryProcessorTest, ExcludeOr) {
ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
{
SearchSpecProto search_spec;
search_spec.set_query("-animal OR -cat");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// We don't have a section mask indicating which sections in this document
// matched the query since it's not based on section-term matching. It's
// more based on the fact that the query excluded all the other documents.
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
ElementsAre(DocHitInfo(document_id1, kSectionIdMaskNone)));
+ EXPECT_THAT(results.query_term_iterators, IsEmpty());
+
EXPECT_THAT(results.query_terms, IsEmpty());
}
@@ -1473,32 +1773,136 @@ TEST_F(QueryProcessorTest, ExcludeOr) {
SearchSpecProto search_spec;
search_spec.set_query("animal OR -cat");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo1(document_id1);
+ expectedDocHitInfo1.UpdateSection(/*section_id=*/0);
+ DocHitInfo expectedDocHitInfo2(document_id2);
+ expectedDocHitInfo2.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask),
- DocHitInfo(document_id1, section_id_mask)));
+ ElementsAre(expectedDocHitInfo2, expectedDocHitInfo1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
}
}
-TEST_F(QueryProcessorTest, DeletedFilter) {
+TEST_P(QueryProcessorTest, WithoutTermFrequency) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // just inserting the documents so that the DocHitInfoIterators will see
+ // that the document exists and not filter out the DocumentId as deleted.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ // Populate the index
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // Document 1 has content "animal puppy dog", which is added to the main
+ // index.
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "animal"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, section_id, term_match_type, "puppy"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id1, section_id, term_match_type, "dog"),
+ IsOk());
+ ASSERT_THAT(index_->Merge(), IsOk());
+
+ // Document 2 has content "animal kitten cat", which is added to the lite
+ // index.
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, section_id, term_match_type, "kitten"),
+ IsOk());
+ EXPECT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
+ IsOk());
+
+ // OR gets precedence over AND, this is parsed as (animal AND (puppy OR
+ // kitten))
+ SearchSpecProto search_spec;
+ search_spec.set_query("animal puppy OR kitten");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ QueryResults results,
+ query_processor_->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ // Descending order of valid DocumentIds
+ // The first Document to match (Document 2) matches on 'animal' AND 'kitten'
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id2);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ // Since need_hit_term_frequency is false, the expected term frequency for
+ // the section with the hit should be 0.
+ std::unordered_map<SectionId, Hit::TermFrequency>
+ expected_section_ids_tf_map = {{section_id, 0}};
+ std::vector<TermMatchInfo> matched_terms_stats;
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("animal", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("kitten", expected_section_ids_tf_map)));
+
+ // The second Document to match (Document 1) matches on 'animal' AND 'puppy'
+ ASSERT_THAT(results.root_iterator->Advance(), IsOk());
+ EXPECT_EQ(results.root_iterator->doc_hit_info().document_id(), document_id1);
+ EXPECT_EQ(results.root_iterator->doc_hit_info().hit_section_ids_mask(),
+ section_id_mask);
+
+ matched_terms_stats.clear();
+ results.root_iterator->PopulateMatchedTermsStats(&matched_terms_stats);
+ EXPECT_THAT(
+ matched_terms_stats,
+ ElementsAre(EqualsTermMatchInfo("animal", expected_section_ids_tf_map),
+ EqualsTermMatchInfo("puppy", expected_section_ids_tf_map)));
+
+ // This should be empty because ranking_strategy != RELEVANCE_SCORE
+ EXPECT_THAT(results.query_term_iterators, IsEmpty());
+}
+
+TEST_P(QueryProcessorTest, DeletedFilter) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1513,11 +1917,12 @@ TEST_F(QueryProcessorTest, DeletedFilter) {
.SetKey("namespace", "2")
.SetSchema("email")
.Build()));
- EXPECT_THAT(document_store_->Delete("namespace", "1"), IsOk());
+ EXPECT_THAT(document_store_->Delete("namespace", "1",
+ fake_clock_.GetSystemTimeMilliseconds()),
+ IsOk());
// Populate the index
SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "animal dog"
@@ -1534,40 +1939,37 @@ TEST_F(QueryProcessorTest, DeletedFilter) {
ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("animal");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo(document_id2);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id2, section_id_mask)));
+ ElementsAre(expectedDocHitInfo));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, NamespaceFilter) {
+TEST_P(QueryProcessorTest, NamespaceFilter) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1585,7 +1987,6 @@ TEST_F(QueryProcessorTest, NamespaceFilter) {
// Populate the index
SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "animal dog"
@@ -1602,42 +2003,40 @@ TEST_F(QueryProcessorTest, NamespaceFilter) {
ASSERT_THAT(AddTokenToIndex(document_id2, section_id, term_match_type, "cat"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("animal");
search_spec.set_term_match_type(term_match_type);
search_spec.add_namespace_filters("namespace1");
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo(document_id1);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ ElementsAre(expectedDocHitInfo));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, SchemaTypeFilter) {
+TEST_P(QueryProcessorTest, SchemaTypeFilter) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
- AddSchemaType(&schema, "message");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1655,7 +2054,6 @@ TEST_F(QueryProcessorTest, SchemaTypeFilter) {
// Populate the index
SectionId section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document 1 has content "animal dog"
@@ -1668,45 +2066,45 @@ TEST_F(QueryProcessorTest, SchemaTypeFilter) {
AddTokenToIndex(document_id2, section_id, term_match_type, "animal"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
search_spec.set_query("animal");
search_spec.set_term_match_type(term_match_type);
search_spec.add_schema_type_filters("email");
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo(document_id1);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id1, section_id_mask)));
+ ElementsAre(expectedDocHitInfo));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, SectionFilterForOneDocument) {
+TEST_P(QueryProcessorTest, PropertyFilterForOneDocument) {
// Create the schema and document store
- SchemaProto schema;
- SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
-
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
// First and only indexed property, so it gets a section_id of 0
- AddIndexedProperty(email_type, "subject");
int subject_section_id = 0;
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1718,7 +2116,6 @@ TEST_F(QueryProcessorTest, SectionFilterForOneDocument) {
.Build()));
// Populate the index
- SectionIdMask section_id_mask = 1U << subject_section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Document has content "animal"
@@ -1726,50 +2123,61 @@ TEST_F(QueryProcessorTest, SectionFilterForOneDocument) {
"animal"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>'
search_spec.set_query("subject:animal");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Descending order of valid DocumentIds
+ DocHitInfo expectedDocHitInfo(document_id);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ ElementsAre(expectedDocHitInfo));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms["subject"], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) {
+TEST_P(QueryProcessorTest, PropertyFilterAcrossSchemaTypes) {
// Create the schema and document store
- SchemaProto schema;
- SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
- // SectionIds are assigned in ascending order per schema type, alphabetically.
- AddIndexedProperty(email_type, "a"); // Section "a" would get sectionId 0
- AddIndexedProperty(email_type, "foo");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ // Section "a" would get sectionId 0
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // SectionIds are assigned in ascending order per schema type,
+ // alphabetically.
int email_foo_section_id = 1;
-
- SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message");
- // SectionIds are assigned in ascending order per schema type, alphabetically.
- AddIndexedProperty(message_type, "foo");
int message_foo_section_id = 0;
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1786,8 +2194,6 @@ TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) {
.Build()));
// Populate the index
- SectionIdMask email_section_id_mask = 1U << email_foo_section_id;
- SectionIdMask message_section_id_mask = 1U << message_foo_section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Email document has content "animal"
@@ -1800,52 +2206,52 @@ TEST_F(QueryProcessorTest, SectionFilterAcrossSchemaTypes) {
term_match_type, "animal"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>'
search_spec.set_query("foo:animal");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Ordered by descending DocumentId, so message comes first since it was
// inserted last
- EXPECT_THAT(
- GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(message_document_id, message_section_id_mask),
- DocHitInfo(email_document_id, email_section_id_mask)));
+ DocHitInfo expectedDocHitInfo1(message_document_id);
+ expectedDocHitInfo1.UpdateSection(/*section_id=*/0);
+ DocHitInfo expectedDocHitInfo2(email_document_id);
+ expectedDocHitInfo2.UpdateSection(/*section_id=*/1);
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(expectedDocHitInfo1, expectedDocHitInfo2));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) {
- // Create the schema and document store
- SchemaProto schema;
- SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
- // SectionIds are assigned in ascending order per schema type, alphabetically.
- AddIndexedProperty(email_type, "foo");
+TEST_P(QueryProcessorTest, PropertyFilterWithinSchemaType) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
int email_foo_section_id = 0;
-
- SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message");
- // SectionIds are assigned in ascending order per schema type, alphabetically.
- AddIndexedProperty(message_type, "foo");
int message_foo_section_id = 0;
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1862,7 +2268,6 @@ TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) {
.Build()));
// Populate the index
- SectionIdMask email_section_id_mask = 1U << email_foo_section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Email document has content "animal"
@@ -1875,53 +2280,135 @@ TEST_F(QueryProcessorTest, SectionFilterWithinSchemaType) {
term_match_type, "animal"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
- // Create a section filter '<section name>:<query term>', but only look within
- // documents of email schema
+ // Create a section filter '<section name>:<query term>', but only look
+ // within documents of email schema
search_spec.set_query("foo:animal");
search_spec.add_schema_type_filters("email");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Shouldn't include the message document since we're only looking at email
// types
- EXPECT_THAT(
- GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(email_document_id, email_section_id_mask)));
+ DocHitInfo expectedDocHitInfo(email_document_id);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(expectedDocHitInfo));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) {
+TEST_P(QueryProcessorTest, NestedPropertyFilter) {
// Create the schema and document store
- SchemaProto schema;
- SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
- // SectionIds are assigned in ascending order per schema type, alphabetically.
- AddIndexedProperty(email_type, "foo");
- int email_foo_section_id = 0;
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ // Add an unindexed property so we generate section
+ // metadata on it
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeDocument(
+ "Foo", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Foo")
+ // Add an unindexed property so we generate section
+ // metadata on it
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("bar")
+ .SetDataTypeDocument(
+ "Bar", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Bar")
+ // Add an unindexed property so we generate section
+ // metadata on it
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("baz")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message");
- // SectionIds are assigned in ascending order per schema type, alphabetically.
- AddIndexedProperty(message_type, "bar");
- int message_foo_section_id = 0;
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+
+ // Populate the index
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ // Email document has content "animal"
+ ASSERT_THAT(AddTokenToIndex(email_document_id, /*section_id=*/0,
+ term_match_type, "animal"),
+ IsOk());
+
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>', but only look
+ // within documents of email schema
+ search_spec.set_query("foo.bar.baz:animal");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ // Even though the section id is the same, we should be able to tell that it
+ // doesn't match to the name of the section filter
+ DocHitInfo expectedDocHitInfo1(email_document_id);
+ expectedDocHitInfo1.UpdateSection(/*section_id=*/0);
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(expectedDocHitInfo1));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms["foo.bar.baz"],
+ UnorderedElementsAre("animal"));
+}
+
+TEST_P(QueryProcessorTest, PropertyFilterRespectsDifferentSectionIds) {
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("bar")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ int email_foo_section_id = 0;
+ int message_foo_section_id = 0;
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -1938,7 +2425,6 @@ TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) {
.Build()));
// Populate the index
- SectionIdMask email_section_id_mask = 1U << email_foo_section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Email document has content "animal"
@@ -1946,51 +2432,47 @@ TEST_F(QueryProcessorTest, SectionFilterRespectsDifferentSectionIds) {
term_match_type, "animal"),
IsOk());
- // Message document has content "animal", but put in in the same section id as
- // the indexed email section id, the same id as indexed property "foo" in the
- // message type
+ // Message document has content "animal", but put in in the same section id
+ // as the indexed email section id, the same id as indexed property "foo" in
+ // the message type
ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id,
term_match_type, "animal"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
- // Create a section filter '<section name>:<query term>', but only look within
- // documents of email schema
+ // Create a section filter '<section name>:<query term>', but only look
+ // within documents of email schema
search_spec.set_query("foo:animal");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Even though the section id is the same, we should be able to tell that it
// doesn't match to the name of the section filter
- EXPECT_THAT(
- GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(email_document_id, email_section_id_mask)));
+ DocHitInfo expectedDocHitInfo(email_document_id);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(expectedDocHitInfo));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) {
+TEST_P(QueryProcessorTest, NonexistentPropertyFilterReturnsEmptyResults) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -2009,44 +2491,46 @@ TEST_F(QueryProcessorTest, NonexistentSectionFilterReturnsEmptyResults) {
term_match_type, "animal"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
- // Create a section filter '<section name>:<query term>', but only look within
- // documents of email schema
+ // Create a section filter '<section name>:<query term>', but only look
+ // within documents of email schema
search_spec.set_query("nonexistent:animal");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Even though the section id is the same, we should be able to tell that it
// doesn't match to the name of the section filter
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms["nonexistent"],
UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) {
+TEST_P(QueryProcessorTest, UnindexedPropertyFilterReturnsEmptyResults) {
// Create the schema and document store
- SchemaProto schema;
- SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
- AddUnindexedProperty(email_type, "foo");
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ // Add an unindexed property so we generate section
+ // metadata on it
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -2065,50 +2549,49 @@ TEST_F(QueryProcessorTest, UnindexedSectionFilterReturnsEmptyResults) {
term_match_type, "animal"),
IsOk());
- // Perform query
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
- // Create a section filter '<section name>:<query term>', but only look within
- // documents of email schema
+ // Create a section filter '<section name>:<query term>', but only look
+ // within documents of email schema
search_spec.set_query("foo:animal");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Even though the section id is the same, we should be able to tell that it
// doesn't match to the name of the section filter
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
EXPECT_THAT(results.query_terms, SizeIs(1));
EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) {
+TEST_P(QueryProcessorTest, PropertyFilterTermAndUnrestrictedTerm) {
// Create the schema and document store
- SchemaProto schema;
- SchemaTypeConfigProto* email_type = AddSchemaType(&schema, "email");
- // SectionIds are assigned in ascending order per schema type, alphabetically.
- AddIndexedProperty(email_type, "foo");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("message").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
int email_foo_section_id = 0;
-
- SchemaTypeConfigProto* message_type = AddSchemaType(&schema, "message");
- // SectionIds are assigned in ascending order per schema type, alphabetically.
- AddIndexedProperty(message_type, "foo");
int message_foo_section_id = 0;
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
-
- ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// These documents don't actually match to the tokens in the index. We're
// inserting the documents to get the appropriate number of documents and
@@ -2125,8 +2608,6 @@ TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) {
.Build()));
// Poplate the index
- SectionIdMask email_section_id_mask = 1U << email_foo_section_id;
- SectionIdMask message_section_id_mask = 1U << message_foo_section_id;
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
// Email document has content "animal"
@@ -2142,106 +2623,379 @@ TEST_F(QueryProcessorTest, SectionFilterTermAndUnrestrictedTerm) {
term_match_type, "animal"),
IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
-
SearchSpecProto search_spec;
// Create a section filter '<section name>:<query term>'
search_spec.set_query("cat OR foo:animal");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
// Ordered by descending DocumentId, so message comes first since it was
// inserted last
- EXPECT_THAT(
- GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(message_document_id, message_section_id_mask),
- DocHitInfo(email_document_id, email_section_id_mask)));
+ DocHitInfo expectedDocHitInfo1(message_document_id);
+ expectedDocHitInfo1.UpdateSection(/*section_id=*/0);
+ DocHitInfo expectedDocHitInfo2(email_document_id);
+ expectedDocHitInfo2.UpdateSection(/*section_id=*/0);
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(expectedDocHitInfo1, expectedDocHitInfo2));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(2));
+
EXPECT_THAT(results.query_terms, SizeIs(2));
EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("cat"));
EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
}
-TEST_F(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) {
+TEST_P(QueryProcessorTest, TypePropertyFilter) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("bar")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("baz")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("bar")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("baz")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ // SectionIds are assigned in ascending order per schema type,
+ // alphabetically.
+ int email_bar_section_id = 0;
+ int email_baz_section_id = 1;
+ int email_foo_section_id = 2;
+ int message_bar_section_id = 0;
+ int message_baz_section_id = 1;
+ int message_foo_section_id = 2;
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("message")
+ .Build()));
+
+ // Poplate the index
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // Email document has content "animal" in all sections
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_bar_section_id,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_baz_section_id,
+ term_match_type, "animal"),
+ IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ // Message document has content "animal" in all sections
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_bar_section_id,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_baz_section_id,
+ term_match_type, "animal"),
+ IsOk());
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("animal");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
+
+ // email has property filters for foo and baz properties
+ TypePropertyMask *email_mask = search_spec.add_type_property_filters();
+ email_mask->set_schema_type("email");
+ email_mask->add_paths("foo");
+ email_mask->add_paths("baz");
+
+ // message has property filters for bar and baz properties
+ TypePropertyMask *message_mask = search_spec.add_type_property_filters();
+ message_mask->set_schema_type("message");
+ message_mask->add_paths("bar");
+ message_mask->add_paths("baz");
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ // Ordered by descending DocumentId, so message comes first since it was
+ // inserted last
+ DocHitInfo expected_doc_hit_info1(message_document_id);
+ expected_doc_hit_info1.UpdateSection(message_bar_section_id);
+ expected_doc_hit_info1.UpdateSection(message_baz_section_id);
+ DocHitInfo expected_doc_hit_info2(email_document_id);
+ expected_doc_hit_info2.UpdateSection(email_foo_section_id);
+ expected_doc_hit_info2.UpdateSection(email_baz_section_id);
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(expected_doc_hit_info1, expected_doc_hit_info2));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms[""], UnorderedElementsAre("animal"));
+}
+
+TEST_P(QueryProcessorTest, TypePropertyFilterWithSectionRestrict) {
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("bar")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("baz")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("bar")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("baz")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ // SectionIds are assigned in ascending order per schema type,
+ // alphabetically.
+ int email_bar_section_id = 0;
+ int email_baz_section_id = 1;
+ int email_foo_section_id = 2;
+ int message_bar_section_id = 0;
+ int message_baz_section_id = 1;
+ int message_foo_section_id = 2;
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // schema types populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
document_store_->Put(DocumentBuilder()
.SetKey("namespace", "1")
.SetSchema("email")
- .SetCreationTimestampMs(0)
- .SetTtlMs(100)
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("message")
.Build()));
- // Populate the index
- int section_id = 0;
- SectionIdMask section_id_mask = 1U << section_id;
+ // Poplate the index
TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
- EXPECT_THAT(
- AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
- IsOk());
+ // Email document has content "animal" in all sections
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_foo_section_id,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_bar_section_id,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(email_document_id, email_baz_section_id,
+ term_match_type, "animal"),
+ IsOk());
+
+ // Message document has content "animal" in all sections
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_foo_section_id,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_bar_section_id,
+ term_match_type, "animal"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(message_document_id, message_baz_section_id,
+ term_match_type, "animal"),
+ IsOk());
+
+ SearchSpecProto search_spec;
+ // Create a section filter '<section name>:<query term>'
+ search_spec.set_query("foo:animal");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
+
+ // email has property filters for foo and baz properties
+ TypePropertyMask *email_mask = search_spec.add_type_property_filters();
+ email_mask->set_schema_type("email");
+ email_mask->add_paths("foo");
+ email_mask->add_paths("baz");
+
+ // message has property filters for bar and baz properties
+ TypePropertyMask *message_mask = search_spec.add_type_property_filters();
+ message_mask->set_schema_type("message");
+ message_mask->add_paths("bar");
+ message_mask->add_paths("baz");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ // Only hits in sections allowed by both the property filters and section
+ // restricts should be returned. Message document should not be returned since
+ // section foo specified in the section restrict is not allowed by the
+ // property filters.
+ DocHitInfo expected_doc_hit_info(email_document_id);
+ expected_doc_hit_info.UpdateSection(email_foo_section_id);
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(expected_doc_hit_info));
+ EXPECT_THAT(results.query_term_iterators, SizeIs(1));
+
+ EXPECT_THAT(results.query_terms, SizeIs(1));
+ EXPECT_THAT(results.query_terms["foo"], UnorderedElementsAre("animal"));
+}
+
+TEST_P(QueryProcessorTest, DocumentBeforeTtlNotFilteredOut) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
// Arbitrary value, just has to be less than the document's creation
// timestamp + ttl
FakeClock fake_clock;
fake_clock.SetSystemTimeMilliseconds(50);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, store_dir_, &fake_clock,
+ schema_store_.get()));
+ document_store_ = std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(10)
+ .SetTtlMs(100)
+ .Build()));
+
+ // Populate the index
+ int section_id = 0;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ EXPECT_THAT(
+ AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
+ IsOk());
+
// Perform query
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock_));
+ std::unique_ptr<QueryProcessor> local_query_processor,
+ QueryProcessor::Create(index_.get(), numeric_index_.get(),
+ language_segmenter_.get(), normalizer_.get(),
+ document_store_.get(), schema_store_.get()));
SearchSpecProto search_spec;
search_spec.set_query("hello");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ local_query_processor->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ DocHitInfo expectedDocHitInfo(document_id);
+ expectedDocHitInfo.UpdateSection(/*section_id=*/0);
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
- ElementsAre(DocHitInfo(document_id, section_id_mask)));
+ ElementsAre(expectedDocHitInfo));
}
-TEST_F(QueryProcessorTest, DocumentPastTtlFilteredOut) {
+TEST_P(QueryProcessorTest, DocumentPastTtlFilteredOut) {
// Create the schema and document store
- SchemaProto schema;
- AddSchemaType(&schema, "email");
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ // Arbitrary value, just has to be greater than the document's creation
+ // timestamp + ttl
+ FakeClock fake_clock_local;
+ fake_clock_local.SetSystemTimeMilliseconds(200);
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, store_dir_, &fake_clock_local,
+ schema_store_.get()));
+ document_store_ = std::move(create_result.document_store);
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- document_store_->Put(DocumentBuilder()
- .SetKey("namespace", "1")
- .SetSchema("email")
- .SetCreationTimestampMs(0)
- .SetTtlMs(100)
- .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(50)
+ .SetTtlMs(100)
+ .Build()));
// Populate the index
int section_id = 0;
@@ -2251,28 +3005,327 @@ TEST_F(QueryProcessorTest, DocumentPastTtlFilteredOut) {
AddTokenToIndex(document_id, section_id, term_match_type, "hello"),
IsOk());
- // Arbitrary value, just has to be greater than the document's creation
- // timestamp + ttl
- FakeClock fake_clock;
- fake_clock.SetSystemTimeMilliseconds(200);
-
// Perform query
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<QueryProcessor> query_processor,
- QueryProcessor::Create(index_.get(), language_segmenter_.get(),
- normalizer_.get(), document_store_.get(),
- schema_store_.get(), &fake_clock));
+ std::unique_ptr<QueryProcessor> local_query_processor,
+ QueryProcessor::Create(index_.get(), numeric_index_.get(),
+ language_segmenter_.get(), normalizer_.get(),
+ document_store_.get(), schema_store_.get()));
SearchSpecProto search_spec;
search_spec.set_query("hello");
search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ local_query_processor->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_local.GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+}
+
+TEST_P(QueryProcessorTest, NumericFilter) {
+ if (GetParam() !=
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ GTEST_SKIP() << "Numeric filter is only supported in advanced query.";
+ }
+
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("transaction")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("price")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("cost")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ // SectionIds are assigned alphabetically
+ SectionId cost_section_id = 0;
+ SectionId price_section_id = 1;
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_one_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("transaction")
+ .AddInt64Property("price", 10)
+ .Build()));
+ ICING_ASSERT_OK(
+ AddToNumericIndex(document_one_id, "price", price_section_id, 10));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_two_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("transaction")
+ .AddInt64Property("price", 25)
+ .Build()));
+ ICING_ASSERT_OK(
+ AddToNumericIndex(document_two_id, "price", price_section_id, 25));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_three_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "3")
+ .SetSchema("transaction")
+ .AddInt64Property("cost", 2)
+ .Build()));
+ ICING_ASSERT_OK(
+ AddToNumericIndex(document_three_id, "cost", cost_section_id, 2));
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("price < 20");
+ search_spec.set_search_type(GetParam());
+ search_spec.add_enabled_features(std::string(kNumericSearchFeature));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(EqualsDocHitInfo(
+ document_one_id, std::vector<SectionId>{price_section_id})));
+
+ search_spec.set_query("price == 25");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ results, query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(EqualsDocHitInfo(
+ document_two_id, std::vector<SectionId>{price_section_id})));
+
+ search_spec.set_query("cost > 2");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ results, query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+
+ search_spec.set_query("cost >= 2");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ results, query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(EqualsDocHitInfo(
+ document_three_id, std::vector<SectionId>{cost_section_id})));
+
+ search_spec.set_query("price <= 25");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ results, query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(
+ GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(EqualsDocHitInfo(document_two_id,
+ std::vector<SectionId>{price_section_id}),
+ EqualsDocHitInfo(document_one_id,
+ std::vector<SectionId>{price_section_id})));
+}
+
+TEST_P(QueryProcessorTest, NumericFilterWithoutEnablingFeatureFails) {
+ if (GetParam() !=
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ GTEST_SKIP() << "Numeric filter is only supported in advanced query.";
+ }
+
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("transaction")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("price")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ SectionId price_section_id = 0;
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_one_id,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("transaction")
+ .AddInt64Property("price", 10)
+ .Build()));
+ ICING_ASSERT_OK(
+ AddToNumericIndex(document_one_id, "price", price_section_id, 10));
+
+ SearchSpecProto search_spec;
+ search_spec.set_query("price < 20");
+ search_spec.set_search_type(GetParam());
+
+ libtextclassifier3::StatusOr<QueryResults> result_or =
+ query_processor_->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(result_or,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(QueryProcessorTest, GroupingInSectionRestriction) {
+ if (GetParam() !=
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY) {
+ GTEST_SKIP() << "Grouping in section restriction is only supported in "
+ "advanced query.";
+ }
+
+ // Create the schema and document store
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ SectionId prop1_section_id = 0;
+ SectionId prop2_section_id = 1;
+ TermMatchType::Code term_match_type = TermMatchType::EXACT_ONLY;
+
+ // Create documents as follows:
+ // Doc0:
+ // prop1: "foo"
+ // prop2: "bar"
+ // Doc1:
+ // prop1: "bar"
+ // prop2: "foo"
+ // Doc2:
+ // prop1: "foo bar"
+ // prop2: ""
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "0")
+ .SetSchema("email")
+ .Build()));
+ EXPECT_THAT(
+ AddTokenToIndex(document_id0, prop1_section_id, term_match_type, "foo"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id0, prop2_section_id, term_match_type, "bar"),
+ IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .Build()));
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, prop1_section_id, term_match_type, "bar"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id1, prop2_section_id, term_match_type, "foo"),
+ IsOk());
- ICING_ASSERT_OK_AND_ASSIGN(QueryProcessor::QueryResults results,
- query_processor->ParseSearch(search_spec));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .Build()));
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, prop1_section_id, term_match_type, "foo"),
+ IsOk());
+ EXPECT_THAT(
+ AddTokenToIndex(document_id2, prop1_section_id, term_match_type, "bar"),
+ IsOk());
+
+ // prop1:(foo bar) <=> prop1:foo AND prop1:bar, which matches doc2.
+ SearchSpecProto search_spec;
+ search_spec.set_query("prop1:(foo bar)");
+ search_spec.set_term_match_type(term_match_type);
+ search_spec.set_search_type(GetParam());
+ search_spec.add_enabled_features(
+ std::string(kListFilterQueryLanguageFeature));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ QueryResults results,
+ query_processor_->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(EqualsDocHitInfo(
+ document_id2, std::vector<SectionId>{prop1_section_id})));
+ // prop2:(foo bar) <=> prop2:foo AND prop2:bar, which matches nothing.
+ search_spec.set_query("prop2:(foo bar)");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ results, query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()), IsEmpty());
+
+ // prop1:(foo -bar) <=> prop1:foo AND -prop1:bar, which matches doc0.
+ search_spec.set_query("prop1:(foo -bar)");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ results, query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(EqualsDocHitInfo(
+ document_id0, std::vector<SectionId>{prop1_section_id})));
+
+ // prop2:(-foo OR bar) <=> -prop2:foo OR prop2:bar, which matches doc0 and
+ // doc2.
+ search_spec.set_query("prop2:(-foo OR bar)");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ results, query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(
+ GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(EqualsDocHitInfo(document_id2, std::vector<SectionId>{}),
+ EqualsDocHitInfo(document_id0,
+ std::vector<SectionId>{prop2_section_id})));
+
+ // prop1:((foo AND bar) OR (foo AND -baz))
+ // <=> ((prop1:foo AND prop1:bar) OR (prop1:foo AND -prop1:baz)), which
+ // matches doc0 and doc2.
+ search_spec.set_query("prop1:((foo AND bar) OR (foo AND -baz))");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ results, query_processor_->ParseSearch(
+ search_spec, ScoringSpecProto::RankingStrategy::NONE,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(
+ GetDocHitInfos(results.root_iterator.get()),
+ ElementsAre(EqualsDocHitInfo(document_id2,
+ std::vector<SectionId>{prop1_section_id}),
+ EqualsDocHitInfo(document_id0,
+ std::vector<SectionId>{prop1_section_id})));
}
+INSTANTIATE_TEST_SUITE_P(
+ QueryProcessorTest, QueryProcessorTest,
+ testing::Values(
+ SearchSpecProto::SearchType::ICING_RAW_QUERY,
+ SearchSpecProto::SearchType::EXPERIMENTAL_ICING_ADVANCED_QUERY));
+
} // namespace
} // namespace lib
diff --git a/icing/query/query-results.h b/icing/query/query-results.h
new file mode 100644
index 0000000..52cdd71
--- /dev/null
+++ b/icing/query/query-results.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_QUERY_RESULTS_H_
+#define ICING_QUERY_QUERY_RESULTS_H_
+
+#include <memory>
+#include <unordered_set>
+
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/query/query-terms.h"
+#include "icing/query/query-features.h"
+
+namespace icing {
+namespace lib {
+
+struct QueryResults {
+ std::unique_ptr<DocHitInfoIterator> root_iterator;
+ // A map from section names to sets of terms restricted to those sections.
+ // Query terms that are not restricted are found at the entry with key "".
+ SectionRestrictQueryTermsMap query_terms;
+ // Hit iterators for the text terms in the query. These query_term_iterators
+ // are completely separate from the iterators that make the iterator tree
+ // beginning with root_iterator.
+ // This will only be populated when ranking_strategy == RELEVANCE_SCORE.
+ QueryTermIteratorsMap query_term_iterators;
+ // Features that are invoked during query execution.
+ // The list of possible features is defined in query_features.h.
+ std::unordered_set<Feature> features_in_use;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_QUERY_RESULTS_H_
diff --git a/icing/query/query-terms.h b/icing/query/query-terms.h
index 1c5ce02..c4efe78 100644
--- a/icing/query/query-terms.h
+++ b/icing/query/query-terms.h
@@ -15,18 +15,24 @@
#ifndef ICING_QUERY_QUERY_TERMS_H_
#define ICING_QUERY_QUERY_TERMS_H_
+#include <memory>
#include <string>
-#include <string_view>
#include <unordered_map>
#include <unordered_set>
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+
namespace icing {
namespace lib {
// A map from section names to sets of terms restricted to those sections.
// Query terms that are not restricted are found at the entry with key "".
using SectionRestrictQueryTermsMap =
- std::unordered_map<std::string_view, std::unordered_set<std::string>>;
+ std::unordered_map<std::string, std::unordered_set<std::string>>;
+
+// A map from query terms to a DocHitInfoIterator for that term.
+using QueryTermIteratorsMap =
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>;
} // namespace lib
} // namespace icing
diff --git a/icing/query/query-utils.cc b/icing/query/query-utils.cc
new file mode 100644
index 0000000..37c3600
--- /dev/null
+++ b/icing/query/query-utils.cc
@@ -0,0 +1,42 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/query-utils.h"
+
+#include <string_view>
+#include <vector>
+
+namespace icing {
+namespace lib {
+
+DocHitInfoIteratorFilter::Options GetFilterOptions(
+ const SearchSpecProto& search_spec) {
+ DocHitInfoIteratorFilter::Options options;
+
+ if (search_spec.namespace_filters_size() > 0) {
+ options.namespaces =
+ std::vector<std::string_view>(search_spec.namespace_filters().begin(),
+ search_spec.namespace_filters().end());
+ }
+
+ if (search_spec.schema_type_filters_size() > 0) {
+ options.schema_types =
+ std::vector<std::string_view>(search_spec.schema_type_filters().begin(),
+ search_spec.schema_type_filters().end());
+ }
+ return options;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/query-utils.h b/icing/query/query-utils.h
new file mode 100644
index 0000000..d85cf3a
--- /dev/null
+++ b/icing/query/query-utils.h
@@ -0,0 +1,30 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_QUERY_UTILS_H_
+#define ICING_QUERY_QUERY_UTILS_H_
+
+#include "icing/index/iterator/doc-hit-info-iterator-filter.h"
+#include "icing/proto/search.pb.h"
+
+namespace icing {
+namespace lib {
+
+DocHitInfoIteratorFilter::Options GetFilterOptions(
+ const SearchSpecProto& search_spec);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_QUERY_UTILS_H_
diff --git a/icing/query/suggestion-processor.cc b/icing/query/suggestion-processor.cc
new file mode 100644
index 0000000..eb86e3b
--- /dev/null
+++ b/icing/query/suggestion-processor.cc
@@ -0,0 +1,311 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/suggestion-processor.h"
+
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/query/query-processor.h"
+#include "icing/store/document-id.h"
+#include "icing/store/suggestion-result-checker-impl.h"
+#include "icing/tokenization/tokenizer-factory.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>>
+SuggestionProcessor::Create(Index* index,
+ const NumericIndex<int64_t>* numeric_index,
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer,
+ const DocumentStore* document_store,
+ const SchemaStore* schema_store) {
+ ICING_RETURN_ERROR_IF_NULL(index);
+ ICING_RETURN_ERROR_IF_NULL(numeric_index);
+ ICING_RETURN_ERROR_IF_NULL(language_segmenter);
+ ICING_RETURN_ERROR_IF_NULL(normalizer);
+ ICING_RETURN_ERROR_IF_NULL(document_store);
+ ICING_RETURN_ERROR_IF_NULL(schema_store);
+
+ return std::unique_ptr<SuggestionProcessor>(
+ new SuggestionProcessor(index, numeric_index, language_segmenter,
+ normalizer, document_store, schema_store));
+}
+
+libtextclassifier3::StatusOr<
+ std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>>
+PopulateDocumentIdFilters(
+ const DocumentStore* document_store,
+ const icing::lib::SuggestionSpecProto& suggestion_spec,
+ const std::unordered_set<NamespaceId>& namespace_ids) {
+ std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
+ document_id_filter_map;
+ document_id_filter_map.reserve(suggestion_spec.document_uri_filters_size());
+ for (const NamespaceDocumentUriGroup& namespace_document_uri_group :
+ suggestion_spec.document_uri_filters()) {
+ auto namespace_id_or = document_store->GetNamespaceId(
+ namespace_document_uri_group.namespace_());
+ if (!namespace_id_or.ok()) {
+ // The current namespace doesn't exist.
+ continue;
+ }
+ NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+ if (!namespace_ids.empty() &&
+ namespace_ids.find(namespace_id) == namespace_ids.end()) {
+ // The current namespace doesn't appear in the namespace filter.
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "The namespace : ", namespace_document_uri_group.namespace_(),
+ " appears in the document uri filter, but doesn't appear in the "
+ "namespace filter."));
+ }
+
+ if (namespace_document_uri_group.document_uris().empty()) {
+ // Client should use namespace filter to filter out all document under
+ // a namespace.
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "The namespace : ", namespace_document_uri_group.namespace_(),
+ " has empty document uri in the document uri filter. Please use the "
+ "namespace filter to exclude a namespace instead of the document uri "
+ "filter."));
+ }
+
+ // Translate namespace document Uris into document_ids
+ std::unordered_set<DocumentId> target_document_ids;
+ target_document_ids.reserve(
+ namespace_document_uri_group.document_uris_size());
+ for (std::string_view document_uri :
+ namespace_document_uri_group.document_uris()) {
+ auto document_id_or = document_store->GetDocumentId(
+ namespace_document_uri_group.namespace_(), document_uri);
+ if (!document_id_or.ok()) {
+ continue;
+ }
+ target_document_ids.insert(document_id_or.ValueOrDie());
+ }
+ document_id_filter_map.insert({namespace_id, target_document_ids});
+ }
+ return document_id_filter_map;
+}
+
+libtextclassifier3::StatusOr<std::unordered_map<SchemaTypeId, SectionIdMask>>
+PopulatePropertyFilters(
+ const SchemaStore* schema_store,
+ const icing::lib::SuggestionSpecProto& suggestion_spec,
+ const std::unordered_set<SchemaTypeId>& schema_type_ids) {
+ std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map;
+ property_filter_map.reserve(suggestion_spec.type_property_filters_size());
+ for (const TypePropertyMask& type_field_mask :
+ suggestion_spec.type_property_filters()) {
+ auto schema_type_id_or =
+ schema_store->GetSchemaTypeId(type_field_mask.schema_type());
+ if (!schema_type_id_or.ok()) {
+ // The current schema doesn't exist
+ continue;
+ }
+ SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
+
+ if (!schema_type_ids.empty() &&
+ schema_type_ids.find(schema_type_id) == schema_type_ids.end()) {
+ // The current schema type doesn't appear in the schema type filter.
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "The schema : ", type_field_mask.schema_type(),
+ " appears in the property filter, but doesn't appear in the schema"
+ " type filter."));
+ }
+
+ if (type_field_mask.paths().empty()) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "The schema type : ", type_field_mask.schema_type(),
+ " has empty path in the property filter. Please use the schema type"
+ " filter to exclude a schema type instead of the property filter."));
+ }
+
+ // Translate property paths into section id mask
+ SectionIdMask section_mask = kSectionIdMaskNone;
+ auto section_metadata_list_or =
+ schema_store->GetSectionMetadata(type_field_mask.schema_type());
+ if (!section_metadata_list_or.ok()) {
+ // The current schema doesn't has section metadata.
+ continue;
+ }
+ std::unordered_set<std::string> target_property_paths;
+ target_property_paths.reserve(type_field_mask.paths_size());
+ for (const std::string& target_property_path : type_field_mask.paths()) {
+ target_property_paths.insert(target_property_path);
+ }
+ const std::vector<SectionMetadata>* section_metadata_list =
+ section_metadata_list_or.ValueOrDie();
+ for (const SectionMetadata& section_metadata : *section_metadata_list) {
+ if (target_property_paths.find(section_metadata.path) !=
+ target_property_paths.end()) {
+ section_mask |= UINT64_C(1) << section_metadata.id;
+ }
+ }
+ property_filter_map.insert({schema_type_id, section_mask});
+ }
+ return property_filter_map;
+}
+
+libtextclassifier3::StatusOr<std::vector<TermMetadata>>
+SuggestionProcessor::QuerySuggestions(
+ const icing::lib::SuggestionSpecProto& suggestion_spec,
+ int64_t current_time_ms) {
+ // We use query tokenizer to tokenize the give prefix, and we only use the
+ // last token to be the suggestion prefix.
+
+ // Populate target namespace filter.
+ std::unordered_set<NamespaceId> namespace_ids;
+ namespace_ids.reserve(suggestion_spec.namespace_filters_size());
+ for (std::string_view name_space : suggestion_spec.namespace_filters()) {
+ auto namespace_id_or = document_store_.GetNamespaceId(name_space);
+ if (!namespace_id_or.ok()) {
+ // The current namespace doesn't exist.
+ continue;
+ }
+ namespace_ids.insert(namespace_id_or.ValueOrDie());
+ }
+ if (namespace_ids.empty() && !suggestion_spec.namespace_filters().empty()) {
+ // None of desired namespace exists, we should return directly.
+ return std::vector<TermMetadata>();
+ }
+
+ // Populate target document id filter.
+ auto document_id_filter_map_or = PopulateDocumentIdFilters(
+ &document_store_, suggestion_spec, namespace_ids);
+ if (!document_id_filter_map_or.ok()) {
+ return std::move(document_id_filter_map_or).status();
+ }
+
+ std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
+ document_id_filter_map = document_id_filter_map_or.ValueOrDie();
+ if (document_id_filter_map.empty() &&
+ !suggestion_spec.document_uri_filters().empty()) {
+ // None of desired DocumentId exists, we should return directly.
+ return std::vector<TermMetadata>();
+ }
+
+ // Populate target schema type filter.
+ std::unordered_set<SchemaTypeId> schema_type_ids;
+ schema_type_ids.reserve(suggestion_spec.schema_type_filters_size());
+ for (std::string_view schema_type : suggestion_spec.schema_type_filters()) {
+ auto schema_type_id_or = schema_store_.GetSchemaTypeId(schema_type);
+ if (!schema_type_id_or.ok()) {
+ continue;
+ }
+ schema_type_ids.insert(schema_type_id_or.ValueOrDie());
+ }
+ if (schema_type_ids.empty() &&
+ !suggestion_spec.schema_type_filters().empty()) {
+ // None of desired schema type exists, we should return directly.
+ return std::vector<TermMetadata>();
+ }
+
+ // Populate target properties filter.
+ auto property_filter_map_or =
+ PopulatePropertyFilters(&schema_store_, suggestion_spec, schema_type_ids);
+ if (!property_filter_map_or.ok()) {
+ return std::move(property_filter_map_or).status();
+ }
+ std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map =
+ property_filter_map_or.ValueOrDie();
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<QueryProcessor> query_processor,
+ QueryProcessor::Create(&index_, &numeric_index_, &language_segmenter_,
+ &normalizer_, &document_store_, &schema_store_));
+
+ SearchSpecProto search_spec;
+ search_spec.set_query(suggestion_spec.prefix());
+ search_spec.set_term_match_type(
+ suggestion_spec.scoring_spec().scoring_match_type());
+ ICING_ASSIGN_OR_RETURN(
+ QueryResults query_results,
+ query_processor->ParseSearch(search_spec,
+ ScoringSpecProto::RankingStrategy::NONE,
+ current_time_ms));
+
+ ICING_ASSIGN_OR_RETURN(
+ DocHitInfoIterator::TrimmedNode trimmed_node,
+ std::move(*query_results.root_iterator).TrimRightMostNode());
+
+ // If the position of the last token is not the end of the prefix, it means
+ // there should be some operator tokens after it and are ignored by the
+ // tokenizer.
+ bool is_last_token =
+ trimmed_node.term_start_index_ + trimmed_node.unnormalized_term_length_ >=
+ suggestion_spec.prefix().length();
+
+ if (!is_last_token || trimmed_node.term_.empty()) {
+ // We don't have a valid last token, return early.
+ return std::vector<TermMetadata>();
+ }
+
+ // Populate the search base in document ids.
+ // Suggestions are only generated for the very last term,
+ // trimmed_node.iterator_ tracks search results for all previous terms. If it
+ // is null means there is no pervious term and we are generating suggetion for
+ // a single term.
+ std::unordered_set<DocumentId> search_base;
+ if (trimmed_node.iterator_ != nullptr) {
+ while (trimmed_node.iterator_->Advance().ok()) {
+ search_base.insert(trimmed_node.iterator_->doc_hit_info().document_id());
+ }
+ if (search_base.empty()) {
+ // Nothing matches the previous terms in the query. There are no valid
+ // suggestions to make, we should return directly.
+ return std::vector<TermMetadata>();
+ }
+ }
+
+ // Create result checker based on given filters.
+ SuggestionResultCheckerImpl suggestion_result_checker_impl(
+ &document_store_, &schema_store_, std::move(namespace_ids),
+ std::move(document_id_filter_map), std::move(schema_type_ids),
+ std::move(property_filter_map), std::move(trimmed_node.target_section_),
+ std::move(search_base), current_time_ms);
+ // TODO(b/228240987) support generate suggestion and append suffix for advance
+ // query and function call.
+ std::string query_prefix =
+ suggestion_spec.prefix().substr(0, trimmed_node.term_start_index_);
+ // Run suggestion based on given SuggestionSpec.
+ // Normalize token text to lowercase since all tokens in the lexicon are
+ // lowercase.
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<TermMetadata> terms,
+ index_.FindTermsByPrefix(
+ trimmed_node.term_, suggestion_spec.num_to_return(),
+ suggestion_spec.scoring_spec().scoring_match_type(),
+ suggestion_spec.scoring_spec().rank_by(),
+ &suggestion_result_checker_impl));
+ for (TermMetadata& term : terms) {
+ term.content = query_prefix + term.content;
+ }
+ return terms;
+}
+
+SuggestionProcessor::SuggestionProcessor(
+ Index* index, const NumericIndex<int64_t>* numeric_index,
+ const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
+ const DocumentStore* document_store, const SchemaStore* schema_store)
+ : index_(*index),
+ numeric_index_(*numeric_index),
+ language_segmenter_(*language_segmenter),
+ normalizer_(*normalizer),
+ document_store_(*document_store),
+ schema_store_(*schema_store) {}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/query/suggestion-processor.h b/icing/query/suggestion-processor.h
new file mode 100644
index 0000000..e100031
--- /dev/null
+++ b/icing/query/suggestion-processor.h
@@ -0,0 +1,78 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_SUGGESTION_PROCESSOR_H_
+#define ICING_QUERY_SUGGESTION_PROCESSOR_H_
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/index.h"
+#include "icing/index/numeric/numeric-index.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+// Processes SuggestionSpecProtos and retrieves the specified TermMedaData that
+// satisfies the prefix and its restrictions. This also performs ranking, and
+// returns TermMetaData ordered by their hit count.
+class SuggestionProcessor {
+ public:
+ // Factory function to create a SuggestionProcessor which does not take
+ // ownership of any input components, and all pointers must refer to valid
+ // objects that outlive the created SuggestionProcessor instance.
+ //
+ // Returns:
+ // An SuggestionProcessor on success
+ // FAILED_PRECONDITION if any of the pointers is null.
+ static libtextclassifier3::StatusOr<std::unique_ptr<SuggestionProcessor>>
+ Create(Index* index, const NumericIndex<int64_t>* numeric_index,
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer, const DocumentStore* document_store,
+ const SchemaStore* schema_store);
+
+ // Query suggestions based on the given SuggestionSpecProto.
+ //
+ // Returns:
+ // On success,
+ // - One vector that represents the entire TermMetadata
+ // INTERNAL_ERROR on all other errors
+ libtextclassifier3::StatusOr<std::vector<TermMetadata>> QuerySuggestions(
+ const SuggestionSpecProto& suggestion_spec, int64_t current_time_ms);
+
+ private:
+ explicit SuggestionProcessor(Index* index,
+ const NumericIndex<int64_t>* numeric_index,
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer,
+ const DocumentStore* document_store,
+ const SchemaStore* schema_store);
+
+ // Not const because we could modify/sort the TermMetaData buffer in the lite
+ // index.
+ Index& index_;
+ const NumericIndex<int64_t>& numeric_index_;
+ const LanguageSegmenter& language_segmenter_;
+ const Normalizer& normalizer_;
+ const DocumentStore& document_store_;
+ const SchemaStore& schema_store_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_SUGGESTION_PROCESSOR_H_
diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc
new file mode 100644
index 0000000..9f9094d
--- /dev/null
+++ b/icing/query/suggestion-processor_test.cc
@@ -0,0 +1,722 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/suggestion-processor.h"
+
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "icing/document-builder.h"
+#include "icing/index/numeric/dummy-numeric-index.h"
+#include "icing/index/term-metadata.h"
+#include "icing/schema-builder.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::IsEmpty;
+using ::testing::Test;
+using ::testing::UnorderedElementsAre;
+
+std::vector<std::string> RetrieveSuggestionsText(
+ const std::vector<TermMetadata>& terms) {
+ std::vector<std::string> suggestions;
+ suggestions.reserve(terms.size());
+ for (const TermMetadata& term : terms) {
+ suggestions.push_back(term.content);
+ }
+ return suggestions;
+}
+
+class SuggestionProcessorTest : public Test {
+ protected:
+ SuggestionProcessorTest()
+ : test_dir_(GetTestTempDir() + "/icing"),
+ store_dir_(test_dir_ + "/store"),
+ schema_store_dir_(test_dir_ + "/schema_store"),
+ index_dir_(test_dir_ + "/index"),
+ numeric_index_dir_(test_dir_ + "/numeric_index") {}
+
+ void SetUp() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(index_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, store_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
+
+ Index::Options options(index_dir_,
+ /*index_merge_size=*/1024 * 1024,
+ /*lite_index_sort_at_indexing=*/true,
+ /*lite_index_sort_size=*/1024 * 8);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ index_, Index::Create(options, &filesystem_, &icing_filesystem_));
+ // TODO(b/249829533): switch to use persistent numeric index.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ numeric_index_,
+ DummyNumericIndex<int64_t>::Create(filesystem_, numeric_index_dir_));
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(
+ ULOC_US, jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(segmenter_options));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ suggestion_processor_,
+ SuggestionProcessor::Create(
+ index_.get(), numeric_index_.get(), language_segmenter_.get(),
+ normalizer_.get(), document_store_.get(), schema_store_.get()));
+ }
+
+ libtextclassifier3::Status AddTokenToIndex(
+ DocumentId document_id, SectionId section_id,
+ TermMatchType::Code term_match_type, const std::string& token) {
+ Index::Editor editor = index_->Edit(document_id, section_id,
+ term_match_type, /*namespace_id=*/0);
+ auto status = editor.BufferTerm(token.c_str());
+ return status.ok() ? editor.IndexAllBufferedTerms() : status;
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ const std::string test_dir_;
+ const std::string store_dir_;
+ const std::string schema_store_dir_;
+
+ private:
+ IcingFilesystem icing_filesystem_;
+ const std::string index_dir_;
+ const std::string numeric_index_dir_;
+
+ protected:
+ std::unique_ptr<Index> index_;
+ std::unique_ptr<NumericIndex<int64_t>> numeric_index_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<Normalizer> normalizer_;
+ FakeClock fake_clock_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
+ std::unique_ptr<SuggestionProcessor> suggestion_processor_;
+};
+
+constexpr SectionId kSectionId2 = 2;
+
+TEST_F(SuggestionProcessorTest, MultipleTermsTest_And) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "2")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "bar"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId1, kSectionId2,
+ TermMatchType::EXACT_ONLY, "fool"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("bar f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("bar foo"));
+}
+
+TEST_F(SuggestionProcessorTest, MultipleTermsTest_AndNary) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "2")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "bar"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "cat"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId1, kSectionId2,
+ TermMatchType::EXACT_ONLY, "fool"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("bar cat f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(RetrieveSuggestionsText(terms),
+ UnorderedElementsAre("bar cat foo"));
+}
+
+TEST_F(SuggestionProcessorTest, MultipleTermsTest_Or) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "2")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "fo"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "bar"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId1, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId1, kSectionId2,
+ TermMatchType::EXACT_ONLY, "cat"),
+ IsOk());
+
+ // Search for "(bar OR cat) AND f" both document1 "bar fo" and document2 "cat
+ // foo" could match.
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("bar OR cat f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(RetrieveSuggestionsText(terms),
+ UnorderedElementsAre("bar OR cat fo", "bar OR cat foo"));
+}
+
+TEST_F(SuggestionProcessorTest, MultipleTermsTest_OrNary) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "2")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId2,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "3")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "fo"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "bar"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId1, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId1, kSectionId2,
+ TermMatchType::EXACT_ONLY, "cat"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId2, kSectionId2,
+ TermMatchType::EXACT_ONLY, "fool"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId2, kSectionId2,
+ TermMatchType::EXACT_ONLY, "lot"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ // Search for "((bar OR cat) OR lot) AND f"
+ suggestion_spec.set_prefix("bar OR cat OR lot f");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ // "fo" in document1, "foo" in document2 and "fool" in document3 could match.
+ EXPECT_THAT(
+ RetrieveSuggestionsText(terms),
+ UnorderedElementsAre("bar OR cat OR lot fo", "bar OR cat OR lot foo",
+ "bar OR cat OR lot fool"));
+}
+
+TEST_F(SuggestionProcessorTest, MultipleTermsTest_NormalizedTerm) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId1,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "2")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "bar"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId1, kSectionId2,
+ TermMatchType::EXACT_ONLY, "fool"),
+ IsOk());
+ ASSERT_THAT(AddTokenToIndex(documentId1, kSectionId2,
+ TermMatchType::EXACT_ONLY, "bar"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ // Search for "bar AND FO"
+ suggestion_spec.set_prefix("bar FO");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ // The term is normalized.
+ EXPECT_THAT(RetrieveSuggestionsText(terms),
+ UnorderedElementsAre("bar foo", "bar fool"));
+
+ // Search for "bar AND ḞÖ"
+ suggestion_spec.set_prefix("bar ḞÖ");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ // The term is normalized.
+ EXPECT_THAT(RetrieveSuggestionsText(terms),
+ UnorderedElementsAre("bar foo", "bar fool"));
+}
+
+TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("nonExistTerm");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(terms, IsEmpty());
+}
+
+TEST_F(SuggestionProcessorTest, PrefixTrailingSpaceTest) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f ");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(terms, IsEmpty());
+}
+
+TEST_F(SuggestionProcessorTest, NormalizePrefixTest) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("F");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo"));
+
+ suggestion_spec.set_prefix("fO");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo"));
+
+ suggestion_spec.set_prefix("Fo");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo"));
+
+ suggestion_spec.set_prefix("FO");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo"));
+}
+
+TEST_F(SuggestionProcessorTest, ParenthesesOperatorPrefixTest) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("{f}");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<TermMetadata> terms,
+ suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(terms, IsEmpty());
+
+ suggestion_spec.set_prefix("[f]");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(terms, IsEmpty());
+
+ suggestion_spec.set_prefix("(f)");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ terms, suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(terms, IsEmpty());
+}
+
+TEST_F(SuggestionProcessorTest, OtherSpecialPrefixTest) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "foo"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("f:");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ auto terms_or = suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds());
+ if (SearchSpecProto::default_instance().search_type() ==
+ SearchSpecProto::SearchType::ICING_RAW_QUERY) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or);
+ EXPECT_THAT(terms, IsEmpty());
+ } else {
+ EXPECT_THAT(terms_or,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ }
+
+ // TODO(b/208654892): Update handling for hyphens to only consider it a hyphen
+ // within a TEXT token (rather than a MINUS token) when surrounded on both
+ // sides by TEXT rather than just preceded by TEXT.
+ suggestion_spec.set_prefix("f-");
+ terms_or = suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds());
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or);
+ EXPECT_THAT(terms, IsEmpty());
+
+ suggestion_spec.set_prefix("f OR");
+ terms_or = suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds());
+ if (SearchSpecProto::default_instance().search_type() ==
+ SearchSpecProto::SearchType::ICING_RAW_QUERY) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or);
+ EXPECT_THAT(terms, IsEmpty());
+ } else {
+ EXPECT_THAT(terms_or,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ }
+}
+
+TEST_F(SuggestionProcessorTest, InvalidPrefixTest) {
+ // Create the schema and document store
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // These documents don't actually match to the tokens in the index. We're
+ // inserting the documents to get the appropriate number of documents and
+ // namespaces populated.
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId documentId0,
+ document_store_->Put(DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .Build()));
+
+ ASSERT_THAT(AddTokenToIndex(documentId0, kSectionId2,
+ TermMatchType::EXACT_ONLY, "original"),
+ IsOk());
+
+ SuggestionSpecProto suggestion_spec;
+ suggestion_spec.set_prefix("OR OR - :");
+ suggestion_spec.set_num_to_return(10);
+ suggestion_spec.mutable_scoring_spec()->set_scoring_match_type(
+ TermMatchType::PREFIX);
+
+ auto terms_or = suggestion_processor_->QuerySuggestions(
+ suggestion_spec, fake_clock_.GetSystemTimeMilliseconds());
+ if (SearchSpecProto::default_instance().search_type() ==
+ SearchSpecProto::SearchType::ICING_RAW_QUERY) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or);
+ EXPECT_THAT(terms, IsEmpty());
+ } else {
+ EXPECT_THAT(terms_or,
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/page-result-state.h b/icing/result/page-result-state.h
index a26c44e..5932b56 100644
--- a/icing/result/page-result-state.h
+++ b/icing/result/page-result-state.h
@@ -18,6 +18,7 @@
#include <cstdint>
#include <vector>
+#include "icing/result/projection-tree.h"
#include "icing/result/snippet-context.h"
#include "icing/scoring/scored-document-hit.h"
@@ -29,11 +30,14 @@ struct PageResultState {
PageResultState(std::vector<ScoredDocumentHit> scored_document_hits_in,
uint64_t next_page_token_in,
SnippetContext snippet_context_in,
- int num_previously_returned_in)
+ std::unordered_map<std::string, ProjectionTree> tree_map,
+ int num_previously_returned_in, int num_per_page_in)
: scored_document_hits(std::move(scored_document_hits_in)),
next_page_token(next_page_token_in),
snippet_context(std::move(snippet_context_in)),
- num_previously_returned(num_previously_returned_in) {}
+ projection_tree_map(std::move(tree_map)),
+ num_previously_returned(num_previously_returned_in),
+ requested_page_size(num_per_page_in) {}
// Results of one page
std::vector<ScoredDocumentHit> scored_document_hits;
@@ -44,8 +48,15 @@ struct PageResultState {
// Information needed for snippeting.
SnippetContext snippet_context;
+ // Information needed for projection.
+ std::unordered_map<std::string, ProjectionTree> projection_tree_map;
+
// Number of results that have been returned in previous pages.
int num_previously_returned;
+
+ // The page size for this query. This should always be >=
+ // scored_document_hits.size();
+ int requested_page_size;
};
} // namespace lib
diff --git a/icing/result/page-result.h b/icing/result/page-result.h
new file mode 100644
index 0000000..6645593
--- /dev/null
+++ b/icing/result/page-result.h
@@ -0,0 +1,46 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_PAGE_RESULT_H_
+#define ICING_RESULT_PAGE_RESULT_H_
+
+#include <vector>
+
+#include "icing/proto/search.pb.h"
+
+namespace icing {
+namespace lib {
+
+// Contains information of the search result of one page.
+struct PageResult {
+ PageResult(std::vector<SearchResultProto::ResultProto> results_in,
+ int num_results_with_snippets_in, int requested_page_size_in)
+ : results(std::move(results_in)),
+ num_results_with_snippets(num_results_with_snippets_in),
+ requested_page_size(requested_page_size_in) {}
+
+ // Results of one page
+ std::vector<SearchResultProto::ResultProto> results;
+
+ // Number of results with snippets.
+ int num_results_with_snippets;
+
+ // The page size for this query. This should always be >= results.size().
+ int requested_page_size;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_PAGE_RESULT_H_
diff --git a/icing/result/projection-tree.cc b/icing/result/projection-tree.cc
new file mode 100644
index 0000000..9896491
--- /dev/null
+++ b/icing/result/projection-tree.cc
@@ -0,0 +1,50 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/projection-tree.h"
+
+#include <algorithm>
+
+#include "icing/proto/search.pb.h"
+#include "icing/schema/property-util.h"
+
+namespace icing {
+namespace lib {
+
+ProjectionTree::ProjectionTree(
+ const SchemaStore::ExpandedTypePropertyMask& type_field_mask) {
+ for (const std::string& field_mask : type_field_mask.paths) {
+ Node* current_node = &root_;
+ for (std::string_view sub_field_mask :
+ property_util::SplitPropertyPathExpr(field_mask)) {
+ current_node = AddChildNode(sub_field_mask, &current_node->children);
+ }
+ }
+}
+
+ProjectionTree::Node* ProjectionTree::AddChildNode(
+ std::string_view property_name, std::vector<Node>* current_children) {
+ auto itr = std::find_if(current_children->begin(), current_children->end(),
+ [&property_name](const Node& node) {
+ return node.name == property_name;
+ });
+ if (itr != current_children->end()) {
+ return &(*itr);
+ }
+ current_children->push_back(ProjectionTree::Node(std::string(property_name)));
+ return &current_children->back();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/projection-tree.h b/icing/result/projection-tree.h
new file mode 100644
index 0000000..cdf268a
--- /dev/null
+++ b/icing/result/projection-tree.h
@@ -0,0 +1,61 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_PROJECTION_TREE_H_
+#define ICING_RESULT_PROJECTION_TREE_H_
+
+#include <string_view>
+#include <vector>
+
+#include "icing/proto/search.pb.h"
+#include "icing/schema/schema-store.h"
+
+namespace icing {
+namespace lib {
+
+class ProjectionTree {
+ public:
+ struct Node {
+ explicit Node(std::string name = "") : name(std::move(name)) {}
+
+ std::string name;
+ std::vector<Node> children;
+
+ bool operator==(const Node& other) const {
+ return name == other.name && children == other.children;
+ }
+ };
+
+ explicit ProjectionTree(
+ const SchemaStore::ExpandedTypePropertyMask& type_field_mask);
+
+ const Node& root() const { return root_; }
+
+ bool operator==(const ProjectionTree& other) const {
+ return root_ == other.root_;
+ }
+
+ private:
+ // Add a child node with property_name to current_children and returns a
+ // pointer to the child node.
+ Node* AddChildNode(std::string_view property_name,
+ std::vector<Node>* current_children);
+
+ Node root_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_PROJECTION_TREE_H_
diff --git a/icing/result/projection-tree_test.cc b/icing/result/projection-tree_test.cc
new file mode 100644
index 0000000..46d0c12
--- /dev/null
+++ b/icing/result/projection-tree_test.cc
@@ -0,0 +1,118 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/projection-tree.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/schema-store.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+TEST(ProjectionTreeTest, CreateEmptyFieldMasks) {
+ ProjectionTree tree({});
+ EXPECT_THAT(tree.root().name, IsEmpty());
+ EXPECT_THAT(tree.root().children, IsEmpty());
+}
+
+TEST(ProjectionTreeTest, CreateTreeTopLevel) {
+ SchemaStore::ExpandedTypePropertyMask type_field_mask{"", {"subject"}};
+
+ ProjectionTree tree(type_field_mask);
+ EXPECT_THAT(tree.root().name, IsEmpty());
+ ASSERT_THAT(tree.root().children, SizeIs(1));
+ EXPECT_THAT(tree.root().children.at(0).name, Eq("subject"));
+ EXPECT_THAT(tree.root().children.at(0).children, IsEmpty());
+}
+
+TEST(ProjectionTreeTest, CreateTreeMultipleTopLevel) {
+ SchemaStore::ExpandedTypePropertyMask type_field_mask{"",
+ {"subject", "body"}};
+
+ ProjectionTree tree(type_field_mask);
+ EXPECT_THAT(tree.root().name, IsEmpty());
+ ASSERT_THAT(tree.root().children, SizeIs(2));
+
+ const ProjectionTree::Node* child0 = &tree.root().children.at(0);
+ const ProjectionTree::Node* child1 = &tree.root().children.at(1);
+ if (child0->name != "subject") {
+ std::swap(child0, child1);
+ }
+
+ EXPECT_THAT(child0->name, Eq("subject"));
+ EXPECT_THAT(child0->children, IsEmpty());
+ EXPECT_THAT(child1->name, Eq("body"));
+ EXPECT_THAT(child1->children, IsEmpty());
+}
+
+TEST(ProjectionTreeTest, CreateTreeNested) {
+ SchemaStore::ExpandedTypePropertyMask type_field_mask{
+ "", {"subject.body", "body"}};
+
+ ProjectionTree tree(type_field_mask);
+ EXPECT_THAT(tree.root().name, IsEmpty());
+ ASSERT_THAT(tree.root().children, SizeIs(2));
+
+ const ProjectionTree::Node* child0 = &tree.root().children.at(0);
+ const ProjectionTree::Node* child1 = &tree.root().children.at(1);
+ if (child0->name != "subject.body") {
+ std::swap(child0, child1);
+ }
+
+ EXPECT_THAT(child0->name, Eq("subject"));
+ ASSERT_THAT(child0->children, SizeIs(1));
+ EXPECT_THAT(child0->children.at(0).name, Eq("body"));
+ EXPECT_THAT(child0->children.at(0).children, IsEmpty());
+ EXPECT_THAT(child1->name, Eq("body"));
+ EXPECT_THAT(child1->children, IsEmpty());
+}
+
+TEST(ProjectionTreeTest, CreateTreeNestedSharedNode) {
+ SchemaStore::ExpandedTypePropertyMask type_field_mask{
+ "", {"sender.name.first", "sender.emailAddress"}};
+
+ ProjectionTree tree(type_field_mask);
+ EXPECT_THAT(tree.root().name, IsEmpty());
+ ASSERT_THAT(tree.root().children, SizeIs(1));
+ EXPECT_THAT(tree.root().children.at(0).name, Eq("sender"));
+ ASSERT_THAT(tree.root().children.at(0).children, SizeIs(2));
+
+ const ProjectionTree::Node* child0_child0 =
+ &tree.root().children.at(0).children.at(0);
+ const ProjectionTree::Node* child0_child1 =
+ &tree.root().children.at(0).children.at(1);
+ if (child0_child0->name != "name") {
+ std::swap(child0_child0, child0_child1);
+ }
+
+ EXPECT_THAT(child0_child0->name, Eq("name"));
+ ASSERT_THAT(child0_child0->children, SizeIs(1));
+ EXPECT_THAT(child0_child0->children.at(0).name, Eq("first"));
+ EXPECT_THAT(child0_child0->children.at(0).children, IsEmpty());
+ EXPECT_THAT(child0_child1->name, Eq("emailAddress"));
+ EXPECT_THAT(child0_child1->children, IsEmpty());
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/projector.cc b/icing/result/projector.cc
new file mode 100644
index 0000000..26478d2
--- /dev/null
+++ b/icing/result/projector.cc
@@ -0,0 +1,62 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/projector.h"
+
+#include <algorithm>
+
+#include "icing/proto/document.pb.h"
+
+namespace icing {
+namespace lib {
+
+namespace projector {
+
+void Project(const std::vector<ProjectionTree::Node>& projection_tree,
+ DocumentProto* document) {
+ int num_kept = 0;
+ for (int cur_pos = 0; cur_pos < document->properties_size(); ++cur_pos) {
+ PropertyProto* prop = document->mutable_properties(cur_pos);
+ auto itr = std::find_if(projection_tree.begin(), projection_tree.end(),
+ [&prop](const ProjectionTree::Node& node) {
+ return node.name == prop->name();
+ });
+ if (itr == projection_tree.end()) {
+ // Property is not present in the projection tree. Just skip it.
+ continue;
+ }
+ // This property should be kept.
+ document->mutable_properties()->SwapElements(num_kept, cur_pos);
+ ++num_kept;
+ if (itr->children.empty()) {
+ // A field mask does refer to this property, but it has no children. So
+ // we should take the entire property, with all of its
+ // subproperties/values
+ continue;
+ }
+ // The field mask refers to children of this property. Recurse through the
+ // document values that this property holds and project the children
+ // requested by this field mask.
+ for (DocumentProto& subproperty : *(prop->mutable_document_values())) {
+ Project(itr->children, &subproperty);
+ }
+ }
+ document->mutable_properties()->DeleteSubrange(
+ num_kept, document->properties_size() - num_kept);
+}
+
+} // namespace projector
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/projector.h b/icing/result/projector.h
new file mode 100644
index 0000000..43d9052
--- /dev/null
+++ b/icing/result/projector.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_PROJECTOR_H_
+#define ICING_RESULT_PROJECTOR_H_
+
+#include <vector>
+
+#include "icing/proto/document.pb.h"
+#include "icing/result/projection-tree.h"
+
+namespace icing {
+namespace lib {
+
+namespace projector {
+
+void Project(const std::vector<ProjectionTree::Node>& projection_tree,
+ DocumentProto* document);
+
+} // namespace projector
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_PROJECTOR_H_
diff --git a/icing/result/result-adjustment-info.cc b/icing/result/result-adjustment-info.cc
new file mode 100644
index 0000000..00ac379
--- /dev/null
+++ b/icing/result/result-adjustment-info.cc
@@ -0,0 +1,64 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-adjustment-info.h"
+
+#include <string>
+#include <unordered_map>
+
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/result/projection-tree.h"
+#include "icing/result/snippet-context.h"
+#include "icing/schema/schema-store.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+SnippetContext CreateSnippetContext(const SearchSpecProto& search_spec,
+ const ResultSpecProto& result_spec,
+ SectionRestrictQueryTermsMap query_terms) {
+ if (result_spec.snippet_spec().num_to_snippet() > 0 &&
+ result_spec.snippet_spec().num_matches_per_property() > 0) {
+ // Needs snippeting
+ return SnippetContext(std::move(query_terms), result_spec.snippet_spec(),
+ search_spec.term_match_type());
+ }
+ return SnippetContext(/*query_terms_in=*/{},
+ ResultSpecProto::SnippetSpecProto::default_instance(),
+ TermMatchType::UNKNOWN);
+}
+
+} // namespace
+
+ResultAdjustmentInfo::ResultAdjustmentInfo(
+ const SearchSpecProto& search_spec, const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec, const SchemaStore* schema_store,
+ SectionRestrictQueryTermsMap query_terms)
+ : snippet_context(CreateSnippetContext(search_spec, result_spec,
+ std::move(query_terms))),
+ remaining_num_to_snippet(snippet_context.snippet_spec.num_to_snippet()) {
+ for (const SchemaStore::ExpandedTypePropertyMask& type_field_mask :
+ schema_store->ExpandTypePropertyMasks(
+ result_spec.type_property_masks())) {
+ projection_tree_map.insert(
+ {type_field_mask.schema_type, ProjectionTree(type_field_mask)});
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-adjustment-info.h b/icing/result/result-adjustment-info.h
new file mode 100644
index 0000000..e859492
--- /dev/null
+++ b/icing/result/result-adjustment-info.h
@@ -0,0 +1,53 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_RESULT_ADJUSTMENT_INFO_H_
+#define ICING_RESULT_RESULT_ADJUSTMENT_INFO_H_
+
+#include <string>
+#include <unordered_map>
+
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/projection-tree.h"
+#include "icing/result/snippet-context.h"
+#include "icing/schema/schema-store.h"
+
+namespace icing {
+namespace lib {
+
+// A wrapper struct for information used in result retrieval.
+// - Snippet
+// - Projection
+struct ResultAdjustmentInfo {
+ // Information needed for snippeting.
+ SnippetContext snippet_context;
+
+ // Remaining # of docs to snippet.
+ int remaining_num_to_snippet;
+
+ // Information needed for projection.
+ std::unordered_map<std::string, ProjectionTree> projection_tree_map;
+
+ explicit ResultAdjustmentInfo(const SearchSpecProto& search_spec,
+ const ScoringSpecProto& scoring_spec,
+ const ResultSpecProto& result_spec,
+ const SchemaStore* schema_store,
+ SectionRestrictQueryTermsMap query_terms);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_RESULT_ADJUSTMENT_INFO_H_
diff --git a/icing/result/result-adjustment-info_test.cc b/icing/result/result-adjustment-info_test.cc
new file mode 100644
index 0000000..cbce557
--- /dev/null
+++ b/icing/result/result-adjustment-info_test.cc
@@ -0,0 +1,198 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-adjustment-info.h"
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/result/projection-tree.h"
+#include "icing/result/snippet-context.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::AnyOf;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+class ResultAdjustmentInfoTest : public testing::Test {
+ protected:
+ ResultAdjustmentInfoTest() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void SetUp() override {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("Phone"))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ FakeClock fake_clock_;
+};
+
+SearchSpecProto CreateSearchSpec(TermMatchType::Code match_type) {
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(match_type);
+ return search_spec;
+}
+
+ScoringSpecProto CreateScoringSpec(bool is_descending_order) {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_order_by(is_descending_order ? ScoringSpecProto::Order::DESC
+ : ScoringSpecProto::Order::ASC);
+ return scoring_spec;
+}
+
+ResultSpecProto CreateResultSpec(
+ int num_per_page, ResultSpecProto::ResultGroupingType result_group_type) {
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(result_group_type);
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+TEST_F(ResultAdjustmentInfoTest,
+ ShouldConstructSnippetContextAccordingToSpecs) {
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
+
+ SectionRestrictQueryTermsMap query_terms_map;
+ query_terms_map.emplace("term1", std::unordered_set<std::string>());
+
+ ResultAdjustmentInfo result_adjustment_info(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec,
+ schema_store_.get(), query_terms_map);
+ const SnippetContext snippet_context = result_adjustment_info.snippet_context;
+
+ // Snippet context should be derived from the specs above.
+ EXPECT_TRUE(
+ result_adjustment_info.snippet_context.query_terms.find("term1") !=
+ result_adjustment_info.snippet_context.query_terms.end());
+ EXPECT_THAT(result_adjustment_info.snippet_context.snippet_spec,
+ EqualsProto(result_spec.snippet_spec()));
+ EXPECT_THAT(result_adjustment_info.snippet_context.match_type,
+ Eq(TermMatchType::EXACT_ONLY));
+ EXPECT_THAT(result_adjustment_info.remaining_num_to_snippet, Eq(5));
+}
+
+TEST_F(ResultAdjustmentInfoTest, NoSnippetingShouldReturnNull) {
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ // Setting num_to_snippet to 0 so that snippeting info won't be
+ // stored.
+ result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
+ result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
+ result_spec.mutable_snippet_spec()->set_max_window_utf32_length(5);
+
+ SectionRestrictQueryTermsMap query_terms_map;
+ query_terms_map.emplace("term1", std::unordered_set<std::string>());
+
+ ResultAdjustmentInfo result_adjustment_info(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec,
+ schema_store_.get(), query_terms_map);
+
+ EXPECT_THAT(result_adjustment_info.snippet_context.query_terms, IsEmpty());
+ EXPECT_THAT(
+ result_adjustment_info.snippet_context.snippet_spec,
+ EqualsProto(ResultSpecProto::SnippetSpecProto::default_instance()));
+ EXPECT_THAT(result_adjustment_info.snippet_context.match_type,
+ TermMatchType::UNKNOWN);
+ EXPECT_THAT(result_adjustment_info.remaining_num_to_snippet, Eq(0));
+}
+
+TEST_F(ResultAdjustmentInfoTest,
+ ShouldConstructProjectionTreeMapAccordingToSpecs) {
+ // Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ TypePropertyMask* email_type_property_mask =
+ result_spec.add_type_property_masks();
+ email_type_property_mask->set_schema_type("Email");
+ email_type_property_mask->add_paths("sender.name");
+ email_type_property_mask->add_paths("sender.emailAddress");
+ TypePropertyMask* phone_type_property_mask =
+ result_spec.add_type_property_masks();
+ phone_type_property_mask->set_schema_type("Phone");
+ phone_type_property_mask->add_paths("caller");
+ TypePropertyMask* wildcard_type_property_mask =
+ result_spec.add_type_property_masks();
+ wildcard_type_property_mask->set_schema_type(
+ std::string(SchemaStore::kSchemaTypeWildcard));
+ wildcard_type_property_mask->add_paths("wild.card");
+
+ ResultAdjustmentInfo result_adjustment_info(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec,
+ schema_store_.get(),
+ /*query_terms=*/{});
+
+ ProjectionTree email_projection_tree =
+ ProjectionTree({"Email", {"sender.name", "sender.emailAddress"}});
+ ProjectionTree alternative_email_projection_tree =
+ ProjectionTree({"Email", {"sender.emailAddress", "sender.name"}});
+ ProjectionTree phone_projection_tree = ProjectionTree({"Phone", {"caller"}});
+ ProjectionTree wildcard_projection_tree = ProjectionTree(
+ {std::string(SchemaStore::kSchemaTypeWildcard), {"wild.card"}});
+
+ EXPECT_THAT(result_adjustment_info.projection_tree_map,
+ UnorderedElementsAre(
+ Pair("Email", AnyOf(email_projection_tree,
+ alternative_email_projection_tree)),
+ Pair("Phone", phone_projection_tree),
+ Pair(std::string(SchemaStore::kSchemaTypeWildcard),
+ wildcard_projection_tree)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-retriever-v2.cc b/icing/result/result-retriever-v2.cc
new file mode 100644
index 0000000..44fa602
--- /dev/null
+++ b/icing/result/result-retriever-v2.cc
@@ -0,0 +1,268 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-retriever-v2.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/page-result.h"
+#include "icing/result/projection-tree.h"
+#include "icing/result/projector.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/result/snippet-context.h"
+#include "icing/result/snippet-retriever.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-store.h"
+#include "icing/store/namespace-id.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+void ApplyProjection(const ResultAdjustmentInfo* adjustment_info,
+ DocumentProto* document) {
+ if (adjustment_info == nullptr) {
+ return;
+ }
+
+ auto itr = adjustment_info->projection_tree_map.find(document->schema());
+ if (itr != adjustment_info->projection_tree_map.end()) {
+ projector::Project(itr->second.root().children, document);
+ } else {
+ auto wildcard_projection_tree_itr =
+ adjustment_info->projection_tree_map.find(
+ std::string(SchemaStore::kSchemaTypeWildcard));
+ if (wildcard_projection_tree_itr !=
+ adjustment_info->projection_tree_map.end()) {
+ projector::Project(wildcard_projection_tree_itr->second.root().children,
+ document);
+ }
+ }
+}
+
+bool ApplySnippet(ResultAdjustmentInfo* adjustment_info,
+ const SnippetRetriever& snippet_retriever,
+ const DocumentProto& document, SectionIdMask section_id_mask,
+ SearchResultProto::ResultProto* result) {
+ if (adjustment_info == nullptr) {
+ return false;
+ }
+
+ const SnippetContext& snippet_context = adjustment_info->snippet_context;
+ int& remaining_num_to_snippet = adjustment_info->remaining_num_to_snippet;
+
+ if (snippet_context.snippet_spec.num_matches_per_property() > 0 &&
+ remaining_num_to_snippet > 0) {
+ SnippetProto snippet_proto = snippet_retriever.RetrieveSnippet(
+ snippet_context.query_terms, snippet_context.match_type,
+ snippet_context.snippet_spec, document, section_id_mask);
+ *result->mutable_snippet() = std::move(snippet_proto);
+ --remaining_num_to_snippet;
+ return true;
+ }
+
+ return false;
+}
+
+} // namespace
+
+bool GroupResultLimiterV2::ShouldBeRemoved(
+ const ScoredDocumentHit& scored_document_hit,
+ const std::unordered_map<int32_t, int>& entry_id_group_id_map,
+ const DocumentStore& document_store, std::vector<int>& group_result_limits,
+ ResultSpecProto::ResultGroupingType result_group_type,
+ int64_t current_time_ms) const {
+ auto document_filter_data_optional =
+ document_store.GetAliveDocumentFilterData(
+ scored_document_hit.document_id(), current_time_ms);
+ if (!document_filter_data_optional) {
+ // The document doesn't exist.
+ return true;
+ }
+ NamespaceId namespace_id =
+ document_filter_data_optional.value().namespace_id();
+ SchemaTypeId schema_type_id =
+ document_filter_data_optional.value().schema_type_id();
+ auto entry_id_or = document_store.GetResultGroupingEntryId(
+ result_group_type, namespace_id, schema_type_id);
+ if (!entry_id_or.ok()) {
+ return false;
+ }
+ int32_t entry_id = entry_id_or.ValueOrDie();
+ auto iter = entry_id_group_id_map.find(entry_id);
+ if (iter == entry_id_group_id_map.end()) {
+ // If a ResultGrouping Entry Id isn't found in entry_id_group_id_map, then
+ // there are no limits placed on results from this entry id.
+ return false;
+ }
+ int& count = group_result_limits.at(iter->second);
+ if (count <= 0) {
+ return true;
+ }
+ --count;
+ return false;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<ResultRetrieverV2>>
+ResultRetrieverV2::Create(
+ const DocumentStore* doc_store, const SchemaStore* schema_store,
+ const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
+ std::unique_ptr<const GroupResultLimiterV2> group_result_limiter) {
+ ICING_RETURN_ERROR_IF_NULL(doc_store);
+ ICING_RETURN_ERROR_IF_NULL(schema_store);
+ ICING_RETURN_ERROR_IF_NULL(language_segmenter);
+ ICING_RETURN_ERROR_IF_NULL(normalizer);
+ ICING_RETURN_ERROR_IF_NULL(group_result_limiter);
+
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<SnippetRetriever> snippet_retriever,
+ SnippetRetriever::Create(schema_store, language_segmenter, normalizer));
+
+ return std::unique_ptr<ResultRetrieverV2>(
+ new ResultRetrieverV2(doc_store, std::move(snippet_retriever),
+ std::move(group_result_limiter)));
+}
+
+std::pair<PageResult, bool> ResultRetrieverV2::RetrieveNextPage(
+ ResultStateV2& result_state, int64_t current_time_ms) const {
+ absl_ports::unique_lock l(&result_state.mutex);
+
+ // For calculating page
+ int original_scored_document_hits_ranker_size =
+ result_state.scored_document_hits_ranker->size();
+ int num_results_with_snippets = 0;
+
+ // Retrieve info
+ std::vector<SearchResultProto::ResultProto> results;
+ int32_t num_total_bytes = 0;
+ while (results.size() < result_state.num_per_page() &&
+ !result_state.scored_document_hits_ranker->empty()) {
+ JoinedScoredDocumentHit next_best_document_hit =
+ result_state.scored_document_hits_ranker->PopNext();
+ if (group_result_limiter_->ShouldBeRemoved(
+ next_best_document_hit.parent_scored_document_hit(),
+ result_state.entry_id_group_id_map(), doc_store_,
+ result_state.group_result_limits, result_state.result_group_type(),
+ current_time_ms)) {
+ continue;
+ }
+
+ libtextclassifier3::StatusOr<DocumentProto> document_or = doc_store_.Get(
+ next_best_document_hit.parent_scored_document_hit().document_id());
+ if (!document_or.ok()) {
+ // Skip the document if getting errors.
+ ICING_LOG(WARNING) << "Fail to fetch document from document store: "
+ << document_or.status().error_message();
+ continue;
+ }
+
+ DocumentProto document = std::move(document_or).ValueOrDie();
+ // Apply parent projection
+ ApplyProjection(result_state.parent_adjustment_info(), &document);
+
+ SearchResultProto::ResultProto result;
+ // Add parent snippet if requested.
+ if (ApplySnippet(result_state.parent_adjustment_info(), *snippet_retriever_,
+ document,
+ next_best_document_hit.parent_scored_document_hit()
+ .hit_section_id_mask(),
+ &result)) {
+ ++num_results_with_snippets;
+ }
+
+ // Add the document, itself.
+ *result.mutable_document() = std::move(document);
+ result.set_score(next_best_document_hit.final_score());
+
+ // Retrieve child documents
+ for (const ScoredDocumentHit& child_scored_document_hit :
+ next_best_document_hit.child_scored_document_hits()) {
+ if (result.joined_results_size() >=
+ result_state.max_joined_children_per_parent_to_return()) {
+ break;
+ }
+
+ libtextclassifier3::StatusOr<DocumentProto> child_document_or =
+ doc_store_.Get(child_scored_document_hit.document_id());
+ if (!child_document_or.ok()) {
+ // Skip the document if getting errors.
+ ICING_LOG(WARNING)
+ << "Fail to fetch child document from document store: "
+ << child_document_or.status().error_message();
+ continue;
+ }
+
+ DocumentProto child_document = std::move(child_document_or).ValueOrDie();
+ ApplyProjection(result_state.child_adjustment_info(), &child_document);
+
+ SearchResultProto::ResultProto* child_result =
+ result.add_joined_results();
+ // Add child snippet if requested.
+ ApplySnippet(result_state.child_adjustment_info(), *snippet_retriever_,
+ child_document,
+ child_scored_document_hit.hit_section_id_mask(),
+ child_result);
+
+ *child_result->mutable_document() = std::move(child_document);
+ child_result->set_score(child_scored_document_hit.score());
+ }
+
+ size_t result_bytes = result.ByteSizeLong();
+ results.push_back(std::move(result));
+
+ // Check if num_total_bytes + result_bytes reaches or exceeds
+ // num_total_bytes_per_page_threshold. Use subtraction to avoid integer
+ // overflow.
+ if (result_bytes >=
+ result_state.num_total_bytes_per_page_threshold() - num_total_bytes) {
+ break;
+ }
+ num_total_bytes += result_bytes;
+ }
+
+ // Update numbers in ResultState
+ result_state.num_returned += results.size();
+ result_state.IncrementNumTotalHits(
+ result_state.scored_document_hits_ranker->size() -
+ original_scored_document_hits_ranker_size);
+
+ bool has_more_results = !result_state.scored_document_hits_ranker->empty();
+
+ return std::make_pair(
+ PageResult(std::move(results), num_results_with_snippets,
+ result_state.num_per_page()),
+ has_more_results);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-retriever-v2.h b/icing/result/result-retriever-v2.h
new file mode 100644
index 0000000..7b1a364
--- /dev/null
+++ b/icing/result/result-retriever-v2.h
@@ -0,0 +1,111 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_RETRIEVER_V2_H_
+#define ICING_RESULT_RETRIEVER_V2_H_
+
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/result/snippet-retriever.h"
+#include "icing/schema/schema-store.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-store.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer.h"
+
+namespace icing {
+namespace lib {
+
+class GroupResultLimiterV2 {
+ public:
+ GroupResultLimiterV2() {}
+
+ virtual ~GroupResultLimiterV2() = default;
+
+ // Returns true if the scored_document_hit should be removed.
+ virtual bool ShouldBeRemoved(
+ const ScoredDocumentHit& scored_document_hit,
+ const std::unordered_map<int32_t, int>& entry_id_group_id_map,
+ const DocumentStore& document_store,
+ std::vector<int>& group_result_limits,
+ ResultSpecProto::ResultGroupingType result_group_type,
+ int64_t current_time_ms) const;
+};
+
+class ResultRetrieverV2 {
+ public:
+ // Factory function to create a ResultRetrieverV2 which does not take
+ // ownership of any input components, and all pointers must refer to valid
+ // objects that outlive the created ResultRetrieverV2 instance.
+ //
+ // Returns:
+ // A ResultRetrieverV2 on success
+ // FAILED_PRECONDITION on any null pointer input
+ static libtextclassifier3::StatusOr<std::unique_ptr<ResultRetrieverV2>>
+ Create(const DocumentStore* doc_store, const SchemaStore* schema_store,
+ const LanguageSegmenter* language_segmenter,
+ const Normalizer* normalizer,
+ std::unique_ptr<const GroupResultLimiterV2> group_result_limiter =
+ std::make_unique<const GroupResultLimiterV2>());
+
+ // Retrieves results (pairs of DocumentProtos and SnippetProtos) with the
+ // given ResultState which holds document and snippet information. It pulls
+ // out the next top rank documents from ResultState, retrieves the documents
+ // from storage, updates ResultState, and finally wraps the result + other
+ // information into PageResult. The expected number of documents to return is
+ // min(num_per_page, the number of all scored document hits) inside
+ // ResultState.
+ //
+ // The number of snippets to return is based on the total number of snippets
+ // needed and number of snippets that have already been returned previously
+ // for the same query. The order of results returned will be sorted by
+ // scored_document_hit_comparator inside ResultState.
+ //
+ // An additional boolean value will be returned, indicating if ResultState has
+ // remaining documents to be retrieved next round.
+ //
+ // All errors will be ignored. It will keep retrieving the next document and
+ // valid documents will be included in PageResult.
+ //
+ // Returns:
+ // std::pair<PageResult, bool>
+ std::pair<PageResult, bool> RetrieveNextPage(ResultStateV2& result_state,
+ int64_t current_time_ms) const;
+
+ private:
+ explicit ResultRetrieverV2(
+ const DocumentStore* doc_store,
+ std::unique_ptr<SnippetRetriever> snippet_retriever,
+ std::unique_ptr<const GroupResultLimiterV2> group_result_limiter)
+ : doc_store_(*doc_store),
+ snippet_retriever_(std::move(snippet_retriever)),
+ group_result_limiter_(std::move(group_result_limiter)) {}
+
+ const DocumentStore& doc_store_;
+ std::unique_ptr<SnippetRetriever> snippet_retriever_;
+ const std::unique_ptr<const GroupResultLimiterV2> group_result_limiter_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_RETRIEVER_V2_H_
diff --git a/icing/result/result-retriever-v2_group-result-limiter_test.cc b/icing/result/result-retriever-v2_group-result-limiter_test.cc
new file mode 100644
index 0000000..2914a8d
--- /dev/null
+++ b/icing/result/result-retriever-v2_group-result-limiter_test.cc
@@ -0,0 +1,1163 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::Pair;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
+
+class ResultRetrieverV2GroupResultLimiterTest : public testing::Test {
+ protected:
+ ResultRetrieverV2GroupResultLimiterTest()
+ : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
+
+ SchemaProto schema;
+ schema.add_types()->set_schema_type("Document");
+ schema.add_types()->set_schema_type("Message");
+ schema.add_types()->set_schema_type("Person");
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ std::move(schema), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, test_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+};
+
+ResultSpecProto CreateResultSpec(
+ int num_per_page, ResultSpecProto::ResultGroupingType result_group_type) {
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(result_group_type);
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingShouldLimitResults) {
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score())};
+
+ // Create a ResultSpec that limits "namespace" to a single result.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace");
+
+ // Creates a ResultState with 2 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Only the top ranked document in "namespace" (document2), should be
+ // returned.
+ auto [page_result, has_more_results] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ ASSERT_THAT(page_result.results, SizeIs(1));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document2));
+ // Document1 has not been returned due to GroupResultLimiter, but since it was
+ // "filtered out", there should be no more results.
+ EXPECT_FALSE(has_more_results);
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingHasEmptyFirstPage) {
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score())};
+
+ // Create a ResultSpec that limits "namespace" to 0 results.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(0);
+ entry->set_namespace_("namespace");
+
+ // Creates a ResultState with 2 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // First page: empty page
+ auto [page_result, has_more_results] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ ASSERT_THAT(page_result.results, IsEmpty());
+ EXPECT_FALSE(has_more_results);
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingHasEmptyLastPage) {
+ // Creates 4 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Document")
+ .SetScore(3)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace", "uri/4")
+ .SetSchema("Document")
+ .SetScore(4)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store_->Put(document4));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score()),
+ ScoredDocumentHit(document_id3, kSectionIdMaskNone, document3.score()),
+ ScoredDocumentHit(document_id4, kSectionIdMaskNone, document4.score())};
+
+ // Create a ResultSpec that limits "namespace" to 2 results.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(2);
+ entry->set_namespace_("namespace");
+
+ // Creates a ResultState with 4 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // First page: document4 and document3 should be returned.
+ auto [page_result1, has_more_results1] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ ASSERT_THAT(page_result1.results, SizeIs(2));
+ EXPECT_THAT(page_result1.results.at(0).document(), EqualsProto(document4));
+ EXPECT_THAT(page_result1.results.at(1).document(), EqualsProto(document3));
+ EXPECT_TRUE(has_more_results1);
+
+ // Second page: although there are valid document hits in result state, all of
+ // them will be filtered out by group result limiter, so we should get an
+ // empty page.
+ auto [page_result2, has_more_results2] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result2.results, SizeIs(0));
+ EXPECT_FALSE(has_more_results2);
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingDoesNotLimitOtherNamespaceResults) {
+ // Creates 4 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "uri/3")
+ .SetSchema("Document")
+ .SetScore(3)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace2", "uri/4")
+ .SetSchema("Document")
+ .SetScore(4)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store_->Put(document4));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score()),
+ ScoredDocumentHit(document_id3, kSectionIdMaskNone, document3.score()),
+ ScoredDocumentHit(document_id4, kSectionIdMaskNone, document4.score())};
+
+ // Create a ResultSpec that limits "namespace1" to a single result, but
+ // doesn't limit "namespace2".
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+
+ // Creates a ResultState with 4 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // All documents in "namespace2" should be returned.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document4));
+ EXPECT_THAT(page_result.results.at(1).document(), EqualsProto(document3));
+ EXPECT_THAT(page_result.results.at(2).document(), EqualsProto(document2));
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingNonexistentNamespaceShouldBeIgnored) {
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score())};
+
+ // Create a ResultSpec that limits "namespace"+"nonExistentNamespace" to a
+ // single result.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("nonexistentNamespace");
+
+ // Creates a ResultState with 2 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Only the top ranked document in "namespace" (document2), should be
+ // returned. The presence of "nonexistentNamespace" in the same result
+ // grouping should have no effect.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(1));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document2));
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingNonexistentSchemaShouldBeIgnored) {
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score())};
+
+ // Create a ResultSpec that limits "Document"+"nonExistentSchema" to a
+ // single result.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_schema("Document");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_schema("nonexistentSchema");
+
+ // Creates a ResultState with 2 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Only the top ranked document in "Document" (document2), should be
+ // returned. The presence of "nonexistentNamespace" in the same result
+ // grouping should have no effect.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(1));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document2));
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingMultiNamespaceGrouping) {
+ // Creates 6 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4 < document5 <
+ // document6
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "uri/3")
+ .SetSchema("Document")
+ .SetScore(3)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace2", "uri/4")
+ .SetSchema("Document")
+ .SetScore(4)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store_->Put(document4));
+
+ DocumentProto document5 = DocumentBuilder()
+ .SetKey("namespace3", "uri/5")
+ .SetSchema("Document")
+ .SetScore(5)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ document_store_->Put(document5));
+
+ DocumentProto document6 = DocumentBuilder()
+ .SetKey("namespace3", "uri/6")
+ .SetSchema("Document")
+ .SetScore(6)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6,
+ document_store_->Put(document6));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score()),
+ ScoredDocumentHit(document_id3, kSectionIdMaskNone, document3.score()),
+ ScoredDocumentHit(document_id4, kSectionIdMaskNone, document4.score()),
+ ScoredDocumentHit(document_id5, kSectionIdMaskNone, document5.score()),
+ ScoredDocumentHit(document_id6, kSectionIdMaskNone, document6.score())};
+
+ // Create a ResultSpec that limits "namespace1" to a single result and limits
+ // "namespace2"+"namespace3" to a total of two results.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(2);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace3");
+
+ // Creates a ResultState with 6 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Only the top-ranked result in "namespace1" (document2) should be returned.
+ // Only the top-ranked results across "namespace2" and "namespace3"
+ // (document6, document5) should be returned.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document6));
+ EXPECT_THAT(page_result.results.at(1).document(), EqualsProto(document5));
+ EXPECT_THAT(page_result.results.at(2).document(), EqualsProto(document2));
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingMultiSchemaGrouping) {
+ // Creates 6 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4 < document5 <
+ // document6
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Person")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Message")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace", "uri/3")
+ .SetSchema("Person")
+ .SetScore(3)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace", "uri/4")
+ .SetSchema("Message")
+ .SetScore(4)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store_->Put(document4));
+
+ DocumentProto document5 = DocumentBuilder()
+ .SetKey("namespace", "uri/5")
+ .SetSchema("Document")
+ .SetScore(5)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ document_store_->Put(document5));
+
+ DocumentProto document6 = DocumentBuilder()
+ .SetKey("namespace", "uri/6")
+ .SetSchema("Document")
+ .SetScore(6)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6,
+ document_store_->Put(document6));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score()),
+ ScoredDocumentHit(document_id3, kSectionIdMaskNone, document3.score()),
+ ScoredDocumentHit(document_id4, kSectionIdMaskNone, document4.score()),
+ ScoredDocumentHit(document_id5, kSectionIdMaskNone, document5.score()),
+ ScoredDocumentHit(document_id6, kSectionIdMaskNone, document6.score())};
+
+ // Create a ResultSpec that limits "namespace1" to a single result and limits
+ // "namespace2"+"namespace3" to a total of two results.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_schema("Document");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(2);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_schema("Message");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_schema("Person");
+
+ // Creates a ResultState with 6 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Only the top-ranked result in "Document" (document6) should be returned.
+ // Only the top-ranked results across "Message" and "Person"
+ // (document5, document3) should be returned.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document6));
+ EXPECT_THAT(page_result.results.at(1).document(), EqualsProto(document4));
+ EXPECT_THAT(page_result.results.at(2).document(), EqualsProto(document3));
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingMultiNamespaceAndSchemaGrouping) {
+ // Creates 6 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4 < document5 <
+ // document6
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace1", "uri/3")
+ .SetSchema("Document")
+ .SetScore(3)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace2", "uri/4")
+ .SetSchema("Document")
+ .SetScore(4)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store_->Put(document4));
+
+ DocumentProto document5 = DocumentBuilder()
+ .SetKey("namespace3", "uri/5")
+ .SetSchema("Message")
+ .SetScore(5)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ document_store_->Put(document5));
+
+ DocumentProto document6 = DocumentBuilder()
+ .SetKey("namespace3", "uri/6")
+ .SetSchema("Message")
+ .SetScore(6)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6,
+ document_store_->Put(document6));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score()),
+ ScoredDocumentHit(document_id3, kSectionIdMaskNone, document3.score()),
+ ScoredDocumentHit(document_id4, kSectionIdMaskNone, document4.score()),
+ ScoredDocumentHit(document_id5, kSectionIdMaskNone, document5.score()),
+ ScoredDocumentHit(document_id6, kSectionIdMaskNone, document6.score())};
+
+ // Create a ResultSpec that limits "namespace1" to a single result and limits
+ // "namespace2"+"namespace3" to a total of two results.
+ ResultSpecProto result_spec = CreateResultSpec(
+ /*num_per_page=*/5, ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("namespace1");
+ entry->set_schema("Document");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(2);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+ entry->set_schema("Document");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace3");
+ entry->set_schema("Message");
+
+ // Creates a ResultState with 6 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Only the top-ranked result in "namespace1xDocument" (document3)
+ // should be returned.
+ // Only the top-ranked results across "namespace2xDocument" and
+ // "namespace3xMessage" (document6, document5) should be returned.
+
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document6));
+ EXPECT_THAT(page_result.results.at(1).document(), EqualsProto(document5));
+ EXPECT_THAT(page_result.results.at(2).document(), EqualsProto(document3));
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingOnlyNonexistentNamespaces) {
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score())};
+
+ // Create a ResultSpec that limits "nonexistentNamespace" to a single result.
+ // but doesn't limit "namespace"
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_namespace_("nonexistentNamespace");
+
+ // Creates a ResultState with 2 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // All documents in "namespace" should be returned. The presence of
+ // "nonexistentNamespace" should have no effect.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document2));
+ EXPECT_THAT(page_result.results.at(1).document(), EqualsProto(document1));
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ResultGroupingOnlyNonexistentSchemas) {
+ // Creates 2 documents and ensures the relationship in terms of document
+ // score is: document1 < document2
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score())};
+
+ // Create a ResultSpec that limits "nonexistentSchema" to a single result.
+ // but doesn't limit "Document"
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::SCHEMA_TYPE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(1);
+ entry->set_schema("nonexistentSchema");
+
+ // Creates a ResultState with 2 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // All documents in "Document" should be returned. The presence of
+ // "nonexistentDocument" should have no effect.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+ EXPECT_THAT(page_result.results.at(0).document(), EqualsProto(document2));
+ EXPECT_THAT(page_result.results.at(1).document(), EqualsProto(document1));
+}
+
+TEST_F(ResultRetrieverV2GroupResultLimiterTest,
+ ShouldUpdateResultStateCorrectlyWithGroupResultLimiter) {
+ // Creates 5 documents and ensures the relationship in terms of document
+ // score is: document1 < document2 < document3 < document4 < document5
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace2", "uri/1")
+ .SetSchema("Document")
+ .SetScore(1)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document1));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri/2")
+ .SetSchema("Document")
+ .SetScore(2)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document2));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace1", "uri/3")
+ .SetSchema("Document")
+ .SetScore(3)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store_->Put(document3));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace2", "uri/4")
+ .SetSchema("Document")
+ .SetScore(4)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store_->Put(document4));
+
+ DocumentProto document5 = DocumentBuilder()
+ .SetKey("namespace2", "uri/5")
+ .SetSchema("Document")
+ .SetScore(5)
+ .SetCreationTimestampMs(1000)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ document_store_->Put(document5));
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ ScoredDocumentHit(document_id1, kSectionIdMaskNone, document1.score()),
+ ScoredDocumentHit(document_id2, kSectionIdMaskNone, document2.score()),
+ ScoredDocumentHit(document_id3, kSectionIdMaskNone, document3.score()),
+ ScoredDocumentHit(document_id4, kSectionIdMaskNone, document4.score()),
+ ScoredDocumentHit(document_id5, kSectionIdMaskNone, document5.score())};
+
+ // Create a ResultSpec that limits "namespace1" to 3 results and "namespace2"
+ // to a single result.
+ ResultSpecProto::ResultGroupingType result_grouping_type =
+ ResultSpecProto::NAMESPACE;
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, result_grouping_type);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(3);
+ entry->set_namespace_("namespace1");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(1);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+
+ // Get corpus ids.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ CorpusId corpus_id1, document_store_->GetResultGroupingEntryId(
+ result_grouping_type, "namespace1", "Document"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ CorpusId corpus_id2, document_store_->GetResultGroupingEntryId(
+ result_grouping_type, "namespace2", "Document"));
+
+ // Creates a ResultState with 5 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *document_store_);
+ {
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ ASSERT_THAT(result_state.entry_id_group_id_map(),
+ UnorderedElementsAre(Pair(corpus_id1, 0), Pair(corpus_id2, 1)));
+ ASSERT_THAT(result_state.group_result_limits, ElementsAre(3, 1));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // document5, document4, document1 belong to namespace2 (with max_results =
+ // 1).
+ // docuemnt3, document2 belong to namespace 1 (with max_results = 3).
+ // Since num_per_page is 2, we expect to get document5 and document3 in the
+ // first page.
+ auto [page_result1, has_more_results1] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ ASSERT_THAT(page_result1.results, SizeIs(2));
+ ASSERT_THAT(page_result1.results.at(0).document(), EqualsProto(document5));
+ ASSERT_THAT(page_result1.results.at(1).document(), EqualsProto(document3));
+ ASSERT_TRUE(has_more_results1);
+ {
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ // Should remove document5, document4 and document3 from
+ // scored_document_hits. It removes more than num_per_page documents because
+ // document4 is filtered out by GroupResultLimiter and ResultRetriever has
+ // to fetch the next one until returning num_per_page documents or no
+ // remaining documents in scored_document_hits.
+ ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone,
+ document1.score());
+ ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone,
+ document2.score());
+ EXPECT_THAT(result_state.scored_document_hits_ranker, Pointee(SizeIs(2)));
+
+ // Even though we removed 3 document hits from scored_document_hits this
+ // round, num_returned should still be 2, since document4 was "filtered out"
+ // and should not be counted into num_returned.
+ EXPECT_THAT(result_state.num_returned, Eq(2));
+ // corpus_id_group_id_map should be unchanged.
+ EXPECT_THAT(result_state.entry_id_group_id_map(),
+ UnorderedElementsAre(Pair(corpus_id1, 0), Pair(corpus_id2, 1)));
+ // GroupResultLimiter should decrement the # in group_result_limits.
+ EXPECT_THAT(result_state.group_result_limits, ElementsAre(2, 0));
+ }
+
+ // Although there are document2 and document1 left, since namespace2 has
+ // reached its max results, document1 should be excluded from the second page.
+ auto [page_result2, has_more_results2] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ ASSERT_THAT(page_result2.results, SizeIs(1));
+ ASSERT_THAT(page_result2.results.at(0).document(), EqualsProto(document2));
+ ASSERT_FALSE(has_more_results2);
+ {
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ // Should remove document2 and document1 from scored_document_hits.
+ EXPECT_THAT(result_state.scored_document_hits_ranker, Pointee(IsEmpty()));
+ // Even though we removed 2 document hits from scored_document_hits this
+ // round, num_returned should only be incremented by 1 (and thus become 3),
+ // since document1 was "filtered out" and should not be counted into
+ // num_returned.
+ EXPECT_THAT(result_state.num_returned, Eq(3));
+ // corpus_id_group_id_map should be unchanged.
+ EXPECT_THAT(result_state.entry_id_group_id_map(),
+ UnorderedElementsAre(Pair(corpus_id1, 0), Pair(corpus_id2, 1)));
+ // GroupResultLimiter should decrement the # in group_result_limits.
+ EXPECT_THAT(result_state.group_result_limits, ElementsAre(1, 0));
+ }
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-retriever-v2_projection_test.cc b/icing/result/result-retriever-v2_projection_test.cc
new file mode 100644
index 0000000..1a75631
--- /dev/null
+++ b/icing/result/result-retriever-v2_projection_test.cc
@@ -0,0 +1,1957 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/result/page-result.h"
+#include "icing/result/projection-tree.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::SizeIs;
+
+class ResultRetrieverV2ProjectionTest : public testing::Test {
+ protected:
+ ResultRetrieverV2ProjectionTest() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Artist")
+ .AddParentType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Musician")
+ .AddParentType("Artist")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("WithPhone")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("phoneNumber")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("phoneModel")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("PersonWithPhone")
+ .AddParentType("Person")
+ .AddParentType("WithPhone")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("phoneNumber")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("phoneModel")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build())
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, test_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ SectionId GetSectionId(const std::string& type, const std::string& property) {
+ auto type_id_or = schema_store_->GetSchemaTypeId(type);
+ if (!type_id_or.ok()) {
+ return kInvalidSectionId;
+ }
+ SchemaTypeId type_id = type_id_or.ValueOrDie();
+ for (SectionId section_id = 0; section_id <= kMaxSectionId; ++section_id) {
+ auto metadata_or = schema_store_->GetSectionMetadata(type_id, section_id);
+ if (!metadata_or.ok()) {
+ break;
+ }
+ const SectionMetadata* metadata = metadata_or.ValueOrDie();
+ if (metadata->path == property) {
+ return metadata->id;
+ }
+ }
+ return kInvalidSectionId;
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+};
+
+SectionIdMask CreateSectionIdMask(const std::vector<SectionId>& section_ids) {
+ SectionIdMask mask = 0;
+ for (SectionId section_id : section_ids) {
+ mask |= (UINT64_C(1) << section_id);
+ }
+ return mask;
+}
+
+SearchSpecProto CreateSearchSpec(TermMatchType::Code match_type) {
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(match_type);
+ return search_spec;
+}
+
+ScoringSpecProto CreateScoringSpec(bool is_descending_order) {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_order_by(is_descending_order ? ScoringSpecProto::Order::DESC
+ : ScoringSpecProto::Order::ASC);
+ return scoring_spec;
+}
+
+ResultSpecProto CreateResultSpec(int num_per_page) {
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionTopLevelLeadNodeFieldPath) {
+ // 1. Add two Email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("name");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned results only contain the 'name' property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Goodnight Moon!")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionNestedLeafNodeFieldPath) {
+ // 1. Add two Email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("name", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("sender.name");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned results only contain the 'sender.name'
+ // property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .Build())
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .Build())
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionIntermediateNodeFieldPath) {
+ // 1. Add two Email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("name", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("sender");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned results only contain the 'sender'
+ // property and all of the subproperties of 'sender'.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionMultipleNestedFieldPaths) {
+ // 1. Add two Email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .AddStringProperty("name", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("sender.name");
+ type_property_mask->add_paths("sender.emailAddress");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned results only contain the 'sender.name' and
+ // 'sender.address' properties.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build())
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty(
+ "sender", DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Tom Hanks")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build())
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionEmptyFieldPath) {
+ // 1. Add two Email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned results contain *no* properties.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionInvalidFieldPath) {
+ // 1. Add two Email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("nonExistentProperty");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned results contain *no* properties.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one = DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionValidAndInvalidFieldPath) {
+ // 1. Add two Email documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("name");
+ type_property_mask->add_paths("nonExistentProperty");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned results only contain the 'name' property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Goodnight Moon!")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionMultipleTypesNoWildcards) {
+ // 1. Add two documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* type_property_mask = result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("name");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ //*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned Email results only contain the 'name'
+ // property and the returned Person results have all of their properties.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionMultipleTypesWildcard) {
+ // 1. Add two documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* wildcard_type_property_mask =
+ result_spec.add_type_property_masks();
+ wildcard_type_property_mask->set_schema_type(
+ std::string(SchemaStore::kSchemaTypeWildcard));
+ wildcard_type_property_mask->add_paths("name");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned Email results only contain the 'name'
+ // property and the returned Person results only contain the 'name' property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest,
+ ProjectionMultipleTypesWildcardWithOneOverride) {
+ // 1. Add two documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* email_type_property_mask =
+ result_spec.add_type_property_masks();
+ email_type_property_mask->set_schema_type("Email");
+ email_type_property_mask->add_paths("body");
+ TypePropertyMask* wildcard_type_property_mask =
+ result_spec.add_type_property_masks();
+ wildcard_type_property_mask->set_schema_type(
+ std::string(SchemaStore::kSchemaTypeWildcard));
+ wildcard_type_property_mask->add_paths("name");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned Email results only contain the 'body'
+ // property and the returned Person results only contain the 'name' property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest,
+ ProjectionSingleTypesWildcardAndOverride) {
+ // 1. Add two documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Mr. Body")
+ .AddStringProperty("emailAddress", "mr.body123@gmail.com")
+ .Build())
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* email_type_property_mask =
+ result_spec.add_type_property_masks();
+ email_type_property_mask->set_schema_type("Email");
+ email_type_property_mask->add_paths("sender.name");
+ TypePropertyMask* wildcard_type_property_mask =
+ result_spec.add_type_property_masks();
+ wildcard_type_property_mask->set_schema_type(
+ std::string(SchemaStore::kSchemaTypeWildcard));
+ wildcard_type_property_mask->add_paths("name");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned Email results only contain the 'sender.name'
+ // property and the returned Person results only contain the 'name' property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Mr. Body")
+ .Build())
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest,
+ ProjectionSingleTypesWildcardAndOverrideNestedProperty) {
+ // 1. Add two documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .AddDocumentProperty(
+ "sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Mr. Body")
+ .AddStringProperty("emailAddress", "mr.body123@gmail.com")
+ .Build())
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* email_type_property_mask =
+ result_spec.add_type_property_masks();
+ email_type_property_mask->set_schema_type("Email");
+ email_type_property_mask->add_paths("sender.name");
+ TypePropertyMask* wildcard_type_property_mask =
+ result_spec.add_type_property_masks();
+ wildcard_type_property_mask->set_schema_type(
+ std::string(SchemaStore::kSchemaTypeWildcard));
+ wildcard_type_property_mask->add_paths("sender");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned Email results only contain the 'sender.name'
+ // property and the returned Person results contain no properties.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddDocumentProperty("sender",
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetSchema("Person")
+ .AddStringProperty("name", "Mr. Body")
+ .Build())
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two = DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionJoinDocuments) {
+ // 1. Add one Person document
+ DocumentProto person_document =
+ DocumentBuilder()
+ .SetKey("namespace", "Person/1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id,
+ document_store_->Put(person_document));
+
+ // 2. Add two Email documents
+ DocumentProto email_document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "Email/1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Hello World!")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id1,
+ document_store_->Put(email_document1));
+
+ DocumentProto email_document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "Email/2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Goodnight Moon!")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id2,
+ document_store_->Put(email_document2));
+
+ // 3. Setup the joined scored results.
+ std::vector<SectionId> person_hit_section_ids = {
+ GetSectionId("Person", "name")};
+ std::vector<SectionId> email_hit_section_ids = {
+ GetSectionId("Email", "name"), GetSectionId("Email", "body")};
+ SectionIdMask person_hit_section_id_mask =
+ CreateSectionIdMask(person_hit_section_ids);
+ SectionIdMask email_hit_section_id_mask =
+ CreateSectionIdMask(email_hit_section_ids);
+
+ ScoredDocumentHit person_scored_doc_hit(
+ person_document_id, person_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email1_scored_doc_hit(
+ email_document_id1, email_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email2_scored_doc_hit(
+ email_document_id2, email_hit_section_id_mask, /*score=*/0);
+ // Create JoinedScoredDocumentHits mapping Person to Email1 and Email2
+ std::vector<JoinedScoredDocumentHit> joined_scored_document_hits = {
+ JoinedScoredDocumentHit(
+ /*final_score=*/0,
+ /*parent_scored_document_hit=*/person_scored_doc_hit,
+ /*child_scored_document_hits=*/
+ {email1_scored_doc_hit, email2_scored_doc_hit})};
+
+ // 4. Create parent ResultSpec with type property mask.
+ ResultSpecProto parent_result_spec = CreateResultSpec(/*num_per_page=*/2);
+ parent_result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int>::max());
+ TypePropertyMask* type_property_mask =
+ parent_result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Person");
+ type_property_mask->add_paths("name");
+
+ // 5. Create child ResultSpec with type property mask.
+ ResultSpecProto child_result_spec;
+ type_property_mask = child_result_spec.add_type_property_masks();
+ type_property_mask->set_schema_type("Email");
+ type_property_mask->add_paths("body");
+
+ // 6. Create ResultState with custom ResultSpecs.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
+ std::move(joined_scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), parent_result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), child_result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ parent_result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 7. Verify that the returned results:
+ // - Person docs only contain the "name" property.
+ // - Email docs only contain the "body" property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(1));
+
+ // Check parent doc.
+ DocumentProto projected_person_document =
+ DocumentBuilder()
+ .SetKey("namespace", "Person/1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_person_document));
+
+ // Check child docs.
+ ASSERT_THAT(page_result.results.at(0).joined_results(), SizeIs(2));
+ // Check Email1
+ DocumentProto projected_email_document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "Email/1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty(
+ "body", "Oh what a beautiful morning! Oh what a beautiful day!")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).joined_results(0).document(),
+ EqualsProto(projected_email_document1));
+ // Check Email2
+ DocumentProto projected_email_document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "Email/2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("body",
+ "Count all the sheep and tell them 'Hello'.")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).joined_results(1).document(),
+ EqualsProto(projected_email_document2));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionPolymorphism) {
+ // 1. Add two documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "Joe Artist")
+ .AddStringProperty("emailAddress", "artist@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskAll, /*score=*/0},
+ {document_id2, kSectionIdMaskAll, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ // Since Artist is a child type of Person, the TypePropertyMask for Person
+ // also applies to Artist.
+ TypePropertyMask* person_type_property_mask =
+ result_spec.add_type_property_masks();
+ person_type_property_mask->set_schema_type("Person");
+ person_type_property_mask->add_paths("name");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned Person and Artist results only contain the
+ // 'name' property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "Joe Artist")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionTransitivePolymorphism) {
+ // 1. Add two documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Musician")
+ .AddStringProperty("name", "Joe Musician")
+ .AddStringProperty("emailAddress", "Musician@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskAll, /*score=*/0},
+ {document_id2, kSectionIdMaskAll, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ // Since Musician is a transitive child type of Person, the TypePropertyMask
+ // for Person also applies to Musician.
+ TypePropertyMask* person_type_property_mask =
+ result_spec.add_type_property_masks();
+ person_type_property_mask->set_schema_type("Person");
+ person_type_property_mask->add_paths("name");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned Person and Musician results only contain the
+ // 'name' property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Musician")
+ .AddStringProperty("name", "Joe Musician")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest,
+ ProjectionPolymorphismChildMissingProperty) {
+ // 1. Add an artist document with missing 'emailAddress', which is allowed
+ // since 'emailAddress' in the parent type 'Person' is defined as optional.
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "Joe Artist")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document));
+
+ // 2. Setup the scored results.
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id, kSectionIdMaskAll, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask for the missing property
+ // 'emailAddress' in the Person type. Since Artist is a child type of Person,
+ // the TypePropertyMask for Person also applies to Artist.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* person_type_property_mask =
+ result_spec.add_type_property_masks();
+ person_type_property_mask->set_schema_type("Person");
+ person_type_property_mask->add_paths("emailAddress");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned person document does not contain any property,
+ // since 'emailAddress' is missing.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(1));
+ DocumentProto projected_document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionPolymorphismMerge) {
+ // 1. Add two documents
+ DocumentProto document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store_->Put(document_one));
+
+ DocumentProto document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "Joe Artist")
+ .AddStringProperty("emailAddress", "artist@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store_->Put(document_two));
+
+ // 2. Setup the scored results.
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskAll, /*score=*/0},
+ {document_id2, kSectionIdMaskAll, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
+ TypePropertyMask* person_type_property_mask =
+ result_spec.add_type_property_masks();
+ person_type_property_mask->set_schema_type("Person");
+ person_type_property_mask->add_paths("name");
+ // Since Artist is a child type of Person, the TypePropertyMask for Person
+ // will be merged to Artist's TypePropertyMask by polymorphism, so that 'name'
+ // will also show in Artist's projection results.
+ TypePropertyMask* artist_type_property_mask =
+ result_spec.add_type_property_masks();
+ artist_type_property_mask->set_schema_type("Artist");
+ artist_type_property_mask->add_paths("emailAddress");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned Person results only contain the 'name'
+ // property and the returned Artist results contain both the 'name' and
+ // 'emailAddress' properties.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ DocumentProto projected_document_one =
+ DocumentBuilder()
+ .SetKey("namespace", "uri1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document_one));
+
+ DocumentProto projected_document_two =
+ DocumentBuilder()
+ .SetKey("namespace", "uri2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Artist")
+ .AddStringProperty("name", "Joe Artist")
+ .AddStringProperty("emailAddress", "artist@aol.com")
+ .Build();
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(projected_document_two));
+}
+
+TEST_F(ResultRetrieverV2ProjectionTest, ProjectionMultipleParentPolymorphism) {
+ // 1. Add a document
+ DocumentProto document = DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("PersonWithPhone")
+ .AddStringProperty("name", "name")
+ .AddStringProperty("emailAddress", "email")
+ .AddStringProperty("phoneNumber", "12345")
+ .AddStringProperty("phoneModel", "pixel")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(document));
+
+ // 2. Setup the scored results.
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id, kSectionIdMaskAll, /*score=*/0}};
+
+ // 3. Create a ResultSpec with type property mask.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
+ // Since PersonWithPhone is a child type of Person, the TypePropertyMask
+ // also applies to PersonWithPhone.
+ TypePropertyMask* person_type_property_mask =
+ result_spec.add_type_property_masks();
+ person_type_property_mask->set_schema_type("Person");
+ person_type_property_mask->add_paths("name");
+ // Since PersonWithPhone is a child type of WithPhone, the
+ // TypePropertyMask also applies to PersonWithPhone.
+ TypePropertyMask* with_phone_type_property_mask =
+ result_spec.add_type_property_masks();
+ with_phone_type_property_mask->set_schema_type("WithPhone");
+ with_phone_type_property_mask->add_paths("phoneNumber");
+
+ // 4. Create ResultState with custom ResultSpec.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // 5. Verify that the returned document only contains the 'name' and the
+ // 'phoneNumber' property.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(1));
+
+ DocumentProto projected_document =
+ DocumentBuilder()
+ .SetKey("namespace", "uri")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("PersonWithPhone")
+ .AddStringProperty("name", "name")
+ .AddStringProperty("phoneNumber", "12345")
+ .Build();
+ EXPECT_THAT(page_result.results.at(0).document(),
+ EqualsProto(projected_document));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-retriever-v2_snippet_test.cc b/icing/result/result-retriever-v2_snippet_test.cc
new file mode 100644
index 0000000..440d31c
--- /dev/null
+++ b/icing/result/result-retriever-v2_snippet_test.cc
@@ -0,0 +1,1162 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <limits>
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/snippet-helpers.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+class ResultRetrieverV2SnippetTest : public testing::Test {
+ protected:
+ ResultRetrieverV2SnippetTest() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, test_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ SectionId GetSectionId(const std::string& type, const std::string& property) {
+ auto type_id_or = schema_store_->GetSchemaTypeId(type);
+ if (!type_id_or.ok()) {
+ return kInvalidSectionId;
+ }
+ SchemaTypeId type_id = type_id_or.ValueOrDie();
+ for (SectionId section_id = 0; section_id <= kMaxSectionId; ++section_id) {
+ auto metadata_or = schema_store_->GetSectionMetadata(type_id, section_id);
+ if (!metadata_or.ok()) {
+ break;
+ }
+ const SectionMetadata* metadata = metadata_or.ValueOrDie();
+ if (metadata->path == property) {
+ return metadata->id;
+ }
+ }
+ return kInvalidSectionId;
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+};
+
+ResultSpecProto::SnippetSpecProto CreateSnippetSpec() {
+ ResultSpecProto::SnippetSpecProto snippet_spec;
+ snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max());
+ snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max());
+ snippet_spec.set_max_window_utf32_length(1024);
+ return snippet_spec;
+}
+
+DocumentProto CreateEmailDocument(int id) {
+ return DocumentBuilder()
+ .SetKey("icing", "Email/" + std::to_string(id))
+ .SetSchema("Email")
+ .AddStringProperty("subject", "subject foo " + std::to_string(id))
+ .AddStringProperty("body", "body bar " + std::to_string(id))
+ .SetCreationTimestampMs(1574365086666 + id)
+ .Build();
+}
+
+DocumentProto CreatePersonDocument(int id) {
+ return DocumentBuilder()
+ .SetKey("icing", "Person/" + std::to_string(id))
+ .SetSchema("Person")
+ .AddStringProperty("name", "person " + std::to_string(id))
+ .SetCreationTimestampMs(1574365086666 + id)
+ .Build();
+}
+
+SectionIdMask CreateSectionIdMask(const std::vector<SectionId>& section_ids) {
+ SectionIdMask mask = 0;
+ for (SectionId section_id : section_ids) {
+ mask |= (UINT64_C(1) << section_id);
+ }
+ return mask;
+}
+
+SearchSpecProto CreateSearchSpec(TermMatchType::Code match_type) {
+ SearchSpecProto search_spec;
+ search_spec.set_term_match_type(match_type);
+ return search_spec;
+}
+
+ScoringSpecProto CreateScoringSpec(bool is_descending_order) {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_order_by(is_descending_order ? ScoringSpecProto::Order::DESC
+ : ScoringSpecProto::Order::ASC);
+ return scoring_spec;
+}
+
+ResultSpecProto CreateResultSpec(int num_per_page) {
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+TEST_F(ResultRetrieverV2SnippetTest,
+ DefaultSnippetSpecShouldDisableSnippeting) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "subject"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0},
+ {document_id3, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/3);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/true), result_spec,
+ schema_store_.get(), SectionRestrictQueryTermsMap()),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+ EXPECT_THAT(page_result.results.at(1).snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+ EXPECT_THAT(page_result.results.at(2).snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(0));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest, SimpleSnippeted) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "subject"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0},
+ {document_id3, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create ResultSpec with custom snippet spec.
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/3);
+ *result_spec.mutable_snippet_spec() = CreateSnippetSpec();
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(3));
+
+ const DocumentProto& result_document_one =
+ page_result.results.at(0).document();
+ const SnippetProto& result_snippet_one = page_result.results.at(0).snippet();
+ EXPECT_THAT(result_document_one, EqualsProto(CreateEmailDocument(/*id=*/1)));
+ EXPECT_THAT(result_snippet_one.entries(), SizeIs(2));
+ EXPECT_THAT(result_snippet_one.entries(0).property_name(), Eq("body"));
+ std::string_view content = GetString(
+ &result_document_one, result_snippet_one.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet_one.entries(0)),
+ ElementsAre("body bar 1"));
+ EXPECT_THAT(GetMatches(content, result_snippet_one.entries(0)),
+ ElementsAre("bar"));
+ EXPECT_THAT(result_snippet_one.entries(1).property_name(), Eq("subject"));
+ content = GetString(&result_document_one,
+ result_snippet_one.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet_one.entries(1)),
+ ElementsAre("subject foo 1"));
+ EXPECT_THAT(GetMatches(content, result_snippet_one.entries(1)),
+ ElementsAre("foo"));
+
+ const DocumentProto& result_document_two =
+ page_result.results.at(1).document();
+ const SnippetProto& result_snippet_two = page_result.results.at(1).snippet();
+ EXPECT_THAT(result_document_two, EqualsProto(CreateEmailDocument(/*id=*/2)));
+ EXPECT_THAT(result_snippet_two.entries(), SizeIs(2));
+ EXPECT_THAT(result_snippet_two.entries(0).property_name(), Eq("body"));
+ content = GetString(&result_document_two,
+ result_snippet_two.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet_two.entries(0)),
+ ElementsAre("body bar 2"));
+ EXPECT_THAT(GetMatches(content, result_snippet_two.entries(0)),
+ ElementsAre("bar"));
+ EXPECT_THAT(result_snippet_two.entries(1).property_name(), Eq("subject"));
+ content = GetString(&result_document_two,
+ result_snippet_two.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet_two.entries(1)),
+ ElementsAre("subject foo 2"));
+ EXPECT_THAT(GetMatches(content, result_snippet_two.entries(1)),
+ ElementsAre("foo"));
+
+ const DocumentProto& result_document_three =
+ page_result.results.at(2).document();
+ const SnippetProto& result_snippet_three =
+ page_result.results.at(2).snippet();
+ EXPECT_THAT(result_document_three,
+ EqualsProto(CreateEmailDocument(/*id=*/3)));
+ EXPECT_THAT(result_snippet_three.entries(), SizeIs(2));
+ EXPECT_THAT(result_snippet_three.entries(0).property_name(), Eq("body"));
+ content = GetString(&result_document_three,
+ result_snippet_three.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet_three.entries(0)),
+ ElementsAre("body bar 3"));
+ EXPECT_THAT(GetMatches(content, result_snippet_three.entries(0)),
+ ElementsAre("bar"));
+ EXPECT_THAT(result_snippet_three.entries(1).property_name(), Eq("subject"));
+ content = GetString(&result_document_three,
+ result_snippet_three.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet_three.entries(1)),
+ ElementsAre("subject foo 3"));
+ EXPECT_THAT(GetMatches(content, result_snippet_three.entries(1)),
+ ElementsAre("foo"));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest, OnlyOneDocumentSnippeted) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "subject"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0},
+ {document_id3, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(1);
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/3);
+ *result_spec.mutable_snippet_spec() = std::move(snippet_spec);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(1));
+
+ const DocumentProto& result_document = page_result.results.at(0).document();
+ const SnippetProto& result_snippet = page_result.results.at(0).snippet();
+ EXPECT_THAT(result_document, EqualsProto(CreateEmailDocument(/*id=*/1)));
+ EXPECT_THAT(result_snippet.entries(), SizeIs(2));
+ EXPECT_THAT(result_snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&result_document, result_snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet.entries(0)),
+ ElementsAre("body bar 1"));
+ EXPECT_THAT(GetMatches(content, result_snippet.entries(0)),
+ ElementsAre("bar"));
+ EXPECT_THAT(result_snippet.entries(1).property_name(), Eq("subject"));
+ content =
+ GetString(&result_document, result_snippet.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, result_snippet.entries(1)),
+ ElementsAre("subject foo 1"));
+ EXPECT_THAT(GetMatches(content, result_snippet.entries(1)),
+ ElementsAre("foo"));
+
+ EXPECT_THAT(page_result.results.at(1).document(),
+ EqualsProto(CreateEmailDocument(/*id=*/2)));
+ EXPECT_THAT(page_result.results.at(1).snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+
+ EXPECT_THAT(page_result.results.at(2).document(),
+ EqualsProto(CreateEmailDocument(/*id=*/3)));
+ EXPECT_THAT(page_result.results.at(2).snippet(),
+ EqualsProto(SnippetProto::default_instance()));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest, ShouldSnippetAllResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "subject"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0},
+ {document_id3, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(5);
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/3);
+ *result_spec.mutable_snippet_spec() = std::move(snippet_spec);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ // num_to_snippet = 5, num_previously_returned_in = 0,
+ // We can return 5 - 0 = 5 snippets at most. We're able to return all 3
+ // snippets here.
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(page_result.results.at(1).snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(page_result.results.at(2).snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(3));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest, ShouldSnippetSomeResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "subject"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0},
+ {document_id3, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(5);
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/3);
+ *result_spec.mutable_snippet_spec() = std::move(snippet_spec);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+ {
+ absl_ports::unique_lock l(&result_state.mutex);
+
+ // Set remaining_num_to_snippet = 2
+ result_state.parent_adjustment_info()->remaining_num_to_snippet = 2;
+ }
+
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(page_result.results.at(1).snippet().entries(), Not(IsEmpty()));
+ EXPECT_THAT(page_result.results.at(2).snippet().entries(), IsEmpty());
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(2));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest, ShouldNotSnippetAnyResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "subject"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0},
+ {document_id3, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(5);
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/3);
+ *result_spec.mutable_snippet_spec() = std::move(snippet_spec);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+ {
+ absl_ports::unique_lock l(&result_state.mutex);
+
+ // Set remaining_num_to_snippet = 0
+ result_state.parent_adjustment_info()->remaining_num_to_snippet = 0;
+ }
+
+ // We can't return any snippets for this page.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).snippet().entries(), IsEmpty());
+ EXPECT_THAT(page_result.results.at(1).snippet().entries(), IsEmpty());
+ EXPECT_THAT(page_result.results.at(2).snippet().entries(), IsEmpty());
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(0));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest,
+ ShouldNotSnippetAnyResultsForNonPositiveNumMatchesPerProperty) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "subject"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0},
+ {document_id3, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
+ snippet_spec.set_num_to_snippet(5);
+ ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/3);
+ *result_spec.mutable_snippet_spec() = std::move(snippet_spec);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ /*child_adjustment_info=*/nullptr, result_spec, *document_store_);
+
+ {
+ absl_ports::unique_lock l(&result_state.mutex);
+
+ // Set num_matchers_per_property = 0
+ result_state.parent_adjustment_info()
+ ->snippet_context.snippet_spec.set_num_matches_per_property(0);
+ }
+
+ // We can't return any snippets for this page even though num_to_snippet > 0.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.results.at(0).snippet().entries(), IsEmpty());
+ EXPECT_THAT(page_result.results.at(1).snippet().entries(), IsEmpty());
+ EXPECT_THAT(page_result.results.at(2).snippet().entries(), IsEmpty());
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(0));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest, JoinSnippeted) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId person_document_id1,
+ document_store_->Put(CreatePersonDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId person_document_id2,
+ document_store_->Put(CreatePersonDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId person_document_id3,
+ document_store_->Put(CreatePersonDocument(/*id=*/3)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> person_hit_section_ids = {
+ GetSectionId("Person", "name")};
+ std::vector<SectionId> email_hit_section_ids = {
+ GetSectionId("Email", "subject"), GetSectionId("Email", "body")};
+ SectionIdMask person_hit_section_id_mask =
+ CreateSectionIdMask(person_hit_section_ids);
+ SectionIdMask email_hit_section_id_mask =
+ CreateSectionIdMask(email_hit_section_ids);
+
+ ScoredDocumentHit person1_scored_doc_hit(
+ person_document_id1, person_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit person2_scored_doc_hit(
+ person_document_id2, person_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit person3_scored_doc_hit(
+ person_document_id3, person_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email1_scored_doc_hit(
+ email_document_id1, email_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email2_scored_doc_hit(
+ email_document_id2, email_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email3_scored_doc_hit(
+ email_document_id3, email_hit_section_id_mask, /*score=*/0);
+
+ // Create JoinedScoredDocumentHits mapping:
+ // - Person1 to Email1 and Email2
+ // - Person2 to empty
+ // - Person3 to Email3
+ JoinedScoredDocumentHit joined_scored_document_hit1(
+ /*final_score=*/0, /*parent_scored_document_hit=*/person1_scored_doc_hit,
+ /*child_scored_document_hits=*/
+ {email1_scored_doc_hit, email2_scored_doc_hit});
+ JoinedScoredDocumentHit joined_scored_document_hit2(
+ /*final_score=*/0, /*parent_scored_document_hit=*/person2_scored_doc_hit,
+ /*child_scored_document_hits=*/{});
+ JoinedScoredDocumentHit joined_scored_document_hit3(
+ /*final_score=*/0, /*parent_scored_document_hit=*/person3_scored_doc_hit,
+ /*child_scored_document_hits=*/{email3_scored_doc_hit});
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create parent ResultSpec with custom snippet spec.
+ ResultSpecProto parent_result_spec = CreateResultSpec(/*num_per_page=*/3);
+ parent_result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+ *parent_result_spec.mutable_snippet_spec() = CreateSnippetSpec();
+
+ // Create child ResultSpec with custom snippet spec.
+ ResultSpecProto child_result_spec;
+ *child_result_spec.mutable_snippet_spec() = CreateSnippetSpec();
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
+ std::vector<JoinedScoredDocumentHit>{joined_scored_document_hit1,
+ joined_scored_document_hit2,
+ joined_scored_document_hit3},
+ /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), parent_result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"person"}}})),
+ /*child_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), child_result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ parent_result_spec, *document_store_);
+
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(3));
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(3));
+
+ // Result1: Person1 for parent and [Email1, Email2] for children.
+ // Check parent doc (Person1).
+ const DocumentProto& result_parent_document_one =
+ page_result.results.at(0).document();
+ const SnippetProto& result_parent_snippet_one =
+ page_result.results.at(0).snippet();
+ EXPECT_THAT(result_parent_document_one,
+ EqualsProto(CreatePersonDocument(/*id=*/1)));
+ ASSERT_THAT(result_parent_snippet_one.entries(), SizeIs(1));
+ EXPECT_THAT(result_parent_snippet_one.entries(0).property_name(), Eq("name"));
+ std::string_view content =
+ GetString(&result_parent_document_one,
+ result_parent_snippet_one.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_parent_snippet_one.entries(0)),
+ ElementsAre("person 1"));
+ EXPECT_THAT(GetMatches(content, result_parent_snippet_one.entries(0)),
+ ElementsAre("person"));
+
+ // Check child docs.
+ ASSERT_THAT(page_result.results.at(0).joined_results(), SizeIs(2));
+ // Check Email1.
+ const DocumentProto& result_child_document_one =
+ page_result.results.at(0).joined_results(0).document();
+ const SnippetProto& result_child_snippet_one =
+ page_result.results.at(0).joined_results(0).snippet();
+ EXPECT_THAT(result_child_document_one,
+ EqualsProto(CreateEmailDocument(/*id=*/1)));
+ ASSERT_THAT(result_child_snippet_one.entries(), SizeIs(2));
+ EXPECT_THAT(result_child_snippet_one.entries(0).property_name(), Eq("body"));
+ content = GetString(&result_child_document_one,
+ result_child_snippet_one.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_child_snippet_one.entries(0)),
+ ElementsAre("body bar 1"));
+ EXPECT_THAT(GetMatches(content, result_child_snippet_one.entries(0)),
+ ElementsAre("bar"));
+ EXPECT_THAT(result_child_snippet_one.entries(1).property_name(),
+ Eq("subject"));
+ content = GetString(&result_child_document_one,
+ result_child_snippet_one.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, result_child_snippet_one.entries(1)),
+ ElementsAre("subject foo 1"));
+ EXPECT_THAT(GetMatches(content, result_child_snippet_one.entries(1)),
+ ElementsAre("foo"));
+ // Check Email2.
+ const DocumentProto& result_child_document_two =
+ page_result.results.at(0).joined_results(1).document();
+ const SnippetProto& result_child_snippet_two =
+ page_result.results.at(0).joined_results(1).snippet();
+ EXPECT_THAT(result_child_document_two,
+ EqualsProto(CreateEmailDocument(/*id=*/2)));
+ ASSERT_THAT(result_child_snippet_two.entries(), SizeIs(2));
+ EXPECT_THAT(result_child_snippet_two.entries(0).property_name(), Eq("body"));
+ content = GetString(&result_child_document_two,
+ result_child_snippet_two.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_child_snippet_two.entries(0)),
+ ElementsAre("body bar 2"));
+ EXPECT_THAT(GetMatches(content, result_child_snippet_two.entries(0)),
+ ElementsAre("bar"));
+ EXPECT_THAT(result_child_snippet_two.entries(1).property_name(),
+ Eq("subject"));
+ content = GetString(&result_child_document_two,
+ result_child_snippet_two.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, result_child_snippet_two.entries(1)),
+ ElementsAre("subject foo 2"));
+ EXPECT_THAT(GetMatches(content, result_child_snippet_two.entries(1)),
+ ElementsAre("foo"));
+
+ // Result2: Person2 for parent and [] for children.
+ // Check parent doc (Person1).
+ const DocumentProto& result_parent_document_two =
+ page_result.results.at(1).document();
+ const SnippetProto& result_parent_snippet_two =
+ page_result.results.at(1).snippet();
+ EXPECT_THAT(result_parent_document_two,
+ EqualsProto(CreatePersonDocument(/*id=*/2)));
+ ASSERT_THAT(result_parent_snippet_two.entries(), SizeIs(1));
+ EXPECT_THAT(result_parent_snippet_two.entries(0).property_name(), Eq("name"));
+ content = GetString(&result_parent_document_two,
+ result_parent_snippet_two.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_parent_snippet_two.entries(0)),
+ ElementsAre("person 2"));
+ EXPECT_THAT(GetMatches(content, result_parent_snippet_two.entries(0)),
+ ElementsAre("person"));
+ // Check child docs.
+ ASSERT_THAT(page_result.results.at(1).joined_results(), IsEmpty());
+
+ // Result3: Person3 for parent and [Email3] for children.
+ // Check parent doc (Person3).
+ const DocumentProto& result_parent_document_three =
+ page_result.results.at(2).document();
+ const SnippetProto& result_parent_snippet_three =
+ page_result.results.at(2).snippet();
+ EXPECT_THAT(result_parent_document_three,
+ EqualsProto(CreatePersonDocument(/*id=*/3)));
+ ASSERT_THAT(result_parent_snippet_three.entries(), SizeIs(1));
+ EXPECT_THAT(result_parent_snippet_three.entries(0).property_name(),
+ Eq("name"));
+ content = GetString(&result_parent_document_three,
+ result_parent_snippet_three.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_parent_snippet_three.entries(0)),
+ ElementsAre("person 3"));
+ EXPECT_THAT(GetMatches(content, result_parent_snippet_three.entries(0)),
+ ElementsAre("person"));
+
+ // Check child docs.
+ ASSERT_THAT(page_result.results.at(2).joined_results(), SizeIs(1));
+ // Check Email3.
+ const DocumentProto& result_child_document_three =
+ page_result.results.at(2).joined_results(0).document();
+ const SnippetProto& result_child_snippet_three =
+ page_result.results.at(2).joined_results(0).snippet();
+ EXPECT_THAT(result_child_document_three,
+ EqualsProto(CreateEmailDocument(/*id=*/3)));
+ ASSERT_THAT(result_child_snippet_three.entries(), SizeIs(2));
+ EXPECT_THAT(result_child_snippet_three.entries(0).property_name(),
+ Eq("body"));
+ content = GetString(&result_child_document_three,
+ result_child_snippet_three.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, result_child_snippet_three.entries(0)),
+ ElementsAre("body bar 3"));
+ EXPECT_THAT(GetMatches(content, result_child_snippet_three.entries(0)),
+ ElementsAre("bar"));
+ EXPECT_THAT(result_child_snippet_three.entries(1).property_name(),
+ Eq("subject"));
+ content = GetString(&result_child_document_three,
+ result_child_snippet_three.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, result_child_snippet_three.entries(1)),
+ ElementsAre("subject foo 3"));
+ EXPECT_THAT(GetMatches(content, result_child_snippet_three.entries(1)),
+ ElementsAre("foo"));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest, ShouldSnippetAllJoinedResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId person_document_id1,
+ document_store_->Put(CreatePersonDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId person_document_id2,
+ document_store_->Put(CreatePersonDocument(/*id=*/2)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> person_hit_section_ids = {
+ GetSectionId("Person", "name")};
+ std::vector<SectionId> email_hit_section_ids = {
+ GetSectionId("Email", "subject"), GetSectionId("Email", "body")};
+ SectionIdMask person_hit_section_id_mask =
+ CreateSectionIdMask(person_hit_section_ids);
+ SectionIdMask email_hit_section_id_mask =
+ CreateSectionIdMask(email_hit_section_ids);
+
+ ScoredDocumentHit person1_scored_doc_hit(
+ person_document_id1, person_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit person2_scored_doc_hit(
+ person_document_id2, person_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email1_scored_doc_hit(
+ email_document_id1, email_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email2_scored_doc_hit(
+ email_document_id2, email_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email3_scored_doc_hit(
+ email_document_id3, email_hit_section_id_mask, /*score=*/0);
+
+ // Create JoinedScoredDocumentHits mapping:
+ // - Person1 to Email1
+ // - Person2 to Email2, Email3
+ JoinedScoredDocumentHit joined_scored_document_hit1(
+ /*final_score=*/0, /*parent_scored_document_hit=*/person1_scored_doc_hit,
+ /*child_scored_document_hits=*/
+ {email1_scored_doc_hit});
+ JoinedScoredDocumentHit joined_scored_document_hit2(
+ /*final_score=*/0, /*parent_scored_document_hit=*/person2_scored_doc_hit,
+ /*child_scored_document_hits=*/
+ {email2_scored_doc_hit, email3_scored_doc_hit});
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create parent ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto parent_snippet_spec = CreateSnippetSpec();
+ parent_snippet_spec.set_num_to_snippet(1);
+ ResultSpecProto parent_result_spec = CreateResultSpec(/*num_per_page=*/3);
+ parent_result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+ *parent_result_spec.mutable_snippet_spec() = std::move(parent_snippet_spec);
+
+ // Create child ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto child_snippet_spec = CreateSnippetSpec();
+ child_snippet_spec.set_num_to_snippet(3);
+ ResultSpecProto child_result_spec;
+ *child_result_spec.mutable_snippet_spec() = std::move(child_snippet_spec);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
+ std::vector<JoinedScoredDocumentHit>{joined_scored_document_hit1,
+ joined_scored_document_hit2},
+ /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), parent_result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"person"}}})),
+ /*child_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), child_result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ parent_result_spec, *document_store_);
+
+ // Only 1 parent document should be snippeted, but all of the child documents
+ // should be snippeted.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ // Result1: Person1 for parent and [Email1] for children.
+ // Check parent doc (Person1).
+ EXPECT_THAT(page_result.results.at(0).snippet().entries(), Not(IsEmpty()));
+ // Check child docs.
+ ASSERT_THAT(page_result.results.at(0).joined_results(), SizeIs(1));
+ EXPECT_THAT(page_result.results.at(0).joined_results(0).snippet().entries(),
+ Not(IsEmpty()));
+
+ // Result2: Person2 for parent and [Email2, Email3] for children.
+ // Check parent doc (Person2).
+ EXPECT_THAT(page_result.results.at(1).snippet().entries(), IsEmpty());
+ // Check child docs.
+ ASSERT_THAT(page_result.results.at(1).joined_results(), SizeIs(2));
+ EXPECT_THAT(page_result.results.at(1).joined_results(0).snippet().entries(),
+ Not(IsEmpty()));
+ EXPECT_THAT(page_result.results.at(1).joined_results(1).snippet().entries(),
+ Not(IsEmpty()));
+
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(1));
+}
+
+TEST_F(ResultRetrieverV2SnippetTest, ShouldSnippetSomeJoinedResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId person_document_id1,
+ document_store_->Put(CreatePersonDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId person_document_id2,
+ document_store_->Put(CreatePersonDocument(/*id=*/2)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id1,
+ document_store_->Put(CreateEmailDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id2,
+ document_store_->Put(CreateEmailDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId email_document_id3,
+ document_store_->Put(CreateEmailDocument(/*id=*/3)));
+
+ std::vector<SectionId> person_hit_section_ids = {
+ GetSectionId("Person", "name")};
+ std::vector<SectionId> email_hit_section_ids = {
+ GetSectionId("Email", "subject"), GetSectionId("Email", "body")};
+ SectionIdMask person_hit_section_id_mask =
+ CreateSectionIdMask(person_hit_section_ids);
+ SectionIdMask email_hit_section_id_mask =
+ CreateSectionIdMask(email_hit_section_ids);
+
+ ScoredDocumentHit person1_scored_doc_hit(
+ person_document_id1, person_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit person2_scored_doc_hit(
+ person_document_id2, person_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email1_scored_doc_hit(
+ email_document_id1, email_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email2_scored_doc_hit(
+ email_document_id2, email_hit_section_id_mask, /*score=*/0);
+ ScoredDocumentHit email3_scored_doc_hit(
+ email_document_id3, email_hit_section_id_mask, /*score=*/0);
+
+ // Create JoinedScoredDocumentHits mapping:
+ // - Person1 to Email1
+ // - Person2 to Email2, Email3
+ JoinedScoredDocumentHit joined_scored_document_hit1(
+ /*final_score=*/0, /*parent_scored_document_hit=*/person1_scored_doc_hit,
+ /*child_scored_document_hits=*/
+ {email1_scored_doc_hit});
+ JoinedScoredDocumentHit joined_scored_document_hit2(
+ /*final_score=*/0, /*parent_scored_document_hit=*/person2_scored_doc_hit,
+ /*child_scored_document_hits=*/
+ {email2_scored_doc_hit, email3_scored_doc_hit});
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Create parent ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto parent_snippet_spec = CreateSnippetSpec();
+ parent_snippet_spec.set_num_to_snippet(3);
+ ResultSpecProto parent_result_spec = CreateResultSpec(/*num_per_page=*/3);
+ parent_result_spec.set_max_joined_children_per_parent_to_return(
+ std::numeric_limits<int32_t>::max());
+ *parent_result_spec.mutable_snippet_spec() = std::move(parent_snippet_spec);
+
+ // Create child ResultSpec with custom snippet spec.
+ ResultSpecProto::SnippetSpecProto child_snippet_spec = CreateSnippetSpec();
+ child_snippet_spec.set_num_to_snippet(2);
+ ResultSpecProto child_result_spec;
+ *child_result_spec.mutable_snippet_spec() = std::move(child_snippet_spec);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
+ std::vector<JoinedScoredDocumentHit>{joined_scored_document_hit1,
+ joined_scored_document_hit2},
+ /*is_descending=*/false),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), parent_result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"person"}}})),
+ /*child_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(
+ CreateSearchSpec(TermMatchType::EXACT_ONLY),
+ CreateScoringSpec(/*is_descending_order=*/false), child_result_spec,
+ schema_store_.get(),
+ SectionRestrictQueryTermsMap({{"", {"foo", "bar"}}})),
+ parent_result_spec, *document_store_);
+
+ // All parents document should be snippeted. Only 2 child documents should be
+ // snippeted.
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result.results, SizeIs(2));
+
+ // Result1: Person1 for parent and [Email1] for children.
+ // Check parent doc (Person1).
+ EXPECT_THAT(page_result.results.at(0).snippet().entries(), Not(IsEmpty()));
+ // Check child docs.
+ ASSERT_THAT(page_result.results.at(0).joined_results(), SizeIs(1));
+ EXPECT_THAT(page_result.results.at(0).joined_results(0).snippet().entries(),
+ Not(IsEmpty()));
+
+ // Result2: Person2 for parent and [Email2, Email3] for children.
+ // Check parent doc (Person2).
+ EXPECT_THAT(page_result.results.at(1).snippet().entries(), Not(IsEmpty()));
+ // Check child docs.
+ ASSERT_THAT(page_result.results.at(1).joined_results(), SizeIs(2));
+ EXPECT_THAT(page_result.results.at(1).joined_results(0).snippet().entries(),
+ Not(IsEmpty()));
+ EXPECT_THAT(page_result.results.at(1).joined_results(1).snippet().entries(),
+ IsEmpty());
+
+ EXPECT_THAT(page_result.num_results_with_snippets, Eq(2));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-retriever-v2_test.cc b/icing/result/result-retriever-v2_test.cc
new file mode 100644
index 0000000..0bd40cc
--- /dev/null
+++ b/icing/result/result-retriever-v2_test.cc
@@ -0,0 +1,1012 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-retriever-v2.h"
+
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::DoDefault;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::IsEmpty;
+using ::testing::Pointee;
+using ::testing::Return;
+using ::testing::SizeIs;
+using EntryIdMap = std::unordered_map<int32_t, int>;
+
+// Mock the behavior of GroupResultLimiter::ShouldBeRemoved.
+class MockGroupResultLimiter : public GroupResultLimiterV2 {
+ public:
+ MockGroupResultLimiter() : GroupResultLimiterV2() {
+ ON_CALL(*this, ShouldBeRemoved).WillByDefault(Return(false));
+ }
+
+ MOCK_METHOD(bool, ShouldBeRemoved,
+ (const ScoredDocumentHit&, const EntryIdMap&,
+ const DocumentStore&, std::vector<int>&,
+ ResultSpecProto::ResultGroupingType, int64_t),
+ (const, override));
+};
+
+class ResultRetrieverV2Test : public ::testing::Test {
+ protected:
+ ResultRetrieverV2Test() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ num_total_hits_ = 0;
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ SectionId GetSectionId(const std::string& type, const std::string& property) {
+ auto type_id_or = schema_store_->GetSchemaTypeId(type);
+ if (!type_id_or.ok()) {
+ return kInvalidSectionId;
+ }
+ SchemaTypeId type_id = type_id_or.ValueOrDie();
+ for (SectionId section_id = 0; section_id <= kMaxSectionId; ++section_id) {
+ auto metadata_or = schema_store_->GetSectionMetadata(type_id, section_id);
+ if (!metadata_or.ok()) {
+ break;
+ }
+ const SectionMetadata* metadata = metadata_or.ValueOrDie();
+ if (metadata->path == property) {
+ return metadata->id;
+ }
+ }
+ return kInvalidSectionId;
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::atomic<int> num_total_hits_;
+ FakeClock fake_clock_;
+};
+
+DocumentProto CreateDocument(int id) {
+ return DocumentBuilder()
+ .SetKey("icing", "Email/" + std::to_string(id))
+ .SetSchema("Email")
+ .AddStringProperty("name", "subject foo " + std::to_string(id))
+ .AddStringProperty("body", "body bar " + std::to_string(id))
+ .SetCreationTimestampMs(1574365086666 + id)
+ .Build();
+}
+
+SectionIdMask CreateSectionIdMask(const std::vector<SectionId>& section_ids) {
+ SectionIdMask mask = 0;
+ for (SectionId section_id : section_ids) {
+ mask |= (UINT64_C(1) << section_id);
+ }
+ return mask;
+}
+
+ResultSpecProto CreateResultSpec(
+ int num_per_page, ResultSpecProto::ResultGroupingType result_group_type) {
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(result_group_type);
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store) {
+ return DocumentStore::Create(
+ filesystem, base_dir, clock, schema_store,
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr);
+}
+
+TEST_F(ResultRetrieverV2Test, CreationWithNullPointerShouldFail) {
+ EXPECT_THAT(
+ ResultRetrieverV2::Create(/*doc_store=*/nullptr, schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_THAT(
+ ResultRetrieverV2::Create(doc_store.get(), /*schema_store=*/nullptr,
+ language_segmenter_.get(), normalizer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ /*language_segmenter=*/nullptr,
+ normalizer_.get()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(),
+ /*normalizer=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(ResultRetrieverV2Test, ShouldRetrieveSimpleResults) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ doc_store->Put(CreateDocument(/*id=*/4)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ doc_store->Put(CreateDocument(/*id=*/5)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/19},
+ {document_id2, hit_section_id_mask, /*score=*/12},
+ {document_id3, hit_section_id_mask, /*score=*/8},
+ {document_id4, hit_section_id_mask, /*score=*/3},
+ {document_id5, hit_section_id_mask, /*score=*/1}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(19);
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
+ result2.set_score(12);
+ SearchResultProto::ResultProto result3;
+ *result3.mutable_document() = CreateDocument(/*id=*/3);
+ result3.set_score(8);
+ SearchResultProto::ResultProto result4;
+ *result4.mutable_document() = CreateDocument(/*id=*/4);
+ result4.set_score(3);
+ SearchResultProto::ResultProto result5;
+ *result5.mutable_document() = CreateDocument(/*id=*/5);
+ result5.set_score(1);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE),
+ *doc_store);
+
+ // First page, 2 results
+ auto [page_result1, has_more_results1] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result1.results,
+ ElementsAre(EqualsProto(result1), EqualsProto(result2)));
+ // num_results_with_snippets is 0 when there is no snippet.
+ EXPECT_THAT(page_result1.num_results_with_snippets, Eq(0));
+ // Requested page size is same as num_per_page.
+ EXPECT_THAT(page_result1.requested_page_size, Eq(2));
+ // Has more results.
+ EXPECT_TRUE(has_more_results1);
+
+ // Second page, 2 results
+ auto [page_result2, has_more_results2] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result2.results,
+ ElementsAre(EqualsProto(result3), EqualsProto(result4)));
+ // num_results_with_snippets is 0 when there is no snippet.
+ EXPECT_THAT(page_result2.num_results_with_snippets, Eq(0));
+ // Requested page size is same as num_per_page.
+ EXPECT_THAT(page_result2.requested_page_size, Eq(2));
+ // Has more results.
+ EXPECT_TRUE(has_more_results2);
+
+ // Third page, 1 result
+ auto [page_result3, has_more_results3] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result3.results, ElementsAre(EqualsProto(result5)));
+ // num_results_with_snippets is 0 when there is no snippet.
+ EXPECT_THAT(page_result3.num_results_with_snippets, Eq(0));
+ // Requested page size is same as num_per_page.
+ EXPECT_THAT(page_result3.requested_page_size, Eq(2));
+ // No more results.
+ EXPECT_FALSE(has_more_results3);
+}
+
+TEST_F(ResultRetrieverV2Test, ShouldIgnoreNonInternalErrors) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+
+ DocumentId invalid_document_id = -1;
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/12},
+ {document_id2, hit_section_id_mask, /*score=*/4},
+ {invalid_document_id, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get(),
+ std::make_unique<MockGroupResultLimiter>()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(12);
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
+ result2.set_score(4);
+
+ ResultStateV2 result_state1(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/3, ResultSpecProto::NAMESPACE),
+ *doc_store);
+ PageResult page_result1 =
+ result_retriever
+ ->RetrieveNextPage(result_state1,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ EXPECT_THAT(page_result1.results,
+ ElementsAre(EqualsProto(result1), EqualsProto(result2)));
+
+ DocumentId non_existing_document_id = 4;
+ scored_document_hits = {
+ {non_existing_document_id, hit_section_id_mask, /*score=*/15},
+ {document_id1, hit_section_id_mask, /*score=*/12},
+ {document_id2, hit_section_id_mask, /*score=*/4}};
+ ResultStateV2 result_state2(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/3, ResultSpecProto::NAMESPACE),
+ *doc_store);
+ PageResult page_result2 =
+ result_retriever
+ ->RetrieveNextPage(result_state2,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ EXPECT_THAT(page_result2.results,
+ ElementsAre(EqualsProto(result1), EqualsProto(result2)));
+}
+
+TEST_F(ResultRetrieverV2Test,
+ ShouldLimitNumChildDocumentsByMaxJoinedChildPerParent) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ // 1. Add 2 Person document
+ DocumentProto person_document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "Person/1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Joe Fox")
+ .AddStringProperty("emailAddress", "ny152@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id1,
+ doc_store->Put(person_document1));
+
+ DocumentProto person_document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "Person/2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Person")
+ .AddStringProperty("name", "Meg Ryan")
+ .AddStringProperty("emailAddress", "shopgirl@aol.com")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId person_document_id2,
+ doc_store->Put(person_document2));
+
+ // 2. Add 4 Email documents
+ DocumentProto email_document1 = DocumentBuilder()
+ .SetKey("namespace", "Email/1")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Test 1")
+ .AddStringProperty("body", "Test 1")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id1,
+ doc_store->Put(email_document1));
+
+ DocumentProto email_document2 = DocumentBuilder()
+ .SetKey("namespace", "Email/2")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Test 2")
+ .AddStringProperty("body", "Test 2")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id2,
+ doc_store->Put(email_document2));
+
+ DocumentProto email_document3 = DocumentBuilder()
+ .SetKey("namespace", "Email/3")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Test 3")
+ .AddStringProperty("body", "Test 3")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id3,
+ doc_store->Put(email_document3));
+
+ DocumentProto email_document4 = DocumentBuilder()
+ .SetKey("namespace", "Email/4")
+ .SetCreationTimestampMs(1000)
+ .SetSchema("Email")
+ .AddStringProperty("name", "Test 4")
+ .AddStringProperty("body", "Test 4")
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id4,
+ doc_store->Put(email_document4));
+
+ // 3. Setup the joined scored results.
+ std::vector<SectionId> person_hit_section_ids = {
+ GetSectionId("Person", "name")};
+ std::vector<SectionId> email_hit_section_ids = {
+ GetSectionId("Email", "name"), GetSectionId("Email", "body")};
+ SectionIdMask person_hit_section_id_mask =
+ CreateSectionIdMask(person_hit_section_ids);
+ SectionIdMask email_hit_section_id_mask =
+ CreateSectionIdMask(email_hit_section_ids);
+
+ ScoredDocumentHit person1_scored_doc_hit(
+ person_document_id1, person_hit_section_id_mask, /*score=*/1);
+ ScoredDocumentHit person2_scored_doc_hit(
+ person_document_id2, person_hit_section_id_mask, /*score=*/2);
+ ScoredDocumentHit email1_scored_doc_hit(
+ email_document_id1, email_hit_section_id_mask, /*score=*/3);
+ ScoredDocumentHit email2_scored_doc_hit(
+ email_document_id2, email_hit_section_id_mask, /*score=*/4);
+ ScoredDocumentHit email3_scored_doc_hit(
+ email_document_id3, email_hit_section_id_mask, /*score=*/5);
+ ScoredDocumentHit email4_scored_doc_hit(
+ email_document_id4, email_hit_section_id_mask, /*score=*/6);
+ // Create JoinedScoredDocumentHits mapping:
+ // - Person1 to Email1
+ // - Person2 to Email2, Email3, Email4
+ std::vector<JoinedScoredDocumentHit> joined_scored_document_hits = {
+ JoinedScoredDocumentHit(
+ /*final_score=*/1,
+ /*parent_scored_document_hit=*/person1_scored_doc_hit,
+ /*child_scored_document_hits=*/{email1_scored_doc_hit}),
+ JoinedScoredDocumentHit(
+ /*final_score=*/3,
+ /*parent_scored_document_hit=*/person2_scored_doc_hit,
+ /*child_scored_document_hits=*/
+ {email4_scored_doc_hit, email3_scored_doc_hit,
+ email2_scored_doc_hit})};
+
+ // 4. Retrieve result with max_joined_children_per_parent_to_return = 2.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ result_spec.set_max_joined_children_per_parent_to_return(2);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<JoinedScoredDocumentHit>>(
+ std::move(joined_scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *doc_store);
+
+ // Result1: person2 with child docs = [email4, email3]
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = person_document2;
+ result1.set_score(3);
+ SearchResultProto::ResultProto* child1 = result1.add_joined_results();
+ *child1->mutable_document() = email_document4;
+ child1->set_score(6);
+ SearchResultProto::ResultProto* child2 = result1.add_joined_results();
+ *child2->mutable_document() = email_document3;
+ child2->set_score(5);
+
+ // Result2: person1 with child docs = [email1]
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = person_document1;
+ result2.set_score(1);
+ SearchResultProto::ResultProto* child3 = result2.add_joined_results();
+ *child3->mutable_document() = email_document1;
+ child3->set_score(3);
+
+ auto [page_result, has_more_results] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result.results,
+ ElementsAre(EqualsProto(result1), EqualsProto(result2)));
+ // No more results.
+ EXPECT_FALSE(has_more_results);
+}
+
+TEST_F(ResultRetrieverV2Test, ShouldIgnoreInternalErrors) {
+ MockFilesystem mock_filesystem;
+ EXPECT_CALL(mock_filesystem,
+ PRead(A<int>(), A<void*>(), A<size_t>(), A<off_t>()))
+ .WillOnce(Return(false))
+ .WillRepeatedly(DoDefault());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&mock_filesystem, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get(),
+ std::make_unique<MockGroupResultLimiter>()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(0);
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE),
+ *doc_store);
+ PageResult page_result =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ // We mocked mock_filesystem to return an internal error when retrieving doc2,
+ // so doc2 should be skipped and doc1 should still be returned.
+ EXPECT_THAT(page_result.results, ElementsAre(EqualsProto(result1)));
+}
+
+TEST_F(ResultRetrieverV2Test, ShouldUpdateResultState) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ doc_store->Put(CreateDocument(/*id=*/4)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ doc_store->Put(CreateDocument(/*id=*/5)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0},
+ {document_id3, hit_section_id_mask, /*score=*/0},
+ {document_id4, hit_section_id_mask, /*score=*/0},
+ {document_id5, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE),
+ *doc_store);
+
+ // First page, 2 results
+ PageResult page_result1 =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result1.results, SizeIs(2));
+ {
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ // num_returned = size of first page
+ EXPECT_THAT(result_state.num_returned, Eq(2));
+ // Should remove the 2 returned docs from scored_document_hits and only
+ // contain the remaining 3.
+ EXPECT_THAT(result_state.scored_document_hits_ranker, Pointee(SizeIs(3)));
+ }
+
+ // Second page, 2 results
+ PageResult page_result2 =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result2.results, SizeIs(2));
+ {
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ // num_returned = size of first and second pages
+ EXPECT_THAT(result_state.num_returned, Eq(4));
+ // Should remove the 2 returned docs from scored_document_hits and only
+ // contain the remaining 1.
+ EXPECT_THAT(result_state.scored_document_hits_ranker, Pointee(SizeIs(1)));
+ }
+
+ // Third page, 1 result
+ PageResult page_result3 =
+ result_retriever
+ ->RetrieveNextPage(result_state,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result3.results, SizeIs(1));
+ {
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ // num_returned = size of first, second and third pages
+ EXPECT_THAT(result_state.num_returned, Eq(5));
+ // Should remove the 1 returned doc from scored_document_hits and become
+ // empty.
+ EXPECT_THAT(result_state.scored_document_hits_ranker, Pointee(IsEmpty()));
+ }
+}
+
+TEST_F(ResultRetrieverV2Test, ShouldUpdateNumTotalHits) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+ std::vector<ScoredDocumentHit> scored_document_hits1 = {
+ {document_id1, hit_section_id_mask, /*score=*/0},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+ std::shared_ptr<ResultStateV2> result_state1 =
+ std::make_shared<ResultStateV2>(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ *doc_store);
+ {
+ absl_ports::unique_lock l(&result_state1->mutex);
+
+ result_state1->RegisterNumTotalHits(&num_total_hits_);
+ ASSERT_THAT(num_total_hits_, Eq(2));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ doc_store->Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ doc_store->Put(CreateDocument(/*id=*/4)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ doc_store->Put(CreateDocument(/*id=*/5)));
+ std::vector<ScoredDocumentHit> scored_document_hits2 = {
+ {document_id3, hit_section_id_mask, /*score=*/0},
+ {document_id4, hit_section_id_mask, /*score=*/0},
+ {document_id5, hit_section_id_mask, /*score=*/0}};
+ std::shared_ptr<ResultStateV2> result_state2 =
+ std::make_shared<ResultStateV2>(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE),
+ *doc_store);
+ {
+ absl_ports::unique_lock l(&result_state2->mutex);
+
+ result_state2->RegisterNumTotalHits(&num_total_hits_);
+ ASSERT_THAT(num_total_hits_, Eq(5));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ // Should get 1 doc in the first page of result_state1, and num_total_hits
+ // should be decremented by 1.
+ PageResult page_result1 =
+ result_retriever
+ ->RetrieveNextPage(*result_state1,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result1.results, SizeIs(1));
+ EXPECT_THAT(num_total_hits_, Eq(4));
+
+ // Should get 2 docs in the first page of result_state2, and num_total_hits
+ // should be decremented by 2.
+ PageResult page_result2 =
+ result_retriever
+ ->RetrieveNextPage(*result_state2,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result2.results, SizeIs(2));
+ EXPECT_THAT(num_total_hits_, Eq(2));
+
+ // Should get 1 doc in the second page of result_state2 (although num_per_page
+ // is 2, there is only 1 doc left), and num_total_hits should be decremented
+ // by 1.
+ PageResult page_result3 =
+ result_retriever
+ ->RetrieveNextPage(*result_state2,
+ fake_clock_.GetSystemTimeMilliseconds())
+ .first;
+ ASSERT_THAT(page_result3.results, SizeIs(1));
+ EXPECT_THAT(num_total_hits_, Eq(1));
+
+ // Destruct result_state1. There is 1 doc left, so num_total_hits should be
+ // decremented by 1 when destructing it.
+ result_state1.reset();
+ EXPECT_THAT(num_total_hits_, Eq(0));
+
+ // Destruct result_state2. There is 0 doc left, so num_total_hits should be
+ // unchanged when destructing it.
+ result_state1.reset();
+ EXPECT_THAT(num_total_hits_, Eq(0));
+}
+
+TEST_F(ResultRetrieverV2Test, ShouldLimitNumTotalBytesPerPage) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/5},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(5);
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
+ result2.set_score(0);
+
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ result_spec.set_num_total_bytes_per_page_threshold(result1.ByteSizeLong());
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *doc_store);
+
+ // First page. Only result1 should be returned, since its byte size meets
+ // num_total_bytes_per_page_threshold and ResultRetriever should terminate
+ // early even though # of results is still below num_per_page.
+ auto [page_result1, has_more_results1] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result1.results, ElementsAre(EqualsProto(result1)));
+ // Has more results.
+ EXPECT_TRUE(has_more_results1);
+
+ // Second page, result2.
+ auto [page_result2, has_more_results2] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result2.results, ElementsAre(EqualsProto(result2)));
+ // No more results.
+ EXPECT_FALSE(has_more_results2);
+}
+
+TEST_F(ResultRetrieverV2Test,
+ ShouldReturnSingleLargeResultAboveNumTotalBytesPerPageThreshold) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/5},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(5);
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
+ result2.set_score(0);
+
+ int threshold = 1;
+ ASSERT_THAT(result1.ByteSizeLong(), Gt(threshold));
+
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ result_spec.set_num_total_bytes_per_page_threshold(threshold);
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *doc_store);
+
+ // First page. Should return single result1 even though its byte size exceeds
+ // num_total_bytes_per_page_threshold.
+ auto [page_result1, has_more_results1] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result1.results, ElementsAre(EqualsProto(result1)));
+ // Has more results.
+ EXPECT_TRUE(has_more_results1);
+
+ // Second page, result2.
+ auto [page_result2, has_more_results2] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result2.results, ElementsAre(EqualsProto(result2)));
+ // No more results.
+ EXPECT_FALSE(has_more_results2);
+}
+
+TEST_F(ResultRetrieverV2Test,
+ ShouldRetrieveNextResultWhenBelowNumTotalBytesPerPageThreshold) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, test_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ doc_store->Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ doc_store->Put(CreateDocument(/*id=*/2)));
+
+ std::vector<SectionId> hit_section_ids = {GetSectionId("Email", "name"),
+ GetSectionId("Email", "body")};
+ SectionIdMask hit_section_id_mask = CreateSectionIdMask(hit_section_ids);
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, hit_section_id_mask, /*score=*/5},
+ {document_id2, hit_section_id_mask, /*score=*/0}};
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(doc_store.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+
+ SearchResultProto::ResultProto result1;
+ *result1.mutable_document() = CreateDocument(/*id=*/1);
+ result1.set_score(5);
+ SearchResultProto::ResultProto result2;
+ *result2.mutable_document() = CreateDocument(/*id=*/2);
+ result2.set_score(0);
+
+ int threshold = result1.ByteSizeLong() + 1;
+ ASSERT_THAT(result1.ByteSizeLong() + result2.ByteSizeLong(), Gt(threshold));
+
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ result_spec.set_num_total_bytes_per_page_threshold(threshold);
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, *doc_store);
+
+ // After retrieving result1, total bytes are still below the threshold and #
+ // of results is still below num_per_page, so ResultRetriever should continue
+ // the retrieval process and thus include result2 into this page, even though
+ // finally total bytes of result1 + result2 exceed the threshold.
+ auto [page_result, has_more_results] = result_retriever->RetrieveNextPage(
+ result_state, fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(page_result.results,
+ ElementsAre(EqualsProto(result1), EqualsProto(result2)));
+ // No more results.
+ EXPECT_FALSE(has_more_results);
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-retriever.cc b/icing/result/result-retriever.cc
deleted file mode 100644
index f09d834..0000000
--- a/icing/result/result-retriever.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/result/result-retriever.h"
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/proto/search.pb.h"
-#include "icing/proto/term.pb.h"
-#include "icing/result/page-result-state.h"
-#include "icing/result/snippet-context.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-libtextclassifier3::StatusOr<std::unique_ptr<ResultRetriever>>
-ResultRetriever::Create(const DocumentStore* doc_store,
- const SchemaStore* schema_store,
- const LanguageSegmenter* language_segmenter,
- const Normalizer* normalizer,
- bool ignore_bad_document_ids) {
- ICING_RETURN_ERROR_IF_NULL(doc_store);
- ICING_RETURN_ERROR_IF_NULL(schema_store);
- ICING_RETURN_ERROR_IF_NULL(language_segmenter);
-
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<SnippetRetriever> snippet_retriever,
- SnippetRetriever::Create(schema_store, language_segmenter, normalizer));
-
- return std::unique_ptr<ResultRetriever>(new ResultRetriever(
- doc_store, std::move(snippet_retriever), ignore_bad_document_ids));
-}
-
-libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
-ResultRetriever::RetrieveResults(
- const PageResultState& page_result_state) const {
- std::vector<SearchResultProto::ResultProto> search_results;
- search_results.reserve(page_result_state.scored_document_hits.size());
-
- const SnippetContext& snippet_context = page_result_state.snippet_context;
- // Calculates how many snippets to return for this page.
- int remaining_num_to_snippet = snippet_context.snippet_spec.num_to_snippet() -
- page_result_state.num_previously_returned;
-
- if (remaining_num_to_snippet < 0) {
- remaining_num_to_snippet = 0;
- }
-
- for (const auto& scored_document_hit :
- page_result_state.scored_document_hits) {
- libtextclassifier3::StatusOr<DocumentProto> document_or =
- doc_store_.Get(scored_document_hit.document_id());
-
- if (!document_or.ok()) {
- // Internal errors from document store are IO errors, return directly.
- if (absl_ports::IsInternal(document_or.status())) {
- return document_or.status();
- }
-
- if (ignore_bad_document_ids_) {
- continue;
- } else {
- return document_or.status();
- }
- }
-
- SearchResultProto::ResultProto result;
- // Add the snippet if requested.
- if (snippet_context.snippet_spec.num_matches_per_property() > 0 &&
- remaining_num_to_snippet > search_results.size()) {
- SnippetProto snippet_proto = snippet_retriever_->RetrieveSnippet(
- snippet_context.query_terms, snippet_context.match_type,
- snippet_context.snippet_spec, document_or.ValueOrDie(),
- scored_document_hit.hit_section_id_mask());
- *result.mutable_snippet() = std::move(snippet_proto);
- }
-
- // Add the document, itself.
- *result.mutable_document() = std::move(document_or).ValueOrDie();
- search_results.push_back(std::move(result));
- }
- return search_results;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/result/result-retriever.h b/icing/result/result-retriever.h
deleted file mode 100644
index 6f33eef..0000000
--- a/icing/result/result-retriever.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_RESULT_RETRIEVER_H_
-#define ICING_RESULT_RETRIEVER_H_
-
-#include <utility>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/proto/search.pb.h"
-#include "icing/proto/term.pb.h"
-#include "icing/query/query-terms.h"
-#include "icing/result/page-result-state.h"
-#include "icing/result/snippet-context.h"
-#include "icing/result/snippet-retriever.h"
-#include "icing/schema/schema-store.h"
-#include "icing/schema/section.h"
-#include "icing/scoring/scored-document-hit.h"
-#include "icing/store/document-id.h"
-#include "icing/store/document-store.h"
-#include "icing/tokenization/language-segmenter.h"
-#include "icing/transform/normalizer.h"
-
-namespace icing {
-namespace lib {
-
-class ResultRetriever {
- public:
- // Factory function to create a ResultRetriever which does not take ownership
- // of any input components, and all pointers must refer to valid objects that
- // outlive the created ResultRetriever instance.
- //
- // Returns:
- // A ResultRetriever on success
- // FAILED_PRECONDITION on any null pointer input
- static libtextclassifier3::StatusOr<std::unique_ptr<ResultRetriever>> Create(
- const DocumentStore* doc_store, const SchemaStore* schema_store,
- const LanguageSegmenter* language_segmenter, const Normalizer* normalizer,
- bool ignore_bad_document_ids = true);
-
- // Retrieves results (pairs of DocumentProtos and SnippetProtos) with the
- // given document and snippet information. The expected number of documents to
- // return is the number of all scored document hits inside PageResultState.
- // The number of snippets to return is based on the total number of snippets
- // needed and number of snippets that have already been returned previously
- // for the same query. The order of results returned is the same as the order
- // of scored document hits inside PageResultState.
- //
- // "ignore_bad_document_ids" from constructor indicates whether to ignore
- // invalid and non-existing document ids. If it's true, errors on some
- // document ids will be ignored and valid documents will be returned,
- // otherwise any error will be returned immediately. Note that IO errors will
- // always be returned.
- //
- // Returns when ignore_bad_document_ids is true:
- // A list of ResultProto on success
- // INTERNAL_ERROR on IO error
- //
- // Returns when ignore_bad_document_ids is false:
- // A list of ResultProto on success
- // INVALID_ARGUMENT if any document_id < 0
- // NOT_FOUND if any doc doesn't exist or has been deleted
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<std::vector<SearchResultProto::ResultProto>>
- RetrieveResults(const PageResultState& page_result_state) const;
-
- private:
- explicit ResultRetriever(const DocumentStore* doc_store,
- std::unique_ptr<SnippetRetriever> snippet_retriever,
- bool ignore_bad_document_ids)
- : doc_store_(*doc_store),
- snippet_retriever_(std::move(snippet_retriever)),
- ignore_bad_document_ids_(ignore_bad_document_ids) {}
-
- const DocumentStore& doc_store_;
- std::unique_ptr<SnippetRetriever> snippet_retriever_;
- const bool ignore_bad_document_ids_;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_RESULT_RETRIEVER_H_
diff --git a/icing/result/result-retriever_test.cc b/icing/result/result-retriever_test.cc
deleted file mode 100644
index 36dbfd9..0000000
--- a/icing/result/result-retriever_test.cc
+++ /dev/null
@@ -1,586 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/result/result-retriever.h"
-
-#include <limits>
-#include <memory>
-
-#include "gtest/gtest.h"
-#include "icing/document-builder.h"
-#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
-#include "icing/portable/equals-proto.h"
-#include "icing/proto/document.pb.h"
-#include "icing/proto/schema.pb.h"
-#include "icing/proto/search.pb.h"
-#include "icing/proto/term.pb.h"
-#include "icing/schema/schema-store.h"
-#include "icing/store/document-id.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/testing/fake-clock.h"
-#include "icing/testing/snippet-helpers.h"
-#include "icing/testing/test-data.h"
-#include "icing/testing/tmp-directory.h"
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/transform/normalizer-factory.h"
-#include "icing/transform/normalizer.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-using ::icing::lib::portable_equals_proto::EqualsProto;
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-using ::testing::Return;
-using ::testing::SizeIs;
-
-class ResultRetrieverTest : public testing::Test {
- protected:
- ResultRetrieverTest() : test_dir_(GetTestTempDir() + "/icing") {
- filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- }
-
- void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
- ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- language_segmenter_factory::Create());
-
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
- /*max_term_byte_size=*/10000));
-
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- PropertyConfigProto* prop_config = type_config->add_properties();
- prop_config->set_property_name("subject");
- prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
- prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- prop_config->mutable_indexing_config()->set_term_match_type(
- TermMatchType::PREFIX);
- prop_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- prop_config = type_config->add_properties();
- prop_config->set_property_name("body");
- prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
- prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- prop_config->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- prop_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
- }
-
- void TearDown() override {
- filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
- }
-
- const Filesystem filesystem_;
- const std::string test_dir_;
- std::unique_ptr<LanguageSegmenter> language_segmenter_;
- std::unique_ptr<SchemaStore> schema_store_;
- std::unique_ptr<Normalizer> normalizer_;
- FakeClock fake_clock_;
-};
-
-ResultSpecProto::SnippetSpecProto CreateSnippetSpec() {
- ResultSpecProto::SnippetSpecProto snippet_spec;
- snippet_spec.set_num_to_snippet(std::numeric_limits<int>::max());
- snippet_spec.set_num_matches_per_property(std::numeric_limits<int>::max());
- snippet_spec.set_max_window_bytes(1024);
- return snippet_spec;
-}
-
-DocumentProto CreateDocument(int id) {
- return DocumentBuilder()
- .SetKey("icing", "email/" + std::to_string(id))
- .SetSchema("email")
- .AddStringProperty("subject", "subject foo " + std::to_string(id))
- .AddStringProperty("body", "body bar " + std::to_string(id))
- .SetCreationTimestampMs(1574365086666 + id)
- .Build();
-}
-
-TEST_F(ResultRetrieverTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(
- ResultRetriever::Create(/*doc_store=*/nullptr, schema_store_.get(),
- language_segmenter_.get(), normalizer_.get()),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
-
- EXPECT_THAT(
- ResultRetriever::Create(doc_store.get(), /*schema_store=*/nullptr,
- language_segmenter_.get(), normalizer_.get()),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- /*language_segmenter=*/nullptr,
- normalizer_.get()),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(),
- /*normalizer=*/nullptr),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-}
-
-TEST_F(ResultRetrieverTest, ShouldRetrieveSimpleResults) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(CreateDocument(/*id=*/3)));
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get()));
-
- SearchResultProto::ResultProto result1;
- *result1.mutable_document() = CreateDocument(/*id=*/1);
- SearchResultProto::ResultProto result2;
- *result2.mutable_document() = CreateDocument(/*id=*/2);
- SearchResultProto::ResultProto result3;
- *result3.mutable_document() = CreateDocument(/*id=*/3);
-
- SnippetContext snippet_context(
- /*query_terms_in=*/{},
- ResultSpecProto::SnippetSpecProto::default_instance(),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/0);
- EXPECT_THAT(
- result_retriever->RetrieveResults(page_result_state),
- IsOkAndHolds(ElementsAre(EqualsProto(result1), EqualsProto(result2),
- EqualsProto(result3))));
-}
-
-TEST_F(ResultRetrieverTest, IgnoreErrors) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
-
- DocumentId invalid_document_id = -1;
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {invalid_document_id, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get(),
- /*ignore_bad_document_ids=*/true));
-
- SearchResultProto::ResultProto result1;
- *result1.mutable_document() = CreateDocument(/*id=*/1);
- SearchResultProto::ResultProto result2;
- *result2.mutable_document() = CreateDocument(/*id=*/2);
-
- SnippetContext snippet_context(
- /*query_terms_in=*/{},
- ResultSpecProto::SnippetSpecProto::default_instance(),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/0);
- EXPECT_THAT(
- result_retriever->RetrieveResults(page_result_state),
- IsOkAndHolds(ElementsAre(EqualsProto(result1), EqualsProto(result2))));
-}
-
-TEST_F(ResultRetrieverTest, NotIgnoreErrors) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
-
- DocumentId invalid_document_id = -1;
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {invalid_document_id, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get(),
- /*ignore_bad_document_ids=*/false));
-
- SnippetContext snippet_context(
- /*query_terms_in=*/{},
- ResultSpecProto::SnippetSpecProto::default_instance(),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/0);
- EXPECT_THAT(result_retriever->RetrieveResults(page_result_state),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-
- DocumentId non_existing_document_id = 4;
- page_result_state.scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {non_existing_document_id, /*hit_section_id_mask=*/0b00000011,
- /*score=*/0}};
- EXPECT_THAT(result_retriever->RetrieveResults(page_result_state),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-}
-
-TEST_F(ResultRetrieverTest, IOErrorShouldReturnInternalError) {
- MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, OpenForRead(_)).WillByDefault(Return(false));
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&mock_filesystem, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get(),
- /*ignore_bad_document_ids=*/true));
-
- SnippetContext snippet_context(
- /*query_terms_in=*/{},
- ResultSpecProto::SnippetSpecProto::default_instance(),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/0);
- EXPECT_THAT(result_retriever->RetrieveResults(page_result_state),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-}
-
-TEST_F(ResultRetrieverTest, DefaultSnippetSpecShouldDisableSnippeting) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(CreateDocument(/*id=*/3)));
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get()));
-
- SnippetContext snippet_context(
- /*query_terms_in=*/{},
- ResultSpecProto::SnippetSpecProto::default_instance(),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/0);
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<SearchResultProto::ResultProto> results,
- result_retriever->RetrieveResults(page_result_state));
- ASSERT_THAT(results, SizeIs(3));
- EXPECT_THAT(results.at(0).snippet(),
- EqualsProto(SnippetProto::default_instance()));
- EXPECT_THAT(results.at(1).snippet(),
- EqualsProto(SnippetProto::default_instance()));
- EXPECT_THAT(results.at(2).snippet(),
- EqualsProto(SnippetProto::default_instance()));
-}
-
-TEST_F(ResultRetrieverTest, SimpleSnippeted) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(CreateDocument(/*id=*/3)));
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get()));
-
- SnippetContext snippet_context(
- /*query_terms_in=*/{{"", {"foo", "bar"}}}, CreateSnippetSpec(),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/0);
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<SearchResultProto::ResultProto> result,
- result_retriever->RetrieveResults(page_result_state));
- EXPECT_THAT(result, SizeIs(3));
- EXPECT_THAT(result[0].document(), EqualsProto(CreateDocument(/*id=*/1)));
- EXPECT_THAT(
- GetWindow(result[0].document(), result[0].snippet(), "subject", 0),
- Eq("subject foo 1"));
- EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "subject", 0),
- Eq("foo"));
- EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "body", 0),
- Eq("body bar 1"));
- EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "body", 0),
- Eq("bar"));
-
- EXPECT_THAT(result[1].document(), EqualsProto(CreateDocument(/*id=*/2)));
- EXPECT_THAT(
- GetWindow(result[1].document(), result[1].snippet(), "subject", 0),
- Eq("subject foo 2"));
- EXPECT_THAT(GetMatch(result[1].document(), result[1].snippet(), "subject", 0),
- Eq("foo"));
- EXPECT_THAT(GetWindow(result[1].document(), result[1].snippet(), "body", 0),
- Eq("body bar 2"));
- EXPECT_THAT(GetMatch(result[1].document(), result[1].snippet(), "body", 0),
- Eq("bar"));
-
- EXPECT_THAT(result[2].document(), EqualsProto(CreateDocument(/*id=*/3)));
- EXPECT_THAT(
- GetWindow(result[2].document(), result[2].snippet(), "subject", 0),
- Eq("subject foo 3"));
- EXPECT_THAT(GetMatch(result[2].document(), result[2].snippet(), "subject", 0),
- Eq("foo"));
- EXPECT_THAT(GetWindow(result[2].document(), result[2].snippet(), "body", 0),
- Eq("body bar 3"));
- EXPECT_THAT(GetMatch(result[2].document(), result[2].snippet(), "body", 0),
- Eq("bar"));
-}
-
-TEST_F(ResultRetrieverTest, OnlyOneDocumentSnippeted) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(CreateDocument(/*id=*/3)));
-
- ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
- snippet_spec.set_num_to_snippet(1);
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get()));
-
- SnippetContext snippet_context(/*query_terms_in=*/{{"", {"foo", "bar"}}},
- snippet_spec, TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/0);
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<SearchResultProto::ResultProto> result,
- result_retriever->RetrieveResults(page_result_state));
- EXPECT_THAT(result, SizeIs(3));
- EXPECT_THAT(result[0].document(), EqualsProto(CreateDocument(/*id=*/1)));
- EXPECT_THAT(
- GetWindow(result[0].document(), result[0].snippet(), "subject", 0),
- Eq("subject foo 1"));
- EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "subject", 0),
- Eq("foo"));
- EXPECT_THAT(GetWindow(result[0].document(), result[0].snippet(), "body", 0),
- Eq("body bar 1"));
- EXPECT_THAT(GetMatch(result[0].document(), result[0].snippet(), "body", 0),
- Eq("bar"));
-
- EXPECT_THAT(result[1].document(), EqualsProto(CreateDocument(/*id=*/2)));
- EXPECT_THAT(result[1].snippet(),
- EqualsProto(SnippetProto::default_instance()));
-
- EXPECT_THAT(result[2].document(), EqualsProto(CreateDocument(/*id=*/3)));
- EXPECT_THAT(result[2].snippet(),
- EqualsProto(SnippetProto::default_instance()));
-}
-
-TEST_F(ResultRetrieverTest, ShouldSnippetAllResults) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(CreateDocument(/*id=*/3)));
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get()));
-
- ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
- snippet_spec.set_num_to_snippet(5);
- SnippetContext snippet_context(
- /*query_terms_in=*/{{"", {"foo", "bar"}}}, std::move(snippet_spec),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/0);
-
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<SearchResultProto::ResultProto> result,
- result_retriever->RetrieveResults(page_result_state));
- // num_to_snippet = 5, num_previously_returned_in = 0,
- // We can return 5 - 0 = 5 snippets at most. We're able to return all 3
- // snippets here.
- ASSERT_THAT(result, SizeIs(3));
- EXPECT_THAT(result[0].snippet().entries(), Not(IsEmpty()));
- EXPECT_THAT(result[1].snippet().entries(), Not(IsEmpty()));
- EXPECT_THAT(result[2].snippet().entries(), Not(IsEmpty()));
-}
-
-TEST_F(ResultRetrieverTest, ShouldSnippetSomeResults) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(CreateDocument(/*id=*/3)));
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get()));
-
- ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
- snippet_spec.set_num_to_snippet(5);
- SnippetContext snippet_context(
- /*query_terms_in=*/{{"", {"foo", "bar"}}}, std::move(snippet_spec),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/3);
-
- // num_to_snippet = 5, num_previously_returned_in = 3,
- // We can return 5 - 3 = 2 snippets.
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<SearchResultProto::ResultProto> result,
- result_retriever->RetrieveResults(page_result_state));
- ASSERT_THAT(result, SizeIs(3));
- EXPECT_THAT(result[0].snippet().entries(), Not(IsEmpty()));
- EXPECT_THAT(result[1].snippet().entries(), Not(IsEmpty()));
- EXPECT_THAT(result[2].snippet().entries(), IsEmpty());
-}
-
-TEST_F(ResultRetrieverTest, ShouldNotSnippetAnyResults) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, test_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
- doc_store->Put(CreateDocument(/*id=*/1)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
- doc_store->Put(CreateDocument(/*id=*/2)));
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
- doc_store->Put(CreateDocument(/*id=*/3)));
-
- std::vector<ScoredDocumentHit> scored_document_hits = {
- {document_id1, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id2, /*hit_section_id_mask=*/0b00000011, /*score=*/0},
- {document_id3, /*hit_section_id_mask=*/0b00000011, /*score=*/0}};
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<ResultRetriever> result_retriever,
- ResultRetriever::Create(doc_store.get(), schema_store_.get(),
- language_segmenter_.get(), normalizer_.get()));
-
- ResultSpecProto::SnippetSpecProto snippet_spec = CreateSnippetSpec();
- snippet_spec.set_num_to_snippet(5);
- SnippetContext snippet_context(
- /*query_terms_in=*/{{"", {"foo", "bar"}}}, std::move(snippet_spec),
- TermMatchType::EXACT_ONLY);
- PageResultState page_result_state(
- std::move(scored_document_hits), /*next_page_token_in=*/1,
- std::move(snippet_context), /*num_previously_returned_in=*/6);
-
- // num_to_snippet = 5, num_previously_returned_in = 6,
- // We can't return any snippets for this page.
- ICING_ASSERT_OK_AND_ASSIGN(
- std::vector<SearchResultProto::ResultProto> result,
- result_retriever->RetrieveResults(page_result_state));
- ASSERT_THAT(result, SizeIs(3));
- EXPECT_THAT(result[0].snippet().entries(), IsEmpty());
- EXPECT_THAT(result[1].snippet().entries(), IsEmpty());
- EXPECT_THAT(result[2].snippet().entries(), IsEmpty());
-}
-
-} // namespace
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/result/result-state-manager.cc b/icing/result/result-state-manager.cc
index e9ae0ab..382f7db 100644
--- a/icing/result/result-state-manager.cc
+++ b/icing/result/result-state-manager.cc
@@ -14,95 +14,134 @@
#include "icing/result/result-state-manager.h"
-#include "icing/proto/search.pb.h"
+#include <memory>
+#include <queue>
+#include <utility>
+
+#include "icing/result/page-result.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
#include "icing/util/clock.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
-ResultStateManager::ResultStateManager(int max_hits_per_query,
- int max_result_states)
- : max_hits_per_query_(max_hits_per_query),
- max_result_states_(max_result_states),
+ResultStateManager::ResultStateManager(int max_total_hits,
+ const DocumentStore& document_store)
+ : document_store_(document_store),
+ max_total_hits_(max_total_hits),
+ num_total_hits_(0),
random_generator_(GetSteadyTimeNanoseconds()) {}
-libtextclassifier3::StatusOr<PageResultState>
-ResultStateManager::RankAndPaginate(ResultState result_state) {
- if (!result_state.HasMoreResults()) {
- return absl_ports::InvalidArgumentError("ResultState has no results");
+libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ResultStateManager::CacheAndRetrieveFirstPage(
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker,
+ std::unique_ptr<ResultAdjustmentInfo> parent_adjustment_info,
+ std::unique_ptr<ResultAdjustmentInfo> child_adjustment_info,
+ const ResultSpecProto& result_spec, const DocumentStore& document_store,
+ const ResultRetrieverV2& result_retriever, int64_t current_time_ms) {
+ if (ranker == nullptr) {
+ return absl_ports::InvalidArgumentError("Should not provide null ranker");
}
- // Truncates scored document hits so that they don't take up too much space.
- result_state.TruncateHitsTo(max_hits_per_query_);
-
- // Gets the number before calling GetNextPage() because num_returned() may
- // change after returning more results.
- int num_previously_returned = result_state.num_returned();
-
- std::vector<ScoredDocumentHit> page_result_document_hits =
- result_state.GetNextPage();
-
- if (!result_state.HasMoreResults()) {
+ // Create shared pointer of ResultState.
+ // ResultState should be created by ResultStateManager only.
+ std::shared_ptr<ResultStateV2> result_state = std::make_shared<ResultStateV2>(
+ std::move(ranker), std::move(parent_adjustment_info),
+ std::move(child_adjustment_info), result_spec, document_store);
+
+ // Retrieve docs outside of ResultStateManager critical section.
+ // Will enter ResultState critical section inside ResultRetriever.
+ auto [page_result, has_more_results] =
+ result_retriever.RetrieveNextPage(*result_state, current_time_ms);
+ if (!has_more_results) {
// No more pages, won't store ResultState, returns directly
- return PageResultState(
- std::move(page_result_document_hits), kInvalidNextPageToken,
- result_state.snippet_context(), num_previously_returned);
+ return std::make_pair(kInvalidNextPageToken, std::move(page_result));
}
- absl_ports::unique_lock l(&mutex_);
-
// ResultState has multiple pages, storing it
- SnippetContext snippet_context_copy = result_state.snippet_context();
- uint64_t next_page_token = Add(std::move(result_state));
+ int num_hits_to_add = 0;
+ {
+ // ResultState critical section
+ absl_ports::unique_lock l(&result_state->mutex);
+
+ result_state->scored_document_hits_ranker->TruncateHitsTo(max_total_hits_);
+ result_state->RegisterNumTotalHits(&num_total_hits_);
+ num_hits_to_add = result_state->scored_document_hits_ranker->size();
+ }
- return PageResultState(std::move(page_result_document_hits), next_page_token,
- std::move(snippet_context_copy),
- num_previously_returned);
-}
+ // It is fine to exit ResultState critical section, since it is just created
+ // above and only this thread (this call stack) has access to it. Thus, it
+ // won't be changed during the gap before we enter ResultStateManager critical
+ // section.
+ uint64_t next_page_token = kInvalidNextPageToken;
+ {
+ // ResultStateManager critical section
+ absl_ports::unique_lock l(&mutex_);
+
+ // Remove expired result states first.
+ InternalInvalidateExpiredResultStates(kDefaultResultStateTtlInMs,
+ current_time_ms);
+ // Remove states to make room for this new state.
+ RemoveStatesIfNeeded(num_hits_to_add);
+ // Generate a new unique token and add it into result_state_map_.
+ next_page_token = Add(std::move(result_state), current_time_ms);
+ }
-uint64_t ResultStateManager::Add(ResultState result_state) {
- RemoveStatesIfNeeded();
+ return std::make_pair(next_page_token, std::move(page_result));
+}
+uint64_t ResultStateManager::Add(std::shared_ptr<ResultStateV2> result_state,
+ int64_t current_time_ms) {
uint64_t new_token = GetUniqueToken();
result_state_map_.emplace(new_token, std::move(result_state));
// Tracks the insertion order
- token_queue_.push(new_token);
+ token_queue_.push(std::make_pair(new_token, current_time_ms));
return new_token;
}
-libtextclassifier3::StatusOr<PageResultState> ResultStateManager::GetNextPage(
- uint64_t next_page_token) {
- absl_ports::unique_lock l(&mutex_);
-
- const auto& state_iterator = result_state_map_.find(next_page_token);
- if (state_iterator == result_state_map_.end()) {
- return absl_ports::NotFoundError("next_page_token not found");
+libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ResultStateManager::GetNextPage(uint64_t next_page_token,
+ const ResultRetrieverV2& result_retriever,
+ int64_t current_time_ms) {
+ std::shared_ptr<ResultStateV2> result_state = nullptr;
+ {
+ // ResultStateManager critical section
+ absl_ports::unique_lock l(&mutex_);
+
+ // Remove expired result states before fetching
+ InternalInvalidateExpiredResultStates(kDefaultResultStateTtlInMs,
+ current_time_ms);
+
+ const auto& state_iterator = result_state_map_.find(next_page_token);
+ if (state_iterator == result_state_map_.end()) {
+ return absl_ports::NotFoundError("next_page_token not found");
+ }
+ result_state = state_iterator->second;
}
- int num_returned = state_iterator->second.num_returned();
- std::vector<ScoredDocumentHit> result_of_page =
- state_iterator->second.GetNextPage();
- if (result_of_page.empty()) {
- // This shouldn't happen, all our active states should contain results, but
- // a sanity check here in case of any data inconsistency.
- InternalInvalidateResultState(next_page_token);
- return absl_ports::NotFoundError(
- "No more results, token has been invalidated.");
- }
+ // Retrieve docs outside of ResultStateManager critical section.
+ // Will enter ResultState critical section inside ResultRetriever.
+ auto [page_result, has_more_results] =
+ result_retriever.RetrieveNextPage(*result_state, current_time_ms);
- // Copies the SnippetContext in case the ResultState is invalidated.
- SnippetContext snippet_context_copy =
- state_iterator->second.snippet_context();
+ if (!has_more_results) {
+ {
+ // ResultStateManager critical section
+ absl_ports::unique_lock l(&mutex_);
- if (!state_iterator->second.HasMoreResults()) {
- InternalInvalidateResultState(next_page_token);
- }
+ InternalInvalidateResultState(next_page_token);
+ }
- return PageResultState(result_of_page, next_page_token,
- std::move(snippet_context_copy), num_returned);
+ next_page_token = kInvalidNextPageToken;
+ }
+ return std::make_pair(next_page_token, std::move(page_result));
}
void ResultStateManager::InvalidateResultState(uint64_t next_page_token) {
@@ -117,10 +156,16 @@ void ResultStateManager::InvalidateResultState(uint64_t next_page_token) {
void ResultStateManager::InvalidateAllResultStates() {
absl_ports::unique_lock l(&mutex_);
+ InternalInvalidateAllResultStates();
+}
+void ResultStateManager::InternalInvalidateAllResultStates() {
+ // We don't have to reset num_total_hits_ (to 0) here, since clearing
+ // result_state_map_ will "eventually" invoke the destructor of ResultState
+ // (which decrements num_total_hits_) and num_total_hits_ will become 0.
result_state_map_.clear();
invalidated_token_set_.clear();
- token_queue_ = {};
+ token_queue_ = std::queue<std::pair<uint64_t, int64_t>>();
}
uint64_t ResultStateManager::GetUniqueToken() {
@@ -136,24 +181,41 @@ uint64_t ResultStateManager::GetUniqueToken() {
return new_token;
}
-void ResultStateManager::RemoveStatesIfNeeded() {
+void ResultStateManager::RemoveStatesIfNeeded(int num_hits_to_add) {
if (result_state_map_.empty() || token_queue_.empty()) {
return;
}
- // Removes any tokens that were previously invalidated.
+ // 1. Check if this new result_state would take up the entire result state
+ // manager budget.
+ if (num_hits_to_add > max_total_hits_) {
+ // This single result state will exceed our budget. Drop everything else to
+ // accomodate it.
+ InternalInvalidateAllResultStates();
+ return;
+ }
+
+ // 2. Remove any tokens that were previously invalidated.
while (!token_queue_.empty() &&
- invalidated_token_set_.find(token_queue_.front()) !=
+ invalidated_token_set_.find(token_queue_.front().first) !=
invalidated_token_set_.end()) {
- invalidated_token_set_.erase(token_queue_.front());
+ invalidated_token_set_.erase(token_queue_.front().first);
token_queue_.pop();
}
- // Removes the oldest state
- if (result_state_map_.size() >= max_result_states_ && !token_queue_.empty()) {
- result_state_map_.erase(token_queue_.front());
+ // 3. If we're over budget, remove states from oldest to newest until we fit
+ // into our budget.
+ // Note: num_total_hits_ may not be decremented immediately after invalidating
+ // a result state, since other threads may still hold the shared pointer.
+ // Thus, we have to check if token_queue_ is empty or not, since it is
+ // possible that num_total_hits_ is non-zero and still greater than
+ // max_total_hits_ when token_queue_ is empty. Still "eventually" it will be
+ // decremented after the last thread releases the shared pointer.
+ while (!token_queue_.empty() && num_total_hits_ > max_total_hits_) {
+ InternalInvalidateResultState(token_queue_.front().first);
token_queue_.pop();
}
+ invalidated_token_set_.clear();
}
void ResultStateManager::InternalInvalidateResultState(uint64_t token) {
@@ -161,10 +223,35 @@ void ResultStateManager::InternalInvalidateResultState(uint64_t token) {
// invalidated_token_set_. The entry in token_queue_ can't be easily removed
// right now (may need O(n) time), so we leave it there and later completely
// remove the token in RemoveStatesIfNeeded().
- if (result_state_map_.erase(token) > 0) {
+ auto itr = result_state_map_.find(token);
+ if (itr != result_state_map_.end()) {
+ // We don't have to decrement num_total_hits_ here, since erasing the shared
+ // ptr instance will "eventually" invoke the destructor of ResultState and
+ // it will handle this.
+ result_state_map_.erase(itr);
invalidated_token_set_.insert(token);
}
}
+void ResultStateManager::InternalInvalidateExpiredResultStates(
+ int64_t result_state_ttl, int64_t current_time_ms) {
+ while (!token_queue_.empty() &&
+ current_time_ms - token_queue_.front().second >= result_state_ttl) {
+ auto itr = result_state_map_.find(token_queue_.front().first);
+ if (itr != result_state_map_.end()) {
+ // We don't have to decrement num_total_hits_ here, since erasing the
+ // shared ptr instance will "eventually" invoke the destructor of
+ // ResultState and it will handle this.
+ result_state_map_.erase(itr);
+ } else {
+ // Since result_state_map_ and invalidated_token_set_ are mutually
+ // exclusive, we remove the token from invalidated_token_set_ only if it
+ // isn't present in result_state_map_.
+ invalidated_token_set_.erase(token_queue_.front().first);
+ }
+ token_queue_.pop();
+ }
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/result/result-state-manager.h b/icing/result/result-state-manager.h
index eaf9eb5..a64ae2c 100644
--- a/icing/result/result-state-manager.h
+++ b/icing/result/result-state-manager.h
@@ -15,6 +15,8 @@
#ifndef ICING_RESULT_RESULT_STATE_MANAGER_H_
#define ICING_RESULT_RESULT_STATE_MANAGER_H_
+#include <atomic>
+#include <memory>
#include <queue>
#include <random>
#include <unordered_map>
@@ -22,10 +24,13 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/mutex.h"
-#include "icing/proto/scoring.pb.h"
#include "icing/proto/search.pb.h"
-#include "icing/result/page-result-state.h"
-#include "icing/result/result-state.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/result/result-state-v2.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
+#include "icing/util/clock.h"
namespace icing {
namespace lib {
@@ -34,38 +39,62 @@ namespace lib {
// SearchResultProto.next_page_token.
inline constexpr uint64_t kInvalidNextPageToken = 0;
+// 1 hr as the default ttl for a ResultState after being pushed into
+// token_queue_.
+inline constexpr int64_t kDefaultResultStateTtlInMs = 1LL * 60 * 60 * 1000;
+
// Used to store and manage ResultState.
class ResultStateManager {
public:
- explicit ResultStateManager(int max_hits_per_query, int max_result_states);
+ explicit ResultStateManager(int max_total_hits,
+ const DocumentStore& document_store);
ResultStateManager(const ResultStateManager&) = delete;
ResultStateManager& operator=(const ResultStateManager&) = delete;
- // Ranks the results and returns the first page of them. The result object
- // PageResultState contains a next_page_token which can be used to fetch more
- // pages later. It will be set to a default value 0 if there're no more pages.
+ // Creates a new result state, retrieves and returns PageResult for the first
+ // page. Also caches the new result state and returns a next_page_token which
+ // can be used to fetch more pages from the same result state later. Before
+ // caching the result state, adjusts (truncate) the size and evicts some old
+ // result states if exceeding the cache size limit. next_page_token will be
+ // set to a default value kInvalidNextPageToken if there're no more pages.
+ //
+ // NOTE: parent_adjustment_info and child_adjustment_info can be nullptr if
+ // there is no requirement to apply adjustment (snippet, projection) to
+ // them.
//
- // NOTE: it's caller's responsibility not to call this method with the same
- // ResultState more than once, otherwise duplicate states will be stored
- // internally.
+ // NOTE: it is possible to have empty result for the first page even if the
+ // ranker was not empty before the retrieval, since GroupResultLimiter
+ // may filter out all docs. In this case, the first page is also the
+ // last page and next_page_token will be set to kInvalidNextPageToken.
//
// Returns:
- // A PageResultState on success
- // INVALID_ARGUMENT if the input state contains no results
- libtextclassifier3::StatusOr<PageResultState> RankAndPaginate(
- ResultState result_state) ICING_LOCKS_EXCLUDED(mutex_);
+ // A token and PageResult wrapped by std::pair on success
+ // INVALID_ARGUMENT if the input ranker is null or contains no results
+ libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>>
+ CacheAndRetrieveFirstPage(
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker,
+ std::unique_ptr<ResultAdjustmentInfo> parent_adjustment_info,
+ std::unique_ptr<ResultAdjustmentInfo> child_adjustment_info,
+ const ResultSpecProto& result_spec, const DocumentStore& document_store,
+ const ResultRetrieverV2& result_retriever, int64_t current_time_ms)
+ ICING_LOCKS_EXCLUDED(mutex_);
- // Retrieves and returns the next page of results wrapped in PageResultState.
+ // Retrieves and returns PageResult for the next page.
// The returned results won't exist in ResultStateManager anymore. If the
// query has no more pages after this retrieval, the input token will be
// invalidated.
//
+ // NOTE: it is possible to have empty result for the last page even if the
+ // ranker was not empty before the retrieval, since GroupResultLimiter
+ // may filtered out all remaining docs.
+ //
// Returns:
- // PageResultState on success, guaranteed to have non-empty results
+ // A token and PageResult wrapped by std::pair on success
// NOT_FOUND if failed to find any more results
- libtextclassifier3::StatusOr<PageResultState> GetNextPage(
- uint64_t next_page_token) ICING_LOCKS_EXCLUDED(mutex_);
+ libtextclassifier3::StatusOr<std::pair<uint64_t, PageResult>> GetNextPage(
+ uint64_t next_page_token, const ResultRetrieverV2& result_retriever,
+ int64_t current_time_ms) ICING_LOCKS_EXCLUDED(mutex_);
// Invalidates the result state associated with the given next-page token.
void InvalidateResultState(uint64_t next_page_token)
@@ -74,23 +103,30 @@ class ResultStateManager {
// Invalidates all result states / tokens currently in ResultStateManager.
void InvalidateAllResultStates() ICING_LOCKS_EXCLUDED(mutex_);
+ int num_total_hits() const { return num_total_hits_; }
+
private:
absl_ports::shared_mutex mutex_;
- // The maximum number of scored document hits to return for a query. When we
- // have more than the maximum number, extra hits will be truncated.
- const int max_hits_per_query_;
+ const DocumentStore& document_store_;
+
+ // The maximum number of scored document hits that all result states may
+ // have. When a new result state is added such that num_total_hits_ would
+ // exceed max_total_hits_, the oldest result states are evicted until
+ // num_total_hits_ is below max_total_hits.
+ const int max_total_hits_;
- // The maximum number of result states. When we have more than the maximum
- // number, the oldest / firstly added result state will be removed.
- const int max_result_states_;
+ // The number of scored document hits that all result states currently held by
+ // the result state manager have.
+ std::atomic<int> num_total_hits_;
// A hash map of (next-page token -> result state)
- std::unordered_map<uint64_t, ResultState> result_state_map_
+ std::unordered_map<uint64_t, std::shared_ptr<ResultStateV2>> result_state_map_
ICING_GUARDED_BY(mutex_);
- // A queue used to track the insertion order of tokens
- std::queue<uint64_t> token_queue_ ICING_GUARDED_BY(mutex_);
+ // A queue used to track the insertion order of tokens with pushed timestamps.
+ std::queue<std::pair<uint64_t, int64_t>> token_queue_
+ ICING_GUARDED_BY(mutex_);
// A set to temporarily store the invalidated tokens before they're finally
// removed from token_queue_. We store the invalidated tokens to ensure the
@@ -105,20 +141,37 @@ class ResultStateManager {
// currently valid tokens. When the maximum number of result states is
// reached, the oldest / firstly added result state will be removed to make
// room for the new state.
- uint64_t Add(ResultState result_state) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ uint64_t Add(std::shared_ptr<ResultStateV2> result_state,
+ int64_t current_time_ms) ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Helper method to generate a next-page token that is unique among all
// existing tokens in token_queue_.
uint64_t GetUniqueToken() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
- // Helper method to remove old states to make room for incoming states.
- void RemoveStatesIfNeeded() ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+ // Helper method to remove old states to make room for incoming states with
+ // size num_hits_to_add.
+ void RemoveStatesIfNeeded(int num_hits_to_add)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
// Helper method to remove a result state from result_state_map_, the token
// will then be temporarily kept in invalidated_token_set_ until it's finally
// removed from token_queue_.
void InternalInvalidateResultState(uint64_t token)
ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Internal method to invalidate all result states / tokens currently in
+ // ResultStateManager. We need this separate method so that other public
+ // methods don't need to call InvalidateAllResultStates(). Public methods
+ // calling each other may cause deadlock issues.
+ void InternalInvalidateAllResultStates()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
+
+ // Internal method to invalidate and remove expired result states / tokens
+ // currently in ResultStateManager that were created before
+ // current_time - result_state_ttl.
+ void InternalInvalidateExpiredResultStates(int64_t result_state_ttl,
+ int64_t current_time_ms)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex_);
};
} // namespace lib
diff --git a/icing/result/result-state-manager_test.cc b/icing/result/result-state-manager_test.cc
index 6defa6f..75d1d93 100644
--- a/icing/result/result-state-manager_test.cc
+++ b/icing/result/result-state-manager_test.cc
@@ -14,22 +14,39 @@
#include "icing/result/result-state-manager.h"
+#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
#include "icing/portable/equals-proto.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/schema/schema-store.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
+#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace {
+
using ::icing::lib::portable_equals_proto::EqualsProto;
-using ::testing::ElementsAre;
using ::testing::Eq;
-using ::testing::Gt;
using ::testing::IsEmpty;
-
-ScoredDocumentHit CreateScoredDocumentHit(DocumentId document_id) {
- return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1);
-}
+using ::testing::Not;
+using ::testing::SizeIs;
+using PageResultInfo = std::pair<uint64_t, PageResult>;
ScoringSpecProto CreateScoringSpec() {
ScoringSpecProto scoring_spec;
@@ -37,441 +54,1521 @@ ScoringSpecProto CreateScoringSpec() {
return scoring_spec;
}
-ResultSpecProto CreateResultSpec(int num_per_page) {
+ResultSpecProto CreateResultSpec(
+ int num_per_page, ResultSpecProto::ResultGroupingType result_group_type) {
ResultSpecProto result_spec;
+ result_spec.set_result_group_type(result_group_type);
result_spec.set_num_per_page(num_per_page);
return result_spec;
}
-ResultState CreateResultState(
- const std::vector<ScoredDocumentHit>& scored_document_hits,
- int num_per_page) {
- return ResultState(scored_document_hits, /*query_terms=*/{},
- SearchSpecProto::default_instance(), CreateScoringSpec(),
- CreateResultSpec(num_per_page));
+DocumentProto CreateDocument(int id) {
+ return DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri(std::to_string(id))
+ .SetSchema("Document")
+ .SetCreationTimestampMs(1574365086666 + id)
+ .SetScore(1)
+ .Build();
}
-TEST(ResultStateManagerTest, ShouldRankAndPaginateOnePage) {
- ResultState original_result_state =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/3)},
- /*num_per_page=*/10);
+class ResultStateManagerTest : public testing::Test {
+ protected:
+ ResultStateManagerTest() : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ clock_ = std::make_unique<FakeClock>();
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, clock_.get()));
+ SchemaProto schema;
+ schema.add_types()->set_schema_type("Document");
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ std::move(schema), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult result,
+ DocumentStore::Create(
+ &filesystem_, test_dir_, clock_.get(), schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ result_retriever_, ResultRetrieverV2::Create(
+ document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ clock_.reset();
+ }
+
+ std::pair<ScoredDocumentHit, DocumentProto> AddScoredDocument(
+ DocumentId document_id) {
+ DocumentProto document;
+ document.set_namespace_("namespace");
+ document.set_uri(std::to_string(document_id));
+ document.set_schema("Document");
+ document.set_creation_timestamp_ms(1574365086666 + document_id);
+ document_store_->Put(document);
+ return std::make_pair(
+ ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1),
+ std::move(document));
+ }
+
+ std::pair<std::vector<ScoredDocumentHit>, std::vector<DocumentProto>>
+ AddScoredDocuments(const std::vector<DocumentId>& document_ids) {
+ std::vector<ScoredDocumentHit> scored_document_hits;
+ std::vector<DocumentProto> document_protos;
+
+ for (DocumentId document_id : document_ids) {
+ std::pair<ScoredDocumentHit, DocumentProto> pair =
+ AddScoredDocument(document_id);
+ scored_document_hits.emplace_back(std::move(pair.first));
+ document_protos.emplace_back(std::move(pair.second));
+ }
+
+ std::reverse(document_protos.begin(), document_protos.end());
+
+ return std::make_pair(std::move(scored_document_hits),
+ std::move(document_protos));
+ }
+
+ FakeClock* clock() { return clock_.get(); }
+ const FakeClock* clock() const { return clock_.get(); }
+
+ DocumentStore& document_store() { return *document_store_; }
+ const DocumentStore& document_store() const { return *document_store_; }
+
+ SchemaStore& schema_store() { return *schema_store_; }
+ const SchemaStore& schema_store() const { return *schema_store_; }
+
+ const ResultRetrieverV2& result_retriever() const {
+ return *result_retriever_;
+ }
+
+ private:
+ Filesystem filesystem_;
+ const std::string test_dir_;
+ std::unique_ptr<FakeClock> clock_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<ResultRetrieverV2> result_retriever_;
+};
+
+TEST_F(ResultStateManagerTest, ShouldCacheAndRetrieveFirstPageOnePage) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store().Put(CreateDocument(/*id=*/3)));
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1},
+ {document_id3, kSectionIdMaskNone, /*score=*/1}};
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker = std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true);
ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/std::numeric_limits<int>::max());
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
-
- EXPECT_THAT(page_result_state.next_page_token, Eq(kInvalidNextPageToken));
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
- // Should get the original scored document hits
- EXPECT_THAT(
- page_result_state.scored_document_hits,
- ElementsAre(
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/1))));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::move(ranker), /*parent_adjustment_info=*/nullptr,
+ /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/10, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(page_result_info.first, Eq(kInvalidNextPageToken));
+
+ // Should get docs.
+ ASSERT_THAT(page_result_info.second.results, SizeIs(3));
+ EXPECT_THAT(page_result_info.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/3)));
+ EXPECT_THAT(page_result_info.second.results.at(1).document(),
+ EqualsProto(CreateDocument(/*id=*/2)));
+ EXPECT_THAT(page_result_info.second.results.at(2).document(),
+ EqualsProto(CreateDocument(/*id=*/1)));
}
-TEST(ResultStateManagerTest, ShouldRankAndPaginateMultiplePages) {
- ResultState original_result_state =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/4),
- CreateScoredDocumentHit(/*document_id=*/5)},
- /*num_per_page=*/2);
+TEST_F(ResultStateManagerTest, ShouldCacheAndRetrieveFirstPageMultiplePages) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store().Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store().Put(CreateDocument(/*id=*/4)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ document_store().Put(CreateDocument(/*id=*/5)));
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1},
+ {document_id3, kSectionIdMaskNone, /*score=*/1},
+ {document_id4, kSectionIdMaskNone, /*score=*/1},
+ {document_id5, kSectionIdMaskNone, /*score=*/1}};
+ std::unique_ptr<ScoredDocumentHitsRanker> ranker = std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true);
ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/std::numeric_limits<int>::max());
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
// First page, 2 results
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
- EXPECT_THAT(
- page_result_state1.scored_document_hits,
- ElementsAre(
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4))));
-
- uint64_t next_page_token = page_result_state1.next_page_token;
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::move(ranker), /*parent_adjustment_info=*/nullptr,
+ /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info1.first, Not(Eq(kInvalidNextPageToken)));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/5)));
+ EXPECT_THAT(page_result_info1.second.results.at(1).document(),
+ EqualsProto(CreateDocument(/*id=*/4)));
+
+ uint64_t next_page_token = page_result_info1.first;
// Second page, 2 results
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
- result_state_manager.GetNextPage(next_page_token));
- EXPECT_THAT(
- page_result_state2.scored_document_hits,
- ElementsAre(
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2))));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.GetNextPage(next_page_token, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info2.first, Eq(next_page_token));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/3)));
+ EXPECT_THAT(page_result_info2.second.results.at(1).document(),
+ EqualsProto(CreateDocument(/*id=*/2)));
// Third page, 1 result
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3,
- result_state_manager.GetNextPage(next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(
- CreateScoredDocumentHit(/*document_id=*/1))));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.GetNextPage(next_page_token, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info3.first, Eq(kInvalidNextPageToken));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/1)));
// No results
- EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(next_page_token, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST(ResultStateManagerTest, EmptyStateShouldReturnError) {
- ResultState empty_result_state = CreateResultState({}, /*num_per_page=*/1);
-
+TEST_F(ResultStateManagerTest, NullRankerShouldReturnError) {
ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/std::numeric_limits<int>::max());
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
+
EXPECT_THAT(
- result_state_manager.RankAndPaginate(std::move(empty_result_state)),
+ result_state_manager.CacheAndRetrieveFirstPage(
+ /*ranker=*/nullptr, /*parent_adjustment_info=*/nullptr,
+ /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST(ResultStateManagerTest, ShouldInvalidateOneToken) {
- ResultState result_state1 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/4),
- CreateScoredDocumentHit(/*document_id=*/5),
- CreateScoredDocumentHit(/*document_id=*/6)},
- /*num_per_page=*/1);
+TEST_F(ResultStateManagerTest, EmptyRankerShouldReturnEmptyFirstPage) {
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::vector<ScoredDocumentHit>(), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(page_result_info.first, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(page_result_info.second.results, IsEmpty());
+}
+
+TEST_F(ResultStateManagerTest, ShouldAllowEmptyFirstPage) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1}};
ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/std::numeric_limits<int>::max());
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
+
+ // Create a ResultSpec that limits "namespace" to 0 results.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(0);
+ entry->set_namespace_("namespace");
+
+ // First page, no result.
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ // If the first page has no result, then it should be the last page.
+ EXPECT_THAT(page_result_info.first, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(page_result_info.second.results, IsEmpty());
+}
+
+TEST_F(ResultStateManagerTest, ShouldAllowEmptyLastPage) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store().Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store().Put(CreateDocument(/*id=*/4)));
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1},
+ {document_id3, kSectionIdMaskNone, /*score=*/1},
+ {document_id4, kSectionIdMaskNone, /*score=*/1}};
+
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
+
+ // Create a ResultSpec that limits "namespace" to 2 results.
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(2);
+ entry->set_namespace_("namespace");
+
+ // First page, 2 results.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info1.first, Not(Eq(kInvalidNextPageToken)));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/4)));
+ EXPECT_THAT(page_result_info1.second.results.at(1).document(),
+ EqualsProto(CreateDocument(/*id=*/3)));
+
+ uint64_t next_page_token = page_result_info1.first;
+
+ // Second page, all remaining documents will be filtered out by group result
+ // limiter, so we should get an empty page.
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.GetNextPage(next_page_token, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info2.first, Eq(kInvalidNextPageToken));
+ EXPECT_THAT(page_result_info2.second.results, IsEmpty());
+}
- result_state_manager.InvalidateResultState(
- page_result_state1.next_page_token);
+TEST_F(ResultStateManagerTest,
+ ShouldInvalidateExpiredTokensWhenCacheAndRetrieveFirstPage) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] = AddScoredDocuments(
+ {/*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
- // page_result_state1.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
- // page_result_state2.next_page_token() should still exist
+ SectionRestrictQueryTermsMap query_terms;
+ SearchSpecProto search_spec;
+ ScoringSpecProto scoring_spec = CreateScoringSpec();
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE);
+
+ // Set time as 1s and add state 1.
+ clock()->SetSystemTimeMilliseconds(1000);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(search_spec, scoring_spec,
+ result_spec, &schema_store(),
+ query_terms),
+ /*child_adjustment_info=*/nullptr, result_spec, document_store(),
+ result_retriever(), clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info1.first, Not(Eq(kInvalidNextPageToken)));
+
+ // Set time as 1hr1s and add state 2.
+ clock()->SetSystemTimeMilliseconds(kDefaultResultStateTtlInMs + 1000);
ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state2,
- result_state_manager.GetNextPage(page_result_state2.next_page_token));
- EXPECT_THAT(page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(
- CreateScoredDocumentHit(/*document_id=*/5))));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/
+ std::make_unique<ResultAdjustmentInfo>(search_spec, scoring_spec,
+ result_spec, &schema_store(),
+ query_terms),
+ /*child_adjustment_info=*/nullptr, result_spec, document_store(),
+ result_retriever(), clock()->GetSystemTimeMilliseconds()));
+
+ // Calling CacheAndRetrieveFirstPage() on state 2 should invalidate the
+ // expired state 1 internally.
+ //
+ // We test the behavior by setting time back to 1s, to make sure the
+ // invalidation of state 1 was done by the previous
+ // CacheAndRetrieveFirstPage() instead of the following GetNextPage().
+ clock()->SetSystemTimeMilliseconds(1000);
+ // page_result_info1's token (page_result_info1.first) shouldn't be found.
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST(ResultStateManagerTest, ShouldInvalidateAllTokens) {
- ResultState result_state1 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/3)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/4),
- CreateScoredDocumentHit(/*document_id=*/5),
- CreateScoredDocumentHit(/*document_id=*/6)},
- /*num_per_page=*/1);
+TEST_F(ResultStateManagerTest,
+ ShouldInvalidateExpiredTokensWhenGetNextPageOnOthers) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] = AddScoredDocuments(
+ {/*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/std::numeric_limits<int>::max());
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
+
+ // Set time as 1s and add state 1.
+ clock()->SetSystemTimeMilliseconds(1000);
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info1.first, Not(Eq(kInvalidNextPageToken)));
+
+ // Set time as 2s and add state 2.
+ clock()->SetSystemTimeMilliseconds(2000);
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info2.first, Not(Eq(kInvalidNextPageToken)));
+
+ // 1. Set time as 1hr1s.
+ // 2. Call GetNextPage() on state 2. It should correctly invalidate the
+ // expired state 1.
+ // 3. Then calling GetNextPage() on state 1 shouldn't get anything.
+ clock()->SetSystemTimeMilliseconds(kDefaultResultStateTtlInMs + 1000);
+ // page_result_info2's token (page_result_info2.first) should be found
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ // We test the behavior by setting time back to 2s, to make sure the
+ // invalidation of state 1 was done by the previous GetNextPage() instead of
+ // the following GetNextPage().
+ clock()->SetSystemTimeMilliseconds(2000);
+ // page_result_info1's token (page_result_info1.first) shouldn't be found.
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
- result_state_manager.InvalidateAllResultStates();
+TEST_F(ResultStateManagerTest,
+ ShouldInvalidateExpiredTokensWhenGetNextPageOnItself) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] = AddScoredDocuments(
+ {/*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
- // page_result_state1.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
- // page_result_state2.next_page_token() shouldn't be found
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // Set time as 1s and add state.
+ clock()->SetSystemTimeMilliseconds(1000);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info.first, Not(Eq(kInvalidNextPageToken)));
+
+ // 1. Set time as 1hr1s.
+ // 2. Then calling GetNextPage() on the state shouldn't get anything.
+ clock()->SetSystemTimeMilliseconds(kDefaultResultStateTtlInMs + 1000);
+ // page_result_info's token (page_result_info.first) shouldn't be found.
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST(ResultStateManagerTest, ShouldRemoveOldestResultState) {
- ResultState result_state1 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/4)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/5),
- CreateScoredDocumentHit(/*document_id=*/6)},
- /*num_per_page=*/1);
+TEST_F(ResultStateManagerTest, ShouldInvalidateOneToken) {
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store().Put(CreateDocument(/*id=*/1)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store().Put(CreateDocument(/*id=*/2)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store().Put(CreateDocument(/*id=*/3)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id4,
+ document_store().Put(CreateDocument(/*id=*/4)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id5,
+ document_store().Put(CreateDocument(/*id=*/5)));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id6,
+ document_store().Put(CreateDocument(/*id=*/6)));
+ std::vector<ScoredDocumentHit> scored_document_hits1 = {
+ {document_id1, kSectionIdMaskNone, /*score=*/1},
+ {document_id2, kSectionIdMaskNone, /*score=*/1},
+ {document_id3, kSectionIdMaskNone, /*score=*/1}};
+ std::vector<ScoredDocumentHit> scored_document_hits2 = {
+ {document_id4, kSectionIdMaskNone, /*score=*/1},
+ {document_id5, kSectionIdMaskNone, /*score=*/1},
+ {document_id6, kSectionIdMaskNone, /*score=*/1}};
ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/2);
- ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
- // Adding state 3 should cause state 1 to be removed.
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // Invalidate first result state by the token.
+ result_state_manager.InvalidateResultState(page_result_info1.first);
+
+ // page_result_info1's token (page_result_info1.first) shouldn't be found
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state1.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // page_result_info2's token (page_result_info2.first) should still exist
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ // Should get docs.
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(CreateDocument(/*id=*/5)));
+}
+
+TEST_F(ResultStateManagerTest, ShouldInvalidateAllTokens) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] = AddScoredDocuments(
+ {/*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
+
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/std::numeric_limits<int>::max(), document_store());
ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state2,
- result_state_manager.GetNextPage(page_result_state2.next_page_token));
- EXPECT_THAT(page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
- /*document_id=*/3))));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
- /*document_id=*/5))));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ result_state_manager.InvalidateAllResultStates();
+
+ // page_result_info1's token (page_result_info1.first) shouldn't be found
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // page_result_info2's token (page_result_info2.first) shouldn't be found
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST(ResultStateManagerTest,
- PreviouslyInvalidatedResultStateShouldNotBeCounted) {
- ResultState result_state1 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2)},
- /*num_per_page=*/1);
- ResultState result_state2 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/4)},
- /*num_per_page=*/1);
- ResultState result_state3 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/5),
- CreateScoredDocumentHit(/*document_id=*/6)},
- /*num_per_page=*/1);
- ResultState result_state4 =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/7),
- CreateScoredDocumentHit(/*document_id=*/8)},
- /*num_per_page=*/1);
+TEST_F(ResultStateManagerTest, ShouldRemoveOldestResultState) {
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
+
+ ResultStateManager result_state_manager(/*max_total_hits=*/2,
+ document_store());
- ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/3);
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(result_state1)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state2,
- result_state_manager.RankAndPaginate(std::move(result_state2)));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // Adding state 3 should cause state 1 to be removed.
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state3,
- result_state_manager.RankAndPaginate(std::move(result_state3)));
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(document_protos2.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+}
- // Invalidates state 2, so that the number of valid tokens becomes 2.
- result_state_manager.InvalidateResultState(
- page_result_state2.next_page_token);
+TEST_F(ResultStateManagerTest,
+ InvalidatedResultStateShouldDecreaseCurrentHitsCount) {
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
+
+ // Add the first three states. Remember, the first page for each result state
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
+ ResultStateManager result_state_manager(/*max_total_hits=*/3,
+ document_store());
- // Adding state 4 shouldn't affect rest of the states
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state4,
- result_state_manager.RankAndPaginate(std::move(result_state4)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state1,
- result_state_manager.GetNextPage(page_result_state1.next_page_token));
- EXPECT_THAT(page_result_state1.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
- /*document_id=*/1))));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- EXPECT_THAT(
- result_state_manager.GetNextPage(page_result_state2.next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // Invalidates state 2, so that the number of hits current cached should be
+ // decremented to 2.
+ result_state_manager.InvalidateResultState(page_result_info2.first);
+
+ // If invalidating state 2 correctly decremented the current hit count to 2,
+ // then adding state 4 should still be within our budget and no other result
+ // states should be evicted.
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info1,
+ result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(document_protos1.at(1)));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
+}
+
+TEST_F(ResultStateManagerTest,
+ InvalidatedAllResultStatesShouldResetCurrentHitCount) {
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
+
+ // Add the first three states. Remember, the first page for each result state
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
+ ResultStateManager result_state_manager(/*max_total_hits=*/3,
+ document_store());
ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state3,
- result_state_manager.GetNextPage(page_result_state3.next_page_token));
- EXPECT_THAT(page_result_state3.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
- /*document_id=*/5))));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
ICING_ASSERT_OK_AND_ASSIGN(
- page_result_state4,
- result_state_manager.GetNextPage(page_result_state4.next_page_token));
- EXPECT_THAT(page_result_state4.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(CreateScoredDocumentHit(
- /*document_id=*/7))));
-}
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
-TEST(ResultStateManagerTest, ShouldGetSnippetContext) {
- ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
- result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // Invalidates all states so that the current hit count will be 0.
+ result_state_manager.InvalidateAllResultStates();
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ // If invalidating all states correctly reset the current hit count to 0,
+ // then adding state 4, 5, 6 should still be within our budget and no other
+ // result states should be evicted.
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ auto [scored_document_hits5, document_protos5] =
+ AddScoredDocuments({/*document_id=*/8, /*document_id=*/9});
+ auto [scored_document_hits6, document_protos6] =
+ AddScoredDocuments({/*document_id=*/10, /*document_id=*/11});
- SectionRestrictQueryTermsMap query_terms_map;
- query_terms_map.emplace("term1", std::unordered_set<std::string>());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- ResultState original_result_state = ResultState(
- /*scored_document_hits=*/{CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2)},
- query_terms_map, search_spec, CreateScoringSpec(), result_spec);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info5,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits5), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/std::numeric_limits<int>::max());
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
+ PageResultInfo page_result_info6,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits6), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- ASSERT_THAT(page_result_state.next_page_token, Gt(kInvalidNextPageToken));
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- EXPECT_THAT(page_result_state.snippet_context.match_type,
- Eq(TermMatchType::EXACT_ONLY));
- EXPECT_TRUE(page_result_state.snippet_context.query_terms.find("term1") !=
- page_result_state.snippet_context.query_terms.end());
- EXPECT_THAT(page_result_state.snippet_context.snippet_spec,
- EqualsProto(result_spec.snippet_spec()));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info5,
+ result_state_manager.GetNextPage(
+ page_result_info5.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info5.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info5.second.results.at(0).document(),
+ EqualsProto(document_protos5.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info6,
+ result_state_manager.GetNextPage(
+ page_result_info6.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info6.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info6.second.results.at(0).document(),
+ EqualsProto(document_protos6.at(1)));
}
-TEST(ResultStateManagerTest, ShouldGetDefaultSnippetContext) {
- ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/1);
- // 0 indicates no snippeting
- result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(0);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(0);
+TEST_F(
+ ResultStateManagerTest,
+ InvalidatedResultStateShouldDecreaseCurrentHitsCountByExactStateHitCount) {
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
+
+ // Add the first three states. Remember, the first page for each result state
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
+ ResultStateManager result_state_manager(/*max_total_hits=*/3,
+ document_store());
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(TermMatchType::EXACT_ONLY);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- SectionRestrictQueryTermsMap query_terms_map;
- query_terms_map.emplace("term1", std::unordered_set<std::string>());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- ResultState original_result_state = ResultState(
- /*scored_document_hits=*/{CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2)},
- query_terms_map, search_spec, CreateScoringSpec(), result_spec);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // Invalidates state 2, so that the number of hits current cached should be
+ // decremented to 2.
+ result_state_manager.InvalidateResultState(page_result_info2.first);
+
+ // If invalidating state 2 correctly decremented the current hit count to 2,
+ // then adding state 4 should still be within our budget and no other result
+ // states should be evicted.
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // If invalidating result state 2 correctly decremented the current hit count
+ // to 2 and adding state 4 correctly incremented it to 3, then adding this
+ // result state should trigger the eviction of state 1.
+ auto [scored_document_hits5, document_protos5] =
+ AddScoredDocuments({/*document_id=*/8, /*document_id=*/9});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info5,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits5), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info5,
+ result_state_manager.GetNextPage(
+ page_result_info5.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info5.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info5.second.results.at(0).document(),
+ EqualsProto(document_protos5.at(1)));
+}
+
+TEST_F(ResultStateManagerTest, GetNextPageShouldDecreaseCurrentHitsCount) {
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
+
+ // Add the first three states. Remember, the first page for each result state
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
+ ResultStateManager result_state_manager(/*max_total_hits=*/3,
+ document_store());
- ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/std::numeric_limits<int>::max());
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- ASSERT_THAT(page_result_state.next_page_token, Gt(kInvalidNextPageToken));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- EXPECT_THAT(page_result_state.snippet_context.query_terms, IsEmpty());
- EXPECT_THAT(
- page_result_state.snippet_context.snippet_spec,
- EqualsProto(ResultSpecProto::SnippetSpecProto::default_instance()));
- EXPECT_THAT(page_result_state.snippet_context.match_type,
- Eq(TermMatchType::UNKNOWN));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // GetNextPage for result state 1 should return its result and decrement the
+ // number of cached hits to 2.
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info1,
+ result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(document_protos1.at(1)));
+
+ // If retrieving the next page for result state 1 correctly decremented the
+ // current hit count to 2, then adding state 4 should still be within our
+ // budget and no other result states should be evicted.
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(document_protos2.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
}
-TEST(ResultStateManagerTest, ShouldGetCorrectNumPreviouslyReturned) {
- ResultState original_result_state =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/4),
- CreateScoredDocumentHit(/*document_id=*/5)},
- /*num_per_page=*/2);
+TEST_F(ResultStateManagerTest,
+ GetNextPageShouldDecreaseCurrentHitsCountByExactlyOnePage) {
+ auto [scored_document_hits1, document_protos1] =
+ AddScoredDocuments({/*document_id=*/0, /*document_id=*/1});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/2, /*document_id=*/3});
+ auto [scored_document_hits3, document_protos3] =
+ AddScoredDocuments({/*document_id=*/4, /*document_id=*/5});
+
+ // Add the first three states. Remember, the first page for each result state
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1 and a
+ // result set of 2 hits. So each result will take up one hit of our three hit
+ // budget.
+ ResultStateManager result_state_manager(/*max_total_hits=*/3,
+ document_store());
- ResultStateManager result_state_manager(
- /*max_hits_per_query=*/std::numeric_limits<int>::max(),
- /*max_result_states=*/std::numeric_limits<int>::max());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- // First page, 2 results
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
- ASSERT_THAT(page_result_state1.scored_document_hits.size(), Eq(2));
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- // No previously returned results
- EXPECT_THAT(page_result_state1.num_previously_returned, Eq(0));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // GetNextPage for result state 1 should return its result and decrement the
+ // number of cached hits to 2.
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info1,
+ result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(document_protos1.at(1)));
+
+ // If retrieving the next page for result state 1 correctly decremented the
+ // current hit count to 2, then adding state 4 should still be within our
+ // budget and no other result states should be evicted.
+ auto [scored_document_hits4, document_protos4] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info4,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits4), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // If retrieving the next page for result state 1 correctly decremented the
+ // current hit count to 2 and adding state 4 correctly incremented it to 3,
+ // then adding this result state should trigger the eviction of state 2.
+ auto [scored_document_hits5, document_protos5] =
+ AddScoredDocuments({/*document_id=*/8, /*document_id=*/9});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info5,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits5), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- uint64_t next_page_token = page_result_state1.next_page_token;
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // Second page, 2 results
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
- result_state_manager.GetNextPage(next_page_token));
- ASSERT_THAT(page_result_state2.scored_document_hits.size(), Eq(2));
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info3,
+ result_state_manager.GetNextPage(
+ page_result_info3.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info4,
+ result_state_manager.GetNextPage(
+ page_result_info4.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info4.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info4.second.results.at(0).document(),
+ EqualsProto(document_protos4.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info5,
+ result_state_manager.GetNextPage(
+ page_result_info5.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info5.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info5.second.results.at(0).document(),
+ EqualsProto(document_protos5.at(1)));
+}
- // num_previously_returned = size of first page
- EXPECT_THAT(page_result_state2.num_previously_returned, Eq(2));
+TEST_F(ResultStateManagerTest,
+ AddingOverBudgetResultStateShouldEvictAllStates) {
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2});
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/3, /*document_id=*/4});
- // Third page, 1 result
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state3,
- result_state_manager.GetNextPage(next_page_token));
- ASSERT_THAT(page_result_state3.scored_document_hits.size(), Eq(1));
+ // Add the first two states. Remember, the first page for each result state
+ // won't be cached (since it is returned immediately from
+ // CacheAndRetrieveFirstPage). Each result state has a page size of 1. So 3
+ // hits will remain cached.
+ ResultStateManager result_state_manager(/*max_total_hits=*/4,
+ document_store());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
- // num_previously_returned = size of first and second pages
- EXPECT_THAT(page_result_state3.num_previously_returned, Eq(4));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // Add a result state that is larger than the entire budget. This should
+ // result in all previous result states being evicted, the first hit from
+ // result state 3 being returned and the next four hits being cached (the last
+ // hit should be dropped because it exceeds the max).
+ auto [scored_document_hits3, document_protos3] = AddScoredDocuments(
+ {/*document_id=*/5, /*document_id=*/6, /*document_id=*/7,
+ /*document_id=*/8, /*document_id=*/9, /*document_id=*/10});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info3,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits3), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info3.first, Not(Eq(kInvalidNextPageToken)));
+
+ // GetNextPage for result state 1 and 2 should return NOT_FOUND.
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // No more results
- EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ // Only the next four results in state 3 should be retrievable.
+ uint64_t next_page_token3 = page_result_info3.first;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_info3,
+ result_state_manager.GetNextPage(next_page_token3, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info3.first, Eq(next_page_token3));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(1)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_info3,
+ result_state_manager.GetNextPage(next_page_token3, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info3.first, Eq(next_page_token3));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(2)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_info3,
+ result_state_manager.GetNextPage(next_page_token3, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info3.first, Eq(next_page_token3));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(3)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ page_result_info3,
+ result_state_manager.GetNextPage(next_page_token3, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ // The final document should have been dropped because it exceeded the budget,
+ // so the next page token of the second last round should be
+ // kInvalidNextPageToken.
+ EXPECT_THAT(page_result_info3.first, Eq(kInvalidNextPageToken));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos3.at(4)));
+
+ // Double check that next_page_token3 is not retrievable anymore.
+ EXPECT_THAT(
+ result_state_manager.GetNextPage(next_page_token3, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST(ResultStateManagerTest, ShouldStoreMaxNumberOfScoredDocumentHits) {
- ResultState original_result_state =
- CreateResultState({CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/4),
- CreateScoredDocumentHit(/*document_id=*/5)},
- /*num_per_page=*/2);
+TEST_F(ResultStateManagerTest,
+ AddingResultStateShouldEvictOverBudgetResultState) {
+ // Add a result state that is larger than the entire budget. The entire result
+ // state will still be cached
+ auto [scored_document_hits1, document_protos1] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2,
+ /*document_id=*/3, /*document_id=*/4, /*document_id=*/5});
- ResultStateManager result_state_manager(
- /*max_hits_per_query=*/3,
- /*max_result_states=*/std::numeric_limits<int>::max());
+ ResultStateManager result_state_manager(/*max_total_hits=*/4,
+ document_store());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // Add a result state. Because state2 + state1 is larger than the budget,
+ // state1 should be evicted.
+ auto [scored_document_hits2, document_protos2] =
+ AddScoredDocuments({/*document_id=*/6, /*document_id=*/7});
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/1, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+
+ // state1 should have been evicted and state2 should still be retrievable.
+ EXPECT_THAT(result_state_manager.GetNextPage(
+ page_result_info1.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- // The 5 input scored document hits will be truncated to 3.
+ ICING_ASSERT_OK_AND_ASSIGN(page_result_info2,
+ result_state_manager.GetNextPage(
+ page_result_info2.first, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(document_protos2.at(1)));
+}
+
+TEST_F(ResultStateManagerTest,
+ AddingResultStateShouldNotTruncatedAfterFirstPage) {
+ // Add a result state that is larger than the entire budget, but within the
+ // entire budget after the first page. The entire result state will still be
+ // cached and not truncated.
+ auto [scored_document_hits, document_protos] = AddScoredDocuments(
+ {/*document_id=*/0, /*document_id=*/1, /*document_id=*/2,
+ /*document_id=*/3, /*document_id=*/4});
+
+ ResultStateManager result_state_manager(/*max_total_hits=*/4,
+ document_store());
+
+ // The 5 input scored document hits will not be truncated. The first page of
+ // two hits will be returned immediately and the other three hits will fit
+ // within our caching budget.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE),
+ document_store(), result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
// First page, 2 results
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info1.second.results.at(0).document(),
+ EqualsProto(document_protos.at(0)));
+ EXPECT_THAT(page_result_info1.second.results.at(1).document(),
+ EqualsProto(document_protos.at(1)));
+
+ uint64_t next_page_token = page_result_info1.first;
+
+ // Second page, 2 results.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info2,
+ result_state_manager.GetNextPage(next_page_token, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info2.second.results, SizeIs(2));
+ EXPECT_THAT(page_result_info2.second.results.at(0).document(),
+ EqualsProto(document_protos.at(2)));
+ EXPECT_THAT(page_result_info2.second.results.at(1).document(),
+ EqualsProto(document_protos.at(3)));
+
+ // Third page, 1 result.
ICING_ASSERT_OK_AND_ASSIGN(
- PageResultState page_result_state1,
- result_state_manager.RankAndPaginate(std::move(original_result_state)));
+ PageResultInfo page_result_info3,
+ result_state_manager.GetNextPage(next_page_token, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info3.second.results, SizeIs(1));
+ EXPECT_THAT(page_result_info3.second.results.at(0).document(),
+ EqualsProto(document_protos.at(4)));
+
+ // Fourth page, 0 results.
EXPECT_THAT(
- page_result_state1.scored_document_hits,
- ElementsAre(
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4))));
-
- uint64_t next_page_token = page_result_state1.next_page_token;
-
- // Second page, 1 results.
- ICING_ASSERT_OK_AND_ASSIGN(PageResultState page_result_state2,
- result_state_manager.GetNextPage(next_page_token));
- EXPECT_THAT(page_result_state2.scored_document_hits,
- ElementsAre(EqualsScoredDocumentHit(
- CreateScoredDocumentHit(/*document_id=*/3))));
-
- // No third page.
- EXPECT_THAT(result_state_manager.GetNextPage(next_page_token),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ result_state_manager.GetNextPage(next_page_token, result_retriever(),
+ clock()->GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
} // namespace
diff --git a/icing/result/result-state-manager_thread-safety_test.cc b/icing/result/result-state-manager_thread-safety_test.cc
new file mode 100644
index 0000000..7e7e13c
--- /dev/null
+++ b/icing/result/result-state-manager_thread-safety_test.cc
@@ -0,0 +1,458 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <optional>
+#include <thread> // NOLINT
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/result/page-result.h"
+#include "icing/result/result-retriever-v2.h"
+#include "icing/result/result-state-manager.h"
+#include "icing/schema/schema-store.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/transform/normalizer.h"
+#include "icing/util/clock.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Not;
+using ::testing::SizeIs;
+using PageResultInfo = std::pair<uint64_t, PageResult>;
+
+ResultSpecProto CreateResultSpec(int num_per_page) {
+ ResultSpecProto result_spec;
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+DocumentProto CreateDocument(int document_id) {
+ return DocumentBuilder()
+ .SetNamespace("namespace")
+ .SetUri(std::to_string(document_id))
+ .SetSchema("Document")
+ .SetCreationTimestampMs(1574365086666 + document_id)
+ .SetScore(document_id)
+ .Build();
+}
+
+class ResultStateManagerThreadSafetyTest : public testing::Test {
+ protected:
+ ResultStateManagerThreadSafetyTest()
+ : test_dir_(GetTestTempDir() + "/icing") {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ clock_ = std::make_unique<FakeClock>();
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, clock_.get()));
+ SchemaProto schema;
+ schema.add_types()->set_schema_type("Document");
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ std::move(schema), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
+ /*max_term_byte_size=*/10000));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult result,
+ DocumentStore::Create(
+ &filesystem_, test_dir_, clock_.get(), schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ result_retriever_, ResultRetrieverV2::Create(
+ document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(), normalizer_.get()));
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ clock_.reset();
+ }
+
+ Filesystem filesystem_;
+ const std::string test_dir_;
+ std::unique_ptr<FakeClock> clock_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<ResultRetrieverV2> result_retriever_;
+};
+
+TEST_F(ResultStateManagerThreadSafetyTest,
+ RequestSameResultStateSimultaneously) {
+ // Create several threads to send GetNextPage requests with the same
+ // ResultState.
+ //
+ // This test verifies the usage of ResultState per instance lock. Only one
+ // thread is allowed to access ResultState, so there should be no crash and
+ // the result documents in a single page should be continuous (i.e. no
+ // interleaf).
+
+ // Prepare documents.
+ constexpr int kNumDocuments = 10000;
+ std::vector<ScoredDocumentHit> scored_document_hits;
+ for (int i = 0; i < kNumDocuments; ++i) {
+ // Put a document with id and score = i.
+ ICING_ASSERT_OK(document_store_->Put(CreateDocument(/*document_id=*/i)));
+ scored_document_hits.push_back(
+ ScoredDocumentHit(/*document_id=*/i, kSectionIdMaskNone, /*score=*/i));
+ }
+
+ constexpr int kNumPerPage = 100;
+ ResultStateManager result_state_manager(/*max_total_hits=*/kNumDocuments,
+ *document_store_);
+
+ // Retrieve the first page.
+ // Documents are ordered by score *ascending*, so the first page should
+ // contain documents with scores [0, 1, 2, ..., kNumPerPage - 1].
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(kNumPerPage), *document_store_, *result_retriever_,
+ clock_->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(kNumPerPage));
+ for (int i = 0; i < kNumPerPage; ++i) {
+ ASSERT_THAT(page_result_info1.second.results[i].score(), Eq(i));
+ }
+
+ uint64_t next_page_token = page_result_info1.first;
+ ASSERT_THAT(next_page_token, Not(Eq(kInvalidNextPageToken)));
+
+ // Create kNumThreads threads to call GetNextPage() with the same token at the
+ // same time. Each thread should get a valid result.
+ // Use page_results to store the result.
+ constexpr int kNumThreads = 50;
+ std::vector<std::optional<PageResultInfo>> page_results(kNumThreads);
+ auto callable = [&](int thread_id) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(),
+ normalizer_.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info,
+ result_state_manager.GetNextPage(next_page_token, *result_retriever,
+ clock_->GetSystemTimeMilliseconds()));
+ page_results[thread_id] =
+ std::make_optional<PageResultInfo>(std::move(page_result_info));
+ };
+
+ // Spawn threads for GetNextPage().
+ std::vector<std::thread> thread_objs;
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs.emplace_back(callable, /*thread_id=*/i);
+ }
+
+ // Join threads.
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs[i].join();
+ EXPECT_THAT(page_results[i], Not(Eq(std::nullopt)));
+ EXPECT_THAT(page_results[i]->second.results, SizeIs(kNumPerPage));
+ }
+
+ // Since we have per instance lock for ResultState, only one thread is allowed
+ // to access ResultState at a moment. Therefore, every thread should get
+ // continuous scores instead of interleaved scores, regardless of the
+ // execution order. IOW, within a particular page the scores of all results
+ // should be ordered as: [N, N+1, N+2, N+3, ...] where N is dependent on the
+ // execution order. Also there should be no crash.
+ std::vector<int> first_doc_scores;
+ for (const auto& page_result_info : page_results) {
+ first_doc_scores.push_back(page_result_info->second.results[0].score());
+ for (int i = 1; i < kNumPerPage; ++i) {
+ EXPECT_THAT(page_result_info->second.results[i].score(),
+ Eq(page_result_info->second.results[i - 1].score() + 1));
+ }
+ }
+
+ // Verify all first doc scores of page results are correct. Should be
+ // kNumPerPage * 1, kNumPerPage * 2, ..., etc.
+ // Note: the first score of the first page retrieved via GetNextPage should be
+ // kNumPerPage because the *actual* first page with first score = 0 was
+ // retrieved during CacheAndRetrieveFirstPage.
+ std::sort(first_doc_scores.begin(), first_doc_scores.end());
+ for (int i = 0; i < kNumThreads; ++i) {
+ EXPECT_THAT(first_doc_scores[i], Eq(kNumPerPage * (i + 1)));
+ }
+}
+
+TEST_F(ResultStateManagerThreadSafetyTest, InvalidateResultStateWhileUsing) {
+ // Create several threads to send GetNextPage requests with the same
+ // ResultState and another single thread to invalidate this ResultState.
+ //
+ // This test verifies the usage of std::shared_ptr. Even after invalidating
+ // the original copy of std::shared_ptr in the cache, the ResultState instance
+ // should be still valid and no crash should occur in threads that are still
+ // holding a copy of std::shared_ptr pointing to the same ResultState
+ // instance.
+
+ // Prepare documents.
+ constexpr int kNumDocuments = 10000;
+ std::vector<ScoredDocumentHit> scored_document_hits;
+ for (int i = 0; i < kNumDocuments; ++i) {
+ // Put a document with id and score = i.
+ ICING_ASSERT_OK(document_store_->Put(CreateDocument(/*document_id=*/i)));
+ scored_document_hits.push_back(
+ ScoredDocumentHit(/*document_id=*/i, kSectionIdMaskNone, /*score=*/i));
+ }
+
+ constexpr int kNumPerPage = 100;
+ ResultStateManager result_state_manager(/*max_total_hits=*/kNumDocuments,
+ *document_store_);
+
+ // Retrieve the first page.
+ // Documents are ordered by score *ascending*, so the first page should
+ // contain documents with scores [0, 1, 2, ..., kNumPerPage - 1].
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits), /*is_descending=*/false),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(kNumPerPage), *document_store_, *result_retriever_,
+ clock_->GetSystemTimeMilliseconds()));
+ ASSERT_THAT(page_result_info1.second.results, SizeIs(kNumPerPage));
+ for (int i = 0; i < kNumPerPage; ++i) {
+ ASSERT_THAT(page_result_info1.second.results[i].score(), Eq(i));
+ }
+
+ uint64_t next_page_token = page_result_info1.first;
+ ASSERT_THAT(next_page_token, Not(Eq(kInvalidNextPageToken)));
+
+ // Create kNumThreads threads to call GetNextPage() with the same token at the
+ // same time. The ResultState might have been invalidated, so it is normal to
+ // get NOT_FOUND error.
+ // Use page_results to store the result.
+ constexpr int kNumThreads = 50;
+ std::vector<std::optional<PageResultInfo>> page_results(kNumThreads);
+ auto callable = [&](int thread_id) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(),
+ normalizer_.get()));
+
+ libtextclassifier3::StatusOr<PageResultInfo> page_result_info_or =
+ result_state_manager.GetNextPage(next_page_token, *result_retriever,
+ clock_->GetSystemTimeMilliseconds());
+ if (page_result_info_or.ok()) {
+ page_results[thread_id] = std::make_optional<PageResultInfo>(
+ std::move(page_result_info_or).ValueOrDie());
+ } else {
+ EXPECT_THAT(page_result_info_or,
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ }
+ };
+
+ // Spawn threads for GetNextPage().
+ std::vector<std::thread> thread_objs;
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs.emplace_back(callable, /*thread_id=*/i);
+ }
+
+ // Spawn another single thread to invalidate the ResultState.
+ std::thread invalidating_thread([&]() -> void {
+ result_state_manager.InvalidateResultState(next_page_token);
+ });
+
+ // Join threads.
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs[i].join();
+ if (page_results[i] != std::nullopt) {
+ EXPECT_THAT(page_results[i]->second.results, SizeIs(kNumPerPage));
+ }
+ }
+ invalidating_thread.join();
+
+ // Threads fetching ResultState before invalidation will get normal results,
+ // while others will get NOT_FOUND error.
+ std::vector<int> first_doc_scores;
+ for (const auto& page_result_info : page_results) {
+ if (page_result_info == std::nullopt) {
+ continue;
+ }
+
+ first_doc_scores.push_back(page_result_info->second.results[0].score());
+ for (int i = 1; i < kNumPerPage; ++i) {
+ EXPECT_THAT(page_result_info->second.results[i].score(),
+ Eq(page_result_info->second.results[i - 1].score() + 1));
+ }
+ }
+
+ // Verify all first doc scores of page results are correct. Should be
+ // kNumPerPage * 1, kNumPerPage * 2, ..., etc.
+ std::sort(first_doc_scores.begin(), first_doc_scores.end());
+ for (int i = 0; i < first_doc_scores.size(); ++i) {
+ EXPECT_THAT(first_doc_scores[i], Eq(kNumPerPage * (i + 1)));
+ }
+
+ // Verify num_total_hits should be decremented correctly.
+ EXPECT_THAT(result_state_manager.num_total_hits(), Eq(0));
+}
+
+TEST_F(ResultStateManagerThreadSafetyTest, MultipleResultStates) {
+ // Create several threads to send GetNextPage requests with different
+ // ResultStates.
+ //
+ // This test verifies each ResultState should work independently and correctly
+ // with each thread. Also it verifies there should be no race condition for
+ // num_total_hits, which will be incremented/decremented by multiple threads.
+
+ // Prepare documents.
+ constexpr int kNumDocuments = 2000;
+ std::vector<ScoredDocumentHit> scored_document_hits;
+ for (int i = 0; i < kNumDocuments; ++i) {
+ // Put a document with id and score = i.
+ ICING_ASSERT_OK(document_store_->Put(CreateDocument(/*document_id=*/i)));
+ scored_document_hits.push_back(
+ ScoredDocumentHit(/*document_id=*/i, kSectionIdMaskNone, /*score=*/i));
+ }
+
+ constexpr int kNumThreads = 50;
+ constexpr int kNumPerPage = 30;
+ ResultStateManager result_state_manager(
+ /*max_total_hits=*/kNumDocuments * kNumThreads, *document_store_);
+
+ // Create kNumThreads threads to:
+ // - Call CacheAndRetrieveFirstPage() once to create its own ResultState.
+ // - Call GetNextPage() on its own ResultState for thread_id times.
+ //
+ // Each thread will get (thread_id + 1) pages, i.e. kNumPerPage *
+ // (thread_id + 1) docs.
+ ASSERT_THAT(kNumDocuments, Ge(kNumPerPage * kNumThreads));
+ auto callable = [&](int thread_id) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ResultRetrieverV2> result_retriever,
+ ResultRetrieverV2::Create(document_store_.get(), schema_store_.get(),
+ language_segmenter_.get(),
+ normalizer_.get()));
+
+ // Retrieve the first page.
+ // Documents are ordered by score *ascending*, so the first page should
+ // contain documents with scores [0, 1, 2, ..., kNumPerPage - 1].
+ std::vector<ScoredDocumentHit> scored_document_hits_copy(
+ scored_document_hits);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ PageResultInfo page_result_info1,
+ result_state_manager.CacheAndRetrieveFirstPage(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits_copy), /*is_descending=*/false),
+ /*parent_adjustment_info=*/nullptr,
+ /*child_adjustment_info=*/nullptr, CreateResultSpec(kNumPerPage),
+ *document_store_, *result_retriever,
+ clock_->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info1.second.results, SizeIs(kNumPerPage));
+ for (int i = 0; i < kNumPerPage; ++i) {
+ EXPECT_THAT(page_result_info1.second.results[i].score(), Eq(i));
+ }
+
+ uint64_t next_page_token = page_result_info1.first;
+ ASSERT_THAT(next_page_token, Not(Eq(kInvalidNextPageToken)));
+
+ // Retrieve some of the subsequent pages. We use thread_id as how many
+ // subsequent pages should be retrieved (how many times GetNextPage should
+ // be called) for each thread in order to:
+ // - Vary the number of pages that we're retrieving in each thread.
+ // - Still make the total number of hits remaining (num_total_hits) a
+ // predictable number.
+ // Then, including the first page (retrieved by CacheAndRetrieveFirstPage),
+ // each thread should retrieve 1, 2, 3, ..., kNumThreads pages.
+ int num_subsequent_pages_to_retrieve = thread_id;
+ for (int i = 0; i < num_subsequent_pages_to_retrieve; ++i) {
+ ICING_ASSERT_OK_AND_ASSIGN(PageResultInfo page_result_info,
+ result_state_manager.GetNextPage(
+ next_page_token, *result_retriever,
+ clock_->GetSystemTimeMilliseconds()));
+ EXPECT_THAT(page_result_info.second.results, SizeIs(kNumPerPage));
+ for (int j = 0; j < kNumPerPage; ++j) {
+ EXPECT_THAT(page_result_info.second.results[j].score(),
+ Eq(kNumPerPage * (i + 1) + j));
+ }
+ }
+ };
+
+ // Spawn threads.
+ std::vector<std::thread> thread_objs;
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs.emplace_back(callable, /*thread_id=*/i);
+ }
+
+ // Join threads.
+ for (int i = 0; i < kNumThreads; ++i) {
+ thread_objs[i].join();
+ }
+
+ // There will be kNumThreads * kNumDocuments ScoredDocumentHits being created
+ // in the beginning, and kNumPerPage * (1 + 2 + ... + kNumThreads) docs should
+ // be returned after retrieval, since each thread should retrieve 1, 2, 3,
+ // ..., kNumThreads pages. Thus, all retrieved ScoredDocumentHits should be
+ // removed from the cache and num_total_hits should be decremented correctly.
+ int expected_remaining_hits =
+ kNumThreads * kNumDocuments -
+ kNumPerPage * (kNumThreads * (kNumThreads + 1) / 2);
+ EXPECT_THAT(result_state_manager.num_total_hits(),
+ Eq(expected_remaining_hits));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-state-v2.cc b/icing/result/result-state-v2.cc
new file mode 100644
index 0000000..3aa9359
--- /dev/null
+++ b/icing/result/result-state-v2.cc
@@ -0,0 +1,84 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-state-v2.h"
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "icing/proto/search.pb.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+ResultStateV2::ResultStateV2(
+ std::unique_ptr<ScoredDocumentHitsRanker> scored_document_hits_ranker_in,
+ std::unique_ptr<ResultAdjustmentInfo> parent_adjustment_info,
+ std::unique_ptr<ResultAdjustmentInfo> child_adjustment_info,
+ const ResultSpecProto& result_spec, const DocumentStore& document_store)
+ : scored_document_hits_ranker(std::move(scored_document_hits_ranker_in)),
+ num_returned(0),
+ parent_adjustment_info_(std::move(parent_adjustment_info)),
+ child_adjustment_info_(std::move(child_adjustment_info)),
+ num_per_page_(result_spec.num_per_page()),
+ num_total_bytes_per_page_threshold_(
+ result_spec.num_total_bytes_per_page_threshold()),
+ max_joined_children_per_parent_to_return_(
+ result_spec.max_joined_children_per_parent_to_return()),
+ num_total_hits_(nullptr),
+ result_group_type_(result_spec.result_group_type()) {
+ for (const ResultSpecProto::ResultGrouping& result_grouping :
+ result_spec.result_groupings()) {
+ int group_id = group_result_limits.size();
+ group_result_limits.push_back(result_grouping.max_results());
+ for (const ResultSpecProto::ResultGrouping::Entry& entry :
+ result_grouping.entry_groupings()) {
+ const std::string& name_space = entry.namespace_();
+ const std::string& schema = entry.schema();
+ auto entry_id_or = document_store.GetResultGroupingEntryId(
+ result_group_type_, name_space, schema);
+ if (!entry_id_or.ok()) {
+ continue;
+ }
+ int32_t entry_id = entry_id_or.ValueOrDie();
+ entry_id_group_id_map_.insert({entry_id, group_id});
+ }
+ }
+}
+
+ResultStateV2::~ResultStateV2() {
+ IncrementNumTotalHits(-1 * scored_document_hits_ranker->size());
+}
+
+void ResultStateV2::RegisterNumTotalHits(std::atomic<int>* num_total_hits) {
+ // Decrement the original num_total_hits_ before registering a new one.
+ IncrementNumTotalHits(-1 * scored_document_hits_ranker->size());
+ num_total_hits_ = num_total_hits;
+ IncrementNumTotalHits(scored_document_hits_ranker->size());
+}
+
+void ResultStateV2::IncrementNumTotalHits(int increment_by) {
+ if (num_total_hits_ != nullptr) {
+ *num_total_hits_ += increment_by;
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-state-v2.h b/icing/result/result-state-v2.h
new file mode 100644
index 0000000..919710e
--- /dev/null
+++ b/icing/result/result-state-v2.h
@@ -0,0 +1,175 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_RESULT_RESULT_STATE_V2_H_
+#define ICING_RESULT_RESULT_STATE_V2_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "icing/absl_ports/mutex.h"
+#include "icing/absl_ports/thread_annotations.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/result-adjustment-info.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+// Used to hold information needed across multiple pagination requests of the
+// same query. Stored in ResultStateManager.
+class ResultStateV2 {
+ public:
+ explicit ResultStateV2(
+ std::unique_ptr<ScoredDocumentHitsRanker> scored_document_hits_ranker_in,
+ std::unique_ptr<ResultAdjustmentInfo> parent_adjustment_info,
+ std::unique_ptr<ResultAdjustmentInfo> child_adjustment_info,
+ const ResultSpecProto& result_spec, const DocumentStore& document_store);
+
+ ~ResultStateV2();
+
+ // Register num_total_hits_ and add current scored_document_hits_ranker.size()
+ // to it. When re-registering, it will subtract
+ // scored_document_hits_ranker.size() from the original counter.
+ void RegisterNumTotalHits(std::atomic<int>* num_total_hits)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex);
+
+ // Increment the global counter num_total_hits_ by increment_by, if
+ // num_total_hits_ has been registered (is not nullptr).
+ // Note that providing a negative value for increment_by is a valid usage,
+ // which will actually decrement num_total_hits_.
+ //
+ // It has to be called when we change scored_document_hits_ranker.
+ void IncrementNumTotalHits(int increment_by)
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex);
+
+ // Returns a nullable pointer to parent adjustment info.
+ ResultAdjustmentInfo* parent_adjustment_info()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex) {
+ return parent_adjustment_info_.get();
+ }
+
+ // Returns a nullable pointer to parent adjustment info.
+ const ResultAdjustmentInfo* parent_adjustment_info() const
+ ICING_SHARED_LOCKS_REQUIRED(mutex) {
+ return parent_adjustment_info_.get();
+ }
+
+ // Returns a nullable pointer to child adjustment info.
+ ResultAdjustmentInfo* child_adjustment_info()
+ ICING_EXCLUSIVE_LOCKS_REQUIRED(mutex) {
+ return child_adjustment_info_.get();
+ }
+
+ // Returns a nullable pointer to child adjustment info.
+ const ResultAdjustmentInfo* child_adjustment_info() const
+ ICING_SHARED_LOCKS_REQUIRED(mutex) {
+ return child_adjustment_info_.get();
+ }
+
+ const std::unordered_map<int32_t, int>& entry_id_group_id_map() const
+ ICING_SHARED_LOCKS_REQUIRED(mutex) {
+ return entry_id_group_id_map_;
+ }
+
+ int32_t num_per_page() const ICING_SHARED_LOCKS_REQUIRED(mutex) {
+ return num_per_page_;
+ }
+
+ int32_t num_total_bytes_per_page_threshold() const
+ ICING_SHARED_LOCKS_REQUIRED(mutex) {
+ return num_total_bytes_per_page_threshold_;
+ }
+
+ int32_t max_joined_children_per_parent_to_return() const
+ ICING_SHARED_LOCKS_REQUIRED(mutex) {
+ return max_joined_children_per_parent_to_return_;
+ }
+
+ ResultSpecProto::ResultGroupingType result_group_type()
+ ICING_SHARED_LOCKS_REQUIRED(mutex) {
+ return result_group_type_;
+ }
+
+ absl_ports::shared_mutex mutex;
+
+ // When evaluating the next top K hits from scored_document_hits_ranker, some
+ // of them may be filtered out by group_result_limits and won't return to the
+ // client, so they shouldn't be counted into num_returned. Also the logic of
+ // group result limiting depends on retrieval, so it is impossible for
+ // ResultState itself to correctly modify these fields. Thus, we make them
+ // public, so users of this class can modify them directly.
+
+ // The scored document hits ranker.
+ std::unique_ptr<ScoredDocumentHitsRanker> scored_document_hits_ranker
+ ICING_GUARDED_BY(mutex);
+
+ // The count of remaining results to return for a group where group id is the
+ // index.
+ std::vector<int> group_result_limits ICING_GUARDED_BY(mutex);
+
+ // Number of results that have already been returned.
+ int num_returned ICING_GUARDED_BY(mutex);
+
+ private:
+ // Adjustment information for parent documents, including snippet and
+ // projection. Can be nullptr if there is no adjustment info for parent
+ // documents.
+ std::unique_ptr<ResultAdjustmentInfo> parent_adjustment_info_
+ ICING_GUARDED_BY(mutex);
+
+ // Adjustment information for child documents, including snippet and
+ // projection. This is only used for join query. Can be nullptr if there is no
+ // adjustment info for child documents.
+ std::unique_ptr<ResultAdjustmentInfo> child_adjustment_info_
+ ICING_GUARDED_BY(mutex);
+
+ // A map between result grouping entry id and the id of the group that it
+ // appears in.
+ std::unordered_map<int32_t, int> entry_id_group_id_map_
+ ICING_GUARDED_BY(mutex);
+
+ // Number of results to return in each page.
+ int32_t num_per_page_ ICING_GUARDED_BY(mutex);
+
+ // The threshold of total bytes of all documents to cutoff, in order to limit
+ // # of bytes in a single page.
+ // Note that it doesn't guarantee the result # of bytes will be smaller, equal
+ // to, or larger than the threshold. Instead, it is just a threshold to
+ // cutoff, and only guarantees total bytes of search results won't exceed the
+ // threshold too much.
+ int32_t num_total_bytes_per_page_threshold_ ICING_GUARDED_BY(mutex);
+
+ // Max # of joined child documents to be attached in the result for each
+ // parent document.
+ int32_t max_joined_children_per_parent_to_return_ ICING_GUARDED_BY(mutex);
+
+ // Pointer to a global counter to sum up the size of scored_document_hits in
+ // all ResultStates.
+ // Does not own.
+ std::atomic<int>* num_total_hits_ ICING_GUARDED_BY(mutex);
+
+ // Value that the search results will get grouped by.
+ ResultSpecProto::ResultGroupingType result_group_type_
+ ICING_GUARDED_BY(mutex);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_RESULT_RESULT_STATE_V2_H_
diff --git a/icing/result/result-state-v2_test.cc b/icing/result/result-state-v2_test.cc
new file mode 100644
index 0000000..0f88023
--- /dev/null
+++ b/icing/result/result-state-v2_test.cc
@@ -0,0 +1,409 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/result/result-state-v2.h"
+
+#include <atomic>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/mutex.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/clock.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+ResultSpecProto CreateResultSpec(
+ int num_per_page, ResultSpecProto::ResultGroupingType result_group_type) {
+ ResultSpecProto result_spec;
+ result_spec.set_result_group_type(result_group_type);
+ result_spec.set_num_per_page(num_per_page);
+ return result_spec;
+}
+
+class ResultStateV2Test : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ schema_store_base_dir_ = GetTestTempDir() + "/schema_store";
+ filesystem_.CreateDirectoryRecursively(schema_store_base_dir_.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_base_dir_, &clock_));
+ SchemaProto schema;
+ schema.add_types()->set_schema_type("Document");
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ std::move(schema), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ doc_store_base_dir_ = GetTestTempDir() + "/document_store";
+ filesystem_.CreateDirectoryRecursively(doc_store_base_dir_.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult result,
+ DocumentStore::Create(
+ &filesystem_, doc_store_base_dir_, &clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(result.document_store);
+
+ num_total_hits_ = 0;
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(doc_store_base_dir_.c_str());
+ filesystem_.DeleteDirectoryRecursively(schema_store_base_dir_.c_str());
+ }
+
+ ScoredDocumentHit AddScoredDocument(DocumentId document_id) {
+ DocumentProto document;
+ document.set_namespace_("namespace");
+ document.set_uri(std::to_string(document_id));
+ document.set_schema("Document");
+ document_store_->Put(std::move(document));
+ return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1);
+ }
+
+ DocumentStore& document_store() { return *document_store_; }
+
+ std::atomic<int>& num_total_hits() { return num_total_hits_; }
+
+ const std::atomic<int>& num_total_hits() const { return num_total_hits_; }
+
+ private:
+ Filesystem filesystem_;
+ std::string doc_store_base_dir_;
+ std::string schema_store_base_dir_;
+ Clock clock_;
+ std::unique_ptr<DocumentStore> document_store_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::atomic<int> num_total_hits_;
+};
+
+TEST_F(ResultStateV2Test, ShouldInitializeValuesAccordingToSpecs) {
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/2, ResultSpecProto::NAMESPACE);
+ result_spec.set_num_total_bytes_per_page_threshold(4096);
+ result_spec.set_max_joined_children_per_parent_to_return(2048);
+
+ // Adjustment info is not important in this test.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::vector<ScoredDocumentHit>(), /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, document_store());
+
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ EXPECT_THAT(result_state.num_returned, Eq(0));
+ EXPECT_THAT(result_state.num_per_page(), Eq(result_spec.num_per_page()));
+ EXPECT_THAT(result_state.num_total_bytes_per_page_threshold(),
+ Eq(result_spec.num_total_bytes_per_page_threshold()));
+ EXPECT_THAT(result_state.max_joined_children_per_parent_to_return(),
+ Eq(result_spec.max_joined_children_per_parent_to_return()));
+}
+
+TEST_F(ResultStateV2Test, ShouldInitializeValuesAccordingToDefaultSpecs) {
+ ResultSpecProto default_result_spec = ResultSpecProto::default_instance();
+ ASSERT_THAT(default_result_spec.num_per_page(), Eq(10));
+ ASSERT_THAT(default_result_spec.num_total_bytes_per_page_threshold(),
+ Eq(std::numeric_limits<int32_t>::max()));
+
+ // Adjustment info is not important in this test.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::vector<ScoredDocumentHit>(),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ default_result_spec, document_store());
+
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ EXPECT_THAT(result_state.num_returned, Eq(0));
+ EXPECT_THAT(result_state.num_per_page(),
+ Eq(default_result_spec.num_per_page()));
+ EXPECT_THAT(result_state.num_total_bytes_per_page_threshold(),
+ Eq(default_result_spec.num_total_bytes_per_page_threshold()));
+ EXPECT_THAT(
+ result_state.max_joined_children_per_parent_to_return(),
+ Eq(default_result_spec.max_joined_children_per_parent_to_return()));
+}
+
+TEST_F(ResultStateV2Test,
+ ShouldConstructNamespaceGroupIdMapAndGroupResultLimitsAccordingToSpecs) {
+ // Create 3 docs under namespace1, namespace2, namespace3.
+ DocumentProto document1;
+ document1.set_namespace_("namespace1");
+ document1.set_uri("uri/1");
+ document1.set_schema("Document");
+ ICING_ASSERT_OK(document_store().Put(std::move(document1)));
+
+ DocumentProto document2;
+ document2.set_namespace_("namespace2");
+ document2.set_uri("uri/2");
+ document2.set_schema("Document");
+ ICING_ASSERT_OK(document_store().Put(std::move(document2)));
+
+ DocumentProto document3;
+ document3.set_namespace_("namespace3");
+ document3.set_uri("uri/3");
+ document3.set_schema("Document");
+ ICING_ASSERT_OK(document_store().Put(std::move(document3)));
+
+ // Create a ResultSpec that limits "namespace1" to 3 results and limits
+ // "namespace2"+"namespace3" to a total of 2 results. Also add
+ // "nonexistentNamespace1" and "nonexistentNamespace2" to test the behavior.
+ ResultSpecProto::ResultGroupingType result_grouping_type =
+ ResultSpecProto::NAMESPACE;
+ ResultSpecProto result_spec =
+ CreateResultSpec(/*num_per_page=*/5, result_grouping_type);
+ ResultSpecProto::ResultGrouping* result_grouping =
+ result_spec.add_result_groupings();
+ ResultSpecProto::ResultGrouping::Entry* entry =
+ result_grouping->add_entry_groupings();
+ result_grouping->set_max_results(3);
+ entry->set_namespace_("namespace1");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(5);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("nonexistentNamespace2");
+ result_grouping = result_spec.add_result_groupings();
+ result_grouping->set_max_results(2);
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace2");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("namespace3");
+ entry = result_grouping->add_entry_groupings();
+ entry->set_namespace_("nonexistentNamespace1");
+
+ // Get entry ids.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ int32_t entry_id1, document_store().GetResultGroupingEntryId(
+ result_grouping_type, "namespace1", "Document"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ int32_t entry_id2, document_store().GetResultGroupingEntryId(
+ result_grouping_type, "namespace2", "Document"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ int32_t entry_id3, document_store().GetResultGroupingEntryId(
+ result_grouping_type, "namespace3", "Document"));
+
+ // Adjustment info is not important in this test.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::vector<ScoredDocumentHit>(),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ result_spec, document_store());
+
+ absl_ports::shared_lock l(&result_state.mutex);
+
+ // "namespace1" should be in group 0, and "namespace2"+"namespace3" should be
+ // in group 2.
+ // "nonexistentNamespace1" and "nonexistentNamespace2" shouldn't exist.
+ EXPECT_THAT(result_state.entry_id_group_id_map(),
+ UnorderedElementsAre(Pair(entry_id1, 0), Pair(entry_id2, 2),
+ Pair(entry_id3, 2)));
+
+ // group_result_limits should contain 3 (at index 0 for group 0), 5 (at index
+ // 1 for group 1), 2 (at index 2 for group 2), even though there is no valid
+ // namespace in group 1.
+ EXPECT_THAT(result_state.group_result_limits, ElementsAre(3, 5, 2));
+}
+
+TEST_F(ResultStateV2Test, ShouldUpdateNumTotalHits) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ AddScoredDocument(/*document_id=*/1),
+ AddScoredDocument(/*document_id=*/0),
+ AddScoredDocument(/*document_id=*/2),
+ AddScoredDocument(/*document_id=*/4),
+ AddScoredDocument(/*document_id=*/3)};
+
+ // Adjustment info is not important in this test.
+ // Creates a ResultState with 5 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE),
+ document_store());
+
+ absl_ports::unique_lock l(&result_state.mutex);
+
+ EXPECT_THAT(num_total_hits(), Eq(0));
+ result_state.RegisterNumTotalHits(&num_total_hits());
+ EXPECT_THAT(num_total_hits(), Eq(5));
+ result_state.IncrementNumTotalHits(500);
+ EXPECT_THAT(num_total_hits(), Eq(505));
+}
+
+TEST_F(ResultStateV2Test, ShouldUpdateNumTotalHitsWhenDestructed) {
+ std::vector<ScoredDocumentHit> scored_document_hits1 = {
+ AddScoredDocument(/*document_id=*/1),
+ AddScoredDocument(/*document_id=*/0),
+ AddScoredDocument(/*document_id=*/2),
+ AddScoredDocument(/*document_id=*/4),
+ AddScoredDocument(/*document_id=*/3)};
+
+ std::vector<ScoredDocumentHit> scored_document_hits2 = {
+ AddScoredDocument(/*document_id=*/6),
+ AddScoredDocument(/*document_id=*/5)};
+
+ num_total_hits() = 2;
+ {
+ // Adjustment info is not important in this test.
+ // Creates a ResultState with 5 ScoredDocumentHits.
+ ResultStateV2 result_state1(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits1),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE),
+ document_store());
+
+ absl_ports::unique_lock l(&result_state1.mutex);
+
+ result_state1.RegisterNumTotalHits(&num_total_hits());
+ ASSERT_THAT(num_total_hits(), Eq(7));
+
+ {
+ // Adjustment info is not important in this test.
+ // Creates another ResultState with 2 ScoredDocumentHits.
+ ResultStateV2 result_state2(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits2),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE),
+ document_store());
+
+ absl_ports::unique_lock l(&result_state2.mutex);
+
+ result_state2.RegisterNumTotalHits(&num_total_hits());
+ ASSERT_THAT(num_total_hits(), Eq(9));
+ }
+
+ EXPECT_THAT(num_total_hits(), Eq(7));
+ }
+ EXPECT_THAT(num_total_hits(), Eq(2));
+}
+
+TEST_F(ResultStateV2Test, ShouldNotUpdateNumTotalHitsWhenNotRegistered) {
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ AddScoredDocument(/*document_id=*/1),
+ AddScoredDocument(/*document_id=*/0),
+ AddScoredDocument(/*document_id=*/2),
+ AddScoredDocument(/*document_id=*/4),
+ AddScoredDocument(/*document_id=*/3)};
+
+ // Creates a ResultState with 5 ScoredDocumentHits.
+ {
+ // Adjustment info is not important in this test.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE),
+ document_store());
+
+ {
+ absl_ports::unique_lock l(&result_state.mutex);
+
+ EXPECT_THAT(num_total_hits(), Eq(0));
+ result_state.IncrementNumTotalHits(500);
+ EXPECT_THAT(num_total_hits(), Eq(0));
+ }
+ }
+ EXPECT_THAT(num_total_hits(), Eq(0));
+}
+
+TEST_F(ResultStateV2Test, ShouldDecrementOriginalNumTotalHitsWhenReregister) {
+ std::atomic<int> another_num_total_hits = 11;
+
+ std::vector<ScoredDocumentHit> scored_document_hits = {
+ AddScoredDocument(/*document_id=*/1),
+ AddScoredDocument(/*document_id=*/0),
+ AddScoredDocument(/*document_id=*/2),
+ AddScoredDocument(/*document_id=*/4),
+ AddScoredDocument(/*document_id=*/3)};
+
+ // Adjustment info is not important in this test.
+ // Creates a ResultState with 5 ScoredDocumentHits.
+ ResultStateV2 result_state(
+ std::make_unique<
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>>(
+ std::move(scored_document_hits),
+ /*is_descending=*/true),
+ /*parent_adjustment_info=*/nullptr, /*child_adjustment_info=*/nullptr,
+ CreateResultSpec(/*num_per_page=*/5, ResultSpecProto::NAMESPACE),
+ document_store());
+
+ absl_ports::unique_lock l(&result_state.mutex);
+
+ num_total_hits() = 7;
+ result_state.RegisterNumTotalHits(&num_total_hits());
+ EXPECT_THAT(num_total_hits(), Eq(12));
+
+ result_state.RegisterNumTotalHits(&another_num_total_hits);
+ // The original num_total_hits should be decremented after re-registration.
+ EXPECT_THAT(num_total_hits(), Eq(7));
+ // another_num_total_hits should be incremented after re-registration.
+ EXPECT_THAT(another_num_total_hits, Eq(16));
+
+ result_state.IncrementNumTotalHits(500);
+ // The original num_total_hits should be unchanged.
+ EXPECT_THAT(num_total_hits(), Eq(7));
+ // Increment should be done on another_num_total_hits.
+ EXPECT_THAT(another_num_total_hits, Eq(516));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/result-state.cc b/icing/result/result-state.cc
deleted file mode 100644
index bf28f52..0000000
--- a/icing/result/result-state.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/result/result-state.h"
-
-#include "icing/scoring/ranker.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-
-SnippetContext CreateSnippetContext(SectionRestrictQueryTermsMap query_terms,
- const SearchSpecProto& search_spec,
- const ResultSpecProto& result_spec) {
- if (result_spec.snippet_spec().num_to_snippet() > 0 &&
- result_spec.snippet_spec().num_matches_per_property() > 0) {
- // Needs snippeting
- return SnippetContext(std::move(query_terms), result_spec.snippet_spec(),
- search_spec.term_match_type());
- }
- return SnippetContext(/*query_terms_in=*/{},
- ResultSpecProto::SnippetSpecProto::default_instance(),
- TermMatchType::UNKNOWN);
-}
-
-ResultState::ResultState(std::vector<ScoredDocumentHit> scored_document_hits,
- SectionRestrictQueryTermsMap query_terms,
- const SearchSpecProto& search_spec,
- const ScoringSpecProto& scoring_spec,
- const ResultSpecProto& result_spec)
- : scored_document_hits_(std::move(scored_document_hits)),
- snippet_context_(CreateSnippetContext(std::move(query_terms), search_spec,
- result_spec)),
- num_per_page_(result_spec.num_per_page()),
- num_returned_(0),
- scored_document_hit_comparator_(scoring_spec.order_by() ==
- ScoringSpecProto::Order::DESC) {
- BuildHeapInPlace(&scored_document_hits_, scored_document_hit_comparator_);
-}
-
-std::vector<ScoredDocumentHit> ResultState::GetNextPage() {
- std::vector<ScoredDocumentHit> scored_document_hits = PopTopResultsFromHeap(
- &scored_document_hits_, num_per_page_, scored_document_hit_comparator_);
- num_returned_ += scored_document_hits.size();
- return scored_document_hits;
-}
-
-void ResultState::TruncateHitsTo(int new_size) {
- if (new_size < 0 || scored_document_hits_.size() <= new_size) {
- return;
- }
-
- // Copying the best new_size results.
- scored_document_hits_ = PopTopResultsFromHeap(
- &scored_document_hits_, new_size, scored_document_hit_comparator_);
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/result/result-state.h b/icing/result/result-state.h
deleted file mode 100644
index 82e783b..0000000
--- a/icing/result/result-state.h
+++ /dev/null
@@ -1,81 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_RESULT_RESULT_STATE_H_
-#define ICING_RESULT_RESULT_STATE_H_
-
-#include <vector>
-
-#include "icing/proto/scoring.pb.h"
-#include "icing/proto/search.pb.h"
-#include "icing/result/snippet-context.h"
-#include "icing/scoring/scored-document-hit.h"
-
-namespace icing {
-namespace lib {
-
-// Used to hold information needed across multiple pagination requests of the
-// same query. Stored in ResultStateManager.
-class ResultState {
- public:
- explicit ResultState(std::vector<ScoredDocumentHit> scored_document_hits,
- SectionRestrictQueryTermsMap query_terms,
- const SearchSpecProto& search_spec,
- const ScoringSpecProto& scoring_spec,
- const ResultSpecProto& result_spec);
-
- // Returns the next page of results. The size of page is passed in from
- // ResultSpecProto in constructor. Calling this method could increase the
- // value of num_returned(), so be careful of the order of calling these
- // methods.
- std::vector<ScoredDocumentHit> GetNextPage();
-
- // Truncates the vector of ScoredDocumentHits to the given size. The best
- // ScoredDocumentHits are kept.
- void TruncateHitsTo(int new_size);
-
- // Returns if the current state has more results to return.
- bool HasMoreResults() const { return !scored_document_hits_.empty(); }
-
- // Returns a SnippetContext generated from the specs passed in via
- // constructor.
- const SnippetContext& snippet_context() const { return snippet_context_; }
-
- // The number of results that have already been returned. This number is
- // increased when GetNextPage() is called.
- int num_returned() const { return num_returned_; }
-
- private:
- // The scored document hits. It represents a heap data structure when ranking
- // is required so that we can get top K hits in O(KlgN) time. If no ranking is
- // required, it's just a vector of ScoredDocumentHits in the original order.
- std::vector<ScoredDocumentHit> scored_document_hits_;
-
- // Information needed for snippeting.
- SnippetContext snippet_context_;
-
- // Number of results to return in each page.
- int num_per_page_;
-
- // Number of results that have already been returned.
- int num_returned_;
-
- // Used to compare two scored document hits.
- ScoredDocumentHitComparator scored_document_hit_comparator_;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_RESULT_RESULT_STATE_H_
diff --git a/icing/result/result-state_test.cc b/icing/result/result-state_test.cc
deleted file mode 100644
index 85cb242..0000000
--- a/icing/result/result-state_test.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/result/result-state.h"
-
-#include "gtest/gtest.h"
-#include "icing/portable/equals-proto.h"
-#include "icing/scoring/scored-document-hit.h"
-#include "icing/testing/common-matchers.h"
-
-namespace icing {
-namespace lib {
-namespace {
-using ::icing::lib::portable_equals_proto::EqualsProto;
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-ScoredDocumentHit CreateScoredDocumentHit(DocumentId document_id) {
- return ScoredDocumentHit(document_id, kSectionIdMaskNone, /*score=*/1);
-}
-
-SearchSpecProto CreateSearchSpec(TermMatchType::Code match_type) {
- SearchSpecProto search_spec;
- search_spec.set_term_match_type(match_type);
- return search_spec;
-}
-
-ScoringSpecProto CreateScoringSpec(bool is_descending_order) {
- ScoringSpecProto scoring_spec;
- scoring_spec.set_order_by(is_descending_order ? ScoringSpecProto::Order::DESC
- : ScoringSpecProto::Order::ASC);
- return scoring_spec;
-}
-
-ResultSpecProto CreateResultSpec(int num_per_page) {
- ResultSpecProto result_spec;
- result_spec.set_num_per_page(num_per_page);
- return result_spec;
-}
-
-// ResultState::ResultState() and ResultState::GetNextPage() are calling
-// Ranker::BuildHeapInPlace() and Ranker::PopTopResultsFromHeap() directly, so
-// we don't need to test much on what order is returned as that is tested in
-// Ranker's tests. Here we just need one sanity test to make sure that the
-// correct functions are called.
-TEST(ResultStateTest, ShouldReturnNextPage) {
- std::vector<ScoredDocumentHit> scored_document_hits = {
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/5),
- CreateScoredDocumentHit(/*document_id=*/4)};
-
- ResultState result_state(scored_document_hits, /*query_terms=*/{},
- CreateSearchSpec(TermMatchType::EXACT_ONLY),
- CreateScoringSpec(/*is_descending_order=*/true),
- CreateResultSpec(/*num_per_page=*/2));
-
- EXPECT_THAT(
- result_state.GetNextPage(),
- ElementsAre(
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4))));
-
- EXPECT_THAT(
- result_state.GetNextPage(),
- ElementsAre(
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2))));
-
- EXPECT_THAT(result_state.GetNextPage(),
- ElementsAre(EqualsScoredDocumentHit(
- CreateScoredDocumentHit(/*document_id=*/1))));
-}
-
-TEST(ResultStateTest, ShouldReturnSnippetContextAccordingToSpecs) {
- ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
- result_spec.mutable_snippet_spec()->set_num_to_snippet(5);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
-
- SectionRestrictQueryTermsMap query_terms_map;
- query_terms_map.emplace("term1", std::unordered_set<std::string>());
-
- ResultState result_state(
- /*scored_document_hits=*/{}, query_terms_map,
- CreateSearchSpec(TermMatchType::EXACT_ONLY),
- CreateScoringSpec(/*is_descending_order=*/true), result_spec);
-
- const SnippetContext& snippet_context = result_state.snippet_context();
-
- // Snippet context should be derived from the specs above.
- EXPECT_TRUE(snippet_context.query_terms.find("term1") !=
- snippet_context.query_terms.end());
- EXPECT_THAT(snippet_context.snippet_spec,
- EqualsProto(result_spec.snippet_spec()));
- EXPECT_THAT(snippet_context.match_type, Eq(TermMatchType::EXACT_ONLY));
-
- // The same copy can be fetched multiple times.
- const SnippetContext& snippet_context2 = result_state.snippet_context();
- EXPECT_TRUE(snippet_context2.query_terms.find("term1") !=
- snippet_context2.query_terms.end());
- EXPECT_THAT(snippet_context2.snippet_spec,
- EqualsProto(result_spec.snippet_spec()));
- EXPECT_THAT(snippet_context2.match_type, Eq(TermMatchType::EXACT_ONLY));
-}
-
-TEST(ResultStateTest, NoSnippetingShouldReturnNull) {
- ResultSpecProto result_spec = CreateResultSpec(/*num_per_page=*/2);
- // Setting num_to_snippet to 0 so that snippeting info won't be
- // stored.
- result_spec.mutable_snippet_spec()->set_num_to_snippet(0);
- result_spec.mutable_snippet_spec()->set_num_matches_per_property(5);
- result_spec.mutable_snippet_spec()->set_max_window_bytes(5);
-
- SectionRestrictQueryTermsMap query_terms_map;
- query_terms_map.emplace("term1", std::unordered_set<std::string>());
-
- ResultState result_state(/*scored_document_hits=*/{}, query_terms_map,
- CreateSearchSpec(TermMatchType::EXACT_ONLY),
- CreateScoringSpec(/*is_descending_order=*/true),
- result_spec);
-
- const SnippetContext& snippet_context = result_state.snippet_context();
- EXPECT_THAT(snippet_context.query_terms, IsEmpty());
- EXPECT_THAT(
- snippet_context.snippet_spec,
- EqualsProto(ResultSpecProto::SnippetSpecProto::default_instance()));
- EXPECT_THAT(snippet_context.match_type, TermMatchType::UNKNOWN);
-}
-
-TEST(ResultStateTest, ShouldTruncateToNewSize) {
- std::vector<ScoredDocumentHit> scored_document_hits = {
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/5),
- CreateScoredDocumentHit(/*document_id=*/4)};
-
- // Creates a ResultState with 5 ScoredDocumentHits.
- ResultState result_state(scored_document_hits, /*query_terms=*/{},
- CreateSearchSpec(TermMatchType::EXACT_ONLY),
- CreateScoringSpec(/*is_descending_order=*/true),
- CreateResultSpec(/*num_per_page=*/5));
-
- result_state.TruncateHitsTo(/*new_size=*/3);
- // The best 3 are left.
- EXPECT_THAT(
- result_state.GetNextPage(),
- ElementsAre(
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3))));
-}
-
-TEST(ResultStateTest, ShouldTruncateToZero) {
- std::vector<ScoredDocumentHit> scored_document_hits = {
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/5),
- CreateScoredDocumentHit(/*document_id=*/4)};
-
- // Creates a ResultState with 5 ScoredDocumentHits.
- ResultState result_state(scored_document_hits, /*query_terms=*/{},
- CreateSearchSpec(TermMatchType::EXACT_ONLY),
- CreateScoringSpec(/*is_descending_order=*/true),
- CreateResultSpec(/*num_per_page=*/5));
-
- result_state.TruncateHitsTo(/*new_size=*/0);
- EXPECT_THAT(result_state.GetNextPage(), IsEmpty());
-}
-
-TEST(ResultStateTest, ShouldNotTruncateToNegative) {
- std::vector<ScoredDocumentHit> scored_document_hits = {
- CreateScoredDocumentHit(/*document_id=*/2),
- CreateScoredDocumentHit(/*document_id=*/1),
- CreateScoredDocumentHit(/*document_id=*/3),
- CreateScoredDocumentHit(/*document_id=*/5),
- CreateScoredDocumentHit(/*document_id=*/4)};
-
- // Creates a ResultState with 5 ScoredDocumentHits.
- ResultState result_state(scored_document_hits, /*query_terms=*/{},
- CreateSearchSpec(TermMatchType::EXACT_ONLY),
- CreateScoringSpec(/*is_descending_order=*/true),
- CreateResultSpec(/*num_per_page=*/5));
-
- result_state.TruncateHitsTo(/*new_size=*/-1);
- // Results are not affected.
- EXPECT_THAT(
- result_state.GetNextPage(),
- ElementsAre(
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/5)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/4)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/3)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/2)),
- EqualsScoredDocumentHit(CreateScoredDocumentHit(/*document_id=*/1))));
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/result/snippet-retriever-test-jni-layer.cc b/icing/result/snippet-retriever-test-jni-layer.cc
new file mode 100644
index 0000000..707d9ee
--- /dev/null
+++ b/icing/result/snippet-retriever-test-jni-layer.cc
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_SnippetRetrieverJniTest_testsMain(JNIEnv* env, jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "jni-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/result/snippet-retriever.cc b/icing/result/snippet-retriever.cc
index 09d0f7a..fcaba4c 100644
--- a/icing/result/snippet-retriever.cc
+++ b/icing/result/snippet-retriever.cc
@@ -15,6 +15,7 @@
#include "icing/result/snippet-retriever.h"
#include <algorithm>
+#include <iterator>
#include <memory>
#include <string>
#include <string_view>
@@ -25,8 +26,12 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/search.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/query/query-terms.h"
+#include "icing/schema/property-util.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
#include "icing/store/document-filter-data.h"
@@ -35,7 +40,9 @@
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
+#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -43,10 +50,166 @@ namespace lib {
namespace {
+inline std::string AddIndexToPath(int values_size, int index,
+ const std::string& property_path) {
+ if (values_size == 1) {
+ return property_path;
+ }
+ return absl_ports::StrCat(
+ property_path, property_util::ConvertToPropertyExprIndexStr(index));
+}
+
+// Returns a string of the normalized text of the input Token. Normalization
+// is applied based on the Token's type.
+std::string NormalizeToken(const Normalizer& normalizer, const Token& token) {
+ switch (token.type) {
+ case Token::Type::RFC822_NAME:
+ [[fallthrough]];
+ case Token::Type::RFC822_COMMENT:
+ [[fallthrough]];
+ case Token::Type::RFC822_LOCAL_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_HOST_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
+ [[fallthrough]];
+ case Token::Type::RFC822_TOKEN:
+ [[fallthrough]];
+ case Token::Type::URL_SCHEME:
+ [[fallthrough]];
+ case Token::Type::URL_USERNAME:
+ [[fallthrough]];
+ case Token::Type::URL_PASSWORD:
+ [[fallthrough]];
+ case Token::Type::URL_HOST_COMMON_PART:
+ [[fallthrough]];
+ case Token::Type::URL_HOST_SIGNIFICANT_PART:
+ [[fallthrough]];
+ case Token::Type::URL_PORT:
+ [[fallthrough]];
+ case Token::Type::URL_PATH_PART:
+ [[fallthrough]];
+ case Token::Type::URL_QUERY:
+ [[fallthrough]];
+ case Token::Type::URL_REF:
+ [[fallthrough]];
+ case Token::Type::URL_SUFFIX:
+ [[fallthrough]];
+ case Token::Type::URL_SUFFIX_INNERMOST:
+ [[fallthrough]];
+ case Token::Type::REGULAR:
+ return normalizer.NormalizeTerm(token.text);
+ case Token::Type::VERBATIM:
+ return std::string(token.text);
+ case Token::Type::QUERY_EXCLUSION:
+ [[fallthrough]];
+ case Token::Type::QUERY_LEFT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_OR:
+ [[fallthrough]];
+ case Token::Type::QUERY_PROPERTY:
+ [[fallthrough]];
+ case Token::Type::INVALID:
+ ICING_LOG(WARNING) << "Unable to normalize token of type: "
+ << static_cast<int>(token.type);
+ return std::string(token.text);
+ }
+}
+
+// Returns a CharacterIterator for token's text, advancing one past the last
+// matching character from the query term.
+CharacterIterator FindMatchEnd(const Normalizer& normalizer, const Token& token,
+ const std::string& match_query_term) {
+ switch (token.type) {
+ case Token::Type::VERBATIM: {
+ // VERBATIM tokens are not normalized. This means the non-normalized
+ // matched query term must be either equal to or a prefix of the token's
+ // text. Therefore, the match must end at the end of the matched query
+ // term.
+ CharacterIterator verbatim_match_end =
+ CharacterIterator(token.text, 0, 0, 0);
+ verbatim_match_end.AdvanceToUtf8(match_query_term.length());
+ return verbatim_match_end;
+ }
+ case Token::Type::QUERY_EXCLUSION:
+ [[fallthrough]];
+ case Token::Type::QUERY_LEFT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
+ [[fallthrough]];
+ case Token::Type::QUERY_OR:
+ [[fallthrough]];
+ case Token::Type::QUERY_PROPERTY:
+ [[fallthrough]];
+ case Token::Type::INVALID:
+ ICING_LOG(WARNING)
+ << "Unexpected Token type " << static_cast<int>(token.type)
+ << " found when finding match end of query term and token.";
+ [[fallthrough]];
+ case Token::Type::RFC822_NAME:
+ [[fallthrough]];
+ case Token::Type::RFC822_COMMENT:
+ [[fallthrough]];
+ case Token::Type::RFC822_LOCAL_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_HOST_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL:
+ [[fallthrough]];
+ case Token::Type::RFC822_ADDRESS_COMPONENT_HOST:
+ [[fallthrough]];
+ case Token::Type::RFC822_TOKEN:
+ [[fallthrough]];
+ case Token::Type::URL_SCHEME:
+ [[fallthrough]];
+ case Token::Type::URL_USERNAME:
+ [[fallthrough]];
+ case Token::Type::URL_PASSWORD:
+ [[fallthrough]];
+ case Token::Type::URL_HOST_COMMON_PART:
+ [[fallthrough]];
+ case Token::Type::URL_HOST_SIGNIFICANT_PART:
+ [[fallthrough]];
+ case Token::Type::URL_PORT:
+ [[fallthrough]];
+ case Token::Type::URL_QUERY:
+ [[fallthrough]];
+ case Token::Type::URL_PATH_PART:
+ [[fallthrough]];
+ case Token::Type::URL_REF:
+ [[fallthrough]];
+ case Token::Type::URL_SUFFIX:
+ [[fallthrough]];
+ case Token::Type::URL_SUFFIX_INNERMOST:
+ [[fallthrough]];
+ case Token::Type::REGULAR:
+ return normalizer.FindNormalizedMatchEndPosition(token.text,
+ match_query_term);
+ }
+}
+
class TokenMatcher {
public:
virtual ~TokenMatcher() = default;
- virtual bool Matches(Token token) const = 0;
+
+ // Returns a CharacterIterator pointing just past the end of the substring in
+ // token.text that matches a query term. Note that the utf* indices will be
+ // in relation to token.text's start.
+ //
+ // If there is no match, then it will construct a CharacterIterator with all
+ // of its indices set to -1.
+ //
+ // Ex. With an exact matcher, query terms=["foo","bar"] and token.text="bar",
+ // Matches will return a CharacterIterator(u8:3, u16:3, u32:3).
+ virtual CharacterIterator Matches(Token token) const = 0;
};
class TokenMatcherExact : public TokenMatcher {
@@ -59,10 +222,18 @@ class TokenMatcherExact : public TokenMatcher {
restricted_query_terms_(restricted_query_terms),
normalizer_(normalizer) {}
- bool Matches(Token token) const override {
- std::string s = normalizer_.NormalizeTerm(token.text);
- return (unrestricted_query_terms_.count(s) > 0) ||
- (restricted_query_terms_.count(s) > 0);
+ CharacterIterator Matches(Token token) const override {
+ std::string s = NormalizeToken(normalizer_, token);
+ auto itr = unrestricted_query_terms_.find(s);
+ if (itr == unrestricted_query_terms_.end()) {
+ itr = restricted_query_terms_.find(s);
+ }
+ if (itr != unrestricted_query_terms_.end() &&
+ itr != restricted_query_terms_.end()) {
+ return FindMatchEnd(normalizer_, token, *itr);
+ }
+
+ return CharacterIterator(token.text, -1, -1, -1);
}
private:
@@ -81,22 +252,21 @@ class TokenMatcherPrefix : public TokenMatcher {
restricted_query_terms_(restricted_query_terms),
normalizer_(normalizer) {}
- bool Matches(Token token) const override {
- std::string s = normalizer_.NormalizeTerm(token.text);
- if (std::any_of(unrestricted_query_terms_.begin(),
- unrestricted_query_terms_.end(),
- [&s](const std::string& term) {
- return term.length() <= s.length() &&
- s.compare(0, term.length(), term) == 0;
- })) {
- return true;
+ CharacterIterator Matches(Token token) const override {
+ std::string s = NormalizeToken(normalizer_, token);
+ for (const std::string& query_term : unrestricted_query_terms_) {
+ if (query_term.length() <= s.length() &&
+ s.compare(0, query_term.length(), query_term) == 0) {
+ return FindMatchEnd(normalizer_, token, query_term);
+ }
}
- return std::any_of(restricted_query_terms_.begin(),
- restricted_query_terms_.end(),
- [&s](const std::string& term) {
- return term.length() <= s.length() &&
- s.compare(0, term.length(), term) == 0;
- });
+ for (const std::string& query_term : restricted_query_terms_) {
+ if (query_term.length() <= s.length() &&
+ s.compare(0, query_term.length(), query_term) == 0) {
+ return FindMatchEnd(normalizer_, token, query_term);
+ }
+ }
+ return CharacterIterator(token.text, -1, -1, -1);
}
private:
@@ -124,115 +294,165 @@ libtextclassifier3::StatusOr<std::unique_ptr<TokenMatcher>> CreateTokenMatcher(
}
}
-// Returns true if token matches any of the terms in query terms according to
-// the provided match type.
+// Finds the start position of a valid token that is after
+// window_start_min_exclusive_utf32
//
// Returns:
// the position of the window start if successful
// INTERNAL_ERROR - if a tokenizer error is encountered
-libtextclassifier3::StatusOr<int> DetermineWindowStart(
+libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowStart(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
- std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
- int window_start_min = (match_mid - snippet_spec.max_window_bytes() / 2) - 1;
- if (window_start_min < 0) {
- return 0;
- }
- if (!iterator->ResetToTokenAfter(window_start_min)) {
+ std::string_view value, int window_start_min_exclusive_utf32,
+ Tokenizer::Iterator* iterator) {
+ if (!iterator->ResetToTokenStartingAfter(window_start_min_exclusive_utf32)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
- return iterator->GetToken().text.data() - value.data();
+ return iterator->CalculateTokenStart();
}
// Increments window_end_exclusive so long as the character at the position
// of window_end_exclusive is punctuation and does not exceed
-// window_end_max_exclusive.
-int IncludeTrailingPunctuation(std::string_view value, int window_end_exclusive,
- int window_end_max_exclusive) {
- while (window_end_exclusive < window_end_max_exclusive) {
+// window_end_max_exclusive_utf32.
+CharacterIterator IncludeTrailingPunctuation(
+ std::string_view value, CharacterIterator window_end_exclusive,
+ int window_end_max_exclusive_utf32) {
+ size_t max_search_index = value.length() - 1;
+ while (window_end_exclusive.utf8_index() <= max_search_index &&
+ window_end_exclusive.utf32_index() < window_end_max_exclusive_utf32) {
int char_len = 0;
- if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive, &char_len)) {
- break;
- }
- if (window_end_exclusive + char_len > window_end_max_exclusive) {
- // This is punctuation, but it goes beyond the window end max. Don't
- // include it.
+ if (!i18n_utils::IsPunctuationAt(value, window_end_exclusive.utf8_index(),
+ &char_len)) {
break;
}
// Expand window by char_len and check the next character.
- window_end_exclusive += char_len;
+ window_end_exclusive.AdvanceToUtf32(window_end_exclusive.utf32_index() + 1);
}
return window_end_exclusive;
}
+// Finds the end position of a valid token that is before the
+// window_end_max_exclusive_utf32.
+//
// Returns:
// the position of the window end if successful
// INTERNAL_ERROR - if a tokenizer error is encountered
-libtextclassifier3::StatusOr<int> DetermineWindowEnd(
+libtextclassifier3::StatusOr<CharacterIterator> DetermineWindowEnd(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
- std::string_view value, int match_mid, Tokenizer::Iterator* iterator) {
- int window_end_max_exclusive =
- match_mid + snippet_spec.max_window_bytes() / 2;
- if (window_end_max_exclusive >= value.length()) {
- return value.length();
- }
- if (!iterator->ResetToTokenBefore(window_end_max_exclusive)) {
+ std::string_view value, int window_end_max_exclusive_utf32,
+ Tokenizer::Iterator* iterator) {
+ if (!iterator->ResetToTokenEndingBefore(window_end_max_exclusive_utf32)) {
return absl_ports::InternalError(
"Couldn't reset tokenizer to determine snippet window!");
}
- int window_end_exclusive = iterator->GetToken().text.data() - value.data() +
- iterator->GetToken().text.length();
- return IncludeTrailingPunctuation(value, window_end_exclusive,
- window_end_max_exclusive);
+ ICING_ASSIGN_OR_RETURN(CharacterIterator end_exclusive,
+ iterator->CalculateTokenEndExclusive());
+ return IncludeTrailingPunctuation(value, end_exclusive,
+ window_end_max_exclusive_utf32);
}
struct SectionData {
std::string_view section_name;
std::string_view section_subcontent;
- // Identifies which subsection of the section content, section_subcontent has
- // come from.
- // Ex. "recipient.address" :
- // ["foo@google.com", "bar@google.com", "baz@google.com"]
- // The subcontent_index of "bar@google.com" is 1.
- int subcontent_index;
};
+// Creates a snippet match proto for the match pointed to by the iterator,
+// between start_itr and end_itr
+// Returns:
+// the position of the window start if successful
+// INTERNAL_ERROR - if a tokenizer error is encountered and iterator is left
+// in an invalid state
+// ABORTED_ERROR - if an invalid utf-8 sequence is encountered
libtextclassifier3::StatusOr<SnippetMatchProto> RetrieveMatch(
const ResultSpecProto::SnippetSpecProto& snippet_spec,
- const SectionData& value, Tokenizer::Iterator* iterator) {
+ const SectionData& value, Tokenizer::Iterator* iterator,
+ const CharacterIterator& start_itr, const CharacterIterator& end_itr) {
SnippetMatchProto snippet_match;
- snippet_match.set_values_index(value.subcontent_index);
-
- Token match = iterator->GetToken();
- int match_pos = match.text.data() - value.section_subcontent.data();
- int match_mid = match_pos + match.text.length() / 2;
-
- snippet_match.set_exact_match_position(match_pos);
- snippet_match.set_exact_match_bytes(match.text.length());
-
- if (snippet_spec.max_window_bytes() > match.text.length()) {
+ // When finding boundaries, we have a few cases:
+ //
+ // Case 1:
+ // If we have an odd length match an odd length window, the window surrounds
+ // the match perfectly.
+ // match = "bar" in "foo bar baz"
+ // window = |---|
+ //
+ // Case 2:
+ // If we have an even length match with an even length window, the window
+ // surrounds the match perfectly.
+ // match = "baar" in "foo baar baz"
+ // window = |----|
+ //
+ // Case 3:
+ // If we have an odd length match with an even length window, we allocate
+ // that extra window byte to the beginning.
+ // match = "bar" in "foo bar baz"
+ // window = |----|
+ //
+ // Case 4:
+ // If we have an even length match with an odd length window, we allocate
+ // that extra window byte to the end.
+ // match = "baar" in "foo baar baz"
+ // window = |-----|
+ //
+ // We have do +1/-1 below to get the math to match up.
+ int match_pos_utf32 = start_itr.utf32_index();
+ int match_len_utf32 = end_itr.utf32_index() - match_pos_utf32;
+ int match_mid_utf32 = match_pos_utf32 + match_len_utf32 / 2;
+ int window_start_min_exclusive_utf32 =
+ (match_mid_utf32 - snippet_spec.max_window_utf32_length() / 2) - 1;
+ int window_end_max_exclusive_utf32 =
+ match_mid_utf32 + (snippet_spec.max_window_utf32_length() + 1) / 2;
+
+ snippet_match.set_exact_match_byte_position(start_itr.utf8_index());
+ snippet_match.set_exact_match_utf16_position(start_itr.utf16_index());
+ snippet_match.set_exact_match_byte_length(end_itr.utf8_index() -
+ start_itr.utf8_index());
+ snippet_match.set_exact_match_utf16_length(end_itr.utf16_index() -
+ start_itr.utf16_index());
+
+ // Only include windows if it'll at least include the matched text. Otherwise,
+ // it'll just be an empty string anyways.
+ if (snippet_spec.max_window_utf32_length() >= match_len_utf32) {
// Find the beginning of the window.
ICING_ASSIGN_OR_RETURN(
- int window_start,
- DetermineWindowStart(snippet_spec, value.section_subcontent, match_mid,
- iterator));
- snippet_match.set_window_position(window_start);
+ CharacterIterator window_start,
+ DetermineWindowStart(snippet_spec, value.section_subcontent,
+ window_start_min_exclusive_utf32, iterator));
+
+ // Check. Did we get fewer characters than we requested? If so, then add it
+ // on to the window_end.
+ int extra_window_space =
+ window_start.utf32_index() - 1 - window_start_min_exclusive_utf32;
+ window_end_max_exclusive_utf32 += extra_window_space;
// Find the end of the window.
ICING_ASSIGN_OR_RETURN(
- int window_end_exclusive,
- DetermineWindowEnd(snippet_spec, value.section_subcontent, match_mid,
- iterator));
- snippet_match.set_window_bytes(window_end_exclusive - window_start);
-
- // DetermineWindowStart/End may change the position of the iterator. So,
- // reset the iterator back to the original position.
- bool success = (match_pos > 0) ? iterator->ResetToTokenAfter(match_pos - 1)
- : iterator->ResetToStart();
- if (!success) {
- return absl_ports::InternalError(
- "Couldn't reset tokenizer to determine snippet window!");
+ CharacterIterator window_end,
+ DetermineWindowEnd(snippet_spec, value.section_subcontent,
+ window_end_max_exclusive_utf32, iterator));
+
+ // Check one more time. Did we get fewer characters than we requested? If
+ // so, then see if we can push the start back again.
+ extra_window_space =
+ window_end_max_exclusive_utf32 - window_end.utf32_index();
+ if (extra_window_space > 0) {
+ window_start_min_exclusive_utf32 =
+ window_start.utf32_index() - 1 - extra_window_space;
+ ICING_ASSIGN_OR_RETURN(
+ window_start,
+ DetermineWindowStart(snippet_spec, value.section_subcontent,
+ window_start_min_exclusive_utf32, iterator));
}
+
+ snippet_match.set_window_byte_position(window_start.utf8_index());
+ snippet_match.set_window_utf16_position(window_start.utf16_index());
+ snippet_match.set_window_byte_length(window_end.utf8_index() -
+ window_start.utf8_index());
+ snippet_match.set_window_utf16_length(window_end.utf16_index() -
+ window_start.utf16_index());
+
+ // DetermineWindowStart/End may change the position of the iterator, but it
+ // will be reset once the entire batch of tokens is checked.
}
return snippet_match;
@@ -243,33 +463,184 @@ struct MatchOptions {
int max_matches_remaining;
};
-libtextclassifier3::StatusOr<SnippetProto::EntryProto> RetrieveMatches(
- const TokenMatcher* matcher, const MatchOptions& match_options,
- const SectionData& value, const Tokenizer* tokenizer) {
- SnippetProto::EntryProto snippet_entry;
- snippet_entry.set_property_name(std::string(value.section_name));
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
- tokenizer->Tokenize(value.section_subcontent));
- while (iterator->Advance()) {
- if (snippet_entry.snippet_matches_size() >=
- match_options.max_matches_remaining) {
- break;
+// Retrieves snippets in the string values of current_property.
+// Tokenizer is provided to tokenize string content and matcher is provided to
+// indicate when a token matches content in the query.
+//
+// current_property is the property with the string values to snippet.
+// property_path is the path in the document to current_property.
+//
+// MatchOptions holds the snippet spec and number of desired matches remaining.
+// Each call to GetEntriesFromProperty will decrement max_matches_remaining
+// by the number of entries that it adds to snippet_proto.
+//
+// The SnippetEntries found for matched content will be added to snippet_proto.
+void GetEntriesFromProperty(const PropertyProto* current_property,
+ const std::string& property_path,
+ const TokenMatcher* matcher,
+ const Tokenizer* tokenizer,
+ MatchOptions* match_options,
+ SnippetProto* snippet_proto) {
+ // We're at the end. Let's check our values.
+ for (int i = 0; i < current_property->string_values_size(); ++i) {
+ SnippetProto::EntryProto snippet_entry;
+ snippet_entry.set_property_name(AddIndexToPath(
+ current_property->string_values_size(), /*index=*/i, property_path));
+ std::string_view value = current_property->string_values(i);
+ std::unique_ptr<Tokenizer::Iterator> iterator =
+ tokenizer->Tokenize(value).ValueOrDie();
+ // All iterators are moved through positions sequentially. Constructing them
+ // each time resets them to the beginning of the string. This means that,
+ // for t tokens and in a string of n chars, each MoveToUtf8 call from the
+ // beginning of the string is on average O(n/2), whereas calling MoveToUtf8
+ // from the token immediately prior to the desired one is O(n/t).
+ // Constructing each outside of the while-loop ensures that performance will
+ // be O(t * (n/t)) = O(n) rather than O(t * n / 2).
+ CharacterIterator start_itr(value);
+ CharacterIterator end_itr(value);
+ CharacterIterator reset_itr(value);
+ bool encountered_error = false;
+ while (iterator->Advance()) {
+ std::vector<Token> batch_tokens = iterator->GetTokens();
+ if (batch_tokens.empty()) {
+ continue;
+ }
+
+ bool needs_reset = false;
+ reset_itr.MoveToUtf8(batch_tokens.at(0).text.begin() - value.begin());
+ start_itr = reset_itr;
+ end_itr = start_itr;
+ for (int i = 0; i < batch_tokens.size(); ++i) {
+ const Token& token = batch_tokens.at(i);
+ CharacterIterator submatch_end = matcher->Matches(token);
+ // If the token matched a query term, then submatch_end will point to an
+ // actual position within token.text.
+ if (submatch_end.utf8_index() == -1) {
+ continue;
+ }
+ // As snippet matching may move iterator around, we save a reset
+ // iterator so that we can reset to the initial iterator state, and
+ // continue Advancing in order in the next round.
+ if (!start_itr.MoveToUtf8(token.text.begin() - value.begin())) {
+ encountered_error = true;
+ break;
+ }
+ if (!end_itr.MoveToUtf8(token.text.end() - value.begin())) {
+ encountered_error = true;
+ break;
+ }
+ SectionData data = {property_path, value};
+ auto match_or = RetrieveMatch(match_options->snippet_spec, data,
+ iterator.get(), start_itr, end_itr);
+ if (!match_or.ok()) {
+ if (absl_ports::IsAborted(match_or.status())) {
+ // Only an aborted. We can't get this match, but we might be able
+ // to retrieve others. Just continue.
+ continue;
+ } else {
+ encountered_error = true;
+ break;
+ }
+ }
+ SnippetMatchProto match = std::move(match_or).ValueOrDie();
+ if (match.window_byte_length() > 0) {
+ needs_reset = true;
+ }
+ // submatch_end refers to a position *within* token.text.
+ // This, conveniently enough, means that index that submatch_end
+ // points to is the length of the submatch (because the submatch
+ // starts at 0 in token.text).
+ match.set_submatch_byte_length(submatch_end.utf8_index());
+ match.set_submatch_utf16_length(submatch_end.utf16_index());
+ // Add the values for the submatch.
+ snippet_entry.mutable_snippet_matches()->Add(std::move(match));
+
+ if (--match_options->max_matches_remaining <= 0) {
+ *snippet_proto->add_entries() = std::move(snippet_entry);
+ return;
+ }
+ }
+
+ if (encountered_error) {
+ break;
+ }
+
+ // RetrieveMatch may call DetermineWindowStart/End if windowing is
+ // requested, which may change the position of the iterator. So, reset the
+ // iterator back to the original position. The first token of the token
+ // batch will be the token to reset to.
+ if (needs_reset) {
+ if (reset_itr.utf8_index() > 0) {
+ encountered_error =
+ !iterator->ResetToTokenStartingAfter(reset_itr.utf32_index() - 1);
+ } else {
+ encountered_error = !iterator->ResetToStart();
+ }
+ }
+ if (encountered_error) {
+ break;
+ }
}
- Token token = iterator->GetToken();
- if (matcher->Matches(token)) {
- // If there was an error while retrieving the match, the tokenizer
- // iterator is probably in an invalid state. There's nothing we can do
- // here, so just return.
- ICING_ASSIGN_OR_RETURN(
- SnippetMatchProto match,
- RetrieveMatch(match_options.snippet_spec, value, iterator.get()));
- snippet_entry.mutable_snippet_matches()->Add(std::move(match));
+ if (!snippet_entry.snippet_matches().empty()) {
+ *snippet_proto->add_entries() = std::move(snippet_entry);
}
}
- if (snippet_entry.snippet_matches().empty()) {
- return absl_ports::NotFoundError("No matches found in value!");
+}
+
+// Retrieves snippets in document from content at section_path.
+// Tokenizer is provided to tokenize string content and matcher is provided to
+// indicate when a token matches content in the query.
+//
+// section_path_index refers to the current property that is held by document.
+// current_path is equivalent to the first section_path_index values in
+// section_path, but with value indices present.
+//
+// For example, suppose that a hit appeared somewhere in the "bcc.emailAddress".
+// The arguments for RetrieveSnippetForSection might be
+// {section_path=["bcc", "emailAddress"], section_path_index=0, current_path=""}
+// on the first call and
+// {section_path=["bcc", "emailAddress"], section_path_index=1,
+// current_path="bcc[1]"} on the second recursive call.
+//
+// MatchOptions holds the snippet spec and number of desired matches remaining.
+// Each call to RetrieveSnippetForSection will decrement max_matches_remaining
+// by the number of entries that it adds to snippet_proto.
+//
+// The SnippetEntries found for matched content will be added to snippet_proto.
+void RetrieveSnippetForSection(
+ const DocumentProto& document, const TokenMatcher* matcher,
+ const Tokenizer* tokenizer,
+ const std::vector<std::string_view>& section_path, int section_path_index,
+ const std::string& current_path, MatchOptions* match_options,
+ SnippetProto* snippet_proto) {
+ std::string_view next_property_name = section_path.at(section_path_index);
+ const PropertyProto* current_property =
+ property_util::GetPropertyProto(document, next_property_name);
+ if (current_property == nullptr) {
+ ICING_VLOG(1) << "No property " << next_property_name << " found at path "
+ << current_path;
+ return;
+ }
+ std::string property_path = property_util::ConcatenatePropertyPathExpr(
+ current_path, next_property_name);
+ if (section_path_index == section_path.size() - 1) {
+ // We're at the end. Let's check our values.
+ GetEntriesFromProperty(current_property, property_path, matcher, tokenizer,
+ match_options, snippet_proto);
+ } else {
+ // Still got more to go. Let's look through our subdocuments.
+ std::vector<SnippetProto::EntryProto> entries;
+ for (int i = 0; i < current_property->document_values_size(); ++i) {
+ std::string new_path = AddIndexToPath(
+ current_property->document_values_size(), /*index=*/i, property_path);
+ RetrieveSnippetForSection(current_property->document_values(i), matcher,
+ tokenizer, section_path, section_path_index + 1,
+ new_path, match_options, snippet_proto);
+ if (match_options->max_matches_remaining <= 0) {
+ break;
+ }
+ }
}
- return snippet_entry;
}
} // namespace
@@ -300,9 +671,13 @@ SnippetProto SnippetRetriever::RetrieveSnippet(
const std::unordered_set<std::string>& unrestricted_set =
(itr != query_terms.end()) ? itr->second : empty_set;
while (section_id_mask != kSectionIdMaskNone) {
- SectionId section_id = __builtin_ctz(section_id_mask);
+ SectionId section_id = __builtin_ctzll(section_id_mask);
// Remove this section from the mask.
- section_id_mask &= ~(1u << section_id);
+ section_id_mask &= ~(UINT64_C(1) << section_id);
+
+ MatchOptions match_options = {snippet_spec};
+ match_options.max_matches_remaining =
+ snippet_spec.num_matches_per_property();
// Determine the section name and match type.
auto section_metadata_or =
@@ -311,7 +686,9 @@ SnippetProto SnippetRetriever::RetrieveSnippet(
continue;
}
const SectionMetadata* metadata = section_metadata_or.ValueOrDie();
- MatchOptions match_options = {snippet_spec};
+ std::vector<std::string_view> section_path =
+ property_util::SplitPropertyPathExpr(metadata->path);
+
// Match type must be as restrictive as possible. Prefix matches for a
// snippet should only be included if both the query is Prefix and the
// section has prefixes enabled.
@@ -330,37 +707,18 @@ SnippetProto SnippetRetriever::RetrieveSnippet(
if (!matcher_or.ok()) {
continue;
}
- match_options.max_matches_remaining =
- snippet_spec.num_matches_per_property();
+ std::unique_ptr<TokenMatcher> matcher = std::move(matcher_or).ValueOrDie();
- // Retrieve values and snippet them.
- auto values_or = schema_store_.GetSectionContent(document, metadata->path);
- if (!values_or.ok()) {
- continue;
- }
auto tokenizer_or = tokenizer_factory::CreateIndexingTokenizer(
metadata->tokenizer, &language_segmenter_);
if (!tokenizer_or.ok()) {
// If we couldn't create the tokenizer properly, just skip this section.
continue;
}
- std::vector<std::string> values = values_or.ValueOrDie();
- for (int value_index = 0; value_index < values.size(); ++value_index) {
- if (match_options.max_matches_remaining <= 0) {
- break;
- }
- SectionData value = {metadata->path, values.at(value_index), value_index};
- auto entry_or =
- RetrieveMatches(matcher_or.ValueOrDie().get(), match_options, value,
- tokenizer_or.ValueOrDie().get());
-
- // Drop any entries that encountered errors or didn't find any matches.
- if (entry_or.ok()) {
- match_options.max_matches_remaining -=
- entry_or.ValueOrDie().snippet_matches_size();
- snippet_proto.mutable_entries()->Add(std::move(entry_or).ValueOrDie());
- }
- }
+ std::unique_ptr<Tokenizer> tokenizer = std::move(tokenizer_or).ValueOrDie();
+ RetrieveSnippetForSection(
+ document, matcher.get(), tokenizer.get(), section_path,
+ /*section_path_index=*/0, "", &match_options, &snippet_proto);
}
return snippet_proto;
}
diff --git a/icing/result/snippet-retriever_benchmark.cc b/icing/result/snippet-retriever_benchmark.cc
new file mode 100644
index 0000000..e574325
--- /dev/null
+++ b/icing/result/snippet-retriever_benchmark.cc
@@ -0,0 +1,333 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "third_party/absl/flags/flag.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/result/snippet-retriever.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/transform/normalizer-factory.h"
+#include "icing/util/clock.h"
+#include "icing/util/logging.h"
+#include "unicode/uloc.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/result:snippet-retriever_benchmark
+//
+// $ blaze-bin/icing/result/snippet-retriever_benchmark
+// --benchmark_filter=all
+//
+// Run on an Android device:
+// Make target //icing/tokenization:language-segmenter depend on
+// //third_party/icu
+//
+// Make target //icing/transform:normalizer depend on
+// //third_party/icu
+//
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/result:snippet-retriever_benchmark
+//
+// $ adb push blaze-bin/icing/result/snippet-retriever_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/snippet-retriever_benchmark
+// --benchmark_filter=all --adb
+
+// Flag to tell the benchmark that it'll be run on an Android device via adb,
+// the benchmark will set up data files accordingly.
+ABSL_FLAG(bool, adb, false, "run benchmark via ADB on an Android device");
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::SizeIs;
+
+void BM_SnippetOneProperty(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
+ const std::string schema_dir = base_dir + "/schema";
+ Filesystem filesystem;
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+ if (!filesystem.CreateDirectoryRecursively(schema_dir.c_str())) {
+ ICING_LOG(ERROR) << "Failed to create test directories";
+ }
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
+ std::unique_ptr<Normalizer> normalizer =
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max())
+ .ValueOrDie();
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type1").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir, &clock));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ auto snippet_retriever =
+ SnippetRetriever::Create(schema_store.get(), language_segmenter.get(),
+ normalizer.get())
+ .ValueOrDie();
+
+ int num_matches = state.range(0);
+ int total_terms = state.range(1);
+
+ std::default_random_engine random;
+ std::vector<std::string> language =
+ CreateLanguages(/*language_size=*/1000, &random);
+ std::uniform_int_distribution<size_t> uniform(0u, language.size() - 1);
+ std::uniform_real_distribution<double> uniform_double(0.0, 1.0);
+
+ std::string text;
+ int num_actual_matches = 0;
+ double match_chance;
+ while (total_terms-- > 0) {
+ std::string term;
+ match_chance = static_cast<double>(num_matches) / total_terms;
+ if (uniform_double(random) <= match_chance) {
+ --num_matches;
+ ++num_actual_matches;
+ term = "foo";
+ } else {
+ term = language.at(uniform(random));
+ }
+ absl_ports::StrAppend(&text, " ", term);
+ }
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema("type1")
+ .AddStringProperty("prop1", text)
+ .Build();
+ SectionRestrictQueryTermsMap query_terms = {{"", {"foo"}}};
+ ResultSpecProto::SnippetSpecProto snippet_spec;
+ snippet_spec.set_num_to_snippet(100000);
+ snippet_spec.set_num_matches_per_property(100000);
+ snippet_spec.set_max_window_utf32_length(64);
+
+ SectionIdMask section_id_mask = 0x01;
+ SnippetProto snippet_proto;
+ for (auto _ : state) {
+ snippet_proto = snippet_retriever->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec, document,
+ section_id_mask);
+ ASSERT_THAT(snippet_proto.entries(), SizeIs(1));
+ ASSERT_THAT(snippet_proto.entries(0).snippet_matches(),
+ SizeIs(num_actual_matches));
+ }
+
+ // Destroy the schema store before the whole directory is removed because they
+ // persist data in destructor.
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+}
+BENCHMARK(BM_SnippetOneProperty)
+ // Arguments: num_matches, total_terms
+ ->ArgPair(1, 1)
+ ->ArgPair(1, 16) // single match
+ ->ArgPair(2, 16) // ~10% matches
+ ->ArgPair(3, 16) // ~20% matches
+ ->ArgPair(8, 16) // 50% matches
+ ->ArgPair(16, 16) // 100% matches
+ ->ArgPair(1, 128) // single match
+ ->ArgPair(13, 128) // ~10% matches
+ ->ArgPair(26, 128) // ~20% matches
+ ->ArgPair(64, 128) // 50% matches
+ ->ArgPair(128, 128) // 100% matches
+ ->ArgPair(1, 512) // single match
+ ->ArgPair(51, 512) // ~10% matches
+ ->ArgPair(102, 512) // ~20% matches
+ ->ArgPair(256, 512) // 50% matches
+ ->ArgPair(512, 512) // 100% matches
+ ->ArgPair(1, 1024) // single match
+ ->ArgPair(102, 1024) // ~10% matches
+ ->ArgPair(205, 1024) // ~20% matches
+ ->ArgPair(512, 1024) // 50% matches
+ ->ArgPair(1024, 1024) // 100% matches
+ ->ArgPair(1, 4096) // single match
+ ->ArgPair(410, 4096) // ~10% matches
+ ->ArgPair(819, 4096) // ~20% matches
+ ->ArgPair(2048, 4096) // 50% matches
+ ->ArgPair(4096, 4096) // 100% matches
+ ->ArgPair(1, 16384) // single match
+ ->ArgPair(1638, 16384) // ~10% matches
+ ->ArgPair(3277, 16384) // ~20% matches
+ ->ArgPair(8192, 16384) // 50% matches
+ ->ArgPair(16384, 16384); // 100% matches
+
+void BM_SnippetRfcOneProperty(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ const std::string base_dir = GetTestTempDir() + "/query_processor_benchmark";
+ const std::string schema_dir = base_dir + "/schema";
+ Filesystem filesystem;
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+ if (!filesystem.CreateDirectoryRecursively(schema_dir.c_str())) {
+ ICING_LOG(ERROR) << "Failed to create test directories";
+ }
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ std::unique_ptr<LanguageSegmenter> language_segmenter =
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
+ std::unique_ptr<Normalizer> normalizer =
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max())
+ .ValueOrDie();
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type1").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_dir, &clock));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ auto snippet_retriever =
+ SnippetRetriever::Create(schema_store.get(), language_segmenter.get(),
+ normalizer.get())
+ .ValueOrDie();
+
+ int num_matches = state.range(0);
+ int total_terms = state.range(1);
+
+ std::default_random_engine random;
+ std::vector<std::string> language =
+ CreateLanguages(/*language_size=*/1000, &random);
+ std::uniform_int_distribution<size_t> uniform(0u, language.size() - 1);
+ std::uniform_real_distribution<double> uniform_double(0.0, 1.0);
+
+ std::string text;
+ int num_actual_matches = 0;
+ double match_chance;
+ while (total_terms-- > 0) {
+ std::string term;
+ match_chance = static_cast<double>(num_matches) / total_terms;
+ if (uniform_double(random) <= match_chance) {
+ --num_matches;
+ ++num_actual_matches;
+ term = "foo@google.com";
+ } else {
+ term = absl_ports::StrCat(language.at(uniform(random)), "@google.com");
+ }
+ absl_ports::StrAppend(&text, ",", term);
+ }
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema("type1")
+ .AddStringProperty("prop1", text)
+ .Build();
+ SectionRestrictQueryTermsMap query_terms = {{"", {"foo"}}};
+ ResultSpecProto::SnippetSpecProto snippet_spec;
+ snippet_spec.set_num_to_snippet(100000);
+ snippet_spec.set_num_matches_per_property(100000);
+ snippet_spec.set_max_window_utf32_length(64);
+
+ SectionIdMask section_id_mask = 0x01;
+ SnippetProto snippet_proto;
+ for (auto _ : state) {
+ snippet_proto = snippet_retriever->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec, document,
+ section_id_mask);
+ ASSERT_THAT(snippet_proto.entries(), SizeIs(1));
+ ASSERT_THAT(snippet_proto.entries(0).snippet_matches(),
+ SizeIs(num_actual_matches));
+ }
+
+ // Destroy the schema store before the whole directory is removed because they
+ // persist data in destructor.
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+}
+BENCHMARK(BM_SnippetRfcOneProperty)
+ // Arguments: num_matches, total_terms
+ ->ArgPair(1, 1)
+ ->ArgPair(1, 16) // single match
+ ->ArgPair(2, 16) // ~10% matches
+ ->ArgPair(3, 16) // ~20% matches
+ ->ArgPair(8, 16) // 50% matches
+ ->ArgPair(16, 16) // 100% matches
+ ->ArgPair(1, 128) // single match
+ ->ArgPair(13, 128) // ~10% matches
+ ->ArgPair(26, 128) // ~20% matches
+ ->ArgPair(64, 128) // 50% matches
+ ->ArgPair(128, 128) // 100% matches
+ ->ArgPair(1, 512) // single match
+ ->ArgPair(51, 512) // ~10% matches
+ ->ArgPair(102, 512) // ~20% matches
+ ->ArgPair(256, 512) // 50% matches
+ ->ArgPair(512, 512) // 100% matches
+ ->ArgPair(1, 1024) // single match
+ ->ArgPair(102, 1024) // ~10% matches
+ ->ArgPair(205, 1024) // ~20% matches
+ ->ArgPair(512, 1024) // 50% matches
+ ->ArgPair(1024, 1024) // 100% matches
+ ->ArgPair(1, 4096) // single match
+ ->ArgPair(410, 4096) // ~10% matches
+ ->ArgPair(819, 4096) // ~20% matches
+ ->ArgPair(2048, 4096) // 50% matches
+ ->ArgPair(4096, 4096) // 100% matches
+ ->ArgPair(1, 16384) // single match
+ ->ArgPair(1638, 16384) // ~10% matches
+ ->ArgPair(3277, 16384) // ~20% matches
+ ->ArgPair(8192, 16384) // 50% matches
+ ->ArgPair(16384, 16384); // 100% matches
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/result/snippet-retriever_test.cc b/icing/result/snippet-retriever_test.cc
index 3b3bf61..8d81b43 100644
--- a/icing/result/snippet-retriever_test.cc
+++ b/icing/result/snippet-retriever_test.cc
@@ -22,70 +22,100 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/mock-filesystem.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/search.pb.h"
+#include "icing/proto/term.pb.h"
#include "icing/query/query-terms.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section-manager.h"
#include "icing/store/document-id.h"
#include "icing/store/key-mapper.h"
#include "icing/testing/common-matchers.h"
-#include "icing/testing/snippet-helpers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/transform/map/map-normalizer.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/snippet-helpers.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace {
+using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsEmpty;
using ::testing::SizeIs;
+// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
+// to Android. Also move it to schema-builder.h
+#ifdef ENABLE_URL_TOKENIZER
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
+ StringIndexingConfig::TokenizerType::URL;
+#endif // ENABLE_URL_TOKENIZER
+
+std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
+ std::vector<std::string_view> paths;
+ for (const SnippetProto::EntryProto& entry : snippet.entries()) {
+ paths.push_back(entry.property_name());
+ }
+ return paths;
+}
+
class SnippetRetrieverTest : public testing::Test {
protected:
void SetUp() override {
test_dir_ = GetTestTempDir() + "/icing";
filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
- ICING_ASSERT_OK_AND_ASSIGN(language_segmenter_,
- language_segmenter_factory::Create());
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ jni_cache_ = GetTestJniCache();
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
// Setup the schema
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
- SchemaProto schema;
- SchemaTypeConfigProto* type_config = schema.add_types();
- type_config->set_schema_type("email");
- PropertyConfigProto* prop_config = type_config->add_properties();
- prop_config->set_property_name("subject");
- prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
- prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- prop_config->mutable_indexing_config()->set_term_match_type(
- TermMatchType::PREFIX);
- prop_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- prop_config = type_config->add_properties();
- prop_config->set_property_name("body");
- prop_config->set_data_type(PropertyConfigProto::DataType::STRING);
- prop_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- prop_config->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- prop_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- ICING_ASSERT_OK(schema_store_->SetSchema(schema));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
/*max_term_byte_size=*/10000));
@@ -99,7 +129,7 @@ class SnippetRetrieverTest : public testing::Test {
snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
snippet_spec_.set_num_matches_per_property(
std::numeric_limits<int32_t>::max());
- snippet_spec_.set_max_window_bytes(64);
+ snippet_spec_.set_max_window_utf32_length(64);
}
void TearDown() override {
@@ -107,10 +137,12 @@ class SnippetRetrieverTest : public testing::Test {
}
Filesystem filesystem_;
+ FakeClock fake_clock_;
std::unique_ptr<SchemaStore> schema_store_;
std::unique_ptr<LanguageSegmenter> language_segmenter_;
std::unique_ptr<SnippetRetriever> snippet_retriever_;
std::unique_ptr<Normalizer> normalizer_;
+ std::unique_ptr<const JniCache> jni_cache_;
ResultSpecProto::SnippetSpecProto snippet_spec_;
std::string test_dir_;
};
@@ -144,13 +176,67 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
// Window starts at the beginning of "three" and ends in the middle of
// "three". len=4, orig_window= "thre"
- snippet_spec_.set_max_window_bytes(4);
+ snippet_spec_.set_max_window_utf32_length(4);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
+}
+
+TEST_F(SnippetRetrieverTest,
+ SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
+
+ // Window starts at the beginning of "three" and at the exact end of
+ // "three". len=5, orig_window= "three"
+ snippet_spec_.set_max_window_utf32_length(5);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq(""));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("three"));
+}
+
+TEST_F(SnippetRetrieverTest,
+ SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
+
+ // Window starts at the beginning of "four" and at the exact end of
+ // "four". len=4, orig_window= "four"
+ snippet_spec_.set_max_window_utf32_length(4);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("four"));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
@@ -165,16 +251,25 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
- // Window starts at the space between "one" and "two". Window ends in the
- // middle of "four".
- // len=14, orig_window=" two three fou"
- snippet_spec_.set_max_window_bytes(14);
+ // String: "one two three four.... five"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 27
+ // UTF-32 idx: 0 4 8 14 23 27
+ //
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (2,17).
+ // 2. trimmed, no-shifting window [4,13) "two three"
+ // 3. trimmed, shifted window [4,18) "two three four"
+ snippet_spec_.set_max_window_utf32_length(14);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("two three"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("two three four"));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
@@ -189,15 +284,25 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
- // Window starts in the middle of "one" and ends at the end of "four".
- // len=16, orig_window="e two three four"
- snippet_spec_.set_max_window_bytes(16);
+ // String: "one two three four.... five"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 27
+ // UTF-32 idx: 0 4 8 14 23 27
+ //
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (1,18).
+ // 2. trimmed, no-shifting window [4,18) "two three four"
+ // 3. trimmed, shifted window [4,20) "two three four.."
+ snippet_spec_.set_max_window_utf32_length(16);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("two three four"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("two three four.."));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
@@ -214,17 +319,20 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
// Window ends in the middle of all the punctuation and window starts at 0.
// len=20, orig_window="one two three four.."
- snippet_spec_.set_max_window_bytes(20);
+ snippet_spec_.set_max_window_utf32_length(20);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("one two three four.."));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four.."));
}
TEST_F(SnippetRetrieverTest,
- SnippetingWindowMaxWindowEndsInMiddleOfMultiBytePunctuation) {
+ SnippetingWindowMaxWindowEndsMultiBytePunctuation) {
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -238,18 +346,21 @@ TEST_F(SnippetRetrieverTest,
SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
// Window ends in the middle of all the punctuation and window starts at 0.
- // len=26, orig_window="pside down in Australia\xC2"
- snippet_spec_.set_max_window_bytes(24);
+ // len=26, orig_window="pside down in Australia¿"
+ snippet_spec_.set_max_window_utf32_length(24);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("down in Australia"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("down in Australia¿"));
}
TEST_F(SnippetRetrieverTest,
- SnippetingWindowMaxWindowEndsInMultiBytePunctuation) {
+ SnippetingWindowMaxWindowBeyondMultiBytePunctuation) {
DocumentProto document =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -263,14 +374,17 @@ TEST_F(SnippetRetrieverTest,
SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
// Window ends in the middle of all the punctuation and window starts at 0.
- // len=26, orig_window="upside down in Australia\xC2\xBF"
- snippet_spec_.set_max_window_bytes(26);
+ // len=26, orig_window="upside down in Australia¿ "
+ snippet_spec_.set_max_window_utf32_length(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("upside down in Australia¿"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("upside down in Australia¿"));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
@@ -285,15 +399,25 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
- // Window starts before 0.
- // len=22, orig_window="one two three four..."
- snippet_spec_.set_max_window_bytes(22);
+ // String: "one two three four.... five"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 27
+ // UTF-32 idx: 0 4 8 14 23 27
+ //
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (-2,21).
+ // 2. trimmed, no-shifting window [0,21) "one two three four..."
+ // 3. trimmed, shifted window [0,22) "one two three four...."
+ snippet_spec_.set_max_window_utf32_length(22);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("one two three four..."));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four...."));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
@@ -310,13 +434,16 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
// Window ends before "five" but after all the punctuation
// len=26, orig_window="one two three four.... "
- snippet_spec_.set_max_window_bytes(26);
+ snippet_spec_.set_max_window_utf32_length(26);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("one two three four...."));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four...."));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
@@ -331,15 +458,25 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
- // Window ends in the middle of "five"
- // len=32, orig_window="one two three four.... fiv"
- snippet_spec_.set_max_window_bytes(32);
+ // String: "one two three four.... five"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 27
+ // UTF-32 idx: 0 4 8 14 23 27
+ //
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be ((-7,26).
+ // 2. trimmed, no-shifting window [0,26) "one two three four...."
+ // 3. trimmed, shifted window [0,27) "one two three four.... five"
+ snippet_spec_.set_max_window_utf32_length(32);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("one two three four...."));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four.... five"));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
@@ -356,13 +493,16 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
// Max window size equals the size of the value.
// len=34, orig_window="one two three four.... five"
- snippet_spec_.set_max_window_bytes(34);
+ snippet_spec_.set_max_window_utf32_length(34);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("one two three four.... five"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four.... five"));
}
TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
@@ -379,13 +519,152 @@ TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
// Max window size exceeds the size of the value.
// len=36, orig_window="one two three four.... five"
- snippet_spec_.set_max_window_bytes(36);
+ snippet_spec_.set_max_window_utf32_length(36);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four.... five"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five six")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
+
+ // String: "one two three four.... five six"
+ // ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 28 31
+ // UTF-32 idx: 0 4 8 14 23 28 31
+ //
+ // Window size will go past the start of the window.
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (-10,19).
+ // 2. trimmed, no-shifting window [0,19) "one two three four."
+ // 3. trimmed, shifted window [0,27) "one two three four.... five"
+ snippet_spec_.set_max_window_utf32_length(28);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four.... five"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four.... five six")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"five"}}};
+
+ // String: "one two three four.... five six"
+ // ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 23 28 31
+ // UTF-32 idx: 0 4 8 14 23 28 31
+ //
+ // Window size will go past the end of the window.
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (10,39).
+ // 2. trimmed, no-shifting window [14,31) "four.... five six"
+ // 3. trimmed, shifted window [4,31) "two three four.... five six"
+ snippet_spec_.set_max_window_utf32_length(28);
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("two three four.... five six"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four....")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
+
+ // String: "one two three four...."
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 22
+ // UTF-32 idx: 0 4 8 14 22
+ //
+ // Window size will go past the start of the window.
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (-10,19).
+ // 2. trimmed, no-shifting window [0, 19) "one two three four."
+ // 3. trimmed, shifted window [0, 22) "one two three four...."
+ snippet_spec_.set_max_window_utf32_length(28);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four...."));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "counting")
+ .AddStringProperty("body", "one two three four....")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
+
+ // String: "one two three four...."
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 4 8 14 22
+ // UTF-32 idx: 0 4 8 14 22
+ //
+ // Window size will go past the start of the window.
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (1,30).
+ // 2. trimmed, no-shifting window [4, 22) "two three four...."
+ // 3. trimmed, shifted window [0, 22) "one two three four...."
+ snippet_spec_.set_max_window_utf32_length(28);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", /*snippet_index=*/0),
- Eq("one two three four.... five"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("one two three four...."));
}
TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
@@ -399,14 +678,18 @@ TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::PREFIX, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets. 'f' should match prefix-enabled property 'subject', but
// not exact-only property 'body'
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo"));
- EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("subject foo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
}
TEST_F(SnippetRetrieverTest, ExactSnippeting) {
@@ -421,8 +704,7 @@ TEST_F(SnippetRetrieverTest, ExactSnippeting) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), IsEmpty());
@@ -437,18 +719,21 @@ TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
.AddStringProperty("body", "Only a fool would match this content.")
.Build();
- snippet_spec_.set_max_window_bytes(0);
+ snippet_spec_.set_max_window_utf32_length(0);
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "subject", 0), IsEmpty());
- EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
@@ -461,23 +746,49 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
"Concerning the subject of foo, we need to begin "
"considering our options regarding body bar.")
.Build();
+ // String: "Concerning the subject of foo, we need to begin considering "
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
+ // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
+ //
+ // String ctd: "our options regarding body bar."
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 60 64 72 82 87 91
+ // UTF-32 idx: 60 64 72 82 87 91
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::PREFIX, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(2));
- EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo"));
- EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ // The first window will be:
+ // 1. untrimmed, no-shifting window will be (-6,59).
+ // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
+ // The second window will be:
+ // 1. untrimmed, no-shifting window will be (54,91).
+ // 2. trimmed, no-shifting window [60, 91) "our... bar.".
+ // 3. trimmed, shifted window [31, 91) "we... bar."
EXPECT_THAT(
- GetWindow(document, snippet, "body", 0),
- Eq("Concerning the subject of foo, we need to begin considering"));
- EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo"));
- EXPECT_THAT(GetWindow(document, snippet, "body", 1),
- Eq("our options regarding body bar."));
- EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("bar"));
+ GetWindows(content, snippet.entries(0)),
+ ElementsAre(
+ "Concerning the subject of foo, we need to begin considering our",
+ "we need to begin considering our options regarding body bar."));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("foo", "bar"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("foo", "bar"));
+
+ EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
+ content = GetString(&document, snippet.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(1)),
+ ElementsAre("subject foo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
@@ -490,23 +801,45 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
"Concerning the subject of foo, we need to begin "
"considering our options regarding body bar.")
.Build();
+ // String: "Concerning the subject of foo, we need to begin considering "
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
+ // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
+ //
+ // String ctd: "our options regarding body bar."
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 60 64 72 82 87 91
+ // UTF-32 idx: 60 64 72 82 87 91
+ //
// Section 1 "subject" is not in the section_mask, so no snippet information
// from that section should be returned by the SnippetRetriever.
SectionIdMask section_mask = 0b00000001;
SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::PREFIX, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ // The first window will be:
+ // 1. untrimmed, no-shifting window will be (-6,59).
+ // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
+ // The second window will be:
+ // 1. untrimmed, no-shifting window will be (54,91).
+ // 2. trimmed, no-shifting window [60, 91) "our... bar.".
+ // 3. trimmed, shifted window [31, 91) "we... bar."
EXPECT_THAT(
- GetWindow(document, snippet, "body", 0),
- Eq("Concerning the subject of foo, we need to begin considering"));
- EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo"));
- EXPECT_THAT(GetWindow(document, snippet, "body", 1),
- Eq("our options regarding body bar."));
- EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("bar"));
+ GetWindows(content, snippet.entries(0)),
+ ElementsAre(
+ "Concerning the subject of foo, we need to begin considering our",
+ "we need to begin considering our options regarding body bar."));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("foo", "bar"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("foo", "bar"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
@@ -519,6 +852,15 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
"Concerning the subject of foo, we need to begin "
"considering our options regarding body bar.")
.Build();
+ // String: "Concerning the subject of foo, we need to begin considering "
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
+ // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
+ //
+ // String ctd: "our options regarding body bar."
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 60 64 72 82 87 91
+ // UTF-32 idx: 60 64 72 82 87 91
SectionIdMask section_mask = 0b00000011;
// "subject" should match in both sections, but "foo" is restricted to "body"
// so it should only match in the 'body' section and not the 'subject'
@@ -526,25 +868,38 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
SectionRestrictQueryTermsMap query_terms{{"", {"subject"}},
{"body", {"foo"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::PREFIX, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(2));
- // 'subject' section should only have the one match for "subject".
- EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo"));
- EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("subject"));
- EXPECT_THAT(GetWindow(document, snippet, "subject", 1), IsEmpty());
- EXPECT_THAT(GetMatch(document, snippet, "subject", 1), IsEmpty());
-
- // 'body' section should have matches for "subject" and "foo".
- EXPECT_THAT(GetWindow(document, snippet, "body", 0),
- Eq("Concerning the subject of foo, we need to begin"));
- EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("subject"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ // The first window will be:
+ // 1. untrimmed, no-shifting window will be (-15,50).
+ // 2. trimmed, no-shifting window [0, 47) "Concerning... begin".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
+ // The second window will be:
+ // 1. untrimmed, no-shifting window will be (-6,59).
+ // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
EXPECT_THAT(
- GetWindow(document, snippet, "body", 1),
- Eq("Concerning the subject of foo, we need to begin considering"));
- EXPECT_THAT(GetMatch(document, snippet, "body", 1), Eq("foo"));
+ GetWindows(content, snippet.entries(0)),
+ ElementsAre(
+ "Concerning the subject of foo, we need to begin considering our",
+ "Concerning the subject of foo, we need to begin considering our"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("subject", "foo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("subject", "foo"));
+
+ EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
+ content = GetString(&document, snippet.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(1)),
+ ElementsAre("subject foo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
+ ElementsAre("subject"));
}
TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
@@ -558,24 +913,44 @@ TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
"considering our options regarding body bar.")
.Build();
+ // String: "Concerning the subject of foo, we need to begin considering "
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
+ // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
+ //
+ // String ctd: "our options regarding body bar."
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 60 64 72 82 87 91
+ // UTF-32 idx: 60 64 72 82 87 91
snippet_spec_.set_num_matches_per_property(1);
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::PREFIX, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
// Check the snippets
EXPECT_THAT(snippet.entries(), SizeIs(2));
- EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("subject foo"));
- EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("foo"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (-6,59).
+ // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
+ // 3. trimmed, shifted window [0, 63) "Concerning... our"
EXPECT_THAT(
- GetWindow(document, snippet, "body", 0),
- Eq("Concerning the subject of foo, we need to begin considering"));
- EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("foo"));
- EXPECT_THAT(GetWindow(document, snippet, "body", 1), IsEmpty());
- EXPECT_THAT(GetMatch(document, snippet, "body", 1), IsEmpty());
+ GetWindows(content, snippet.entries(0)),
+ ElementsAre(
+ "Concerning the subject of foo, we need to begin considering our"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
+
+ EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
+ content = GetString(&document, snippet.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(1)),
+ ElementsAre("subject foo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
}
TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
@@ -589,12 +964,15 @@ TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"md"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::PREFIX, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "subject", 0), Eq("MDI team"));
- EXPECT_THAT(GetMatch(document, snippet, "subject", 0), Eq("MDI"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
}
TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
@@ -609,14 +987,1025 @@ TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
SectionIdMask section_mask = 0b00000011;
SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}};
SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
- query_terms, TermMatchType::EXACT_ONLY, snippet_spec_, document,
- section_mask);
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
EXPECT_THAT(snippet.entries(), SizeIs(1));
- EXPECT_THAT(GetWindow(document, snippet, "body", 0),
- Eq("Some members are in Zürich."));
- EXPECT_THAT(GetMatch(document, snippet, "body", 0), Eq("Zürich"));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("Some members are in Zürich."));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich"));
+
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("Zürich"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("SingleLevelType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("X")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Y")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Z")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
+ DocumentProto document;
+ document.set_schema("SingleLevelType");
+ PropertyProto* prop = document.add_properties();
+ prop->set_name("X");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+ prop = document.add_properties();
+ prop->set_name("Y");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+ prop = document.add_properties();
+ prop->set_name("Z");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+
+ SectionIdMask section_mask = 0b00000111;
+ SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(6));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("X[1]"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
+
+ EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]"));
+ content = GetString(&document, snippet.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
+
+ EXPECT_THAT(GetPropertyPaths(snippet),
+ ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("SingleLevelType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("X")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Y")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Z")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("MultiLevelType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("A")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("B")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("C")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
+ DocumentProto subdocument;
+ PropertyProto* prop = subdocument.add_properties();
+ prop->set_name("X");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+ prop = subdocument.add_properties();
+ prop->set_name("Y");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+ prop = subdocument.add_properties();
+ prop->set_name("Z");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+
+ DocumentProto document;
+ document.set_schema("MultiLevelType");
+ prop = document.add_properties();
+ prop->set_name("A");
+ *prop->add_document_values() = subdocument;
+
+ prop = document.add_properties();
+ prop->set_name("B");
+ *prop->add_document_values() = subdocument;
+
+ prop = document.add_properties();
+ prop->set_name("C");
+ *prop->add_document_values() = subdocument;
+
+ SectionIdMask section_mask = 0b111111111;
+ SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(18));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("A.X[1]"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
+
+ EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]"));
+ content = GetString(&document, snippet.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
+
+ EXPECT_THAT(
+ GetPropertyPaths(snippet),
+ ElementsAre("A.X[1]", "A.X[3]", "A.Y[1]", "A.Y[3]", "A.Z[1]", "A.Z[3]",
+ "B.X[1]", "B.X[3]", "B.Y[1]", "B.Y[3]", "B.Z[1]", "B.Z[3]",
+ "C.X[1]", "C.X[3]", "C.Y[1]", "C.Y[3]", "C.Z[1]", "C.Z[3]"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("SingleLevelType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("X")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Y")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Z")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("MultiLevelType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("A")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("B")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("C")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
+ DocumentProto subdocument;
+ PropertyProto* prop = subdocument.add_properties();
+ prop->set_name("X");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+ prop = subdocument.add_properties();
+ prop->set_name("Y");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+ prop = subdocument.add_properties();
+ prop->set_name("Z");
+ for (const std::string& s : string_values) {
+ prop->add_string_values(s);
+ }
+
+ DocumentProto document;
+ document.set_schema("MultiLevelType");
+ prop = document.add_properties();
+ prop->set_name("A");
+ *prop->add_document_values() = subdocument;
+ *prop->add_document_values() = subdocument;
+
+ prop = document.add_properties();
+ prop->set_name("B");
+ *prop->add_document_values() = subdocument;
+ *prop->add_document_values() = subdocument;
+
+ prop = document.add_properties();
+ prop->set_name("C");
+ *prop->add_document_values() = subdocument;
+ *prop->add_document_values() = subdocument;
+
+ SectionIdMask section_mask = 0b111111111;
+ SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(36));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X[1]"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
+
+ EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]"));
+ content = GetString(&document, snippet.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
+
+ EXPECT_THAT(GetPropertyPaths(snippet),
+ ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]",
+ "A[0].Y[1]", "A[0].Y[3]", "A[1].Y[1]", "A[1].Y[3]",
+ "A[0].Z[1]", "A[0].Z[3]", "A[1].Z[1]", "A[1].Z[3]",
+ "B[0].X[1]", "B[0].X[3]", "B[1].X[1]", "B[1].X[3]",
+ "B[0].Y[1]", "B[0].Y[3]", "B[1].Y[1]", "B[1].Y[3]",
+ "B[0].Z[1]", "B[0].Z[3]", "B[1].Z[1]", "B[1].Z[3]",
+ "C[0].X[1]", "C[0].X[3]", "C[1].X[1]", "C[1].X[3]",
+ "C[0].Y[1]", "C[0].Y[3]", "C[1].Y[1]", "C[1].Y[3]",
+ "C[0].Z[1]", "C[0].Z[3]", "C[1].Z[1]", "C[1].Z[3]"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("SingleLevelType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("X")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Y")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Z")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("MultiLevelType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("A")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("B")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("C")
+ .SetDataTypeDocument(
+ "SingleLevelType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ DocumentProto subdocument;
+ PropertyProto* prop = subdocument.add_properties();
+ prop->set_name("X");
+ prop->add_string_values("polo");
+ prop = subdocument.add_properties();
+ prop->set_name("Y");
+ prop->add_string_values("marco");
+ prop = subdocument.add_properties();
+ prop->set_name("Z");
+ prop->add_string_values("polo");
+
+ DocumentProto document;
+ document.set_schema("MultiLevelType");
+ prop = document.add_properties();
+ prop->set_name("A");
+ *prop->add_document_values() = subdocument;
+ *prop->add_document_values() = subdocument;
+
+ prop = document.add_properties();
+ prop->set_name("B");
+ *prop->add_document_values() = subdocument;
+ *prop->add_document_values() = subdocument;
+
+ prop = document.add_properties();
+ prop->set_name("C");
+ *prop->add_document_values() = subdocument;
+ *prop->add_document_values() = subdocument;
+
+ SectionIdMask section_mask = 0b111111111;
+ SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ EXPECT_THAT(snippet.entries(), SizeIs(12));
+ EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
+
+ EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X"));
+ content = GetString(&document, snippet.entries(1).property_name());
+ EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
+
+ EXPECT_THAT(
+ GetPropertyPaths(snippet),
+ ElementsAre("A[0].X", "A[1].X", "A[0].Z", "A[1].Z", "B[0].X", "B[1].X",
+ "B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z"));
+}
+
+TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF8 idx: 0 3 9 15 18
+ // UTF16 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", kChinese)
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ // Ensure that the match is correct.
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
+ EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
+ EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
+}
+
+TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF8 idx: 0 3 9 15 18
+ // UTF16 idx: 0 1 3 5 6
+ // UTF32 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ constexpr std::string_view kChinese = "我每天走路去上班。";
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", kChinese)
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+ // The window will be:
+ // 1. untrimmed, no-shifting window will be (0,7).
+ // 2. trimmed, no-shifting window [1, 6) "每天走路去".
+ // 3. trimmed, shifted window [0, 6) "我每天走路去"
+ snippet_spec_.set_max_window_utf32_length(6);
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ // Ensure that the match is correct.
+ EXPECT_THAT(GetWindows(content, *entry), ElementsAre("我每天走路去"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.window_utf16_position(), Eq(0));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(6));
+}
+
+TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
+ // The following string has four-byte UTF-8 characters. Most importantly, it
+ // is also two code units in UTF-16.
+ // String: "𐀀𐀁 𐀂𐀃 𐀄"
+ // ^ ^ ^
+ // UTF8 idx: 0 9 18
+ // UTF16 idx: 0 5 10
+ // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+ constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", kText)
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ // Ensure that the match is correct.
+ EXPECT_THAT(GetMatches(content, *entry), ElementsAre("𐀂𐀃"));
+ EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("𐀂"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
+ EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
+ EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
+}
+
+TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
+ // The following string has four-byte UTF-8 characters. Most importantly, it
+ // is also two code units in UTF-16.
+ // String: "𐀀𐀁 𐀂𐀃 𐀄"
+ // ^ ^ ^
+ // UTF8 idx: 0 9 18
+ // UTF16 idx: 0 5 10
+ // UTF32 idx: 0 3 6
+ // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+ constexpr std::string_view kText = "𐀀𐀁 𐀂𐀃 𐀄";
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", kText)
+ .AddStringProperty("body",
+ "Concerning the subject of foo, we need to begin "
+ "considering our options regarding body bar.")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000011;
+ SectionRestrictQueryTermsMap query_terms{{"", {"𐀂"}}};
+
+ // Set a six character window. This will produce a window like this:
+ // String: "𐀀𐀁 𐀂𐀃 𐀄"
+ // ^ ^
+ // UTF8 idx: 9 22
+ // UTF16 idx: 5 12
+ // UTF32 idx: 3 7
+ snippet_spec_.set_max_window_utf32_length(6);
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // Ensure that one and only one property was matched and it was "body"
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ EXPECT_THAT(entry->property_name(), Eq("subject"));
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // Ensure that there is one and only one match within "subject"
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+
+ // Ensure that the match is correct.
+ EXPECT_THAT(GetWindows(content, *entry), ElementsAre("𐀂𐀃 𐀄"));
+
+ // Ensure that the utf-16 values are also as expected
+ EXPECT_THAT(match_proto.window_utf16_position(), Eq(5));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
+}
+
+TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("verbatimType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("verbatim")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "verbatim/1")
+ .SetSchema("verbatimType")
+ .AddStringProperty("verbatim", "Hello, world!")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000001;
+ SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}};
+
+ snippet_spec_.set_max_window_utf32_length(13);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
+
+ // There should only be one snippet entry and match, the verbatim token in its
+ // entirety.
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ ASSERT_THAT(entry->property_name(), "verbatim");
+
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+ // We expect the match to begin at position 0, and to span the entire token
+ // which contains 13 characters.
+ EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(13));
+
+ // We expect the submatch to begin at position 0 of the verbatim token and
+ // span the length of our query term "Hello, world!", which has utf-16 length
+ // of 13. The submatch length is equal to the window length as the query the
+ // snippet is retrieved with an exact term match.
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
+ EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13));
+}
+
+TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("verbatimType")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("verbatim")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF8 idx: 0 3 9 15 18
+ // UTF16 idx: 0 1 3 5 6
+ // UTF32 idx: 0 1 3 5 6
+ // Breaks into segments: "我", "每天", "走路", "去", "上班"
+ std::string chinese_string = "我每天走路去上班。";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "verbatim/1")
+ .SetSchema("verbatimType")
+ .AddStringProperty("verbatim", chinese_string)
+ .Build();
+
+ SectionIdMask section_mask = 0b00000001;
+ SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}};
+
+ snippet_spec_.set_max_window_utf32_length(9);
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // There should only be one snippet entry and match, the verbatim token in its
+ // entirety.
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+
+ const SnippetProto::EntryProto* entry = &snippet.entries(0);
+ ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
+ ASSERT_THAT(entry->property_name(), "verbatim");
+
+ const SnippetMatchProto& match_proto = entry->snippet_matches(0);
+ // We expect the match to begin at position 0, and to span the entire token
+ // which has utf-16 length of 9.
+ EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
+ EXPECT_THAT(match_proto.window_utf16_length(), Eq(9));
+
+ // We expect the submatch to begin at position 0 of the verbatim token and
+ // span the length of our query term "我每", which has utf-16 length of 2.
+ EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
+ EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
+}
+
+TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("rfc822Type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("rfc822")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_RFC822)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "rfc822/1")
+ .SetSchema("rfc822Type")
+ .AddStringProperty("rfc822",
+ "Alexander Sav <tom.bar@google.com>, Very Long "
+ "Name Example <tjbarron@google.com>")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000001;
+
+ // This should match both the first name token as well as the entire RFC822.
+ SectionRestrictQueryTermsMap query_terms{{"", {"alexand"}}};
+
+ snippet_spec_.set_max_window_utf32_length(35);
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
+
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("Alexander Sav <tom.bar@google.com>,",
+ "Alexander Sav <tom.bar@google.com>,"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("Alexander Sav <tom.bar@google.com>", "Alexander"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("Alexand", "Alexand"));
+
+ // "tom" should match the local component, local address, and address tokens.
+ query_terms = SectionRestrictQueryTermsMap{{"", {"tom"}}};
+ snippet_spec_.set_max_window_utf32_length(36);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ // TODO(b/248362902) Stop returning duplicate matches.
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("Alexander Sav <tom.bar@google.com>,",
+ "Alexander Sav <tom.bar@google.com>,",
+ "Alexander Sav <tom.bar@google.com>,"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("tom.bar", "tom.bar@google.com", "tom"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("tom", "tom", "tom"));
+}
+
+TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("rfc822Type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("rfc822")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_RFC822)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ std::string chinese_string = "我, 每天@走路, 去@上班";
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "rfc822/1")
+ .SetSchema("rfc822Type")
+ .AddStringProperty("rfc822", chinese_string)
+ .Build();
+
+ SectionIdMask section_mask = 0b00000001;
+
+ SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
+
+ snippet_spec_.set_max_window_utf32_length(8);
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ // There should only be one snippet entry and match, the local component token
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
+
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ // The local component, address, local address, and token will all match. The
+ // windows for address and token are "" as the snippet window is too small.
+ // TODO(b/248362902) Stop returning duplicate matches.
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("每天@走路,", "每天@走路,"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("走路", "走路"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("走", "走"));
+}
+
+#ifdef ENABLE_URL_TOKENIZER
+TEST_F(SnippetRetrieverTest, SnippettingUrlAscii) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("urlType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("url")
+ .SetDataTypeString(MATCH_PREFIX, TOKENIZER_URL)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ snippet_retriever_,
+ SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
+ normalizer_.get()));
+
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "url/1")
+ .SetSchema("urlType")
+ .AddStringProperty("url", "https://mail.google.com/calendar/google/")
+ .Build();
+
+ SectionIdMask section_mask = 0b00000001;
+
+ // Query with single url split-token match
+ SectionRestrictQueryTermsMap query_terms{{"", {"com"}}};
+ // 40 is the length of the url.
+ // Window that is the size of the url should return entire url.
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ std::string_view content =
+ GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("com"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("com"));
+
+ // Query with single url suffix-token match
+ query_terms = SectionRestrictQueryTermsMap{{"", {"mail.goo"}}};
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("mail.goo"));
+
+ // Query with multiple url split-token matches
+ query_terms = SectionRestrictQueryTermsMap{{"", {"goog"}}};
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/",
+ "https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("google", "google"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("goog", "goog"));
+
+ // Query with both url split-token and suffix-token matches
+ query_terms = SectionRestrictQueryTermsMap{{"", {"mail"}}};
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/",
+ "https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("mail", "mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("mail", "mail"));
+
+ // Prefix query with both url split-token and suffix-token matches
+ query_terms = SectionRestrictQueryTermsMap{{"", {"http"}}};
+ snippet_spec_.set_max_window_utf32_length(40);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://mail.google.com/calendar/google/",
+ "https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("https", "https://mail.google.com/calendar/google/"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("http", "http"));
+
+ // Window that's smaller than the input size should not return any matches.
+ query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
+ snippet_spec_.set_max_window_utf32_length(10);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(0));
+
+ // Test case with more than two matches
+ document =
+ DocumentBuilder()
+ .SetKey("icing", "url/1")
+ .SetSchema("urlType")
+ .AddStringProperty("url", "https://www.google.com/calendar/google/")
+ .Build();
+
+ // Prefix query with both url split-token and suffix-token matches
+ query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
+ snippet_spec_.set_max_window_utf32_length(39);
+
+ snippet = snippet_retriever_->RetrieveSnippet(
+ query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
+
+ ASSERT_THAT(snippet.entries(), SizeIs(1));
+ EXPECT_THAT(snippet.entries(0).property_name(), "url");
+
+ content = GetString(&document, snippet.entries(0).property_name());
+
+ EXPECT_THAT(GetWindows(content, snippet.entries(0)),
+ ElementsAre("https://www.google.com/calendar/google/",
+ "https://www.google.com/calendar/google/",
+ "https://www.google.com/calendar/google/"));
+ EXPECT_THAT(GetMatches(content, snippet.entries(0)),
+ ElementsAre("google", "google", "google.com/calendar/google/"));
+ EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
+ ElementsAre("google", "google", "google"));
}
+#endif // ENABLE_URL_TOKENIZER
} // namespace
diff --git a/icing/schema-builder.h b/icing/schema-builder.h
new file mode 100644
index 0000000..c74505e
--- /dev/null
+++ b/icing/schema-builder.h
@@ -0,0 +1,227 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_BUILDER_H_
+#define ICING_SCHEMA_BUILDER_H_
+
+#include <cstdint>
+#include <initializer_list>
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+
+namespace icing {
+namespace lib {
+
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_UNKNOWN =
+ PropertyConfigProto::Cardinality::UNKNOWN;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REPEATED =
+ PropertyConfigProto::Cardinality::REPEATED;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_OPTIONAL =
+ PropertyConfigProto::Cardinality::OPTIONAL;
+constexpr PropertyConfigProto::Cardinality::Code CARDINALITY_REQUIRED =
+ PropertyConfigProto::Cardinality::REQUIRED;
+
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_NONE =
+ StringIndexingConfig::TokenizerType::NONE;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_PLAIN =
+ StringIndexingConfig::TokenizerType::PLAIN;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_VERBATIM =
+ StringIndexingConfig::TokenizerType::VERBATIM;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_RFC822 =
+ StringIndexingConfig::TokenizerType::RFC822;
+constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
+ StringIndexingConfig::TokenizerType::URL;
+
+constexpr TermMatchType::Code TERM_MATCH_UNKNOWN = TermMatchType::UNKNOWN;
+constexpr TermMatchType::Code TERM_MATCH_EXACT = TermMatchType::EXACT_ONLY;
+constexpr TermMatchType::Code TERM_MATCH_PREFIX = TermMatchType::PREFIX;
+
+constexpr IntegerIndexingConfig::NumericMatchType::Code NUMERIC_MATCH_UNKNOWN =
+ IntegerIndexingConfig::NumericMatchType::UNKNOWN;
+constexpr IntegerIndexingConfig::NumericMatchType::Code NUMERIC_MATCH_RANGE =
+ IntegerIndexingConfig::NumericMatchType::RANGE;
+
+constexpr PropertyConfigProto::DataType::Code TYPE_UNKNOWN =
+ PropertyConfigProto::DataType::UNKNOWN;
+constexpr PropertyConfigProto::DataType::Code TYPE_STRING =
+ PropertyConfigProto::DataType::STRING;
+constexpr PropertyConfigProto::DataType::Code TYPE_INT64 =
+ PropertyConfigProto::DataType::INT64;
+constexpr PropertyConfigProto::DataType::Code TYPE_DOUBLE =
+ PropertyConfigProto::DataType::DOUBLE;
+constexpr PropertyConfigProto::DataType::Code TYPE_BOOLEAN =
+ PropertyConfigProto::DataType::BOOLEAN;
+constexpr PropertyConfigProto::DataType::Code TYPE_BYTES =
+ PropertyConfigProto::DataType::BYTES;
+constexpr PropertyConfigProto::DataType::Code TYPE_DOCUMENT =
+ PropertyConfigProto::DataType::DOCUMENT;
+
+constexpr JoinableConfig::ValueType::Code JOINABLE_VALUE_TYPE_NONE =
+ JoinableConfig::ValueType::NONE;
+constexpr JoinableConfig::ValueType::Code JOINABLE_VALUE_TYPE_QUALIFIED_ID =
+ JoinableConfig::ValueType::QUALIFIED_ID;
+
+class PropertyConfigBuilder {
+ public:
+ PropertyConfigBuilder() = default;
+ explicit PropertyConfigBuilder(PropertyConfigProto property)
+ : property_(std::move(property)) {}
+
+ PropertyConfigBuilder& SetName(std::string_view name) {
+ property_.set_property_name(std::string(name));
+ return *this;
+ }
+
+ PropertyConfigBuilder& SetDataType(
+ PropertyConfigProto::DataType::Code data_type) {
+ property_.set_data_type(data_type);
+ return *this;
+ }
+
+ PropertyConfigBuilder& SetDataTypeString(
+ TermMatchType::Code match_type,
+ StringIndexingConfig::TokenizerType::Code tokenizer) {
+ property_.set_data_type(PropertyConfigProto::DataType::STRING);
+ property_.mutable_string_indexing_config()->set_term_match_type(match_type);
+ property_.mutable_string_indexing_config()->set_tokenizer_type(tokenizer);
+ return *this;
+ }
+
+ PropertyConfigBuilder& SetDataTypeJoinableString(
+ JoinableConfig::ValueType::Code join_value_type,
+ TermMatchType::Code match_type = TERM_MATCH_UNKNOWN,
+ StringIndexingConfig::TokenizerType::Code tokenizer = TOKENIZER_NONE) {
+ property_.set_data_type(PropertyConfigProto::DataType::STRING);
+ property_.mutable_joinable_config()->set_value_type(join_value_type);
+ property_.mutable_string_indexing_config()->set_term_match_type(match_type);
+ property_.mutable_string_indexing_config()->set_tokenizer_type(tokenizer);
+ return *this;
+ }
+
+ PropertyConfigBuilder& SetDataTypeInt64(
+ IntegerIndexingConfig::NumericMatchType::Code numeric_match_type) {
+ property_.set_data_type(PropertyConfigProto::DataType::INT64);
+ property_.mutable_integer_indexing_config()->set_numeric_match_type(
+ numeric_match_type);
+ return *this;
+ }
+
+ PropertyConfigBuilder& SetDataTypeDocument(std::string_view schema_type,
+ bool index_nested_properties) {
+ property_.set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property_.set_schema_type(std::string(schema_type));
+ property_.mutable_document_indexing_config()->set_index_nested_properties(
+ index_nested_properties);
+ property_.mutable_document_indexing_config()
+ ->clear_indexable_nested_properties_list();
+ return *this;
+ }
+
+ PropertyConfigBuilder& SetDataTypeDocument(
+ std::string_view schema_type,
+ std::initializer_list<std::string> indexable_nested_properties_list) {
+ property_.set_data_type(PropertyConfigProto::DataType::DOCUMENT);
+ property_.set_schema_type(std::string(schema_type));
+ property_.mutable_document_indexing_config()->set_index_nested_properties(
+ false);
+ for (const std::string& property : indexable_nested_properties_list) {
+ property_.mutable_document_indexing_config()
+ ->add_indexable_nested_properties_list(property);
+ }
+ return *this;
+ }
+
+ PropertyConfigBuilder& SetJoinable(
+ JoinableConfig::ValueType::Code join_value_type, bool propagate_delete) {
+ property_.mutable_joinable_config()->set_value_type(join_value_type);
+ property_.mutable_joinable_config()->set_propagate_delete(propagate_delete);
+ return *this;
+ }
+
+ PropertyConfigBuilder& SetCardinality(
+ PropertyConfigProto::Cardinality::Code cardinality) {
+ property_.set_cardinality(cardinality);
+ return *this;
+ }
+
+ PropertyConfigProto Build() const { return std::move(property_); }
+
+ private:
+ PropertyConfigProto property_;
+};
+
+class SchemaTypeConfigBuilder {
+ public:
+ SchemaTypeConfigBuilder() = default;
+ SchemaTypeConfigBuilder(SchemaTypeConfigProto type_config)
+ : type_config_(std::move(type_config)) {}
+
+ SchemaTypeConfigBuilder& SetType(std::string_view type) {
+ type_config_.set_schema_type(std::string(type));
+ return *this;
+ }
+
+ SchemaTypeConfigBuilder& AddParentType(std::string_view parent_type) {
+ type_config_.add_parent_types(std::string(parent_type));
+ return *this;
+ }
+
+ SchemaTypeConfigBuilder& SetVersion(int version) {
+ type_config_.set_version(version);
+ return *this;
+ }
+
+ SchemaTypeConfigBuilder& AddProperty(PropertyConfigProto property) {
+ *type_config_.add_properties() = std::move(property);
+ return *this;
+ }
+ SchemaTypeConfigBuilder& AddProperty(PropertyConfigBuilder property_builder) {
+ *type_config_.add_properties() = property_builder.Build();
+ return *this;
+ }
+
+ SchemaTypeConfigProto Build() { return std::move(type_config_); }
+
+ private:
+ SchemaTypeConfigProto type_config_;
+};
+
+class SchemaBuilder {
+ public:
+ SchemaBuilder() = default;
+ SchemaBuilder(SchemaProto schema) : schema_(std::move(schema)) {}
+
+ SchemaBuilder& AddType(SchemaTypeConfigProto type) {
+ *schema_.add_types() = std::move(type);
+ return *this;
+ }
+ SchemaBuilder& AddType(SchemaTypeConfigBuilder type_builder) {
+ *schema_.add_types() = type_builder.Build();
+ return *this;
+ }
+
+ SchemaProto Build() { return std::move(schema_); }
+
+ private:
+ SchemaProto schema_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_BUILDER_H_
diff --git a/icing/schema/backup-schema-producer.cc b/icing/schema/backup-schema-producer.cc
new file mode 100644
index 0000000..d0a0554
--- /dev/null
+++ b/icing/schema/backup-schema-producer.cc
@@ -0,0 +1,164 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/backup-schema-producer.h"
+
+#include <string_view>
+#include <unordered_map>
+#include <vector>
+
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema/property-util.h"
+#include "icing/schema/section.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Creates a map of property to indexed id count based on the list of indexed
+// properties provided by metadata_list.
+// For all non-document properties, the value will always be 1.
+// For document properties, the value will be the number of nested properties
+// that are indexed with that document type.
+std::unordered_map<std::string_view, int> CreateIndexedIdCountMap(
+ const std::vector<SectionMetadata>* metadata_list) {
+ std::unordered_map<std::string_view, int> property_indexed_id_count_map;
+ for (const SectionMetadata& metadata : *metadata_list) {
+ std::string_view top_level_property;
+ size_t separator_pos =
+ metadata.path.find(property_util::kPropertyPathSeparator);
+ if (separator_pos == std::string::npos) {
+ top_level_property = metadata.path;
+ } else {
+ top_level_property =
+ std::string_view(metadata.path.c_str(), separator_pos);
+ }
+ int& count = property_indexed_id_count_map[top_level_property];
+ ++count;
+ }
+ return property_indexed_id_count_map;
+}
+
+// Returns the indices (within schema.types()) of all types that are rollback
+// incompatible (old code cannot handle these types if they are unmodified).
+//
+// Currently, this means types that:
+// 1. Use RFC822 tokenization for any properties
+// 2. Use more than 16 indexed properties
+libtextclassifier3::StatusOr<std::vector<int>>
+GetRollbackIncompatibleTypeIndices(const SchemaProto& schema,
+ const SectionManager& type_manager) {
+ std::vector<int> invalid_type_indices;
+ for (int i = 0; i < schema.types_size(); ++i) {
+ const SchemaTypeConfigProto& type = schema.types(i);
+ bool rollback_incompatible = false;
+ for (const PropertyConfigProto& property : type.properties()) {
+ if (property.string_indexing_config().tokenizer_type() ==
+ StringIndexingConfig::TokenizerType::RFC822) {
+ rollback_incompatible = true;
+ break;
+ }
+ }
+ if (rollback_incompatible) {
+ invalid_type_indices.push_back(i);
+ continue;
+ }
+
+ ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
+ type_manager.GetMetadataList(type.schema_type()));
+ if (metadata_list->size() > kOldTotalNumSections) {
+ invalid_type_indices.push_back(i);
+ }
+ }
+ return invalid_type_indices;
+}
+
+} // namespace
+
+/* static */ libtextclassifier3::StatusOr<BackupSchemaProducer>
+BackupSchemaProducer::Create(const SchemaProto& schema,
+ const SectionManager& type_manager) {
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<int> invalid_type_indices,
+ GetRollbackIncompatibleTypeIndices(schema, type_manager));
+ if (invalid_type_indices.empty()) {
+ return BackupSchemaProducer();
+ }
+
+ SchemaProto backup_schema(schema);
+ std::unordered_map<std::string_view, int> type_indexed_property_count;
+ for (int i : invalid_type_indices) {
+ SchemaTypeConfigProto* type = backup_schema.mutable_types(i);
+
+ // This should never cause an error - every type should have an entry in the
+ // type_manager.
+ ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
+ type_manager.GetMetadataList(type->schema_type()));
+ int num_indexed_sections = metadata_list->size();
+ std::unordered_map<std::string_view, int> property_indexed_id_count_map;
+ if (num_indexed_sections > kOldTotalNumSections) {
+ property_indexed_id_count_map = CreateIndexedIdCountMap(metadata_list);
+ }
+
+ // Step 1. Switch all properties with RFC tokenizer as unindexed.
+ for (PropertyConfigProto& property : *type->mutable_properties()) {
+ // If the property uses the RFC tokenizer, then we need to set it to NONE
+ // and set match type UNKNOWN.
+ if (property.string_indexing_config().tokenizer_type() ==
+ StringIndexingConfig::TokenizerType::RFC822) {
+ property.clear_string_indexing_config();
+ --num_indexed_sections;
+ property_indexed_id_count_map.erase(property.property_name());
+ }
+ }
+
+ // Step 2. If there are any types that exceed the old indexed property
+ // limit, then mark indexed properties as unindexed until we're back under
+ // the limit.
+ if (num_indexed_sections <= kOldTotalNumSections) {
+ continue;
+ }
+
+ // We expect that the last properties were the ones added most recently and
+ // are the least crucial, so we do removal in reverse order. This is a bit
+ // arbitrary, but we don't really have sufficient information to make this
+ // judgment anyways.
+ for (auto itr = type->mutable_properties()->rbegin();
+ itr != type->mutable_properties()->rend(); ++itr) {
+ auto indexed_count_itr =
+ property_indexed_id_count_map.find(itr->property_name());
+ if (indexed_count_itr == property_indexed_id_count_map.end()) {
+ continue;
+ }
+
+ // Mark this property as unindexed and subtract all indexed property ids
+ // consumed by this property.
+ PropertyConfigProto& property = *itr;
+ property.clear_document_indexing_config();
+ property.clear_string_indexing_config();
+ property.clear_integer_indexing_config();
+ num_indexed_sections -= indexed_count_itr->second;
+ if (num_indexed_sections <= kOldTotalNumSections) {
+ break;
+ }
+ }
+ }
+ return BackupSchemaProducer(std::move(backup_schema));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/backup-schema-producer.h b/icing/schema/backup-schema-producer.h
new file mode 100644
index 0000000..61dcde6
--- /dev/null
+++ b/icing/schema/backup-schema-producer.h
@@ -0,0 +1,55 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_BACKUP_SCHEMA_PRODUCER_H_
+#define ICING_SCHEMA_BACKUP_SCHEMA_PRODUCER_H_
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/section-manager.h"
+#include "icing/schema/section.h"
+
+namespace icing {
+namespace lib {
+
+class BackupSchemaProducer {
+ public:
+ // Creates a BackupSchemaProducer based off of schema.
+ // If schema doesn't require a backup schema (because it is fully
+ // rollback-proof) then no copies will be made and `is_backup_necessary` will
+ // return false.
+ // If schema *does* require a backup schema, then `is_backup_necessary` will
+ // return true and the backup schema can be retrieved by calling `Produce`.
+ // Returns:
+ // - On success, a BackupSchemaProducer
+ // - INTERNAL_ERROR if the schema is inconsistent with the type_manager.
+ static libtextclassifier3::StatusOr<BackupSchemaProducer> Create(
+ const SchemaProto& schema, const SectionManager& type_manager);
+
+ SchemaProto Produce() && { return std::move(cached_schema_); }
+
+ bool is_backup_necessary() const { return !cached_schema_.types().empty(); }
+
+ private:
+ BackupSchemaProducer() = default;
+ explicit BackupSchemaProducer(SchemaProto&& schema)
+ : cached_schema_(std::move(schema)) {}
+
+ SchemaProto cached_schema_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_BACKUP_SCHEMA_PRODUCER_H_
diff --git a/icing/schema/backup-schema-producer_test.cc b/icing/schema/backup-schema-producer_test.cc
new file mode 100644
index 0000000..dbd033f
--- /dev/null
+++ b/icing/schema/backup-schema-producer_test.cc
@@ -0,0 +1,737 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/backup-schema-producer.h"
+
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-type-manager.h"
+#include "icing/schema/schema-util.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+
+class BackupSchemaProducerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/icing";
+ schema_store_dir_ = test_dir_ + "/schema_store";
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+ }
+
+ void TearDown() override {
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+ std::string schema_store_dir_;
+};
+
+TEST_F(BackupSchemaProducerTest, EmptySchema) {
+ SchemaProto empty;
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(empty, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(empty,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(false));
+}
+
+TEST_F(BackupSchemaProducerTest, NoIndexedPropertySchema) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .SetDataType(TYPE_INT64)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop3")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument(
+ "TypeA", /*index_nested_properties=*/false))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop4")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataType(TYPE_STRING)))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+ ASSERT_THAT(type_id_mapper->Put("TypeB", 1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(false));
+}
+
+TEST_F(BackupSchemaProducerTest, RollbackCompatibleSchema) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop3")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument(
+ "TypeA", /*index_nested_properties=*/true))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop4")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_VERBATIM)))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+ ASSERT_THAT(type_id_mapper->Put("TypeB", 1), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(false));
+}
+
+TEST_F(BackupSchemaProducerTest, RemoveRfc822) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822)))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(true));
+ SchemaProto backup = std::move(backup_producer).Produce();
+
+ SchemaProto expected_backup =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("TypeA").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING)))
+ .Build();
+ EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup));
+}
+
+TEST_F(BackupSchemaProducerTest, MakeExtraStringIndexedPropertiesUnindexed) {
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto type =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(indexed_string_property_builder.SetName("prop1"))
+ .AddProperty(indexed_string_property_builder.SetName("prop2"))
+ .AddProperty(indexed_string_property_builder.SetName("prop3"))
+ .AddProperty(indexed_string_property_builder.SetName("prop4"))
+ .AddProperty(indexed_string_property_builder.SetName("prop5"))
+ .AddProperty(indexed_string_property_builder.SetName("prop6"))
+ .AddProperty(indexed_string_property_builder.SetName("prop7"))
+ .AddProperty(indexed_string_property_builder.SetName("prop8"))
+ .AddProperty(indexed_string_property_builder.SetName("prop9"))
+ .AddProperty(indexed_string_property_builder.SetName("prop10"))
+ .AddProperty(indexed_string_property_builder.SetName("prop11"))
+ .AddProperty(indexed_string_property_builder.SetName("prop12"))
+ .AddProperty(indexed_string_property_builder.SetName("prop13"))
+ .AddProperty(indexed_string_property_builder.SetName("prop14"))
+ .AddProperty(indexed_string_property_builder.SetName("prop15"))
+ .AddProperty(indexed_string_property_builder.SetName("prop16"))
+ .AddProperty(indexed_string_property_builder.SetName("prop17"))
+ .AddProperty(indexed_string_property_builder.SetName("prop18"))
+ .AddProperty(indexed_string_property_builder.SetName("prop19"))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type).Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(true));
+ SchemaProto backup = std::move(backup_producer).Produce();
+
+ PropertyConfigBuilder unindexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING);
+ SchemaTypeConfigProto expected_type =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(indexed_string_property_builder.SetName("prop1"))
+ .AddProperty(indexed_string_property_builder.SetName("prop2"))
+ .AddProperty(indexed_string_property_builder.SetName("prop3"))
+ .AddProperty(indexed_string_property_builder.SetName("prop4"))
+ .AddProperty(indexed_string_property_builder.SetName("prop5"))
+ .AddProperty(indexed_string_property_builder.SetName("prop6"))
+ .AddProperty(indexed_string_property_builder.SetName("prop7"))
+ .AddProperty(indexed_string_property_builder.SetName("prop8"))
+ .AddProperty(indexed_string_property_builder.SetName("prop9"))
+ .AddProperty(indexed_string_property_builder.SetName("prop10"))
+ .AddProperty(indexed_string_property_builder.SetName("prop11"))
+ .AddProperty(indexed_string_property_builder.SetName("prop12"))
+ .AddProperty(indexed_string_property_builder.SetName("prop13"))
+ .AddProperty(indexed_string_property_builder.SetName("prop14"))
+ .AddProperty(indexed_string_property_builder.SetName("prop15"))
+ .AddProperty(unindexed_string_property_builder.SetName("prop16"))
+ .AddProperty(unindexed_string_property_builder.SetName("prop17"))
+ .AddProperty(unindexed_string_property_builder.SetName("prop18"))
+ .AddProperty(unindexed_string_property_builder.SetName("prop19"))
+ .Build();
+ SchemaProto expected_backup = SchemaBuilder().AddType(expected_type).Build();
+ EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup));
+}
+
+TEST_F(BackupSchemaProducerTest, MakeExtraIntIndexedPropertiesUnindexed) {
+ PropertyConfigBuilder indexed_int_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE);
+ SchemaTypeConfigProto type =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_int_property_builder.SetName("prop0"))
+ .AddProperty(indexed_int_property_builder.SetName("prop1"))
+ .AddProperty(indexed_int_property_builder.SetName("prop2"))
+ .AddProperty(indexed_int_property_builder.SetName("prop3"))
+ .AddProperty(indexed_int_property_builder.SetName("prop4"))
+ .AddProperty(indexed_int_property_builder.SetName("prop5"))
+ .AddProperty(indexed_int_property_builder.SetName("prop6"))
+ .AddProperty(indexed_int_property_builder.SetName("prop7"))
+ .AddProperty(indexed_int_property_builder.SetName("prop8"))
+ .AddProperty(indexed_int_property_builder.SetName("prop9"))
+ .AddProperty(indexed_int_property_builder.SetName("prop10"))
+ .AddProperty(indexed_int_property_builder.SetName("prop11"))
+ .AddProperty(indexed_int_property_builder.SetName("prop12"))
+ .AddProperty(indexed_int_property_builder.SetName("prop13"))
+ .AddProperty(indexed_int_property_builder.SetName("prop14"))
+ .AddProperty(indexed_int_property_builder.SetName("prop15"))
+ .AddProperty(indexed_int_property_builder.SetName("prop16"))
+ .AddProperty(indexed_int_property_builder.SetName("prop17"))
+ .AddProperty(indexed_int_property_builder.SetName("prop18"))
+ .AddProperty(indexed_int_property_builder.SetName("prop19"))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type).Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(true));
+ SchemaProto backup = std::move(backup_producer).Produce();
+
+ PropertyConfigBuilder unindexed_int_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_INT64);
+ SchemaTypeConfigProto expected_type =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_int_property_builder.SetName("prop0"))
+ .AddProperty(indexed_int_property_builder.SetName("prop1"))
+ .AddProperty(indexed_int_property_builder.SetName("prop2"))
+ .AddProperty(indexed_int_property_builder.SetName("prop3"))
+ .AddProperty(indexed_int_property_builder.SetName("prop4"))
+ .AddProperty(indexed_int_property_builder.SetName("prop5"))
+ .AddProperty(indexed_int_property_builder.SetName("prop6"))
+ .AddProperty(indexed_int_property_builder.SetName("prop7"))
+ .AddProperty(indexed_int_property_builder.SetName("prop8"))
+ .AddProperty(indexed_int_property_builder.SetName("prop9"))
+ .AddProperty(indexed_int_property_builder.SetName("prop10"))
+ .AddProperty(indexed_int_property_builder.SetName("prop11"))
+ .AddProperty(indexed_int_property_builder.SetName("prop12"))
+ .AddProperty(indexed_int_property_builder.SetName("prop13"))
+ .AddProperty(indexed_int_property_builder.SetName("prop14"))
+ .AddProperty(indexed_int_property_builder.SetName("prop15"))
+ .AddProperty(unindexed_int_property_builder.SetName("prop16"))
+ .AddProperty(unindexed_int_property_builder.SetName("prop17"))
+ .AddProperty(unindexed_int_property_builder.SetName("prop18"))
+ .AddProperty(unindexed_int_property_builder.SetName("prop19"))
+ .Build();
+ SchemaProto expected_backup = SchemaBuilder().AddType(expected_type).Build();
+ EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup));
+}
+
+TEST_F(BackupSchemaProducerTest, MakeExtraDocumentIndexedPropertiesUnindexed) {
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto typeB =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(indexed_string_property_builder.SetName("prop1"))
+ .AddProperty(indexed_string_property_builder.SetName("prop2"))
+ .AddProperty(indexed_string_property_builder.SetName("prop3"))
+ .AddProperty(indexed_string_property_builder.SetName("prop4"))
+ .AddProperty(indexed_string_property_builder.SetName("prop5"))
+ .AddProperty(indexed_string_property_builder.SetName("prop6"))
+ .AddProperty(indexed_string_property_builder.SetName("prop7"))
+ .AddProperty(indexed_string_property_builder.SetName("prop8"))
+ .AddProperty(indexed_string_property_builder.SetName("prop9"))
+ .Build();
+
+ PropertyConfigBuilder indexed_document_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("TypeB", /*index_nested_properties=*/true);
+ SchemaTypeConfigProto typeA =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_document_property_builder.SetName("propA"))
+ .AddProperty(indexed_document_property_builder.SetName("propB"))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(typeA).AddType(typeB).Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+ ASSERT_THAT(type_id_mapper->Put("TypeB", 1), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(true));
+ SchemaProto backup = std::move(backup_producer).Produce();
+
+ PropertyConfigProto unindexed_document_property =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_DOCUMENT)
+ .Build();
+ unindexed_document_property.set_schema_type("TypeB");
+ PropertyConfigBuilder unindexed_document_property_builder(
+ unindexed_document_property);
+ SchemaTypeConfigProto expected_typeA =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_document_property_builder.SetName("propA"))
+ .AddProperty(unindexed_document_property_builder.SetName("propB"))
+ .Build();
+ SchemaProto expected_backup =
+ SchemaBuilder().AddType(expected_typeA).AddType(typeB).Build();
+ EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup));
+}
+
+TEST_F(
+ BackupSchemaProducerTest,
+ MakeExtraDocumentIndexedPropertiesWithIndexableNestedPropertiesListUnindexed) {
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ PropertyConfigBuilder indexed_int_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE);
+ SchemaTypeConfigProto typeB =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(indexed_int_property_builder.SetName("prop1"))
+ .AddProperty(indexed_string_property_builder.SetName("prop2"))
+ .AddProperty(indexed_int_property_builder.SetName("prop3"))
+ .AddProperty(indexed_string_property_builder.SetName("prop4"))
+ .AddProperty(indexed_int_property_builder.SetName("prop5"))
+ .AddProperty(indexed_string_property_builder.SetName("prop6"))
+ .AddProperty(indexed_int_property_builder.SetName("prop7"))
+ .AddProperty(indexed_string_property_builder.SetName("prop8"))
+ .AddProperty(indexed_int_property_builder.SetName("prop9"))
+ .Build();
+
+ // Create indexed document property by using indexable nested properties list.
+ PropertyConfigBuilder indexed_document_property_with_list_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument(
+ "TypeB", /*indexable_nested_properties_list=*/{
+ "prop0", "prop1", "prop2", "prop3", "prop4", "prop5",
+ "unknown1", "unknown2", "unknown3"});
+ SchemaTypeConfigProto typeA =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(
+ indexed_document_property_with_list_builder.SetName("propA"))
+ .AddProperty(
+ indexed_document_property_with_list_builder.SetName("propB"))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(typeA).AddType(typeB).Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+ ASSERT_THAT(type_id_mapper->Put("TypeB", 1), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+ ASSERT_THAT(schema_type_manager->section_manager().GetMetadataList("TypeA"),
+ IsOkAndHolds(Pointee(SizeIs(18))));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(true));
+ SchemaProto backup = std::move(backup_producer).Produce();
+
+ PropertyConfigProto unindexed_document_property =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_DOCUMENT)
+ .Build();
+ unindexed_document_property.set_schema_type("TypeB");
+ PropertyConfigBuilder unindexed_document_property_builder(
+ unindexed_document_property);
+
+ // "propA" and "propB" both have 9 sections respectively, so we have to drop
+ // "propB" indexing config to make total # of sections <= 16.
+ SchemaTypeConfigProto expected_typeA =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(
+ indexed_document_property_with_list_builder.SetName("propA"))
+ .AddProperty(unindexed_document_property_builder.SetName("propB"))
+ .Build();
+ SchemaProto expected_backup =
+ SchemaBuilder().AddType(expected_typeA).AddType(typeB).Build();
+ EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup));
+}
+
+TEST_F(BackupSchemaProducerTest, MakeRfcPropertiesUnindexedFirst) {
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto typeA =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(indexed_string_property_builder.SetName("prop1"))
+ .AddProperty(indexed_string_property_builder.SetName("prop2"))
+ .AddProperty(indexed_string_property_builder.SetName("prop3"))
+ .AddProperty(indexed_string_property_builder.SetName("prop4"))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .AddProperty(indexed_string_property_builder.SetName("prop6"))
+ .AddProperty(indexed_string_property_builder.SetName("prop7"))
+ .AddProperty(indexed_string_property_builder.SetName("prop8"))
+ .AddProperty(indexed_string_property_builder.SetName("prop9"))
+ .AddProperty(indexed_string_property_builder.SetName("prop10"))
+ .AddProperty(indexed_string_property_builder.SetName("prop11"))
+ .AddProperty(indexed_string_property_builder.SetName("prop12"))
+ .AddProperty(indexed_string_property_builder.SetName("prop13"))
+ .AddProperty(indexed_string_property_builder.SetName("prop14"))
+ .AddProperty(indexed_string_property_builder.SetName("prop15"))
+ .AddProperty(indexed_string_property_builder.SetName("prop16"))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(typeA).Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(true));
+ SchemaProto backup = std::move(backup_producer).Produce();
+
+ SchemaTypeConfigProto expected_typeA =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(indexed_string_property_builder.SetName("prop1"))
+ .AddProperty(indexed_string_property_builder.SetName("prop2"))
+ .AddProperty(indexed_string_property_builder.SetName("prop3"))
+ .AddProperty(indexed_string_property_builder.SetName("prop4"))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING))
+ .AddProperty(indexed_string_property_builder.SetName("prop6"))
+ .AddProperty(indexed_string_property_builder.SetName("prop7"))
+ .AddProperty(indexed_string_property_builder.SetName("prop8"))
+ .AddProperty(indexed_string_property_builder.SetName("prop9"))
+ .AddProperty(indexed_string_property_builder.SetName("prop10"))
+ .AddProperty(indexed_string_property_builder.SetName("prop11"))
+ .AddProperty(indexed_string_property_builder.SetName("prop12"))
+ .AddProperty(indexed_string_property_builder.SetName("prop13"))
+ .AddProperty(indexed_string_property_builder.SetName("prop14"))
+ .AddProperty(indexed_string_property_builder.SetName("prop15"))
+ .AddProperty(indexed_string_property_builder.SetName("prop16"))
+ .Build();
+ SchemaProto expected_backup = SchemaBuilder().AddType(expected_typeA).Build();
+ EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup));
+}
+
+TEST_F(BackupSchemaProducerTest, MakeExtraPropertiesUnindexedMultipleTypes) {
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ PropertyConfigBuilder indexed_int_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE);
+ SchemaTypeConfigProto typeB =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeB")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(indexed_int_property_builder.SetName("prop1"))
+ .AddProperty(indexed_string_property_builder.SetName("prop2"))
+ .AddProperty(indexed_int_property_builder.SetName("prop3"))
+ .AddProperty(indexed_string_property_builder.SetName("prop4"))
+ .Build();
+
+ PropertyConfigBuilder indexed_document_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("TypeB", /*index_nested_properties=*/true);
+ PropertyConfigBuilder indexed_document_property_with_list_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument(
+ "TypeB", /*indexable_nested_properties_list=*/{
+ "prop0", "prop4", "unknown1", "unknown2", "unknown3"});
+ SchemaTypeConfigProto typeA =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_string_property_builder.SetName("propA"))
+ .AddProperty(
+ indexed_document_property_with_list_builder.SetName("propB"))
+ .AddProperty(indexed_string_property_builder.SetName("propC"))
+ .AddProperty(indexed_document_property_builder.SetName("propD"))
+ .AddProperty(indexed_string_property_builder.SetName("propE"))
+ .AddProperty(indexed_int_property_builder.SetName("propF"))
+ .AddProperty(indexed_document_property_builder.SetName("propG"))
+ .AddProperty(indexed_string_property_builder.SetName("propH"))
+ .AddProperty(indexed_int_property_builder.SetName("propI"))
+ .AddProperty(
+ indexed_document_property_with_list_builder.SetName("propJ"))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(typeA).AddType(typeB).Build();
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ SchemaUtil::BuildTypeConfigMap(schema, &type_config_map);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<DynamicTrieKeyMapper<SchemaTypeId>> type_id_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, schema_store_dir_,
+ /*maximum_size_bytes=*/10000));
+ ASSERT_THAT(type_id_mapper->Put("TypeA", 0), IsOk());
+ ASSERT_THAT(type_id_mapper->Put("TypeB", 1), IsOk());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, type_id_mapper.get()));
+ ASSERT_THAT(schema_type_manager->section_manager().GetMetadataList("TypeA"),
+ IsOkAndHolds(Pointee(SizeIs(26))));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ BackupSchemaProducer backup_producer,
+ BackupSchemaProducer::Create(schema,
+ schema_type_manager->section_manager()));
+ EXPECT_THAT(backup_producer.is_backup_necessary(), Eq(true));
+ SchemaProto backup = std::move(backup_producer).Produce();
+
+ PropertyConfigBuilder unindexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING);
+ PropertyConfigBuilder unindexed_int_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_INT64);
+ PropertyConfigProto unindexed_document_property =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_DOCUMENT)
+ .Build();
+ unindexed_document_property.set_schema_type("TypeB");
+ PropertyConfigBuilder unindexed_document_property_builder(
+ unindexed_document_property);
+
+ // On version 0 (Android T):
+ // - Only "propA", "propC", "propD.prop0", "propD.prop1", "propD.prop2",
+ // "propD.prop3", "propD.prop4", "propE", "propF" will be assigned sections.
+ // - Unlike version 2, "propB.prop0", "propB.prop4", "propB.unknown1",
+ // "propB.unknown2", "propB.unknown3" will be ignored because version 0
+ // doesn't recognize indexable nested properties list.
+ // - So there will be only 9 sections on version 0. We still have potential to
+ // avoid dropping "propG", "propH", "propI" indexing configs on version 0
+ // (in this case it will be 16 sections), but it is ok to make it simple as
+ // long as total # of sections <= 16.
+ SchemaTypeConfigProto expected_typeA =
+ SchemaTypeConfigBuilder()
+ .SetType("TypeA")
+ .AddProperty(indexed_string_property_builder.SetName("propA"))
+ .AddProperty(
+ indexed_document_property_with_list_builder.SetName("propB"))
+ .AddProperty(indexed_string_property_builder.SetName("propC"))
+ .AddProperty(indexed_document_property_builder.SetName("propD"))
+ .AddProperty(indexed_string_property_builder.SetName("propE"))
+ .AddProperty(indexed_int_property_builder.SetName("propF"))
+ .AddProperty(unindexed_document_property_builder.SetName("propG"))
+ .AddProperty(unindexed_string_property_builder.SetName("propH"))
+ .AddProperty(unindexed_int_property_builder.SetName("propI"))
+ .AddProperty(unindexed_document_property_builder.SetName("propJ"))
+ .Build();
+ SchemaProto expected_backup =
+ SchemaBuilder().AddType(expected_typeA).AddType(typeB).Build();
+ EXPECT_THAT(backup, portable_equals_proto::EqualsProto(expected_backup));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/joinable-property-manager-builder_test.cc b/icing/schema/joinable-property-manager-builder_test.cc
new file mode 100644
index 0000000..ac48faa
--- /dev/null
+++ b/icing/schema/joinable-property-manager-builder_test.cc
@@ -0,0 +1,446 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/joinable-property-manager.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Pointee;
+
+class JoinablePropertyManagerBuilderTest : public ::testing::Test {
+ protected:
+ void SetUp() override { test_dir_ = GetTestTempDir() + "/icing"; }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+};
+
+TEST_F(JoinablePropertyManagerBuilderTest, Build) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("SchemaTypeOne", 0));
+ ICING_ASSERT_OK(schema_type_mapper->Put("SchemaTypeTwo", 1));
+
+ PropertyConfigProto prop_foo =
+ PropertyConfigBuilder()
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+ PropertyConfigProto prop_bar =
+ PropertyConfigBuilder()
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+ PropertyConfigProto prop_baz =
+ PropertyConfigBuilder()
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+
+ JoinablePropertyManager::Builder builder(*schema_type_mapper);
+ // Add "foo" and "bar" to "SchemaTypeOne" (schema_type_id = 0).
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, prop_foo, /*property_path=*/"foo"));
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, prop_bar, /*property_path=*/"bar"));
+ // Add "baz" to "SchemaTypeTwo" (schema_type_id = 1).
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/1, prop_baz, /*property_path=*/"baz"));
+
+ std::unique_ptr<JoinablePropertyManager> joinable_property_manager =
+ std::move(builder).Build();
+ // Check "SchemaTypeOne"
+ EXPECT_THAT(
+ joinable_property_manager->GetMetadataList("SchemaTypeOne"),
+ IsOkAndHolds(Pointee(ElementsAre(
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"foo", prop_foo),
+ EqualsJoinablePropertyMetadata(/*expected_id=*/1,
+ /*expected_property_path=*/"bar",
+ prop_bar)))));
+ // Check "SchemaTypeTwo"
+ EXPECT_THAT(
+ joinable_property_manager->GetMetadataList("SchemaTypeTwo"),
+ IsOkAndHolds(Pointee(ElementsAre(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"baz", prop_baz)))));
+}
+
+TEST_F(JoinablePropertyManagerBuilderTest, TooManyPropertiesShouldFail) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("SchemaType", 0));
+
+ JoinablePropertyManager::Builder builder(*schema_type_mapper);
+ // Add kTotalNumJoinableProperties joinable properties
+ for (int i = 0; i < kTotalNumJoinableProperties; i++) {
+ PropertyConfigProto property_config =
+ PropertyConfigBuilder()
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config,
+ /*property_path=*/"property" + std::to_string(i)));
+ }
+
+ // Add another joinable property. This should fail.
+ PropertyConfigProto property_config =
+ PropertyConfigBuilder()
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+ EXPECT_THAT(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config,
+ /*property_path=*/"propertyExceed"),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE,
+ HasSubstr("Too many properties")));
+}
+
+TEST_F(JoinablePropertyManagerBuilderTest, InvalidSchemaTypeIdShouldFail) {
+ // Create a schema type mapper with invalid schema type id.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("SchemaType", 0));
+
+ PropertyConfigProto property_config =
+ PropertyConfigBuilder()
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+
+ JoinablePropertyManager::Builder builder(*schema_type_mapper);
+ EXPECT_THAT(
+ builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/-1, property_config, /*property_path=*/"property"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(JoinablePropertyManagerBuilderTest,
+ SchemaTypeIdInconsistentWithSchemaTypeMapperSizeShouldFail) {
+ // Create a schema type mapper with schema type id = 2, but size of mapper is
+ // 2.
+ // Since JoinablePropertyManagerBuilder expects 2 schema type ids = [0, 1],
+ // building with schema type id = 2 should fail even though id = 2 is in
+ // schema type mapper.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("SchemaTypeOne", 0));
+ ICING_ASSERT_OK(schema_type_mapper->Put("SchemaTypeTwo", 2));
+
+ PropertyConfigProto property_config =
+ PropertyConfigBuilder()
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+
+ JoinablePropertyManager::Builder builder(*schema_type_mapper);
+ EXPECT_THAT(
+ builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/2, property_config, /*property_path=*/"property"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(JoinablePropertyManagerBuilderTest,
+ NonStringPropertiesWithQualifiedIdJoinableConfigShouldNotProcess) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("SchemaTypeOne", 0));
+ ICING_ASSERT_OK(schema_type_mapper->Put("SchemaTypeTwo", 1));
+
+ // Create non-string properties with QUALIFIED_ID joinable value type.
+ std::vector<PropertyConfigProto> properties = {
+ PropertyConfigBuilder()
+ .SetName("int1")
+ .SetDataType(TYPE_INT64)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("int2")
+ .SetDataType(TYPE_INT64)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("double1")
+ .SetDataType(TYPE_DOUBLE)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("double2")
+ .SetDataType(TYPE_DOUBLE)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("boolean1")
+ .SetDataType(TYPE_BOOLEAN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("boolean2")
+ .SetDataType(TYPE_BOOLEAN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("bytes1")
+ .SetDataType(TYPE_BYTES)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("bytes2")
+ .SetDataType(TYPE_BYTES)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("document1")
+ .SetDataTypeDocument(/*schema_type=*/"SchemaTypeTwo",
+ /*index_nested_properties=*/true)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("document2")
+ .SetDataTypeDocument(/*schema_type=*/"SchemaTypeTwo",
+ /*index_nested_properties=*/true)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build()};
+
+ JoinablePropertyManager::Builder builder(*schema_type_mapper);
+ for (const PropertyConfigProto& property_config : properties) {
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config,
+ std::string(property_config.property_name())));
+ }
+
+ std::unique_ptr<JoinablePropertyManager> joinable_property_manager =
+ std::move(builder).Build();
+ EXPECT_THAT(joinable_property_manager->GetMetadataList("SchemaTypeOne"),
+ IsOkAndHolds(Pointee(IsEmpty())));
+}
+
+class JoinablePropertyManagerBuilderWithJoinablePropertyTest
+ : public JoinablePropertyManagerBuilderTest,
+ public ::testing::WithParamInterface<PropertyConfigProto> {};
+
+TEST_P(JoinablePropertyManagerBuilderWithJoinablePropertyTest, Build) {
+ static constexpr std::string_view kSchemaType = "type";
+ static constexpr std::string_view kPropertyPath = "foo.bar";
+ const PropertyConfigProto& property_config = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put(kSchemaType, 0));
+
+ JoinablePropertyManager::Builder builder(*schema_type_mapper);
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config, std::string(kPropertyPath)));
+
+ std::unique_ptr<JoinablePropertyManager> joinable_property_manager =
+ std::move(builder).Build();
+ EXPECT_THAT(
+ joinable_property_manager->GetMetadataList(std::string(kSchemaType)),
+ IsOkAndHolds(Pointee(ElementsAre(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0, kPropertyPath, property_config)))));
+}
+
+// The following type is considered joinable:
+// - String with QUALIFIED_ID joinable value type
+INSTANTIATE_TEST_SUITE_P(
+ JoinablePropertyManagerBuilderWithJoinablePropertyTest,
+ JoinablePropertyManagerBuilderWithJoinablePropertyTest,
+ testing::Values(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ // Indexable string can be configured joinable as well. For
+ // convenience, just test one indexable string config.
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build()));
+
+class JoinablePropertyManagerBuilderWithNonJoinablePropertyTest
+ : public JoinablePropertyManagerBuilderTest,
+ public ::testing::WithParamInterface<PropertyConfigProto> {};
+
+TEST_P(JoinablePropertyManagerBuilderWithNonJoinablePropertyTest, Build) {
+ static constexpr std::string_view kSchemaType = "type";
+ static constexpr std::string_view kPropertyPath = "foo.bar";
+ const PropertyConfigProto& property_config = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put(kSchemaType, 0));
+
+ JoinablePropertyManager::Builder builder(*schema_type_mapper);
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config, std::string(kPropertyPath)));
+
+ std::unique_ptr<JoinablePropertyManager> joinable_property_manager =
+ std::move(builder).Build();
+ EXPECT_THAT(
+ joinable_property_manager->GetMetadataList(std::string(kSchemaType)),
+ IsOkAndHolds(Pointee(IsEmpty())));
+}
+
+// All types without JoinableConfig (i.e. joinable value type = NONE by default)
+// are considered non-joinable. Other mismatching types (e.g. non-string
+// properties with QUALIFIED_ID joinable value type) were tested individually
+// above.
+INSTANTIATE_TEST_SUITE_P(
+ JoinablePropertyManagerBuilderWithNonJoinablePropertyTest,
+ JoinablePropertyManagerBuilderWithNonJoinablePropertyTest,
+ testing::Values(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ // Indexable but non-joinable string
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_BOOLEAN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeDocument("anotherSchema",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeDocument("anotherSchema",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build()));
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/joinable-property-manager.cc b/icing/schema/joinable-property-manager.cc
new file mode 100644
index 0000000..1606abb
--- /dev/null
+++ b/icing/schema/joinable-property-manager.cc
@@ -0,0 +1,203 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/joinable-property-manager.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/property-util.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Helper function to append a new joinable property metadata
+libtextclassifier3::Status AppendNewJoinablePropertyMetadata(
+ JoinablePropertyManager::JoinablePropertyMetadataListWrapper*
+ metadata_list_wrapper,
+ std::string&& concatenated_path,
+ PropertyConfigProto::DataType::Code data_type,
+ JoinableConfig::ValueType::Code value_type) {
+ // Validates next joinable property id, makes sure that joinable property id
+ // is the same as the list index so that we could find any joinable property
+ // metadata by id in O(1) later.
+ JoinablePropertyId new_id = static_cast<JoinablePropertyId>(
+ metadata_list_wrapper->metadata_list.size());
+ if (!IsJoinablePropertyIdValid(new_id)) {
+ // Max number of joinable properties reached
+ return absl_ports::OutOfRangeError(
+ IcingStringUtil::StringPrintf("Too many properties to be joinable, max "
+ "number of properties allowed: %d",
+ kTotalNumJoinableProperties));
+ }
+
+ // Creates joinable property metadata
+ metadata_list_wrapper->metadata_list.push_back(JoinablePropertyMetadata(
+ new_id, data_type, value_type, std::move(concatenated_path)));
+ metadata_list_wrapper->property_path_to_id_map.insert(
+ {metadata_list_wrapper->metadata_list.back().path, new_id});
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+void AppendJoinablePropertyContent(
+ JoinablePropertyMetadata joinable_property_metadata,
+ libtextclassifier3::StatusOr<std::vector<T>>&& joinable_property_content_or,
+ std::vector<JoinableProperty<T>>& joinable_property_out) {
+ if (!joinable_property_content_or.ok()) {
+ return;
+ }
+
+ std::vector<T> joinable_property_content =
+ std::move(joinable_property_content_or).ValueOrDie();
+ if (!joinable_property_content.empty()) {
+ // Adds to result vector if joinable property is found in document
+ joinable_property_out.emplace_back(std::move(joinable_property_metadata),
+ std::move(joinable_property_content));
+ }
+}
+
+} // namespace
+
+libtextclassifier3::Status
+JoinablePropertyManager::Builder::ProcessSchemaTypePropertyConfig(
+ SchemaTypeId schema_type_id, const PropertyConfigProto& property_config,
+ std::string&& property_path) {
+ if (schema_type_id < 0 ||
+ schema_type_id >=
+ static_cast<int64_t>(joinable_property_metadata_cache_.size())) {
+ return absl_ports::InvalidArgumentError("Invalid schema type id");
+ }
+
+ switch (property_config.data_type()) {
+ case PropertyConfigProto::DataType::STRING: {
+ if (property_config.joinable_config().value_type() ==
+ JoinableConfig::ValueType::QUALIFIED_ID) {
+ ICING_RETURN_IF_ERROR(AppendNewJoinablePropertyMetadata(
+ &joinable_property_metadata_cache_[schema_type_id],
+ std::move(property_path), PropertyConfigProto::DataType::STRING,
+ JoinableConfig::ValueType::QUALIFIED_ID));
+ }
+ break;
+ }
+ default: {
+ // Skip other data types.
+ break;
+ }
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<JoinablePropertyGroup>
+JoinablePropertyManager::ExtractJoinableProperties(
+ const DocumentProto& document) const {
+ ICING_ASSIGN_OR_RETURN(
+ const std::vector<JoinablePropertyMetadata>* metadata_list,
+ GetMetadataList(document.schema()));
+ JoinablePropertyGroup joinable_property_group;
+ for (const JoinablePropertyMetadata& joinable_property_metadata :
+ *metadata_list) {
+ switch (joinable_property_metadata.data_type) {
+ case PropertyConfigProto::DataType::STRING: {
+ if (joinable_property_metadata.value_type ==
+ JoinableConfig::ValueType::QUALIFIED_ID) {
+ AppendJoinablePropertyContent(
+ joinable_property_metadata,
+ property_util::ExtractPropertyValuesFromDocument<
+ std::string_view>(document, joinable_property_metadata.path),
+ joinable_property_group.qualified_id_properties);
+ }
+ break;
+ }
+ default: {
+ // Skip other data types.
+ break;
+ }
+ }
+ }
+ return joinable_property_group;
+}
+
+libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+JoinablePropertyManager::GetJoinablePropertyMetadata(
+ SchemaTypeId schema_type_id, const std::string& property_path) const {
+ if (schema_type_id < 0 ||
+ schema_type_id >=
+ static_cast<int64_t>(joinable_property_metadata_cache_.size())) {
+ return absl_ports::InvalidArgumentError("Invalid schema type id");
+ }
+
+ const auto iter = joinable_property_metadata_cache_[schema_type_id]
+ .property_path_to_id_map.find(property_path);
+ if (iter == joinable_property_metadata_cache_[schema_type_id]
+ .property_path_to_id_map.end()) {
+ return nullptr;
+ }
+
+ JoinablePropertyId joinable_property_id = iter->second;
+ return &joinable_property_metadata_cache_[schema_type_id]
+ .metadata_list[joinable_property_id];
+}
+
+libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+JoinablePropertyManager::GetJoinablePropertyMetadata(
+ SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id) const {
+ if (schema_type_id < 0 ||
+ schema_type_id >=
+ static_cast<int64_t>(joinable_property_metadata_cache_.size())) {
+ return absl_ports::InvalidArgumentError("Invalid schema type id");
+ }
+ if (!IsJoinablePropertyIdValid(joinable_property_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Invalid joinable property id %d", joinable_property_id));
+ }
+
+ const std::vector<JoinablePropertyMetadata>& metadata_list =
+ joinable_property_metadata_cache_[schema_type_id].metadata_list;
+ if (joinable_property_id >= metadata_list.size()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Joinable property with id %d doesn't exist in type config id %d",
+ joinable_property_id, schema_type_id));
+ }
+
+ // The index of metadata list is the same as the joinable property id, so we
+ // can use joinable property id as the index.
+ return &metadata_list[joinable_property_id];
+}
+
+libtextclassifier3::StatusOr<const std::vector<JoinablePropertyMetadata>*>
+JoinablePropertyManager::GetMetadataList(
+ const std::string& type_config_name) const {
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ schema_type_mapper_.Get(type_config_name));
+ return &joinable_property_metadata_cache_.at(schema_type_id).metadata_list;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/joinable-property-manager.h b/icing/schema/joinable-property-manager.h
new file mode 100644
index 0000000..3ee5963
--- /dev/null
+++ b/icing/schema/joinable-property-manager.h
@@ -0,0 +1,160 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_JOINABLE_PROPERTY_MANAGER_H_
+#define ICING_SCHEMA_JOINABLE_PROPERTY_MANAGER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/proto/document.pb.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/key-mapper.h"
+
+namespace icing {
+namespace lib {
+
+// This class provides joinable-property-related operations. It assigns joinable
+// properties according to JoinableConfig and extracts joinable property values
+// from documents.
+class JoinablePropertyManager {
+ public:
+ // A wrapper class that contains a vector of metadatas and property path to
+ // JoinablePropertyId reverse lookup map.
+ struct JoinablePropertyMetadataListWrapper {
+ std::vector<JoinablePropertyMetadata> metadata_list;
+ std::unordered_map<std::string, JoinablePropertyId> property_path_to_id_map;
+ };
+
+ // Builder class to create a JoinablePropertyManager which does not take
+ // ownership of any input components, and all pointers must refer to valid
+ // objects that outlive the created JoinablePropertyManager instance.
+ class Builder {
+ public:
+ explicit Builder(const KeyMapper<SchemaTypeId>& schema_type_mapper)
+ : schema_type_mapper_(schema_type_mapper),
+ joinable_property_metadata_cache_(schema_type_mapper.num_keys()) {}
+
+ // Checks and appends a new JoinablePropertyMetadata for the schema type id
+ // if the given property config is joinable.
+ //
+ // Returns:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR if schema type id is invalid (not in range [0,
+ // schema_type_mapper_.num_keys() - 1])
+ // - OUT_OF_RANGE_ERROR if # of joinable properties in a single Schema
+ // exceeds the threshold (kTotalNumJoinableProperties)
+ libtextclassifier3::Status ProcessSchemaTypePropertyConfig(
+ SchemaTypeId schema_type_id, const PropertyConfigProto& property_config,
+ std::string&& property_path);
+
+ // Builds and returns a JoinablePropertyManager instance.
+ std::unique_ptr<JoinablePropertyManager> Build() && {
+ return std::unique_ptr<JoinablePropertyManager>(
+ new JoinablePropertyManager(
+ schema_type_mapper_,
+ std::move(joinable_property_metadata_cache_)));
+ }
+
+ private:
+ const KeyMapper<SchemaTypeId>& schema_type_mapper_; // Does not own.
+ std::vector<JoinablePropertyMetadataListWrapper>
+ joinable_property_metadata_cache_;
+ };
+
+ JoinablePropertyManager(const JoinablePropertyManager&) = delete;
+ JoinablePropertyManager& operator=(const JoinablePropertyManager&) = delete;
+
+ // Extracts all joinable property contents of different types from the given
+ // document and group them by joinable value type.
+ // - Joinable properties are sorted by joinable property id in ascending
+ // order.
+ // - Joinable property ids start from 0.
+ // - Joinable properties with empty content won't be returned.
+ //
+ // Returns:
+ // - A JoinablePropertyGroup instance on success
+ // - NOT_FOUND_ERROR if the type config name of document is not present in
+ // schema_type_mapper_
+ libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties(
+ const DocumentProto& document) const;
+
+ // Returns the JoinablePropertyMetadata associated with property_path that's
+ // in the SchemaTypeId.
+ //
+ // Returns:
+ // - Valid pointer to JoinablePropertyMetadata on success
+ // - nullptr if property_path doesn't exist (or is not joinable) in the
+ // joinable metadata list of the schema
+ // - INVALID_ARGUMENT_ERROR if schema type id is invalid
+ libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+ GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
+ const std::string& property_path) const;
+
+ // Returns the JoinablePropertyMetadata associated with the JoinablePropertyId
+ // that's in the SchemaTypeId.
+ //
+ // Returns:
+ // - Valid pointer to JoinablePropertyMetadata on success
+ // - INVALID_ARGUMENT_ERROR if schema type id or JoinablePropertyId is
+ // invalid
+ libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+ GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
+ JoinablePropertyId joinable_property_id) const;
+
+ // Returns:
+ // - On success, the joinable property metadatas for the specified type
+ // - NOT_FOUND_ERROR if the type config name is not present in
+ // schema_type_mapper_
+ libtextclassifier3::StatusOr<const std::vector<JoinablePropertyMetadata>*>
+ GetMetadataList(const std::string& type_config_name) const;
+
+ private:
+ explicit JoinablePropertyManager(
+ const KeyMapper<SchemaTypeId>& schema_type_mapper,
+ std::vector<JoinablePropertyMetadataListWrapper>&&
+ joinable_property_metadata_cache)
+ : schema_type_mapper_(schema_type_mapper),
+ joinable_property_metadata_cache_(joinable_property_metadata_cache) {}
+
+ // Maps schema types to a densely-assigned unique id.
+ const KeyMapper<SchemaTypeId>& schema_type_mapper_; // Does not own
+
+ // The index of joinable_property_metadata_cache_ corresponds to a schema
+ // type's SchemaTypeId. At that SchemaTypeId index, we store a
+ // JoinablePropertyMetadataListWrapper instance. The metadata list's index
+ // corresponds to a joinable property's JoinablePropertyId. At the
+ // JoinablePropertyId index, we store the JoinablePropertyMetadata of that
+ // joinable property.
+ //
+ // For example, suppose "email" has a SchemaTypeId of 0 and it has a joinable
+ // property called "senderQualifiedId" with a JoinablePropertyId of 1. Then
+ // the "senderQualifiedId" property's JoinablePropertyMetadata will be at
+ // joinable_property_metadata_cache_[0].metadata_list[1], and
+ // joinable_property_metadata_cache_[0]
+ // .property_path_to_id_map["senderQualifiedId"]
+ // will be 1.
+ const std::vector<JoinablePropertyMetadataListWrapper>
+ joinable_property_metadata_cache_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_JOINABLE_PROPERTY_MANAGER_H_
diff --git a/icing/schema/joinable-property-manager_test.cc b/icing/schema/joinable-property-manager_test.cc
new file mode 100644
index 0000000..ceaaa18
--- /dev/null
+++ b/icing/schema/joinable-property-manager_test.cc
@@ -0,0 +1,519 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/joinable-property-manager.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/schema-type-manager.h"
+#include "icing/schema/schema-util.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsNull;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+
+// type and property names of Email
+static constexpr char kTypeEmail[] = "Email";
+// joinable
+static constexpr char kPropertyReceiverQualifiedId[] = "receiverQualifiedId";
+static constexpr char kPropertySenderQualifiedId[] = "senderQualifiedId";
+// non-joinable
+static constexpr char kPropertyAttachment[] = "attachment";
+static constexpr char kPropertySubject[] = "subject";
+static constexpr char kPropertyText[] = "text";
+static constexpr char kPropertyTimestamp[] = "timestamp";
+
+// type and property names of Conversation
+static constexpr char kTypeConversation[] = "Conversation";
+// joinable
+static constexpr char kPropertyEmails[] = "emails";
+static constexpr char kPropertyGroupQualifiedId[] = "groupQualifiedId";
+// non-joinable
+static constexpr char kPropertyName[] = "name";
+static constexpr char kPropertyNumber[] = "number";
+
+constexpr int64_t kDefaultTimestamp = 1663274901;
+
+PropertyConfigProto CreateSenderQualifiedIdPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertySenderQualifiedId)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+}
+
+PropertyConfigProto CreateReceiverQualifiedIdPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyReceiverQualifiedId)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+}
+
+PropertyConfigProto CreateGroupQualifiedIdPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyGroupQualifiedId)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+}
+
+SchemaTypeConfigProto CreateEmailTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType(kTypeEmail)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertySubject)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyText)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyAttachment)
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyTimestamp)
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(CreateSenderQualifiedIdPropertyConfig())
+ .AddProperty(CreateReceiverQualifiedIdPropertyConfig())
+ .Build();
+}
+
+SchemaTypeConfigProto CreateConversationTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType(kTypeConversation)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyNumber)
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(CreateGroupQualifiedIdPropertyConfig())
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyEmails)
+ .SetDataTypeDocument(kTypeEmail, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+}
+
+class JoinablePropertyManagerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/icing";
+
+ type_config_map_.emplace(kTypeEmail, CreateEmailTypeConfig());
+ type_config_map_.emplace(kTypeConversation, CreateConversationTypeConfig());
+
+ email_document_ =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema(kTypeEmail)
+ .AddStringProperty(kPropertySubject, "the subject")
+ .AddStringProperty(kPropertyText, "the text")
+ .AddStringProperty(kPropertySenderQualifiedId, "pkg$db/ns#Person1")
+ .AddStringProperty(kPropertyReceiverQualifiedId,
+ "pkg$db/ns#Person2")
+ .AddBytesProperty(kPropertyAttachment, "attachment")
+ .AddInt64Property(kPropertyTimestamp, kDefaultTimestamp)
+ .Build();
+
+ conversation_document_ =
+ DocumentBuilder()
+ .SetKey("icing", "conversation/1")
+ .SetSchema(kTypeConversation)
+ .AddStringProperty(kPropertyName, "the conversation")
+ .AddInt64Property(kPropertyNumber, 2)
+ .AddDocumentProperty(kPropertyEmails,
+ DocumentProto(email_document_))
+ .AddStringProperty(kPropertyGroupQualifiedId,
+ "pkg$db/ns#GroupQualifiedId1")
+ .Build();
+
+ // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each
+ // one 128KiB so the total DynamicTrieKeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ ICING_ASSERT_OK_AND_ASSIGN(schema_type_mapper_,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_, key_mapper_size));
+ ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeEmail, 0));
+ ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeConversation, 1));
+ }
+
+ void TearDown() override {
+ schema_type_mapper_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+ SchemaUtil::TypeConfigMap type_config_map_;
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
+
+ DocumentProto email_document_;
+ DocumentProto conversation_document_;
+};
+
+TEST_F(JoinablePropertyManagerTest, ExtractJoinableProperties) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Extracts all joinable properties from 'Email' document
+ ICING_ASSERT_OK_AND_ASSIGN(JoinablePropertyGroup joinable_property_group,
+ schema_type_manager->joinable_property_manager()
+ .ExtractJoinableProperties(email_document_));
+
+ // Qualified Id joinable properties
+ EXPECT_THAT(joinable_property_group.qualified_id_properties, SizeIs(2));
+
+ EXPECT_THAT(
+ joinable_property_group.qualified_id_properties[0].metadata,
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"receiverQualifiedId",
+ CreateReceiverQualifiedIdPropertyConfig()));
+ EXPECT_THAT(joinable_property_group.qualified_id_properties[0].values,
+ ElementsAre("pkg$db/ns#Person2"));
+
+ EXPECT_THAT(
+ joinable_property_group.qualified_id_properties[1].metadata,
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1, /*expected_property_path=*/"senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()));
+ EXPECT_THAT(joinable_property_group.qualified_id_properties[1].values,
+ ElementsAre("pkg$db/ns#Person1"));
+}
+
+TEST_F(JoinablePropertyManagerTest, ExtractJoinablePropertiesNested) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Extracts all joinable properties from 'Conversation' document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ JoinablePropertyGroup joinable_property_group,
+ schema_type_manager->joinable_property_manager()
+ .ExtractJoinableProperties(conversation_document_));
+
+ // Qualified Id joinable properties
+ EXPECT_THAT(joinable_property_group.qualified_id_properties, SizeIs(3));
+
+ EXPECT_THAT(joinable_property_group.qualified_id_properties[0].metadata,
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0,
+ /*expected_property_path=*/"emails.receiverQualifiedId",
+ CreateReceiverQualifiedIdPropertyConfig()));
+ EXPECT_THAT(joinable_property_group.qualified_id_properties[0].values,
+ ElementsAre("pkg$db/ns#Person2"));
+
+ EXPECT_THAT(joinable_property_group.qualified_id_properties[1].metadata,
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1,
+ /*expected_property_path=*/"emails.senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()));
+ EXPECT_THAT(joinable_property_group.qualified_id_properties[1].values,
+ ElementsAre("pkg$db/ns#Person1"));
+
+ EXPECT_THAT(
+ joinable_property_group.qualified_id_properties[2].metadata,
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/2, /*expected_property_path=*/"groupQualifiedId",
+ CreateGroupQualifiedIdPropertyConfig()));
+ EXPECT_THAT(joinable_property_group.qualified_id_properties[2].values,
+ ElementsAre("pkg$db/ns#GroupQualifiedId1"));
+}
+
+TEST_F(JoinablePropertyManagerTest,
+ ExtractJoinablePropertiesShouldIgnoreEmptyContents) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Create an email document without receiverQualifiedId.
+ DocumentProto another_email_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/2")
+ .SetSchema(kTypeEmail)
+ .AddStringProperty(kPropertySubject, "the subject")
+ .AddStringProperty(kPropertyText, "the text")
+ .AddBytesProperty(kPropertyAttachment, "attachment")
+ .AddStringProperty(kPropertySenderQualifiedId, "pkg$db/ns#Person1")
+ .AddInt64Property(kPropertyTimestamp, kDefaultTimestamp)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ JoinablePropertyGroup joinable_property_group,
+ schema_type_manager->joinable_property_manager()
+ .ExtractJoinableProperties(another_email_document));
+
+ // ExtractJoinableProperties should ignore receiverQualifiedId and not append
+ // a JoinableProperty instance of it into the vector.
+ EXPECT_THAT(joinable_property_group.qualified_id_properties, SizeIs(1));
+ EXPECT_THAT(
+ joinable_property_group.qualified_id_properties[0].metadata,
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1, /*expected_property_path=*/"senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()));
+ EXPECT_THAT(joinable_property_group.qualified_id_properties[0].values,
+ ElementsAre("pkg$db/ns#Person1"));
+}
+
+TEST_F(JoinablePropertyManagerTest, GetJoinablePropertyMetadata) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Email (joinable property id -> joinable property path):
+ // 0 -> receiverQualifiedId
+ // 1 -> senderQualifiedId
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ /*joinable_property_id=*/0),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"receiverQualifiedId",
+ CreateReceiverQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ /*joinable_property_id=*/1),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1, /*expected_property_path=*/"senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()))));
+
+ // Conversation (joinable property id -> joinable property path):
+ // 0 -> emails.receiverQualifiedId
+ // 1 -> emails.senderQualifiedId
+ // 2 -> groupQualifiedId
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ /*joinable_property_id=*/0),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0,
+ /*expected_property_path=*/"emails.receiverQualifiedId",
+ CreateReceiverQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ /*joinable_property_id=*/1),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1,
+ /*expected_property_path=*/"emails.senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ /*joinable_property_id=*/2),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/2, /*expected_property_path=*/"groupQualifiedId",
+ CreateGroupQualifiedIdPropertyConfig()))));
+}
+
+TEST_F(JoinablePropertyManagerTest,
+ GetJoinablePropertyMetadataInvalidSchemaTypeId) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+ ASSERT_THAT(type_config_map_, SizeIs(2));
+
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/-1,
+ /*joinable_property_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/2,
+ /*joinable_property_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(JoinablePropertyManagerTest,
+ GetJoinablePropertyMetadataInvalidJoinablePropertyId) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Email (joinable property id -> joinable property path):
+ // 0 -> receiverQualifiedId
+ // 1 -> senderQualifiedId
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ /*joinable_property_id=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ /*joinable_property_id=*/2),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Conversation (joinable property id -> joinable property path):
+ // 0 -> emails.receiverQualifiedId
+ // 1 -> emails.senderQualifiedId
+ // 2 -> groupQualifiedId
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ /*joinable_property_id=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ /*joinable_property_id=*/3),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(JoinablePropertyManagerTest, GetJoinablePropertyMetadataByPath) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Email (joinable property id -> joinable property path):
+ // 0 -> receiverQualifiedId
+ // 1 -> senderQualifiedId
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ "receiverQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"receiverQualifiedId",
+ CreateReceiverQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ "senderQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1, /*expected_property_path=*/"senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()))));
+
+ // Conversation (joinable property id -> joinable property path):
+ // 0 -> emails.receiverQualifiedId
+ // 1 -> emails.senderQualifiedId
+ // 2 -> groupQualifiedId
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ "emails.receiverQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0,
+ /*expected_property_path=*/"emails.receiverQualifiedId",
+ CreateReceiverQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ "emails.senderQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1,
+ /*expected_property_path=*/"emails.senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()))));
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ "groupQualifiedId"),
+ IsOkAndHolds(Pointee(EqualsJoinablePropertyMetadata(
+ /*expected_id=*/2, /*expected_property_path=*/"groupQualifiedId",
+ CreateGroupQualifiedIdPropertyConfig()))));
+}
+
+TEST_F(JoinablePropertyManagerTest,
+ GetJoinablePropertyMetadataByPathInvalidSchemaTypeId) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+ ASSERT_THAT(type_config_map_, SizeIs(2));
+
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/-1,
+ "receiverQualifiedId"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/2,
+ "receiverQualifiedId"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(JoinablePropertyManagerTest, GetJoinablePropertyMetadataByPathNotExist) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/0, "nonExistingPath"),
+ IsOkAndHolds(IsNull()));
+ EXPECT_THAT(schema_type_manager->joinable_property_manager()
+ .GetJoinablePropertyMetadata(/*schema_type_id=*/1,
+ "emails.nonExistingPath"),
+ IsOkAndHolds(IsNull()));
+}
+
+// Note: valid GetMetadataList has been tested in
+// JoinablePropertyManagerBuildTest.
+TEST_F(JoinablePropertyManagerTest, GetMetadataListInvalidSchemaTypeName) {
+ // Use SchemaTypeManager factory method to instantiate
+ // JoinablePropertyManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ EXPECT_THAT(schema_type_manager->joinable_property_manager().GetMetadataList(
+ "NonExistingSchemaTypeName"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/joinable-property.h b/icing/schema/joinable-property.h
new file mode 100644
index 0000000..057bb74
--- /dev/null
+++ b/icing/schema/joinable-property.h
@@ -0,0 +1,132 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_JOINABLE_PROPERTY_H_
+#define ICING_SCHEMA_JOINABLE_PROPERTY_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/proto/schema.pb.h"
+
+namespace icing {
+namespace lib {
+
+using JoinablePropertyId = int8_t;
+
+// 6 bits for 64 values.
+inline constexpr int kJoinablePropertyIdBits = 6;
+inline constexpr JoinablePropertyId kTotalNumJoinableProperties =
+ (INT8_C(1) << kJoinablePropertyIdBits);
+inline constexpr JoinablePropertyId kInvalidJoinablePropertyId =
+ kTotalNumJoinableProperties;
+inline constexpr JoinablePropertyId kMaxJoinablePropertyId =
+ kTotalNumJoinableProperties - 1;
+inline constexpr JoinablePropertyId kMinJoinablePropertyId = 0;
+
+constexpr bool IsJoinablePropertyIdValid(
+ JoinablePropertyId joinable_property_id) {
+ return joinable_property_id >= kMinJoinablePropertyId &&
+ joinable_property_id <= kMaxJoinablePropertyId;
+}
+
+static_assert(
+ kJoinablePropertyIdBits < 8 * sizeof(JoinablePropertyId),
+ "Cannot exhaust all bits of JoinablePropertyId since it is a signed "
+ "integer and the most significant bit should be preserved.");
+
+struct JoinablePropertyMetadata {
+ // Dot-joined property names, representing the location of joinable property
+ // inside an document. E.g. "property1.property2".
+ std::string path;
+
+ // A unique id of joinable property.
+ JoinablePropertyId id;
+
+ // Data type of this joinable property values. Currently we only support
+ // STRING.
+ PropertyConfigProto::DataType::Code data_type;
+
+ // How values will be used as a joining matcher.
+ //
+ // JoinableConfig::ValueType::QUALIFIED_ID:
+ // Value in this property is a joinable (string) qualified id. Qualified id
+ // is composed of namespace and uri, and it will be used as the identifier
+ // of the parent document. Note: it is invalid to use this value type with
+ // non-string DataType.
+ JoinableConfig::ValueType::Code value_type;
+
+ explicit JoinablePropertyMetadata(
+ JoinablePropertyId id_in,
+ PropertyConfigProto::DataType::Code data_type_in,
+ JoinableConfig::ValueType::Code value_type_in, std::string&& path_in)
+ : path(std::move(path_in)),
+ id(id_in),
+ data_type(data_type_in),
+ value_type(value_type_in) {}
+
+ JoinablePropertyMetadata(const JoinablePropertyMetadata& other) = default;
+ JoinablePropertyMetadata& operator=(const JoinablePropertyMetadata& other) =
+ default;
+
+ JoinablePropertyMetadata(JoinablePropertyMetadata&& other) = default;
+ JoinablePropertyMetadata& operator=(JoinablePropertyMetadata&& other) =
+ default;
+
+ bool operator==(const JoinablePropertyMetadata& rhs) const {
+ return path == rhs.path && id == rhs.id && data_type == rhs.data_type &&
+ value_type == rhs.value_type;
+ }
+};
+
+// JoinableProperty is an icing internal concept similar to document property
+// values (contents), but with extra metadata. the data type of value is
+// specified by template.
+//
+// Current supported data types:
+// - std::string_view (PropertyConfigProto::DataType::STRING)
+template <typename T>
+struct JoinableProperty {
+ JoinablePropertyMetadata metadata;
+ std::vector<T> values;
+
+ explicit JoinableProperty(JoinablePropertyMetadata&& metadata_in,
+ std::vector<T>&& values_in)
+ : metadata(std::move(metadata_in)), values(std::move(values_in)) {}
+
+ PropertyConfigProto::DataType::Code data_type() const {
+ return metadata.data_type;
+ }
+
+ JoinableConfig::ValueType::Code value_type() const {
+ return metadata.value_type;
+ }
+};
+
+// Groups of different type joinable properties. Callers can access joinable
+// properties with types they want and avoid going through non-desired ones.
+//
+// REQUIRES: lifecycle of the property must be longer than this object, since we
+// use std::string_view for extracting its string_values.
+struct JoinablePropertyGroup {
+ std::vector<JoinableProperty<std::string_view>> qualified_id_properties;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_JOINABLE_PROPERTY_H_
diff --git a/icing/schema/property-util.cc b/icing/schema/property-util.cc
new file mode 100644
index 0000000..67ff748
--- /dev/null
+++ b/icing/schema/property-util.cc
@@ -0,0 +1,137 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/property-util.h"
+
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/proto/document.pb.h"
+
+namespace icing {
+namespace lib {
+
+namespace property_util {
+
+std::string ConvertToPropertyExprIndexStr(int index) {
+ if (index == kWildcardPropertyIndex) {
+ return "";
+ }
+ return absl_ports::StrCat(kLBracket, std::to_string(index), kRBracket);
+}
+
+std::string ConcatenatePropertyPathExpr(std::string_view property_path_expr1,
+ std::string_view property_path_expr2) {
+ if (property_path_expr1.empty()) {
+ return std::string(property_path_expr2);
+ }
+ if (property_path_expr2.empty()) {
+ return std::string(property_path_expr1);
+ }
+ return absl_ports::StrCat(property_path_expr1, kPropertyPathSeparator,
+ property_path_expr2);
+}
+
+std::vector<std::string_view> SplitPropertyPathExpr(
+ std::string_view property_path_expr) {
+ return absl_ports::StrSplit(property_path_expr, kPropertyPathSeparator);
+}
+
+PropertyInfo ParsePropertyNameExpr(std::string_view property_name_expr) {
+ size_t l_bracket = property_name_expr.find(kLBracket);
+ if (l_bracket == std::string_view::npos ||
+ l_bracket >= property_name_expr.length()) {
+ return PropertyInfo(std::string(property_name_expr),
+ kWildcardPropertyIndex);
+ }
+ size_t r_bracket = property_name_expr.find(kRBracket, l_bracket);
+ if (r_bracket == std::string_view::npos || r_bracket - l_bracket < 2) {
+ return PropertyInfo(std::string(property_name_expr),
+ kWildcardPropertyIndex);
+ }
+ std::string index_string = std::string(
+ property_name_expr.substr(l_bracket + 1, r_bracket - l_bracket - 1));
+ return PropertyInfo(std::string(property_name_expr.substr(0, l_bracket)),
+ std::stoi(index_string));
+}
+
+std::vector<PropertyInfo> ParsePropertyPathExpr(
+ std::string_view property_path_expr) {
+ std::vector<std::string_view> property_name_exprs =
+ SplitPropertyPathExpr(property_path_expr);
+
+ std::vector<PropertyInfo> property_infos;
+ property_infos.reserve(property_name_exprs.size());
+ for (std::string_view property_name_expr : property_name_exprs) {
+ property_infos.push_back(ParsePropertyNameExpr(property_name_expr));
+ }
+ return property_infos;
+}
+
+bool IsParentPropertyPath(std::string_view property_path_expr1,
+ std::string_view property_path_expr2) {
+ if (property_path_expr2.length() < property_path_expr1.length()) {
+ return false;
+ }
+ if (property_path_expr1 !=
+ property_path_expr2.substr(0, property_path_expr1.length())) {
+ return false;
+ }
+ if (property_path_expr2.length() > property_path_expr1.length() &&
+ property_path_expr2[property_path_expr1.length()] !=
+ kPropertyPathSeparator[0]) {
+ return false;
+ }
+ return true;
+}
+
+const PropertyProto* GetPropertyProto(const DocumentProto& document,
+ std::string_view property_name) {
+ for (const PropertyProto& property : document.properties()) {
+ if (property.name() == property_name) {
+ return &property;
+ }
+ }
+ return nullptr;
+}
+
+template <>
+libtextclassifier3::StatusOr<std::vector<std::string>>
+ExtractPropertyValues<std::string>(const PropertyProto& property) {
+ return std::vector<std::string>(property.string_values().begin(),
+ property.string_values().end());
+}
+
+template <>
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+ExtractPropertyValues<std::string_view>(const PropertyProto& property) {
+ return std::vector<std::string_view>(property.string_values().begin(),
+ property.string_values().end());
+}
+
+template <>
+libtextclassifier3::StatusOr<std::vector<int64_t>>
+ExtractPropertyValues<int64_t>(const PropertyProto& property) {
+ return std::vector<int64_t>(property.int64_values().begin(),
+ property.int64_values().end());
+}
+
+} // namespace property_util
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/property-util.h b/icing/schema/property-util.h
new file mode 100644
index 0000000..7557879
--- /dev/null
+++ b/icing/schema/property-util.h
@@ -0,0 +1,212 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_PROPERTY_UTIL_H_
+#define ICING_SCHEMA_PROPERTY_UTIL_H_
+
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/proto/document.pb.h"
+
+namespace icing {
+namespace lib {
+
+namespace property_util {
+
+// Definition:
+// - Expr (short for expression): with or without index.
+// - property_name: one level of property name without index. E.g. "abc", "def".
+// - property_name_expr: one level of property name with or without index. E.g.
+// "abc", "abc[0]", "def[1]".
+// - property_path: multiple levels (including one) of property names without
+// indices. E.g. "abc", "abc.def".
+// - property_path_expr: multiple levels (including one) of property name
+// expressions. E.g. "abc", "abc[0]", "abc.def",
+// "abc[0].def", "abc[0].def[1]".
+//
+// Set relationship graph (A -> B: A is a subset of B):
+//
+// property_path -> property_path_expr
+// ^ ^
+// | |
+// property_name -> property_name_expr
+inline constexpr std::string_view kPropertyPathSeparator = ".";
+inline constexpr std::string_view kLBracket = "[";
+inline constexpr std::string_view kRBracket = "]";
+
+inline constexpr int kWildcardPropertyIndex = -1;
+
+struct PropertyInfo {
+ std::string name;
+ int index;
+
+ explicit PropertyInfo(std::string name_in, int index_in)
+ : name(std::move(name_in)), index(index_in) {}
+};
+
+// Converts a property (value) index to string, wrapped by kLBracket and
+// kRBracket.
+//
+// REQUIRES: index should be valid or kWildcardPropertyIndex.
+//
+// Returns:
+// - "" if index is kWildcardPropertyIndex.
+// - kLBracket + std::to_string(index) + kRBracket for all non
+// kWildcardPropertyIndex indices.
+std::string ConvertToPropertyExprIndexStr(int index);
+
+// Concatenates 2 property path expressions.
+//
+// Returns:
+// - property_path_expr1 + "." + property_path_expr2 if both are not empty.
+// - property_path_expr1 if property_path_expr2 is empty.
+// - property_path_expr2 if property_path_expr1 is empty.
+// - "" if both are empty.
+std::string ConcatenatePropertyPathExpr(std::string_view property_path_expr1,
+ std::string_view property_path_expr2);
+
+// Splits a property path expression into multiple property name expressions.
+//
+// Returns: a vector of property name expressions.
+std::vector<std::string_view> SplitPropertyPathExpr(
+ std::string_view property_path_expr);
+
+// Parses a property name expression into (property name, property index). If
+// the index expression is missing, then the returned property index will be
+// kWildcardPropertyIndex.
+//
+// Examples:
+// - ParsePropertyNameExpr("foo") will return ("foo",
+// kWildcardPropertyIndex).
+// - ParsePropertyNameExpr("foo[5]") will return ("foo", 5).
+//
+// Returns: a PropertyInfo instance.
+PropertyInfo ParsePropertyNameExpr(std::string_view property_name_expr);
+
+// Parses a property path expression into multiple (property name, property
+// index). It is similar to ParsePropertyPathExpr, except property path
+// expression can contain multiple name expressions.
+//
+// Examples:
+// - ParsePropertyPathExpr("foo") will return [("foo",
+// kWildcardPropertyIndex)].
+// - ParsePropertyPathExpr("foo[5]") will return [("foo", 5)].
+// - ParsePropertyPathExpr("foo.bar[2]") will return [("foo",
+// kWildcardPropertyIndex), ("bar", 2)]
+//
+// Returns: a vector of PropertyInfo instances.
+std::vector<PropertyInfo> ParsePropertyPathExpr(
+ std::string_view property_path_expr);
+
+// A property path property_path_expr1 is considered a parent of another
+// property path property_path_expr2 if:
+// 1. property_path_expr2 == property_path_expr1, OR
+// 2. property_path_expr2 consists of the entire path of property_path_expr1
+// + "." + [some other property path].
+//
+// Note that this can only be used for property name strings that do not
+// contain the property index.
+//
+// Examples:
+// - IsParentPropertyPath("foo", "foo") will return true.
+// - IsParentPropertyPath("foo", "foo.bar") will return true.
+// - IsParentPropertyPath("foo", "bar.foo") will return false.
+// - IsParentPropertyPath("foo.bar", "foo.foo.bar") will return false.
+//
+// Returns: true if property_path_expr1 is a parent property path of
+// property_path_expr2.
+bool IsParentPropertyPath(std::string_view property_path_expr1,
+ std::string_view property_path_expr2);
+
+// Gets the desired PropertyProto from the document by given property name.
+// Since the input parameter is property name, this function only deals with
+// the first level of properties in the document and cannot deal with nested
+// documents.
+//
+// Returns:
+// - const PropertyInfo* if property name exists in the document.
+// - nullptr if property name not found.
+const PropertyProto* GetPropertyProto(const DocumentProto& document,
+ std::string_view property_name);
+
+template <typename T>
+libtextclassifier3::StatusOr<std::vector<T>> ExtractPropertyValues(
+ const PropertyProto& property) {
+ return absl_ports::UnimplementedError(
+ "Unimplemented template type for ExtractPropertyValues");
+}
+
+template <>
+libtextclassifier3::StatusOr<std::vector<std::string>>
+ExtractPropertyValues<std::string>(const PropertyProto& property);
+
+template <>
+libtextclassifier3::StatusOr<std::vector<std::string_view>>
+ExtractPropertyValues<std::string_view>(const PropertyProto& property);
+
+template <>
+libtextclassifier3::StatusOr<std::vector<int64_t>>
+ExtractPropertyValues<int64_t>(const PropertyProto& property);
+
+template <typename T>
+libtextclassifier3::StatusOr<std::vector<T>> ExtractPropertyValuesFromDocument(
+ const DocumentProto& document, std::string_view property_path) {
+ // Finds the first property name in property_path
+ size_t separator_position = property_path.find(kPropertyPathSeparator);
+ std::string_view current_property_name =
+ (separator_position == std::string::npos)
+ ? property_path
+ : property_path.substr(0, separator_position);
+
+ const PropertyProto* property_proto =
+ GetPropertyProto(document, current_property_name);
+ if (property_proto == nullptr) {
+ // Property name not found, it could be one of the following 2 cases:
+ // 1. The property is optional and it's not in the document
+ // 2. The property name is invalid
+ return std::vector<T>();
+ }
+
+ if (separator_position == std::string::npos) {
+ // Current property name is the last one in property path.
+ return ExtractPropertyValues<T>(*property_proto);
+ }
+
+ // Extracts property values recursively
+ std::string_view sub_property_path =
+ property_path.substr(separator_position + 1);
+ std::vector<T> nested_document_content;
+ for (const DocumentProto& nested_document :
+ property_proto->document_values()) {
+ auto content_or = ExtractPropertyValuesFromDocument<T>(nested_document,
+ sub_property_path);
+ if (content_or.ok()) {
+ std::vector<T> content = std::move(content_or).ValueOrDie();
+ std::move(content.begin(), content.end(),
+ std::back_inserter(nested_document_content));
+ }
+ }
+ return nested_document_content;
+}
+
+} // namespace property_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_PROPERTY_UTIL_H_
diff --git a/icing/schema/property-util_test.cc b/icing/schema/property-util_test.cc
new file mode 100644
index 0000000..eddcc84
--- /dev/null
+++ b/icing/schema/property-util_test.cc
@@ -0,0 +1,253 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/property-util.h"
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/proto/document.pb.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+
+static constexpr std::string_view kTypeTest = "Test";
+static constexpr std::string_view kPropertySingleString = "singleString";
+static constexpr std::string_view kPropertyRepeatedString = "repeatedString";
+static constexpr std::string_view kPropertySingleInteger = "singleInteger";
+static constexpr std::string_view kPropertyRepeatedInteger = "repeatedInteger";
+
+static constexpr std::string_view kTypeNestedTest = "NestedTest";
+static constexpr std::string_view kPropertyStr = "str";
+static constexpr std::string_view kPropertyNestedDocument = "nestedDocument";
+
+TEST(PropertyUtilTest, IsParentPropertyPath) {
+ EXPECT_TRUE(property_util::IsParentPropertyPath("foo", "foo"));
+ EXPECT_TRUE(property_util::IsParentPropertyPath("foo", "foo.bar"));
+ EXPECT_TRUE(property_util::IsParentPropertyPath("foo", "foo.bar.foo"));
+ EXPECT_TRUE(property_util::IsParentPropertyPath("foo", "foo.foo.bar"));
+ EXPECT_TRUE(property_util::IsParentPropertyPath("foo.bar", "foo.bar.foo"));
+
+ EXPECT_FALSE(property_util::IsParentPropertyPath("foo", "foofoo.bar"));
+ EXPECT_FALSE(property_util::IsParentPropertyPath("foo.bar", "foo.foo.bar"));
+ EXPECT_FALSE(property_util::IsParentPropertyPath("foo.bar", "foofoo.bar"));
+ EXPECT_FALSE(property_util::IsParentPropertyPath("foo.bar.foo", "foo"));
+ EXPECT_FALSE(property_util::IsParentPropertyPath("foo.bar.foo", "foo.bar"));
+ EXPECT_FALSE(
+ property_util::IsParentPropertyPath("foo.foo.bar", "foo.bar.foo"));
+ EXPECT_FALSE(property_util::IsParentPropertyPath("foo", "foo#bar.foo"));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesTypeString) {
+ PropertyProto property;
+ property.mutable_string_values()->Add("Hello, world");
+ property.mutable_string_values()->Add("Foo");
+ property.mutable_string_values()->Add("Bar");
+
+ EXPECT_THAT(property_util::ExtractPropertyValues<std::string>(property),
+ IsOkAndHolds(ElementsAre("Hello, world", "Foo", "Bar")));
+
+ EXPECT_THAT(property_util::ExtractPropertyValues<std::string_view>(property),
+ IsOkAndHolds(ElementsAre("Hello, world", "Foo", "Bar")));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesTypeInteger) {
+ PropertyProto property;
+ property.mutable_int64_values()->Add(123);
+ property.mutable_int64_values()->Add(-456);
+ property.mutable_int64_values()->Add(0);
+
+ EXPECT_THAT(property_util::ExtractPropertyValues<int64_t>(property),
+ IsOkAndHolds(ElementsAre(123, -456, 0)));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesMismatchedType) {
+ PropertyProto property;
+ property.mutable_int64_values()->Add(123);
+ property.mutable_int64_values()->Add(-456);
+ property.mutable_int64_values()->Add(0);
+
+ EXPECT_THAT(property_util::ExtractPropertyValues<std::string_view>(property),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesEmpty) {
+ PropertyProto property;
+ EXPECT_THAT(property_util::ExtractPropertyValues<std::string>(property),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(property_util::ExtractPropertyValues<std::string_view>(property),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(property_util::ExtractPropertyValues<int64_t>(property),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesTypeUnimplemented) {
+ PropertyProto property;
+ EXPECT_THAT(property_util::ExtractPropertyValues<int32_t>(property),
+ StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesFromDocument) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "test/1")
+ .SetSchema(std::string(kTypeTest))
+ .AddStringProperty(std::string(kPropertySingleString), "single")
+ .AddStringProperty(std::string(kPropertyRepeatedString), "repeated1",
+ "repeated2", "repeated3")
+ .AddInt64Property(std::string(kPropertySingleInteger), 123)
+ .AddInt64Property(std::string(kPropertyRepeatedInteger), 1, 2, 3)
+ .Build();
+
+ // Single string
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ document, /*property_path=*/kPropertySingleString),
+ IsOkAndHolds(ElementsAre("single")));
+ // Repeated string
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ document, /*property_path=*/kPropertyRepeatedString),
+ IsOkAndHolds(ElementsAre("repeated1", "repeated2", "repeated3")));
+ // Single integer
+ EXPECT_THAT(property_util::ExtractPropertyValuesFromDocument<int64_t>(
+ document, /*property_path=*/kPropertySingleInteger),
+ IsOkAndHolds(ElementsAre(123)));
+ // Repeated integer
+ EXPECT_THAT(property_util::ExtractPropertyValuesFromDocument<int64_t>(
+ document, /*property_path=*/kPropertyRepeatedInteger),
+ IsOkAndHolds(ElementsAre(1, 2, 3)));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesFromDocumentNested) {
+ DocumentProto nested_document =
+ DocumentBuilder()
+ .SetKey("icing", "nested/1")
+ .SetSchema(std::string(kTypeNestedTest))
+ .AddStringProperty(std::string(kPropertyStr), "a", "b", "c")
+ .AddDocumentProperty(
+ std::string(kPropertyNestedDocument),
+ DocumentBuilder()
+ .SetSchema(std::string(kTypeTest))
+ .AddStringProperty(std::string(kPropertySingleString),
+ "single1")
+ .AddStringProperty(std::string(kPropertyRepeatedString),
+ "repeated1", "repeated2", "repeated3")
+ .AddInt64Property(std::string(kPropertySingleInteger), 123)
+ .AddInt64Property(std::string(kPropertyRepeatedInteger), 1, 2,
+ 3)
+ .Build(),
+ DocumentBuilder()
+ .SetSchema(std::string(kTypeTest))
+ .AddStringProperty(std::string(kPropertySingleString),
+ "single2")
+ .AddStringProperty(std::string(kPropertyRepeatedString),
+ "repeated4", "repeated5", "repeated6")
+ .AddInt64Property(std::string(kPropertySingleInteger), 456)
+ .AddInt64Property(std::string(kPropertyRepeatedInteger), 4, 5,
+ 6)
+ .Build())
+ .Build();
+
+ // Since there are 2 nested documents, all of values at leaf will be returned.
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ nested_document, /*property_path=*/"nestedDocument.singleString"),
+ IsOkAndHolds(ElementsAre("single1", "single2")));
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ nested_document, /*property_path=*/"nestedDocument.repeatedString"),
+ IsOkAndHolds(ElementsAre("repeated1", "repeated2", "repeated3",
+ "repeated4", "repeated5", "repeated6")));
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<int64_t>(
+ nested_document, /*property_path=*/"nestedDocument.singleInteger"),
+ IsOkAndHolds(ElementsAre(123, 456)));
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<int64_t>(
+ nested_document, /*property_path=*/"nestedDocument.repeatedInteger"),
+ IsOkAndHolds(ElementsAre(1, 2, 3, 4, 5, 6)));
+
+ // Test the property at first level
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ nested_document, kPropertyStr),
+ IsOkAndHolds(ElementsAre("a", "b", "c")));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesFromDocumentNonExistingPaths) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "test/1")
+ .SetSchema(std::string(kTypeTest))
+ .AddStringProperty(std::string(kPropertySingleString), "single")
+ .AddStringProperty(std::string(kPropertyRepeatedString), "repeated1",
+ "repeated2", "repeated3")
+ .AddInt64Property(std::string(kPropertySingleInteger), 123)
+ .AddInt64Property(std::string(kPropertyRepeatedInteger), 1, 2, 3)
+ .Build();
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ document, /*property_path=*/"invalid"),
+ IsOkAndHolds(IsEmpty()));
+
+ DocumentProto nested_document =
+ DocumentBuilder()
+ .SetKey("icing", "nested/1")
+ .SetSchema(std::string(kTypeNestedTest))
+ .AddStringProperty(std::string(kPropertyStr), "a", "b", "c")
+ .AddDocumentProperty(std::string(kPropertyNestedDocument),
+ DocumentProto(document), DocumentProto(document))
+ .Build();
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ nested_document, /*property_path=*/kPropertySingleString),
+ IsOkAndHolds(IsEmpty()));
+ EXPECT_THAT(
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ nested_document, /*property_path=*/"nestedDocument.invalid"),
+ IsOkAndHolds(IsEmpty()));
+}
+
+TEST(PropertyUtilTest, ExtractPropertyValuesFromDocumentTypeUnimplemented) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "test/1")
+ .SetSchema(std::string(kTypeTest))
+ .AddStringProperty(std::string(kPropertySingleString), "single")
+ .AddStringProperty(std::string(kPropertyRepeatedString), "repeated1",
+ "repeated2", "repeated3")
+ .AddInt64Property(std::string(kPropertySingleInteger), 123)
+ .AddInt64Property(std::string(kPropertyRepeatedInteger), 1, 2, 3)
+ .Build();
+ EXPECT_THAT(property_util::ExtractPropertyValuesFromDocument<int32_t>(
+ document, /*property_path=*/kPropertySingleString),
+ StatusIs(libtextclassifier3::StatusCode::UNIMPLEMENTED));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-property-iterator.cc b/icing/schema/schema-property-iterator.cc
new file mode 100644
index 0000000..8fc245c
--- /dev/null
+++ b/icing/schema/schema-property-iterator.cc
@@ -0,0 +1,198 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/schema-property-iterator.h"
+
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/property-util.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::Status SchemaPropertyIterator::Advance() {
+ while (!levels_.empty()) {
+ if (!levels_.back().Advance()) {
+ // When finishing iterating all properties of the current level, pop it
+ // from the stack (levels_), return to the previous level and resume the
+ // iteration.
+ parent_type_config_names_.erase(
+ parent_type_config_names_.find(levels_.back().GetSchemaTypeName()));
+ levels_.pop_back();
+ continue;
+ }
+
+ const PropertyConfigProto& curr_property_config =
+ levels_.back().GetCurrentPropertyConfig();
+ std::string curr_property_path = levels_.back().GetCurrentPropertyPath();
+
+ // Iterate through the sorted_top_level_indexable_nested_properties_ in
+ // order until we find the first element that is >= curr_property_path.
+ while (current_top_level_indexable_nested_properties_idx_ <
+ sorted_top_level_indexable_nested_properties_.size() &&
+ sorted_top_level_indexable_nested_properties_.at(
+ current_top_level_indexable_nested_properties_idx_) <
+ curr_property_path) {
+ // If an element in sorted_top_level_indexable_nested_properties_ < the
+ // current property path, it means that we've already iterated past the
+ // possible position for it without seeing it.
+ // It's not a valid property path in our schema definition. Add it to
+ // unknown_indexable_nested_properties_ and advance
+ // current_top_level_indexable_nested_properties_idx_.
+ unknown_indexable_nested_property_paths_.push_back(
+ sorted_top_level_indexable_nested_properties_.at(
+ current_top_level_indexable_nested_properties_idx_));
+ ++current_top_level_indexable_nested_properties_idx_;
+ }
+
+ if (curr_property_config.data_type() !=
+ PropertyConfigProto::DataType::DOCUMENT) {
+ // We've advanced to a leaf property.
+ // Set whether this property is indexable according to its level's
+ // indexable config. If this property is declared in
+ // indexable_nested_properties_list of the top-level schema, it is also
+ // nested indexable.
+ std::string* current_indexable_nested_prop =
+ current_top_level_indexable_nested_properties_idx_ <
+ sorted_top_level_indexable_nested_properties_.size()
+ ? &sorted_top_level_indexable_nested_properties_.at(
+ current_top_level_indexable_nested_properties_idx_)
+ : nullptr;
+ if (current_indexable_nested_prop == nullptr ||
+ *current_indexable_nested_prop > curr_property_path) {
+ // Current property is not in the indexable list. Set it as indexable if
+ // its schema level is indexable AND it is an indexable property.
+ bool is_property_indexable =
+ levels_.back().GetLevelNestedIndexable() &&
+ SchemaUtil::IsIndexedProperty(curr_property_config);
+ levels_.back().SetCurrentPropertyIndexable(is_property_indexable);
+ } else if (*current_indexable_nested_prop == curr_property_path) {
+ // Current property is in the indexable list. Set its indexable config
+ // to true. This property will consume a sectionId regardless of whether
+ // or not it is actually indexable.
+ levels_.back().SetCurrentPropertyIndexable(true);
+ ++current_top_level_indexable_nested_properties_idx_;
+ }
+ return libtextclassifier3::Status::OK;
+ }
+
+ // - When advancing to a TYPE_DOCUMENT property, it means it is a nested
+ // schema and we need to traverse the next level. Look up SchemaTypeConfig
+ // (by the schema name) by type_config_map_, and push a new level into
+ // levels_.
+ // - Each level has to record the index of property it is currently at, so
+ // we can resume the iteration when returning back to it. Also other
+ // essential info will be maintained in LevelInfo as well.
+ auto nested_type_config_iter =
+ type_config_map_.find(curr_property_config.schema_type());
+ if (nested_type_config_iter == type_config_map_.end()) {
+ // This should never happen because our schema should already be
+ // validated by this point.
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Type config not found: ", curr_property_config.schema_type()));
+ }
+ const SchemaTypeConfigProto& nested_type_config =
+ nested_type_config_iter->second;
+
+ if (levels_.back().GetLevelNestedIndexable()) {
+ // We should set sorted_top_level_indexable_nested_properties_ to the list
+ // defined by the current level.
+ // GetLevelNestedIndexable() is true either because:
+ // 1. We're looking at a document property of the top-level schema --
+ // The first LevelInfo for the iterator is initialized with
+ // all_nested_properties_indexable_ = true.
+ // 2. All previous levels set index_nested_properties = true:
+ // This indicates that upper-level schema types want to follow nested
+ // properties definition of its document subtypes. If this is the first
+ // subtype level that defines a list, we should set it as
+ // top_level_indexable_nested_properties_ for the current top-level
+ // schema.
+ sorted_top_level_indexable_nested_properties_.clear();
+ sorted_top_level_indexable_nested_properties_.reserve(
+ curr_property_config.document_indexing_config()
+ .indexable_nested_properties_list()
+ .size());
+ for (const std::string& property :
+ curr_property_config.document_indexing_config()
+ .indexable_nested_properties_list()) {
+ // Concat the current property name to each property to get the full
+ // property path expression for each indexable nested property.
+ sorted_top_level_indexable_nested_properties_.push_back(
+ property_util::ConcatenatePropertyPathExpr(curr_property_path,
+ property));
+ }
+ current_top_level_indexable_nested_properties_idx_ = 0;
+ // Sort elements and dedupe
+ std::sort(sorted_top_level_indexable_nested_properties_.begin(),
+ sorted_top_level_indexable_nested_properties_.end());
+ auto last =
+ std::unique(sorted_top_level_indexable_nested_properties_.begin(),
+ sorted_top_level_indexable_nested_properties_.end());
+ sorted_top_level_indexable_nested_properties_.erase(
+ last, sorted_top_level_indexable_nested_properties_.end());
+ }
+
+ bool is_cycle =
+ parent_type_config_names_.find(nested_type_config.schema_type()) !=
+ parent_type_config_names_.end();
+ bool is_parent_property_path =
+ current_top_level_indexable_nested_properties_idx_ <
+ sorted_top_level_indexable_nested_properties_.size() &&
+ property_util::IsParentPropertyPath(
+ curr_property_path,
+ sorted_top_level_indexable_nested_properties_.at(
+ current_top_level_indexable_nested_properties_idx_));
+ if (is_cycle && !is_parent_property_path) {
+ // Cycle detected. The schema definition is guaranteed to be valid here
+ // since it must have already been validated during SchemaUtil::Validate,
+ // which would have rejected any schema with bad cycles.
+ //
+ // There are no properties in the indexable_nested_properties_list that
+ // are a part of this circular reference.
+ // We do not need to iterate this type further so we simply move on to
+ // other properties in the parent type.
+ continue;
+ }
+
+ bool all_nested_properties_indexable =
+ levels_.back().GetLevelNestedIndexable() &&
+ curr_property_config.document_indexing_config()
+ .index_nested_properties();
+ levels_.push_back(LevelInfo(nested_type_config,
+ std::move(curr_property_path),
+ all_nested_properties_indexable));
+ parent_type_config_names_.insert(nested_type_config.schema_type());
+ }
+
+ // Before returning, move all remaining uniterated properties from
+ // sorted_top_level_indexable_nested_properties_ into
+ // unknown_indexable_nested_properties_.
+ std::move(sorted_top_level_indexable_nested_properties_.begin() +
+ current_top_level_indexable_nested_properties_idx_,
+ sorted_top_level_indexable_nested_properties_.end(),
+ std::back_inserter(unknown_indexable_nested_property_paths_));
+
+ return absl_ports::OutOfRangeError("End of iterator");
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-property-iterator.h b/icing/schema/schema-property-iterator.h
new file mode 100644
index 0000000..66b8f32
--- /dev/null
+++ b/icing/schema/schema-property-iterator.h
@@ -0,0 +1,222 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_SCHEMA_PROPERTY_ITERATOR_H_
+#define ICING_SCHEMA_SCHEMA_PROPERTY_ITERATOR_H_
+
+#include <algorithm>
+#include <numeric>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema/property-util.h"
+#include "icing/schema/schema-util.h"
+
+namespace icing {
+namespace lib {
+
+// SchemaPropertyIterator: a class for iterating through all properties of a
+// given SchemaTypeConfigProto in lexicographical order. Only leaf
+// (non-document-type) properties will be returned, and for document type
+// properties, the iterator will traverse down to the next nested level of
+// schema.
+//
+// REQUIRED: The schema in which this SchemaTypeConfigProto is defined must have
+// already passed the validation step during SetSchema.
+class SchemaPropertyIterator {
+ public:
+ explicit SchemaPropertyIterator(
+ const SchemaTypeConfigProto& base_schema_type_config,
+ const SchemaUtil::TypeConfigMap& type_config_map)
+ : type_config_map_(type_config_map) {
+ levels_.push_back(LevelInfo(base_schema_type_config,
+ /*base_property_path=*/"",
+ /*all_nested_properties_indexable=*/true));
+ parent_type_config_names_.insert(base_schema_type_config.schema_type());
+ }
+
+ // Gets the current property config.
+ //
+ // REQUIRES: The preceding call for Advance() is OK.
+ const PropertyConfigProto& GetCurrentPropertyConfig() const {
+ return levels_.back().GetCurrentPropertyConfig();
+ }
+
+ // Gets the current property path.
+ //
+ // REQUIRES: The preceding call for Advance() is OK.
+ std::string GetCurrentPropertyPath() const {
+ return levels_.back().GetCurrentPropertyPath();
+ }
+
+ // Returns whether the current property is indexable. This would be true if
+ // either the current level is nested indexable, or if the current property is
+ // declared indexable in the indexable_nested_properties_list of the top-level
+ // schema type.
+ //
+ // REQUIRES: The preceding call for Advance() is OK.
+ bool GetCurrentPropertyIndexable() const {
+ return levels_.back().GetCurrentPropertyIndexable();
+ }
+
+ // Returns whether the current schema level is nested indexable. If this is
+ // true, all properties in the level are indexed.
+ //
+ // REQUIRES: The preceding call for Advance() is OK.
+ bool GetLevelNestedIndexable() const {
+ return levels_.back().GetLevelNestedIndexable();
+ }
+
+ // The set of indexable nested properties that are defined in the
+ // indexable_nested_properties_list but are not found in the schema
+ // definition. These properties still consume sectionIds, but will not be
+ // indexed.
+ const std::vector<std::string>& unknown_indexable_nested_property_paths()
+ const {
+ return unknown_indexable_nested_property_paths_;
+ }
+
+ // Advances to the next leaf property.
+ //
+ // Returns:
+ // - OK on success
+ // - OUT_OF_RANGE_ERROR if there is no more leaf property
+ // - INVALID_ARGUMENT_ERROR if cycle dependency is detected in the nested
+ // schema
+ // - NOT_FOUND_ERROR if any nested schema name is not found in
+ // type_config_map
+ libtextclassifier3::Status Advance();
+
+ private:
+ // An inner class for maintaining the iterating state of a (nested) level.
+ // Nested SchemaTypeConfig is a tree structure, so we have to traverse it
+ // recursively to all leaf properties.
+ class LevelInfo {
+ public:
+ explicit LevelInfo(const SchemaTypeConfigProto& schema_type_config,
+ std::string base_property_path,
+ bool all_nested_properties_indexable)
+ : schema_type_config_(schema_type_config),
+ base_property_path_(std::move(base_property_path)),
+ sorted_property_indices_(schema_type_config.properties_size()),
+ current_vec_idx_(-1),
+ sorted_property_indexable_(schema_type_config.properties_size()),
+ all_nested_properties_indexable_(all_nested_properties_indexable) {
+ // Index sort property by lexicographical order.
+ std::iota(sorted_property_indices_.begin(),
+ sorted_property_indices_.end(),
+ /*value=*/0);
+ std::sort(
+ sorted_property_indices_.begin(), sorted_property_indices_.end(),
+ [&schema_type_config](int lhs_idx, int rhs_idx) -> bool {
+ return schema_type_config.properties(lhs_idx).property_name() <
+ schema_type_config.properties(rhs_idx).property_name();
+ });
+ }
+
+ bool Advance() {
+ return ++current_vec_idx_ < sorted_property_indices_.size();
+ }
+
+ const PropertyConfigProto& GetCurrentPropertyConfig() const {
+ return schema_type_config_.properties(
+ sorted_property_indices_[current_vec_idx_]);
+ }
+
+ std::string GetCurrentPropertyPath() const {
+ return property_util::ConcatenatePropertyPathExpr(
+ base_property_path_, GetCurrentPropertyConfig().property_name());
+ }
+
+ bool GetLevelNestedIndexable() const {
+ return all_nested_properties_indexable_;
+ }
+
+ bool GetCurrentPropertyIndexable() const {
+ return sorted_property_indexable_[current_vec_idx_];
+ }
+
+ void SetCurrentPropertyIndexable(bool indexable) {
+ sorted_property_indexable_[current_vec_idx_] = indexable;
+ }
+
+ std::string_view GetSchemaTypeName() const {
+ return schema_type_config_.schema_type();
+ }
+
+ private:
+ const SchemaTypeConfigProto& schema_type_config_; // Does not own
+
+ // Concatenated property path of all parent levels.
+ std::string base_property_path_;
+
+ // We perform index sort (comparing property name) in order to iterate all
+ // leaf properties in lexicographical order. This vector is for storing
+ // these sorted indices.
+ std::vector<int> sorted_property_indices_;
+ int current_vec_idx_;
+
+ // Vector indicating whether each property in the current level is
+ // indexable. We can declare different indexable settings for properties in
+ // the same level using indexable_nested_properties_list.
+ //
+ // Element indices in this vector correspond to property indices in the
+ // sorted order.
+ std::vector<bool> sorted_property_indexable_;
+
+ // Indicates if all properties in the current level is nested indexable.
+ // This would be true for a level if the document declares
+ // index_nested_properties=true. If any of parent document type
+ // property sets its flag false, then this would be false for all its child
+ // properties.
+ bool all_nested_properties_indexable_;
+ };
+
+ const SchemaUtil::TypeConfigMap& type_config_map_; // Does not own
+
+ // For maintaining the stack of recursive nested schema type traversal. We use
+ // std::vector instead of std::stack to avoid memory allocate and free too
+ // frequently.
+ std::vector<LevelInfo> levels_;
+
+ // Maintaining all traversed parent schema type config names of the current
+ // stack (levels_). It is used to detect nested schema cycle dependency.
+ std::unordered_multiset<std::string_view> parent_type_config_names_;
+
+ // Sorted list of indexable nested properties for the top-level schema.
+ std::vector<std::string> sorted_top_level_indexable_nested_properties_;
+
+ // Current iteration index in the sorted_top_level_indexable_nested_properties
+ // list.
+ int current_top_level_indexable_nested_properties_idx_ = 0;
+
+ // Vector of indexable nested properties defined in the
+ // indexable_nested_properties_list, but not found in the schema definition.
+ // These properties still consume sectionIds, but will not be indexed.
+ // Properties are inserted into this vector in sorted order.
+ //
+ // TODO(b/289152024): Implement support for indexing these properties if they
+ // are in the child types of polymorphic nested properties.
+ std::vector<std::string> unknown_indexable_nested_property_paths_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_SCHEMA_PROPERTY_ITERATOR_H_
diff --git a/icing/schema/schema-property-iterator_test.cc b/icing/schema/schema-property-iterator_test.cc
new file mode 100644
index 0000000..2b0226d
--- /dev/null
+++ b/icing/schema/schema-property-iterator_test.cc
@@ -0,0 +1,3905 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/schema-property-iterator.h"
+
+#include <initializer_list>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-util.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+
+TEST(SchemaPropertyIteratorTest,
+ SingleLevelSchemaTypeConfigShouldIterateInCorrectOrder) {
+ std::string schema_type_name = "Schema";
+
+ SchemaTypeConfigProto schema_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Google").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder().SetName("Youtube").SetDataType(
+ TYPE_BYTES))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Alphabet")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name, schema_type_config}};
+
+ SchemaPropertyIterator iterator(schema_type_config, type_config_map);
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Alphabet"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config.properties(2)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Youtube"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config.properties(1)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(iterator.unknown_indexable_nested_property_paths(), IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest,
+ NestedSchemaTypeConfigShouldIterateInCorrectOrder) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Google").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder().SetName("Youtube").SetDataType(
+ TYPE_BYTES))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Alphabet")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(PropertyConfigBuilder().SetName("Foo").SetDataTypeString(
+ TERM_MATCH_UNKNOWN, TOKENIZER_NONE))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Bar").SetDataTypeDocument(
+ schema_type_name1, /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Hello").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("World").SetDataTypeDocument(
+ schema_type_name1, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Icing").SetDataTypeDocument(
+ schema_type_name2, /*index_nested_properties=*/true))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3}};
+
+ // SchemaThree: {
+ // "Hello": TYPE_STRING,
+ // "World": TYPE_DOCUMENT SchemaOne {
+ // "Google": TYPE_STRING,
+ // "Youtube": TYPE_BYTES,
+ // "Alphabet": TYPE_INT64,
+ // },
+ // "Icing": TYPE_DOCUMENT SchemaTwo {
+ // "Foo": TYPE_STRING,
+ // "Bar": TYPE_DOCUMENT SchemaOne {
+ // "Google": TYPE_STRING,
+ // "Youtube": TYPE_BYTES,
+ // "Alphabet": TYPE_INT64,
+ // },
+ // },
+ // }
+ SchemaPropertyIterator iterator(schema_type_config3, type_config_map);
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Hello"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config3.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Icing.Bar.Alphabet"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Icing.Bar.Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Icing.Bar.Youtube"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Icing.Foo"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("World.Alphabet"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("World.Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("World.Youtube"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(iterator.unknown_indexable_nested_property_paths(), IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest,
+ NonExistingNestedSchemaTypeConfigShouldGetNotFoundError) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(PropertyConfigBuilder().SetName("Google").SetDataType(
+ TYPE_STRING))
+ .AddProperty(PropertyConfigBuilder().SetName("Youtube").SetDataType(
+ TYPE_BYTES))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Alphabet")
+ .SetDataType(TYPE_INT64))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Foo").SetDataTypeDocument(
+ schema_type_name1, /*index_nested_properties=*/true))
+ .Build();
+ // Remove the second level (schema_type_config1) from type_config_map.
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name2, schema_type_config2}};
+
+ SchemaPropertyIterator iterator(schema_type_config2, type_config_map);
+ // Since Foo is a document type property with schema type = "SchemaOne" and
+ // "SchemaOne" is not in type_config_map, Advance() should return NOT_FOUND
+ // error.
+ EXPECT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST(SchemaPropertyIteratorTest,
+ SchemaTypeConfigWithEmptyPropertyShouldGetOutOfRangeErrorAtFirstAdvance) {
+ std::string schema_type_name = "Schema";
+
+ SchemaTypeConfigProto schema_type_config =
+ SchemaTypeConfigBuilder().SetType(schema_type_name).Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name, schema_type_config}};
+
+ SchemaPropertyIterator iterator(schema_type_config, type_config_map);
+ EXPECT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+ EXPECT_THAT(iterator.unknown_indexable_nested_property_paths(), IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest, NestedIndexable) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+ std::string schema_type_name4 = "SchemaFour";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Google").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Bar").SetDataTypeDocument(
+ schema_type_name1, /*index_nested_properties=*/true))
+ .AddProperty(PropertyConfigBuilder().SetName("Foo").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Bar").SetDataTypeDocument(
+ schema_type_name1,
+ /*index_nested_properties=*/false))
+ .AddProperty(PropertyConfigBuilder().SetName("Foo").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config4 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name4)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Baz1").SetDataTypeDocument(
+ schema_type_name2, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Baz2").SetDataTypeDocument(
+ schema_type_name2, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Baz3").SetDataTypeDocument(
+ schema_type_name3, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Baz4").SetDataTypeDocument(
+ schema_type_name3, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Hello1").SetDataTypeDocument(
+ schema_type_name1, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("Hello2").SetDataTypeDocument(
+ schema_type_name1, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("World").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3},
+ {schema_type_name4, schema_type_config4}};
+
+ // SchemaFour: {
+ // "Baz1": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=true SchemaTwo {
+ // "Bar": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=true SchemaOne {
+ // "Google": TYPE_STRING INDEXABLE,
+ // },
+ // "Foo": TYPE_STRING INDEXABLE,
+ // },
+ // "Baz2": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=false SchemaTwo {
+ // "Bar": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=true SchemaOne {
+ // "Google": TYPE_STRING INDEXABLE,
+ // },
+ // "Foo": TYPE_STRING INDEXABLE,
+ // },
+ // "Baz3": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=true SchemaThree {
+ // "Bar": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=false SchemaOne {
+ // "Google": TYPE_STRING INDEXABLE,
+ // },
+ // "Foo": TYPE_STRING INDEXABLE,
+ // },
+ // "Baz4": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=false SchemaThree {
+ // "Bar": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=false SchemaOne {
+ // "Google": TYPE_STRING INDEXABLE,
+ // },
+ // "Foo": TYPE_STRING INDEXABLE,
+ // },
+ // "Hello": TYPE_DOCUMENT INDEX_NESTED_PROPERTIES=false SchemaOne {
+ // "Google": TYPE_STRING INDEXABLE,
+ // },
+ // "World": TYPE_STRING INDEXABLE,
+ // }
+ SchemaPropertyIterator iterator(schema_type_config4, type_config_map);
+
+ // Baz1 to Baz4: 2 levels of nested document type property.
+ // For Baz1, all levels set index_nested_properties = true, so all leaf
+ // properties should be nested indexable.
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz1.Bar.Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz1.Foo"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ // For Baz2, the parent level sets index_nested_properties = false, so all
+ // leaf properties in child levels should be nested unindexable even if
+ // they've set their index_nested_properties = true.
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz2.Bar.Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz2.Foo"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ // For Baz3, the parent level sets index_nested_properties = true, but the
+ // child level sets index_nested_properties = false.
+ // - Leaf properties in the parent level should be nested indexable.
+ // - Leaf properties in the child level should be nested unindexable.
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz3.Bar.Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz3.Foo"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ // For Baz4, all levels set index_nested_properties = false, so all leaf
+ // properties should be nested unindexable.
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz4.Bar.Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Baz4.Foo"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ // Verify 1 and 0 level of nested document type properties.
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Hello1.Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("Hello2.Google"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(iterator.Advance(), IsOk());
+ EXPECT_THAT(iterator.GetCurrentPropertyPath(), Eq("World"));
+ EXPECT_THAT(iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config4.properties(6)));
+ EXPECT_THAT(iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(iterator.unknown_indexable_nested_property_paths(), IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest,
+ IndexableNestedPropertiesList_singleNestedLevel) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop1")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop3")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schema1prop4")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schema1prop5")
+ .SetDataType(TYPE_BOOLEAN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop1")
+ .SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/{"schema1prop2",
+ "schema1prop3",
+ "schema1prop5"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schema2prop3")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2}};
+
+ // Order of iteration for Schema2:
+ // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2",
+ // "schema2prop1.schema1prop3", "schema2prop1.schema1prop4",
+ // "schema2prop1.schema1prop5", "schema2prop2", "schema2prop3"}
+ //
+ // Indexable properties:
+ // {"schema2prop1.schema1prop2", "schema2prop1.schema1prop3",
+ // "schema2prop1.schema1prop5", "schema2prop2"}.
+ //
+ // "schema2prop1.schema1prop4" is indexable by its indexing-config, but is not
+ // considered indexable for Schema2 because Schema2 sets its
+ // index_nested_properties config to false, and "schema1prop4" is not
+ // in the indexable_nested_properties_list for schema2prop1.
+ //
+ // "schema2prop1.schema1prop1", "schema2prop1.schema1prop3" and
+ // "schema2prop1.schema1prop5" are non-indexable by its indexing-config.
+ // However "schema2prop1.schema1prop3" and "schema2prop1.schema1prop5" are
+ // indexed as it appears in the indexable_list.
+ SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map);
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop3"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop4"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(3)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop5"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(4)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop3"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(2)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Iterate through schema1 properties. Schema1 only has non-document type leaf
+ // properties, so its properties will be assigned indexable or not according
+ // to their indexing configs.
+ SchemaPropertyIterator schema1_iterator(schema_type_config1, type_config_map);
+
+ EXPECT_THAT(schema1_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop1"));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema1_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop2"));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema1_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop3"));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema1_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop4"));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(3)));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema1_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyPath(), Eq("schema1prop5"));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(4)));
+ EXPECT_THAT(schema1_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema1_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema1_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest,
+ IndexableNestedPropertiesList_indexBooleanTrueDoesNotAffectOtherLevels) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop3")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop1")
+ .SetDataTypeDocument(schema_type_name1,
+ /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop3")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema3prop3")
+ .SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/{"schema1prop1",
+ "schema1prop3"}))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schema3prop1")
+ .SetDataTypeDocument(
+ schema_type_name2,
+ /*indexable_nested_properties_list=*/
+ {"schema2prop2", "schema2prop1.schema1prop1",
+ "schema2prop1.schema1prop3"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema3prop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3}};
+
+ // Order of iteration for Schema3:
+ // {"schema3prop1.schema2prop1.schema1prop1",
+ // "schema3prop1.schema2prop1.schema1prop2",
+ // "schema3prop1.schema2prop1.schema1prop3",
+ // "schema3prop1.schema2prop2", "schema3prop1.schema2prop3", "schema3prop2",
+ // "schema3prop3.schema1prop1", "schema3prop3.schema1prop2",
+ // "schema3prop3.schema1prop3"}.
+ //
+ // Indexable properties:
+ // {"schema3prop1.schema2prop1.schema1prop1",
+ // "schema3prop1.schema2prop1.schema1prop3",
+ // "schema3prop1.schema2prop2", "schema3prop2", "schema3prop3.schema1prop1",
+ // "schema3prop3.schema1prop3"}
+ //
+ // Schema2 setting index_nested_properties=true does not affect nested
+ // properties indexing for Schema3.
+ SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map);
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("schema3prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config3.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop3.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop3.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop3.schema1prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for Schema2:
+ // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2",
+ // "schema2prop1.schema1prop3", "schema2prop2", "schema2prop3"}
+ //
+ // Indexable properties:
+ // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2",
+ // "schema2prop1.schema1prop3", "schema2prop2", "schema2prop3"}
+ //
+ // All properties are indexed because index_nested_properties=true for
+ // Schema2.schema2prop1. Schema3's indexable_nested_properties setting does
+ // not affect this.
+ SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map);
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop3"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop3"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(2)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest,
+ IndexableNestedPropertiesList_indexBooleanFalseDoesNotAffectOtherLevels) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop1")
+ .SetDataTypeDocument(schema_type_name1,
+ /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema3prop1")
+ .SetDataTypeDocument(schema_type_name2,
+ /*indexable_nested_properties_list=*/
+ std::initializer_list<std::string>{
+ "schema2prop1.schema1prop2"}))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3}};
+
+ // Order of iteration for Schema3:
+ // {"schema3prop1.schema2prop1.schema1prop1",
+ // "schema3prop1.schema2prop1.schema1prop2"}.
+ //
+ // Indexable properties: {"schema3prop1.schema2prop1.schema1prop2"}
+ //
+ // Schema2 setting index_nested_properties=false, does not affect Schema3's
+ // indexable list.
+ SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map);
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for Schema2:
+ // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2"}
+ //
+ // Indexable properties: None
+ //
+ // The indexable list for Schema3 does not propagate to Schema2.
+ SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map);
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest,
+ IndexableNestedPropertiesList_indexableSetDoesNotAffectOtherLevels) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop3")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop1")
+ .SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ std::initializer_list<std::string>{"schema1prop2"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop3")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema3prop3")
+ .SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/{"schema1prop1",
+ "schema1prop3"}))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schema3prop1")
+ .SetDataTypeDocument(
+ schema_type_name2,
+ /*indexable_nested_properties_list=*/
+ {"schema2prop2", "schema2prop1.schema1prop1",
+ "schema2prop1.schema1prop3"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema3prop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3}};
+
+ // Order of iteration for Schema3:
+ // {"schema3prop1.schema2prop1.schema1prop1",
+ // "schema3prop1.schema2prop1.schema1prop2",
+ // "schema3prop1.schema2prop1.schema1prop3",
+ // "schema3prop1.schema2prop2", "schema3prop1.schema2prop3", "schema3prop2",
+ // "schema3prop3.schema1prop1", "schema3prop3.schema1prop2",
+ // "schema3prop3.schema1prop3"}.
+ //
+ // Indexable properties:
+ // {"schema3prop1.schema2prop1.schema1prop1",
+ // "schema3prop1.schema2prop1.schema1prop3",
+ // "schema3prop1.schema2prop2", "schema3prop2", "schema3prop3.schema1prop1",
+ // "schema3prop3.schema1prop3"}
+ //
+ // Schema2 setting indexable_nested_properties_list={schema1prop2} does not
+ // affect nested properties indexing for Schema3.
+ SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map);
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("schema3prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config3.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop3.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop3.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop3.schema1prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for Schema2:
+ // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2",
+ // "schema2prop1.schema1prop3", "schema2prop2", "schema2prop3"}
+ //
+ // Indexable properties:
+ // {"schema2prop1.schema1prop2", "schema2prop2", "schema2prop3"}
+ //
+ // Indexable_nested_properties set for Schema3.schema3prop1 does not propagate
+ // to Schema2.
+ SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map);
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop3"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("schema2prop3"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(2)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(
+ SchemaPropertyIteratorTest,
+ IndexableNestedPropertiesList_upperLevelIndexTrueIndexesListOfNestedLevel) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+ std::string schema_type_name4 = "SchemaFour";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop1")
+ .SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ std::initializer_list<std::string>{"schema1prop2"}))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema3prop1")
+ .SetDataTypeDocument(schema_type_name2,
+ /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto schema_type_config4 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name4)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema4prop1")
+ .SetDataTypeDocument(schema_type_name3,
+ /*index_nested_properties=*/true))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3},
+ {schema_type_name4, schema_type_config4}};
+
+ // Order of iteration for Schema4:
+ // {"schema4prop1.schema3prop1.schema2prop1.schema1prop1",
+ // "schema4prop1.schema3prop1.schema2prop1.schema1prop2"}.
+ //
+ // Indexable properties: {schema4prop1.schema3prop1.schema2prop1.schema1prop2}
+ //
+ // Both Schema4 and Schema3 sets index_nested_properties=true, so they both
+ // want to follow the indexing behavior of its subtype.
+ // Schema2 is the first subtype to define an indexing config, so we index its
+ // list for both Schema3 and Schema4 even though it sets
+ // index_nested_properties=false.
+ SchemaPropertyIterator schema4_iterator(schema_type_config4, type_config_map);
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema4_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema4_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for Schema3:
+ // {"schema3prop1.schema2prop1.schema1prop1",
+ // "schema3prop1.schema2prop1.schema1prop2"}.
+ //
+ // Indexable properties: {schema3prop1.schema2prop1.schema1prop2}
+ SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map);
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for Schema2:
+ // {"schema2prop1.schema1prop1", "schema2prop1.schema1prop2"}
+ //
+ // Indexable properties:
+ // {"schema2prop1.schema1prop2"}
+ //
+ // Schema3 setting index_nested_properties=true does not propagate to Schema2.
+ SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map);
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest,
+ IndexableNestedPropertiesList_unknownPropPaths) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+ std::string schema_type_name4 = "SchemaFour";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop1")
+ .SetDataTypeDocument(schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ {"schema1prop2", "schema1prop2.foo",
+ "foo.bar", "zzz", "aaa.zzz"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop2")
+ .SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ {"schema1prop1", "schema1prop2", "unknown.path"}))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema3prop1")
+ .SetDataTypeDocument(
+ schema_type_name2,
+ /*indexable_nested_properties_list=*/
+ {"schema3prop1", "schema2prop1", "schema1prop2",
+ "schema2prop1.schema1prop2", "schema2prop1.zzz", "zzz"}))
+ .Build();
+ SchemaTypeConfigProto schema_type_config4 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name4)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema4prop1")
+ .SetDataTypeDocument(schema_type_name3,
+ /*index_nested_properties=*/true))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3},
+ {schema_type_name4, schema_type_config4}};
+
+ // Order of iteration for Schema4:
+ // "schema4prop1.schema3prop1.schema2prop1.schema1prop1",
+ // "schema4prop1.schema3prop1.schema2prop1.schema1prop2" (indexable),
+ // "schema4prop1.schema3prop1.schema2prop2.schema1prop1",
+ // "schema4prop1.schema3prop1.schema2prop2.schema1prop2"
+ //
+ // Unknown property paths from schema3 will also be included for schema4,
+ // since schema4 sets index_nested_properties=true.
+ // This includes everything in schema3prop1's list except
+ // "schema2prop1.schema1prop2".
+ SchemaPropertyIterator schema4_iterator(schema_type_config4, type_config_map);
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop2.schema1prop1"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop2.schema1prop2"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema4_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema4_iterator.unknown_indexable_nested_property_paths(),
+ testing::ElementsAre("schema4prop1.schema3prop1.schema1prop2",
+ "schema4prop1.schema3prop1.schema2prop1",
+ "schema4prop1.schema3prop1.schema2prop1.zzz",
+ "schema4prop1.schema3prop1.schema3prop1",
+ "schema4prop1.schema3prop1.zzz"));
+
+ // Order of iteration for Schema3:
+ // "schema3prop1.schema2prop1.schema1prop1",
+ // "schema3prop1.schema2prop1.schema1prop2" (indexable),
+ // "schema3prop1.schema2prop2.schema1prop1",
+ // "schema3prop1.schema2prop2.schema1prop2"
+ //
+ // Unknown properties (in order):
+ // "schema3prop1.schema1prop2", "schema3prop1.schema2prop1" (not a leaf prop),
+ // "schema3prop1.schema2prop1.zzz", "schema3prop1.schema3prop1",
+ // "schema3prop1.zzz"
+ SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map);
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop2.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop2.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(),
+ testing::ElementsAre(
+ "schema3prop1.schema1prop2", "schema3prop1.schema2prop1",
+ "schema3prop1.schema2prop1.zzz", "schema3prop1.schema3prop1",
+ "schema3prop1.zzz"));
+
+ // Order of iteration for Schema2:
+ // "schema2prop1.schema1prop1",
+ // "schema2prop1.schema1prop2" (indexable),
+ // "schema2prop2.schema1prop1" (indexable),
+ // "schema2prop2.schema1prop2" (indexable)
+ //
+ // Unknown properties (in order):
+ // "schema2prop1.aaa.zzz", "schema2prop1.foo.bar",
+ // "schema2prop1.schema1prop2.foo", "schema2prop1.zzz",
+ // "schema2prop2.unknown.path"
+ SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map);
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop2.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop2.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(
+ schema2_iterator.unknown_indexable_nested_property_paths(),
+ testing::ElementsAre("schema2prop1.aaa.zzz", "schema2prop1.foo.bar",
+ "schema2prop1.schema1prop2.foo", "schema2prop1.zzz",
+ "schema2prop2.unknown.path"));
+}
+
+TEST(SchemaPropertyIteratorTest,
+ IndexableNestedPropertiesListDuplicateElements) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+ std::string schema_type_name4 = "SchemaFour";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema1prop2")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema2prop1")
+ .SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ {"schema1prop2", "schema1prop2", "schema1prop2.foo",
+ "schema1prop2.foo", "foo.bar", "foo.bar", "foo.bar",
+ "zzz", "zzz", "aaa.zzz", "schema1prop2"}))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schema2prop2")
+ .SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ {"schema1prop1", "schema1prop2", "unknown.path",
+ "unknown.path", "unknown.path", "unknown.path",
+ "schema1prop1"}))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema3prop1")
+ .SetDataTypeDocument(
+ schema_type_name2,
+ /*indexable_nested_properties_list=*/
+ {"schema3prop1", "schema3prop1", "schema2prop1",
+ "schema2prop1", "schema1prop2", "schema1prop2",
+ "schema2prop1.schema1prop2", "schema2prop1.schema1prop2",
+ "schema2prop1.zzz", "zzz", "zzz"}))
+ .Build();
+ SchemaTypeConfigProto schema_type_config4 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name4)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schema4prop1")
+ .SetDataTypeDocument(schema_type_name3,
+ /*index_nested_properties=*/true))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3},
+ {schema_type_name4, schema_type_config4}};
+
+ // The results of this test case is the same as the previous test case. This
+ // is to test that the indexable-list is deduped correctly.
+
+ // Order of iteration for Schema4:
+ // "schema4prop1.schema3prop1.schema2prop1.schema1prop1",
+ // "schema4prop1.schema3prop1.schema2prop1.schema1prop2" (indexable),
+ // "schema4prop1.schema3prop1.schema2prop2.schema1prop1",
+ // "schema4prop1.schema3prop1.schema2prop2.schema1prop2"
+ //
+ // Unknown property paths from schema3 will also be included for schema4,
+ // since schema4 sets index_nested_properties=true.
+ // This includes everything in schema3prop1's list except
+ // "schema2prop1.schema1prop2".
+ SchemaPropertyIterator schema4_iterator(schema_type_config4, type_config_map);
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop2.schema1prop1"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema4_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyPath(),
+ Eq("schema4prop1.schema3prop1.schema2prop2.schema1prop2"));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema4_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema4_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema4_iterator.unknown_indexable_nested_property_paths(),
+ testing::ElementsAre("schema4prop1.schema3prop1.schema1prop2",
+ "schema4prop1.schema3prop1.schema2prop1",
+ "schema4prop1.schema3prop1.schema2prop1.zzz",
+ "schema4prop1.schema3prop1.schema3prop1",
+ "schema4prop1.schema3prop1.zzz"));
+
+ // Order of iteration for Schema3:
+ // "schema3prop1.schema2prop1.schema1prop1",
+ // "schema3prop1.schema2prop1.schema1prop2" (indexable),
+ // "schema3prop1.schema2prop2.schema1prop1",
+ // "schema3prop1.schema2prop2.schema1prop2"
+ //
+ // Unknown properties (in order):
+ // "schema2prop1.aaa.zzz", "schema2prop1.foo.bar",
+ // "schema2prop1.schema1prop2.foo", "schema2prop1.zzz",
+ // "schema2prop2.unknown.path"
+ SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map);
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop2.schema1prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("schema3prop1.schema2prop2.schema1prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(),
+ testing::ElementsAre(
+ "schema3prop1.schema1prop2", "schema3prop1.schema2prop1",
+ "schema3prop1.schema2prop1.zzz", "schema3prop1.schema3prop1",
+ "schema3prop1.zzz"));
+
+ // Order of iteration for Schema2:
+ // "schema2prop1.schema1prop1",
+ // "schema2prop1.schema1prop2" (indexable),
+ // "schema2prop2.schema1prop1" (indexable),
+ // "schema2prop2.schema1prop2" (indexable)
+ //
+ // Unknown properties (in order):
+ // "schema2prop1.aaa.zzz", "schema2prop1.foo.bar",
+ // "schema2prop1.schema1prop2.foo", "schema2prop1.zzz",
+ // "schema2prop2.unknown.path"
+ SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map);
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop1.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop2.schema1prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(),
+ Eq("schema2prop2.schema1prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(
+ schema2_iterator.unknown_indexable_nested_property_paths(),
+ testing::ElementsAre("schema2prop1.aaa.zzz", "schema2prop1.foo.bar",
+ "schema2prop1.schema1prop2.foo", "schema2prop1.zzz",
+ "schema2prop2.unknown.path"));
+}
+
+TEST(SchemaPropertyIteratorTest,
+ IndexableNestedProperties_duplicatePropertyNamesInDifferentProperties) {
+ std::string schema_type_name1 = "SchemaOne";
+ std::string schema_type_name2 = "SchemaTwo";
+ std::string schema_type_name3 = "SchemaThree";
+
+ SchemaTypeConfigProto schema_type_config1 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name1)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop1").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop2").SetDataTypeString(
+ TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop3").SetDataTypeString(
+ TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config2 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name2)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop1").SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ std::initializer_list<std::string>{"prop2"}))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop2").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop3").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config3 =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_type_name3)
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop3").SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ {"prop1", "prop3"}))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop1").SetDataTypeDocument(
+ schema_type_name2,
+ /*indexable_nested_properties_list=*/
+ {"prop2", "prop1.prop1", "prop1.prop3"}))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop2").SetDataTypeString(
+ TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder().SetName("prop4").SetDataTypeDocument(
+ schema_type_name1,
+ /*indexable_nested_properties_list=*/
+ {"prop2", "prop3"}))
+ .Build();
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_type_name1, schema_type_config1},
+ {schema_type_name2, schema_type_config2},
+ {schema_type_name3, schema_type_config3}};
+
+ // Order of iteration for Schema3:
+ // {"prop1.prop1.prop1", "prop1.prop1.prop2", "prop1.prop1.prop3",
+ // "prop1.prop2", "prop1.prop3", "prop2",
+ // "prop3.prop1", "prop3.prop2", "prop3.prop3",
+ // "prop4.prop1", "prop4.prop2", "prop4.prop3"}.
+ //
+ // Indexable properties:
+ // {"prop1.prop1.prop1", "prop1.prop1.prop3", "prop1.prop2", "prop2",
+ // "prop3.prop1", "prop3.prop3", "prop4.prop2", "prop4.prop3"}
+ //
+ // Properties do not affect other properties with the same name from different
+ // properties.
+ SchemaPropertyIterator schema3_iterator(schema_type_config3, type_config_map);
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("prop1.prop1.prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("prop1.prop1.prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(),
+ Eq("prop1.prop1.prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop1.prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop1.prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config3.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop3.prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop3.prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop3.prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop4.prop1"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop4.prop2"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyPath(), Eq("prop4.prop3"));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema3_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema3_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema3_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for Schema2:
+ // {"prop1.prop1", "prop1.prop2",
+ // "prop1.prop3", "prop2", "prop3"}
+ //
+ // Indexable properties:
+ // {"prop1.prop2", "prop1.prop3", "prop2", "prop3"}
+ //
+ // Indexable_nested_properties set for Schema3.prop1 does not propagate
+ // to Schema2.
+ SchemaPropertyIterator schema2_iterator(schema_type_config2, type_config_map);
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop1.prop1"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(0)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop1.prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop1.prop3"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config1.properties(2)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop2"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(1)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyPath(), Eq("prop3"));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config2.properties(2)));
+ EXPECT_THAT(schema2_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema2_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema2_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+TEST(SchemaPropertyIteratorTest, SingleLevelCycle) {
+ std::string schema_a = "A";
+ std::string schema_b = "B";
+
+ // Create schema with A -> B -> B -> B...
+ SchemaTypeConfigProto schema_type_config_a =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_a)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaAprop1")
+ .SetDataTypeDocument(
+ schema_b, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_b =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_b)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaBprop1")
+ .SetDataTypeDocument(
+ schema_b, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_a, schema_type_config_a}, {schema_b, schema_type_config_b}};
+
+ // Order of iteration for schema A:
+ // {"schemaAprop1.schemaBprop2", "schemaAprop2"}, both indexable
+ SchemaPropertyIterator schema_a_iterator(schema_type_config_a,
+ type_config_map);
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema B:
+ // {"schemaBprop2"}, indexable.
+ SchemaPropertyIterator schema_b_iterator(schema_type_config_b,
+ type_config_map);
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest, MultipleLevelCycle) {
+ std::string schema_a = "A";
+ std::string schema_b = "B";
+ std::string schema_c = "C";
+
+ // Create schema with A -> B -> C -> A -> B -> C...
+ SchemaTypeConfigProto schema_type_config_a =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_a)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaAprop1")
+ .SetDataTypeDocument(
+ schema_b, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_b =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_b)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaBprop1")
+ .SetDataTypeDocument(
+ schema_c, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_c =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_c)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaCprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaCprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_a, schema_type_config_a},
+ {schema_b, schema_type_config_b},
+ {schema_c, schema_type_config_c}};
+
+ // Order of iteration for schema A:
+ // {"schemaAprop1.schemaBprop1.schemaCprop2", "schemaAprop1.schemaBprop2",
+ // "schemaAprop2"}, all indexable
+ SchemaPropertyIterator schema_a_iterator(schema_type_config_a,
+ type_config_map);
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema B:
+ // {"schemaBprop1.schemaCprop1.schemaAprop2", "schemaBprop1.schemaCprop2",
+ // "schemaBprop2"}
+ //
+ // Indexable properties: {"schemaBprop1.schemaCprop2", "schemaBprop2"}
+ SchemaPropertyIterator schema_b_iterator(schema_type_config_b,
+ type_config_map);
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema C:
+ // {"schemaCprop1.schemaAprop1.schemaBprop2", "schemaCprop1.schemaAprop2",
+ // "schemaCprop2"}
+ //
+ // Indexable properties: {"schemaCprop2"}
+ SchemaPropertyIterator schema_c_iterator(schema_type_config_c,
+ type_config_map);
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest, SingleLevelCycleWithIndexableList) {
+ std::string schema_a = "A";
+ std::string schema_b = "B";
+
+ // Create schema with A -> B -> B -> B...
+ SchemaTypeConfigProto schema_type_config_a =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_a)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaAprop1")
+ .SetDataTypeDocument(
+ schema_b, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_b =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_b)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaBprop2")
+ .SetDataTypeDocument(
+ schema_b, /*indexable_nested_properties_list=*/
+ {"schemaBprop1", "schemaBprop2.schemaBprop1",
+ "schemaBprop2.schemaBprop3",
+ "schemaBprop2.schemaBprop2.schemaBprop3"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop3")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_a, schema_type_config_a}, {schema_b, schema_type_config_b}};
+
+ // Order of iteration and whether each property is indexable for schema A:
+ // {"schemaAprop1.schemaBprop1" (true),
+ // "schemaAprop1.schemaBprop2.schemaBprop1" (true),
+ // "schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop1" (true),
+ // "schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop1" (false),
+ // "schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop3" (true),
+ // "schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop3" (true),
+ // "schemaAprop1.schemaBprop2.schemaBprop3" (false),
+ // "schemaAprop1.schemaBprop3" (true),
+ // "schemaAprop2" (true)}
+ SchemaPropertyIterator schema_a_iterator(schema_type_config_a,
+ type_config_map);
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2.schemaBprop1"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop1"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop1"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop3"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2.schemaBprop2.schemaBprop3"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2.schemaBprop3"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop3"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema B:
+ // {"schemaBprop1" (true),
+ // "schemaBprop2.schemaBprop1" (true),
+ // "schemaBprop2.schemaBprop2.schemaBprop1" (true),
+ // "schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop1" (false),
+ // "schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop3" (true),
+ // "schemaBprop2.schemaBprop2.schemaBprop3" (true),
+ // "schemaBprop2.schemaBprop3" (false),
+ // "schemaBprop3" (true)}
+ SchemaPropertyIterator schema_b_iterator(schema_type_config_b,
+ type_config_map);
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop1"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop2.schemaBprop1"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop2.schemaBprop2.schemaBprop1"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop1"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop2.schemaBprop2.schemaBprop2.schemaBprop3"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop2.schemaBprop2.schemaBprop3"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop2.schemaBprop3"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop3"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest, MultipleCycles) {
+ std::string schema_a = "A";
+ std::string schema_b = "B";
+ std::string schema_c = "C";
+ std::string schema_d = "D";
+
+ // Create the following schema:
+ // D <--> A <--- C
+ // \ ^
+ // v /
+ // B
+ // Schema type A has two cycles: A-B-C-A and A-D-A
+ SchemaTypeConfigProto schema_type_config_a =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_a)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaAprop1")
+ .SetDataTypeDocument(
+ schema_b, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaAprop3")
+ .SetDataTypeDocument(
+ schema_d, /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_b =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_b)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaBprop1")
+ .SetDataTypeDocument(
+ schema_c, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_c =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_c)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaCprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaCprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_d =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_d)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaDprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaDprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_a, schema_type_config_a},
+ {schema_b, schema_type_config_b},
+ {schema_c, schema_type_config_c},
+ {schema_d, schema_type_config_d}};
+
+ // Order of iteration for schema A:
+ // {"schemaAprop1.schemaBprop1.schemaCprop2", "schemaAprop1.schemaBprop2",
+ // "schemaAprop2", "schemaAprop3.schemaDprop2"}, all indexable
+ SchemaPropertyIterator schema_a_iterator(schema_type_config_a,
+ type_config_map);
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema B:
+ // {"schemaBprop1.schemaCprop1.schemaAprop2",
+ // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2",
+ // "schemaBprop1.schemaCprop2", "schemaBprop2"}
+ //
+ // Indexable properties: {"schemaBprop1.schemaCprop2", "schemaBprop2"}
+ SchemaPropertyIterator schema_b_iterator(schema_type_config_b,
+ type_config_map);
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema C:
+ // {"schemaCprop1.schemaAprop1.schemaBprop2", "schemaCprop1.schemaAprop2",
+ // "schemaCprop1.schemaAprop3.schemaDprop2", "schemaCprop2"}
+ //
+ // Indexable properties: {"schemaCprop2"}
+ SchemaPropertyIterator schema_c_iterator(schema_type_config_c,
+ type_config_map);
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema D:
+ // {"schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2",
+ // "schemaDprop1.schemaAprop1.schemaBprop2", "schemaDprop1.schemaAprop2",
+ // "schemaDprop2"}
+ //
+ // Indexable properties: {"schemaDprop2"}
+ SchemaPropertyIterator schema_d_iterator(schema_type_config_d,
+ type_config_map);
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_d_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest, MultipleCyclesWithIndexableList) {
+ std::string schema_a = "A";
+ std::string schema_b = "B";
+ std::string schema_c = "C";
+ std::string schema_d = "D";
+
+ // Create the following schema:
+ // D <--> A <--- C
+ // \ ^
+ // v /
+ // B
+ // Schema type A has two cycles: A-B-C-A and A-D-A
+ SchemaTypeConfigProto schema_type_config_a =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_a)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop1")
+ .SetDataTypeDocument(
+ schema_b, /*indexable_nested_properties_list=*/
+ {"schemaBprop2", "schemaBprop1.schemaCprop1.schemaAprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1."
+ "schemaAprop2"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop3")
+ .SetDataTypeDocument(
+ schema_d, /*indexable_nested_properties_list=*/
+ {"schemaDprop2", "schemaDprop1.schemaAprop2",
+ "schemaDprop1.schemaAprop1.schemaBprop2",
+ "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2",
+ "schemaDprop1.schemaAprop3.schemaDprop2"}))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_b =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_b)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaBprop1")
+ .SetDataTypeDocument(
+ schema_c, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_c =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_c)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaCprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaCprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_d =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_d)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaDprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaDprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_a, schema_type_config_a},
+ {schema_b, schema_type_config_b},
+ {schema_c, schema_type_config_c},
+ {schema_d, schema_type_config_d}};
+
+ // Order of iteration and whether each property is indexable for schema A:
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" (true),
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"
+ // (true), "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"
+ // (true), "schemaAprop1.schemaBprop1.schemaCprop2" (false),
+ // "schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true),
+ // "schemaAprop3.schemaDprop2" (true)
+ SchemaPropertyIterator schema_a_iterator(schema_type_config_a,
+ type_config_map);
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3."
+ "schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration and whether each property is indexable for schema B:
+ // "schemaBprop1.schemaCprop1.schemaAprop2" (false),
+ // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" (false),
+ // "schemaBprop1.schemaCprop2" (true),
+ // "schemaBprop2" (true)
+ SchemaPropertyIterator schema_b_iterator(schema_type_config_b,
+ type_config_map);
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema C:
+ // "schemaCprop1.schemaAprop1.schemaBprop2" (false),
+ // "schemaCprop1.schemaAprop2" (false),
+ // "schemaCprop1.schemaAprop3.schemaDprop2" (false),
+ // "schemaCprop2" (true)
+ SchemaPropertyIterator schema_c_iterator(schema_type_config_c,
+ type_config_map);
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema D:
+ // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (false),
+ // "schemaDprop1.schemaAprop1.schemaBprop2" (false),
+ // "schemaDprop1.schemaAprop2" (false),
+ // "schemaDprop2" (true)
+ SchemaPropertyIterator schema_d_iterator(schema_type_config_d,
+ type_config_map);
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_d_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest, MultipleCyclesWithIndexableList_allIndexTrue) {
+ std::string schema_a = "A";
+ std::string schema_b = "B";
+ std::string schema_c = "C";
+ std::string schema_d = "D";
+
+ // Create the following schema:
+ // D <--> A <--- C
+ // \ ^
+ // v /
+ // B
+ // Schema type A has two cycles: A-B-C-A and A-D-A
+ SchemaTypeConfigProto schema_type_config_a =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_a)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop1")
+ .SetDataTypeDocument(
+ schema_b, /*indexable_nested_properties_list=*/
+ {"schemaBprop2", "schemaBprop1.schemaCprop1.schemaAprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1."
+ "schemaAprop2"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop3")
+ .SetDataTypeDocument(
+ schema_d, /*indexable_nested_properties_list=*/
+ {"schemaDprop2", "schemaDprop1.schemaAprop2",
+ "schemaDprop1.schemaAprop1.schemaBprop2",
+ "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2",
+ "schemaDprop1.schemaAprop3.schemaDprop2"}))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_b =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_b)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaBprop1")
+ .SetDataTypeDocument(
+ schema_c, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_c =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_c)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaCprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaCprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_d =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_d)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaDprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaDprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_a, schema_type_config_a},
+ {schema_b, schema_type_config_b},
+ {schema_c, schema_type_config_c},
+ {schema_d, schema_type_config_d}};
+
+ // Order of iteration and whether each property is indexable for schema A:
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" (true),
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"
+ // (true), "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"
+ // (true), "schemaAprop1.schemaBprop1.schemaCprop2" (false),
+ // "schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true),
+ // "schemaAprop3.schemaDprop2" (true)
+ SchemaPropertyIterator schema_a_iterator(schema_type_config_a,
+ type_config_map);
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3."
+ "schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration and whether each property is indexable for schema B:
+ // "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2"
+ // (true),
+ // "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2"
+ // (true),
+ // "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"
+ // (true),
+ // "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"
+ // (true), "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop2"
+ // (false), "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaBprop1.schemaCprop1.schemaAprop2" (true),
+ // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"
+ // (true),
+ // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2"
+ // (true), "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"
+ // (true),
+ // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2"
+ // (true), "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" (true)
+ // "schemaBprop1.schemaCprop2" (true)
+ // "schemaBprop2" (true)
+
+ SchemaPropertyIterator schema_b_iterator(schema_type_config_b,
+ type_config_map);
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1."
+ "schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1."
+ "schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1."
+ "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1."
+ "schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1."
+ "schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1."
+ "schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1."
+ "schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration and whether each property is indexable for schema C:
+ // "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2"
+ // (true), "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2"
+ // (true),
+ // "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"
+ // (true),
+ // "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"
+ // (true),
+ // "schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop2" (false),
+ // "schemaCprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaCprop1.schemaAprop2" (true),
+ // "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"
+ // (true),
+ // "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2" (true),
+ // "schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true),
+ // "schemaCprop1.schemaAprop3.schemaDprop2" (true)
+ // "schemaCprop2" (true)
+ SchemaPropertyIterator schema_c_iterator(schema_type_config_c,
+ type_config_map);
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1."
+ "schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1."
+ "schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop1."
+ "schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1."
+ "schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration and whether each property is indexable for schema D:
+ // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2"
+ // (true), "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2"
+ // (true),
+ // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"
+ // (true),
+ // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"
+ // (true), "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (false),
+ // "schemaDprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaDprop1.schemaAprop2" (true),
+ // "schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"
+ // (true), "schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2"
+ // (true), "schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop2" (true),
+ // "schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true),
+ // "schemaDprop1.schemaAprop3.schemaDprop2" (true),
+ // "schemaDprop2" (true)
+ SchemaPropertyIterator schema_d_iterator(schema_type_config_d,
+ type_config_map);
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1."
+ "schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1."
+ "schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop1."
+ "schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop1."
+ "schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_d_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest,
+ MultipleCyclesWithIndexableList_unknownPropPaths) {
+ std::string schema_a = "A";
+ std::string schema_b = "B";
+ std::string schema_c = "C";
+ std::string schema_d = "D";
+
+ // Create the following schema:
+ // D <--> A <--- C
+ // \ ^
+ // v /
+ // B
+ // Schema type A has two cycles: A-B-C-A and A-D-A
+ SchemaTypeConfigProto schema_type_config_a =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_a)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop1")
+ .SetDataTypeDocument(
+ schema_b, /*indexable_nested_properties_list=*/
+ {"schemaBprop2", "schemaBprop1.schemaCprop1.schemaAprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2",
+ "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1."
+ "schemaAprop2",
+ "schemaBprop1.schemaCprop1",
+ "schemaBprop1.schemaCprop1.schemaAprop3", "schemaAprop2",
+ "schemaBprop2.schemaCprop2", "schemaBprop1.foo.bar",
+ "foo", "foo", "bar"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop3")
+ .SetDataTypeDocument(
+ schema_d, /*indexable_nested_properties_list=*/
+ {"schemaDprop2", "schemaDprop1.schemaAprop2",
+ "schemaDprop1.schemaAprop1.schemaBprop2",
+ "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2",
+ "schemaDprop1.schemaAprop3.schemaDprop2", "schemaBprop2",
+ "bar", "schemaDprop2.foo", "schemaDprop1",
+ "schemaAprop3.schemaDprop2"}))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_b =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_b)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaBprop1")
+ .SetDataTypeDocument(
+ schema_c, /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_c =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_c)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaCprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaCprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_d =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_d)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaDprop1")
+ .SetDataTypeDocument(
+ schema_a, /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaDprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_a, schema_type_config_a},
+ {schema_b, schema_type_config_b},
+ {schema_c, schema_type_config_c},
+ {schema_d, schema_type_config_d}};
+
+ // Order of iteration and whether each property is indexable for schema A:
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2" (true),
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop1.schemaAprop2"
+ // (true), "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"
+ // (true), "schemaAprop1.schemaBprop1.schemaCprop2" (false),
+ // "schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop2" (true),
+ // "schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2" (true),
+ // "schemaAprop3.schemaDprop2" (true)
+ //
+ // The following properties listed in the indexable_list are not defined
+ // in the schema and should not be seen during iteration. These should appear
+ // in the unknown_indexable_nested_properties_ set.
+ // "schemaAprop1.bar",
+ // "schemaAprop1.foo",
+ // "schemaAprop1.schemaAprop2",
+ // "schemaAprop1.schemaBprop1.foo.bar",
+ // "schemaAprop1.schemaBprop1.schemaCprop1",
+ // "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3",
+ // "schemaAprop1.schemaBprop2.schemaCprop2",
+ // "schemaAprop3.bar",
+ // "schemaAprop3.schemaAprop3.schemaDprop2",
+ // "schemaAprop3.schemaBprop2",
+ // "schemaAprop3.schemaDprop1",
+ // "schemaAprop3.schemaDprop2.foo"
+ SchemaPropertyIterator schema_a_iterator(schema_type_config_a,
+ type_config_map);
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3."
+ "schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(
+ schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(
+ schema_a_iterator.unknown_indexable_nested_property_paths(),
+ ElementsAre(
+ "schemaAprop1.bar", "schemaAprop1.foo", "schemaAprop1.schemaAprop2",
+ "schemaAprop1.schemaBprop1.foo.bar",
+ "schemaAprop1.schemaBprop1.schemaCprop1",
+ "schemaAprop1.schemaBprop1.schemaCprop1.schemaAprop3",
+ "schemaAprop1.schemaBprop2.schemaCprop2", "schemaAprop3.bar",
+ "schemaAprop3.schemaAprop3.schemaDprop2", "schemaAprop3.schemaBprop2",
+ "schemaAprop3.schemaDprop1", "schemaAprop3.schemaDprop2.foo"));
+
+ // Order of iteration and whether each property is indexable for schema B:
+ // "schemaBprop1.schemaCprop1.schemaAprop2" (false),
+ // "schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2" (false),
+ // "schemaBprop1.schemaCprop2" (true),
+ // "schemaBprop2" (true)
+ SchemaPropertyIterator schema_b_iterator(schema_type_config_b,
+ type_config_map);
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(),
+ Eq("schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyPath(), Eq("schemaBprop2"));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_b_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_b_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_b_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema C:
+ // "schemaCprop1.schemaAprop1.schemaBprop2" (false),
+ // "schemaCprop1.schemaAprop2" (false),
+ // "schemaCprop1.schemaAprop3.schemaDprop2" (false),
+ // "schemaCprop2" (true)
+ SchemaPropertyIterator schema_c_iterator(schema_type_config_c,
+ type_config_map);
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(),
+ Eq("schemaCprop1.schemaAprop3.schemaDprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_c_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyPath(), Eq("schemaCprop2"));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_c_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_c_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_c_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+
+ // Order of iteration for schema D:
+ // "schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2" (false),
+ // "schemaDprop1.schemaAprop1.schemaBprop2" (false),
+ // "schemaDprop1.schemaAprop2" (false),
+ // "schemaDprop2" (true)
+ SchemaPropertyIterator schema_d_iterator(schema_type_config_d,
+ type_config_map);
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop1.schemaCprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_c.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(),
+ Eq("schemaDprop1.schemaAprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_d_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyPath(), Eq("schemaDprop2"));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_d.properties(1)));
+ EXPECT_THAT(schema_d_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_d_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_d_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+TEST(SchemaPropertyIteratorTest, TopLevelCycleWithMultipleIndexableLists) {
+ std::string schema_a = "A";
+ std::string schema_b = "B";
+ std::string schema_c = "C";
+ std::string schema_d = "D";
+
+ // Create the following schema:
+ // A <-> A -> B
+ // A has a top-level property that is a self-reference.
+ SchemaTypeConfigProto schema_type_config_a =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_a)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaAprop1")
+ .SetDataTypeDocument(
+ schema_b, /*indexable_nested_properties_list=*/
+ {"schemaBprop1", "schemaBprop2"}))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("schemaAprop2")
+ .SetDataTypeDocument(
+ schema_a, /*indexable_nested_properties_list=*/
+ {"schemaAprop1.schemaBprop2",
+ "schemaAprop1.schemaBprop3"}))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaAprop3")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto schema_type_config_b =
+ SchemaTypeConfigBuilder()
+ .SetType(schema_b)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("schemaBprop3")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ SchemaUtil::TypeConfigMap type_config_map = {
+ {schema_a, schema_type_config_a}, {schema_b, schema_type_config_b}};
+
+ // Order of iteration for Schema A:
+ // "schemaAprop1.schemaBprop1" (true)
+ // "schemaAprop1.schemaBprop2" (true)
+ // "schemaAprop1.schemaBprop3" (false)
+ // "schemaAprop2.schemaAprop1.schemaBprop1" (false)
+ // "schemaAprop2.schemaAprop1.schemaBprop2" (true)
+ // "schemaAprop2.schemaAprop1.schemaBprop3" (true)
+ // "schemaAprop2.schemaAprop3" (false)
+ // "schemaAprop3" (true)
+ SchemaPropertyIterator schema_a_iterator(schema_type_config_a,
+ type_config_map);
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop1"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop1.schemaBprop3"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop2.schemaAprop1.schemaBprop1"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(0)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop2.schemaAprop1.schemaBprop2"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(1)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop2.schemaAprop1.schemaBprop3"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_b.properties(2)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(),
+ Eq("schemaAprop2.schemaAprop3"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(2)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsFalse());
+
+ EXPECT_THAT(schema_a_iterator.Advance(), IsOk());
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyPath(), Eq("schemaAprop3"));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyConfig(),
+ EqualsProto(schema_type_config_a.properties(2)));
+ EXPECT_THAT(schema_a_iterator.GetCurrentPropertyIndexable(), IsTrue());
+
+ EXPECT_THAT(schema_a_iterator.Advance(),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+
+ EXPECT_THAT(schema_a_iterator.unknown_indexable_nested_property_paths(),
+ IsEmpty());
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-store.cc b/icing/schema/schema-store.cc
index 34ccf22..e17e388 100644
--- a/icing/schema/schema-store.cc
+++ b/icing/schema/schema-store.cc
@@ -15,11 +15,14 @@
#include "icing/schema/schema-store.h"
#include <algorithm>
+#include <cinttypes>
#include <cstdint>
+#include <limits>
#include <memory>
#include <string>
#include <string_view>
#include <unordered_map>
+#include <unordered_set>
#include <utility>
#include <vector>
@@ -27,15 +30,24 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
+#include "icing/file/version-util.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/schema/backup-schema-producer.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/property-util.h"
+#include "icing/schema/schema-type-manager.h"
#include "icing/schema/schema-util.h"
-#include "icing/schema/section-manager.h"
#include "icing/schema/section.h"
#include "icing/store/document-filter-data.h"
-#include "icing/store/key-mapper.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
#include "icing/util/crc32.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
@@ -47,21 +59,27 @@ namespace {
constexpr char kSchemaStoreHeaderFilename[] = "schema_store_header";
constexpr char kSchemaFilename[] = "schema.pb";
+constexpr char kOverlaySchemaFilename[] = "overlay_schema.pb";
constexpr char kSchemaTypeMapperFilename[] = "schema_type_mapper";
-// A KeyMapper stores its data across 3 arrays internally. Giving each array
-// 128KiB for storage means the entire KeyMapper requires 384KiB.
+// A DynamicTrieKeyMapper stores its data across 3 arrays internally. Giving
+// each array 128KiB for storage means the entire DynamicTrieKeyMapper requires
+// 384KiB.
constexpr int32_t kSchemaTypeMapperMaxSize = 3 * 128 * 1024; // 384 KiB
-const std::string MakeHeaderFilename(const std::string& base_dir) {
+std::string MakeHeaderFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kSchemaStoreHeaderFilename);
}
-const std::string MakeSchemaFilename(const std::string& base_dir) {
+std::string MakeSchemaFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kSchemaFilename);
}
-const std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
+std::string MakeOverlaySchemaFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kOverlaySchemaFilename);
+}
+
+std::string MakeSchemaTypeMapperFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kSchemaTypeMapperFilename);
}
@@ -102,30 +120,226 @@ std::unordered_set<SchemaTypeId> SchemaTypeIdsChanged(
} // namespace
+/* static */ libtextclassifier3::StatusOr<SchemaStore::Header>
+SchemaStore::Header::Read(const Filesystem* filesystem,
+ const std::string& path) {
+ Header header;
+ ScopedFd sfd(filesystem->OpenForRead(path.c_str()));
+ if (!sfd.is_valid()) {
+ return absl_ports::NotFoundError("SchemaStore header doesn't exist");
+ }
+
+ // If file is sizeof(LegacyHeader), then it must be LegacyHeader.
+ int64_t file_size = filesystem->GetFileSize(sfd.get());
+ if (file_size == sizeof(LegacyHeader)) {
+ LegacyHeader legacy_header;
+ if (!filesystem->Read(path.c_str(), &legacy_header,
+ sizeof(legacy_header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Couldn't read: ", path));
+ }
+ if (legacy_header.magic != Header::kMagic) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header kMagic for file: ", path));
+ }
+ header.set_checksum(legacy_header.checksum);
+ } else if (file_size == sizeof(Header)) {
+ if (!filesystem->Read(path.c_str(), &header, sizeof(header))) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Couldn't read: ", path));
+ }
+ if (header.magic() != Header::kMagic) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Invalid header kMagic for file: ", path));
+ }
+ } else {
+ int legacy_header_size = sizeof(LegacyHeader);
+ int header_size = sizeof(Header);
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Unexpected header size %" PRId64 ". Expected %d or %d", file_size,
+ legacy_header_size, header_size));
+ }
+ return header;
+}
+
+libtextclassifier3::Status SchemaStore::Header::Write(
+ const Filesystem* filesystem, const std::string& path) {
+ ScopedFd scoped_fd(filesystem->OpenForWrite(path.c_str()));
+ // This should overwrite the header.
+ if (!scoped_fd.is_valid() ||
+ !filesystem->Write(scoped_fd.get(), this, sizeof(*this)) ||
+ !filesystem->DataSync(scoped_fd.get())) {
+ return absl_ports::InternalError(
+ absl_ports::StrCat("Failed to write SchemaStore header: ", path));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
- const Filesystem* filesystem, const std::string& base_dir) {
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(filesystem);
+ ICING_RETURN_ERROR_IF_NULL(clock);
- std::unique_ptr<SchemaStore> schema_store =
- std::unique_ptr<SchemaStore>(new SchemaStore(filesystem, base_dir));
- ICING_RETURN_IF_ERROR(schema_store->Initialize());
+ if (!filesystem->DirectoryExists(base_dir.c_str())) {
+ return absl_ports::FailedPreconditionError(
+ "Schema store base directory does not exist!");
+ }
+ std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
+ new SchemaStore(filesystem, base_dir, clock));
+ ICING_RETURN_IF_ERROR(schema_store->Initialize(initialize_stats));
return schema_store;
}
-SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir)
- : filesystem_(*filesystem),
+libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> SchemaStore::Create(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, SchemaProto schema) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+ ICING_RETURN_ERROR_IF_NULL(clock);
+
+ if (!filesystem->DirectoryExists(base_dir.c_str())) {
+ return absl_ports::FailedPreconditionError(
+ "Schema store base directory does not exist!");
+ }
+ std::unique_ptr<SchemaStore> schema_store = std::unique_ptr<SchemaStore>(
+ new SchemaStore(filesystem, base_dir, clock));
+ ICING_RETURN_IF_ERROR(schema_store->Initialize(std::move(schema)));
+ return schema_store;
+}
+
+/* static */ libtextclassifier3::Status SchemaStore::DiscardOverlaySchema(
+ const Filesystem* filesystem, const std::string& base_dir, Header& header) {
+ std::string header_filename = MakeHeaderFilename(base_dir);
+ if (header.overlay_created()) {
+ header.SetOverlayInfo(
+ /*overlay_created=*/false,
+ /*min_overlay_version_compatibility=*/ std::numeric_limits<
+ int32_t>::max());
+ ICING_RETURN_IF_ERROR(header.Write(filesystem, header_filename));
+ }
+ std::string schema_overlay_filename = MakeOverlaySchemaFilename(base_dir);
+ if (!filesystem->DeleteFile(schema_overlay_filename.c_str())) {
+ return absl_ports::InternalError(
+ "Unable to delete stale schema overlay file.");
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+/* static */ libtextclassifier3::Status SchemaStore::MigrateSchema(
+ const Filesystem* filesystem, const std::string& base_dir,
+ version_util::StateChange version_state_change, int32_t new_version) {
+ if (!filesystem->DirectoryExists(base_dir.c_str())) {
+ // Situations when schema store directory doesn't exist:
+ // - Initializing new Icing instance: don't have to do anything now. The
+ // directory will be created later.
+ // - Lose schema store: there is nothing we can do now. The logic will be
+ // handled later by initializing.
+ //
+ // Therefore, just simply return OK here.
+ return libtextclassifier3::Status::OK;
+ }
+
+ std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir);
+ if (!filesystem->FileExists(overlay_schema_filename.c_str())) {
+ // The overlay doesn't exist. So there should be nothing particularly
+ // interesting to worry about.
+ return libtextclassifier3::Status::OK;
+ }
+
+ std::string header_filename = MakeHeaderFilename(base_dir);
+ libtextclassifier3::StatusOr<Header> header_or;
+ switch (version_state_change) {
+ // No necessary actions for normal upgrades or no version change. The data
+ // that was produced by the previous version is fully compatible with this
+ // version and there's no stale data for us to clean up.
+ // The same is true for a normal rollforward. A normal rollforward implies
+ // that the previous version was one that understood the concept of the
+ // overlay schema and would have already discarded it if it was unusable.
+ case version_util::StateChange::kVersionZeroUpgrade:
+ // fallthrough
+ case version_util::StateChange::kUpgrade:
+ // fallthrough
+ case version_util::StateChange::kRollForward:
+ // fallthrough
+ case version_util::StateChange::kCompatible:
+ return libtextclassifier3::Status::OK;
+ case version_util::StateChange::kVersionZeroRollForward:
+ // We've rolled forward. The schema overlay file, if it exists, is
+ // possibly stale. We must throw it out.
+ header_or = Header::Read(filesystem, header_filename);
+ if (!header_or.ok()) {
+ return header_or.status();
+ }
+ return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
+ header_or.ValueOrDie());
+ case version_util::StateChange::kRollBack:
+ header_or = Header::Read(filesystem, header_filename);
+ if (!header_or.ok()) {
+ return header_or.status();
+ }
+ if (header_or.ValueOrDie().min_overlay_version_compatibility() <=
+ new_version) {
+ // We've been rolled back, but the overlay schema claims that it
+ // supports this version. So we can safely return.
+ return libtextclassifier3::Status::OK;
+ }
+ // We've been rolled back to a version that the overlay schema doesn't
+ // support. We must throw it out.
+ return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
+ header_or.ValueOrDie());
+ case version_util::StateChange::kUndetermined:
+ // It's not clear what version we're on, but the base schema should always
+ // be safe to use. Throw out the overlay.
+ header_or = Header::Read(filesystem, header_filename);
+ if (!header_or.ok()) {
+ return header_or.status();
+ }
+ return SchemaStore::DiscardOverlaySchema(filesystem, base_dir,
+ header_or.ValueOrDie());
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+/* static */ libtextclassifier3::Status SchemaStore::DiscardDerivedFiles(
+ const Filesystem* filesystem, const std::string& base_dir) {
+ // Schema type mapper
+ return DynamicTrieKeyMapper<SchemaTypeId>::Delete(
+ *filesystem, MakeSchemaTypeMapperFilename(base_dir));
+}
+
+SchemaStore::SchemaStore(const Filesystem* filesystem, std::string base_dir,
+ const Clock* clock)
+ : filesystem_(filesystem),
base_dir_(std::move(base_dir)),
- schema_file_(*filesystem, MakeSchemaFilename(base_dir_)) {}
+ clock_(clock),
+ schema_file_(std::make_unique<FileBackedProto<SchemaProto>>(
+ *filesystem, MakeSchemaFilename(base_dir_))) {}
SchemaStore::~SchemaStore() {
- if (initialized_) {
+ if (has_schema_successfully_set_ && schema_file_ != nullptr &&
+ schema_type_mapper_ != nullptr && schema_type_manager_ != nullptr) {
if (!PersistToDisk().ok()) {
ICING_LOG(ERROR) << "Error persisting to disk in SchemaStore destructor";
}
}
}
-libtextclassifier3::Status SchemaStore::Initialize() {
+libtextclassifier3::Status SchemaStore::Initialize(SchemaProto new_schema) {
+ ICING_RETURN_IF_ERROR(LoadSchema());
+ if (!absl_ports::IsNotFound(GetSchema().status())) {
+ return absl_ports::FailedPreconditionError(
+ "Incorrectly tried to initialize schema store with a new schema, when "
+ "one is already set!");
+ }
+ ICING_RETURN_IF_ERROR(schema_file_->Write(
+ std::make_unique<SchemaProto>(std::move(new_schema))));
+ return InitializeInternal(/*create_overlay_if_necessary=*/true,
+ /*initialize_stats=*/nullptr);
+}
+
+libtextclassifier3::Status SchemaStore::Initialize(
+ InitializeStatsProto* initialize_stats) {
+ ICING_RETURN_IF_ERROR(LoadSchema());
auto schema_proto_or = GetSchema();
if (absl_ports::IsNotFound(schema_proto_or.status())) {
// Don't have an existing schema proto, that's fine
@@ -134,124 +348,215 @@ libtextclassifier3::Status SchemaStore::Initialize() {
// Real error when trying to read the existing schema
return schema_proto_or.status();
}
+ return InitializeInternal(/*create_overlay_if_necessary=*/false,
+ initialize_stats);
+}
+
+libtextclassifier3::Status SchemaStore::LoadSchema() {
+ libtextclassifier3::StatusOr<Header> header_or =
+ Header::Read(filesystem_, MakeHeaderFilename(base_dir_));
+ bool header_exists = false;
+ if (!header_or.ok() && !absl_ports::IsNotFound(header_or.status())) {
+ return header_or.status();
+ } else if (!header_or.ok()) {
+ header_ = std::make_unique<Header>();
+ } else {
+ header_exists = true;
+ header_ = std::make_unique<Header>(std::move(header_or).ValueOrDie());
+ }
+
+ std::string overlay_schema_filename = MakeOverlaySchemaFilename(base_dir_);
+ bool overlay_schema_file_exists =
+ filesystem_->FileExists(overlay_schema_filename.c_str());
+
+ libtextclassifier3::Status base_schema_state = schema_file_->Read().status();
+ if (!base_schema_state.ok() && !absl_ports::IsNotFound(base_schema_state)) {
+ return base_schema_state;
+ }
+
+ // There are three valid cases:
+ // 1. Everything is missing. This is an empty schema store.
+ if (!base_schema_state.ok() && !overlay_schema_file_exists &&
+ !header_exists) {
+ return libtextclassifier3::Status::OK;
+ }
+
+ // 2. There never was a overlay schema. The header exists, the base schema
+ // exists and the header says the overlay schema shouldn't exist
+ if (base_schema_state.ok() && !overlay_schema_file_exists && header_exists &&
+ !header_->overlay_created()) {
+ // Nothing else to do. Just return safely.
+ return libtextclassifier3::Status::OK;
+ }
+
+ // 3. There is an overlay schema and a base schema and a header. The header
+ // says that the overlay schema should exist.
+ if (base_schema_state.ok() && overlay_schema_file_exists && header_exists &&
+ header_->overlay_created()) {
+ overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
+ *filesystem_, MakeOverlaySchemaFilename(base_dir_));
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Something has gone wrong. We've lost part of the schema ground truth.
+ // Return an error.
+ bool overlay_created = header_->overlay_created();
+ bool base_schema_exists = base_schema_state.ok();
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Unable to properly load schema. Header {exists:%d, overlay_created:%d}, "
+ "base schema exists: %d, overlay_schema_exists: %d",
+ header_exists, overlay_created, base_schema_exists,
+ overlay_schema_file_exists));
+}
+libtextclassifier3::Status SchemaStore::InitializeInternal(
+ bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats) {
if (!InitializeDerivedFiles().ok()) {
ICING_VLOG(3)
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for SchemaStore.";
- ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
+ std::unique_ptr<Timer> regenerate_timer = clock_->GetNewTimer();
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_schema_store_recovery_cause(
+ InitializeStatsProto::IO_ERROR);
+ }
+ ICING_RETURN_IF_ERROR(RegenerateDerivedFiles(create_overlay_if_necessary));
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_schema_store_recovery_latency_ms(
+ regenerate_timer->GetElapsedMilliseconds());
+ }
}
- initialized_ = true;
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_num_schema_types(type_config_map_.size());
+ }
+ has_schema_successfully_set_ = true;
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status SchemaStore::InitializeDerivedFiles() {
- if (!HeaderExists()) {
- // Without a header, we don't know if things are consistent between each
- // other so the caller should just regenerate everything from ground truth.
- return absl_ports::InternalError("SchemaStore header doesn't exist");
- }
-
- SchemaStore::Header header;
- if (!filesystem_.Read(MakeHeaderFilename(base_dir_).c_str(), &header,
- sizeof(header))) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
- }
-
- if (header.magic != SchemaStore::Header::kMagic) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
- }
-
ICING_ASSIGN_OR_RETURN(
schema_type_mapper_,
- KeyMapper<SchemaTypeId>::Create(filesystem_,
- MakeSchemaTypeMapperFilename(base_dir_),
- kSchemaTypeMapperMaxSize));
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
+ kSchemaTypeMapperMaxSize));
ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
- if (checksum.Get() != header.checksum) {
+ if (checksum.Get() != header_->checksum()) {
return absl_ports::InternalError(
"Combined checksum of SchemaStore was inconsistent");
}
- // Update our in-memory data structures
- type_config_map_.clear();
- ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
- for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
- // Update our type_config_map_
- type_config_map_.emplace(type_config.schema_type(), type_config);
- }
- ICING_ASSIGN_OR_RETURN(
- section_manager_,
- SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
-
+ ICING_RETURN_IF_ERROR(BuildInMemoryCache());
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles() {
+libtextclassifier3::Status SchemaStore::RegenerateDerivedFiles(
+ bool create_overlay_if_necessary) {
ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
ICING_RETURN_IF_ERROR(ResetSchemaTypeMapper());
- type_config_map_.clear();
for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
- // Update our type_config_map_
- type_config_map_.emplace(type_config.schema_type(), type_config);
-
// Assign a SchemaTypeId to the type
ICING_RETURN_IF_ERROR(schema_type_mapper_->Put(
type_config.schema_type(), schema_type_mapper_->num_keys()));
}
-
- ICING_ASSIGN_OR_RETURN(
- section_manager_,
- SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
+ ICING_RETURN_IF_ERROR(BuildInMemoryCache());
+
+ if (create_overlay_if_necessary) {
+ ICING_ASSIGN_OR_RETURN(
+ BackupSchemaProducer producer,
+ BackupSchemaProducer::Create(*schema_proto,
+ schema_type_manager_->section_manager()));
+
+ if (producer.is_backup_necessary()) {
+ SchemaProto base_schema = std::move(producer).Produce();
+
+ // The overlay schema should be written to the overlay file location.
+ overlay_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
+ *filesystem_, MakeOverlaySchemaFilename(base_dir_));
+ auto schema_ptr = std::make_unique<SchemaProto>(std::move(*schema_proto));
+ ICING_RETURN_IF_ERROR(overlay_schema_file_->Write(std::move(schema_ptr)));
+
+ // The base schema should be written to the original file
+ auto base_schema_ptr =
+ std::make_unique<SchemaProto>(std::move(base_schema));
+ ICING_RETURN_IF_ERROR(schema_file_->Write(std::move(base_schema_ptr)));
+
+ // LINT.IfChange(min_overlay_version_compatibility)
+ // Although the current version is 3, the schema is compatible with
+ // version 1, so min_overlay_version_compatibility should be 1.
+ int32_t min_overlay_version_compatibility = version_util::kVersionOne;
+ // LINT.ThenChange(//depot/google3/icing/file/version-util.h:kVersion)
+ header_->SetOverlayInfo(
+ /*overlay_created=*/true, min_overlay_version_compatibility);
+ // Rebuild in memory data - references to the old schema will be invalid
+ // now.
+ ICING_RETURN_IF_ERROR(BuildInMemoryCache());
+ }
+ }
// Write the header
ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
- ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
-
- return libtextclassifier3::Status::OK;
-}
-
-bool SchemaStore::HeaderExists() {
- if (!filesystem_.FileExists(MakeHeaderFilename(base_dir_).c_str())) {
- return false;
- }
-
- int64_t file_size =
- filesystem_.GetFileSize(MakeHeaderFilename(base_dir_).c_str());
-
- // If it's been truncated to size 0 before, we consider it to be a new file
- return file_size != 0 && file_size != Filesystem::kBadFileSize;
+ header_->set_checksum(checksum.Get());
+ return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
}
-libtextclassifier3::Status SchemaStore::UpdateHeader(const Crc32& checksum) {
- // Write the header
- SchemaStore::Header header;
- header.magic = SchemaStore::Header::kMagic;
- header.checksum = checksum.Get();
+libtextclassifier3::Status SchemaStore::BuildInMemoryCache() {
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, GetSchema());
+ ICING_ASSIGN_OR_RETURN(
+ SchemaUtil::InheritanceMap inheritance_map,
+ SchemaUtil::BuildTransitiveInheritanceGraph(*schema_proto));
- // This should overwrite the header.
- if (!filesystem_.Write(MakeHeaderFilename(base_dir_).c_str(), &header,
- sizeof(header))) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Failed to write SchemaStore header: ", MakeHeaderFilename(base_dir_)));
+ reverse_schema_type_mapper_.clear();
+ type_config_map_.clear();
+ schema_subtype_id_map_.clear();
+ for (const SchemaTypeConfigProto& type_config : schema_proto->types()) {
+ std::string_view type_name = type_config.schema_type();
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId type_id,
+ schema_type_mapper_->Get(type_name));
+
+ // Build reverse_schema_type_mapper_
+ reverse_schema_type_mapper_.insert({type_id, std::string(type_name)});
+
+ // Build type_config_map_
+ type_config_map_.insert({std::string(type_name), type_config});
+
+ // Build schema_subtype_id_map_
+ std::unordered_set<SchemaTypeId>& subtype_id_set =
+ schema_subtype_id_map_[type_id];
+ // Find all child types
+ auto child_types_names = inheritance_map.find(type_name);
+ if (child_types_names != inheritance_map.end()) {
+ subtype_id_set.reserve(child_types_names->second.size() + 1);
+ for (const auto& [child_type_name, is_direct_child] :
+ child_types_names->second) {
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId child_type_id,
+ schema_type_mapper_->Get(child_type_name));
+ subtype_id_set.insert(child_type_id);
+ }
+ }
+ // Every type is a subtype of itself.
+ subtype_id_set.insert(type_id);
}
+
+ // Build schema_type_manager_
+ ICING_ASSIGN_OR_RETURN(
+ schema_type_manager_,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
return libtextclassifier3::Status::OK;
}
libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
schema_type_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
- libtextclassifier3::Status status = KeyMapper<SchemaTypeId>::Delete(
- filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
+ libtextclassifier3::Status status =
+ DynamicTrieKeyMapper<SchemaTypeId>::Delete(
+ *filesystem_, MakeSchemaTypeMapperFilename(base_dir_));
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
<< "Failed to delete old schema_type mapper";
@@ -259,33 +564,40 @@ libtextclassifier3::Status SchemaStore::ResetSchemaTypeMapper() {
}
ICING_ASSIGN_OR_RETURN(
schema_type_mapper_,
- KeyMapper<SchemaTypeId>::Create(filesystem_,
- MakeSchemaTypeMapperFilename(base_dir_),
- kSchemaTypeMapperMaxSize));
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ *filesystem_, MakeSchemaTypeMapperFilename(base_dir_),
+ kSchemaTypeMapperMaxSize));
return libtextclassifier3::Status::OK;
}
libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
- Crc32 total_checksum;
-
- auto schema_proto_or = GetSchema();
+ // Base schema checksum
+ auto schema_proto_or = schema_file_->Read();
if (absl_ports::IsNotFound(schema_proto_or.status())) {
- // Nothing to checksum
- return total_checksum;
- } else if (!schema_proto_or.ok()) {
- // Some real error. Pass it up
- return schema_proto_or.status();
+ return Crc32();
}
-
- // Guaranteed to have a schema proto now
- const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema_proto, schema_proto_or);
Crc32 schema_checksum;
schema_checksum.Append(schema_proto->SerializeAsString());
- Crc32 schema_type_mapper_checksum = schema_type_mapper_->ComputeChecksum();
+ Crc32 overlay_schema_checksum;
+ if (overlay_schema_file_ != nullptr) {
+ auto schema_proto_or = schema_file_->Read();
+ if (schema_proto_or.ok()) {
+ ICING_ASSIGN_OR_RETURN(schema_proto, schema_proto_or);
+ overlay_schema_checksum.Append(schema_proto->SerializeAsString());
+ }
+ }
+
+ ICING_ASSIGN_OR_RETURN(Crc32 schema_type_mapper_checksum,
+ schema_type_mapper_->ComputeChecksum());
+ Crc32 total_checksum;
total_checksum.Append(std::to_string(schema_checksum.Get()));
+ if (overlay_schema_file_ != nullptr) {
+ total_checksum.Append(std::to_string(overlay_schema_checksum.Get()));
+ }
total_checksum.Append(std::to_string(schema_type_mapper_checksum.Get()));
return total_checksum;
@@ -293,7 +605,10 @@ libtextclassifier3::StatusOr<Crc32> SchemaStore::ComputeChecksum() const {
libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
const {
- return schema_file_.Read();
+ if (overlay_schema_file_ != nullptr) {
+ return overlay_schema_file_->Read();
+ }
+ return schema_file_->Read();
}
// TODO(cassiewang): Consider removing this definition of SetSchema if it's not
@@ -302,19 +617,29 @@ libtextclassifier3::StatusOr<const SchemaProto*> SchemaStore::GetSchema()
// SetSchema(SchemaProto&& new_schema)
libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SchemaStore::SetSchema(const SchemaProto& new_schema,
- bool ignore_errors_and_delete_documents) {
- return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents);
+ bool ignore_errors_and_delete_documents,
+ bool allow_circular_schema_definitions) {
+ return SetSchema(SchemaProto(new_schema), ignore_errors_and_delete_documents,
+ allow_circular_schema_definitions);
}
libtextclassifier3::StatusOr<const SchemaStore::SetSchemaResult>
SchemaStore::SetSchema(SchemaProto&& new_schema,
- bool ignore_errors_and_delete_documents) {
+ bool ignore_errors_and_delete_documents,
+ bool allow_circular_schema_definitions) {
+ ICING_ASSIGN_OR_RETURN(
+ SchemaUtil::DependentMap new_dependent_map,
+ SchemaUtil::Validate(new_schema, allow_circular_schema_definitions));
+
SetSchemaResult result;
auto schema_proto_or = GetSchema();
if (absl_ports::IsNotFound(schema_proto_or.status())) {
// We don't have a pre-existing schema, so anything is valid.
result.success = true;
+ for (const SchemaTypeConfigProto& type_config : new_schema.types()) {
+ result.schema_types_new_by_name.insert(type_config.schema_type());
+ }
} else if (!schema_proto_or.ok()) {
// Real error
return schema_proto_or.status();
@@ -332,10 +657,16 @@ SchemaStore::SetSchema(SchemaProto&& new_schema,
// Different schema, track the differences and see if we can still write it
SchemaUtil::SchemaDelta schema_delta =
- SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema);
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ new_dependent_map);
- // An incompatible index is fine, we can just reindex
- result.index_incompatible = schema_delta.index_incompatible;
+ result.schema_types_new_by_name = std::move(schema_delta.schema_types_new);
+ result.schema_types_changed_fully_compatible_by_name =
+ std::move(schema_delta.schema_types_changed_fully_compatible);
+ result.schema_types_index_incompatible_by_name =
+ std::move(schema_delta.schema_types_index_incompatible);
+ result.schema_types_join_incompatible_by_name =
+ std::move(schema_delta.schema_types_join_incompatible);
for (const auto& schema_type : schema_delta.schema_types_deleted) {
// We currently don't support deletions, so mark this as not possible.
@@ -370,26 +701,70 @@ SchemaStore::SetSchema(SchemaProto&& new_schema,
result.success = result.success || ignore_errors_and_delete_documents;
if (result.success) {
- // Write the schema (and potentially overwrite a previous schema)
- ICING_RETURN_IF_ERROR(
- schema_file_.Write(std::make_unique<SchemaProto>(new_schema)));
-
- ICING_RETURN_IF_ERROR(RegenerateDerivedFiles());
+ ICING_RETURN_IF_ERROR(ApplySchemaChange(std::move(new_schema)));
+ has_schema_successfully_set_ = true;
}
return result;
}
-libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
-SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
- auto schema_proto_or = GetSchema();
- if (absl_ports::IsNotFound(schema_proto_or.status())) {
- return absl_ports::FailedPreconditionError("Schema not set yet.");
- } else if (!schema_proto_or.ok()) {
- // Some other real error, pass it up
- return schema_proto_or.status();
+libtextclassifier3::Status SchemaStore::ApplySchemaChange(
+ SchemaProto new_schema) {
+ // We need to ensure that we either 1) successfully set the schema and
+ // update all derived data structures or 2) fail and leave the schema store
+ // unchanged.
+ // So, first, we create an empty temporary directory to build a new schema
+ // store in.
+ std::string temp_schema_store_dir_path = base_dir_ + "_temp";
+ if (!filesystem_->DeleteDirectoryRecursively(
+ temp_schema_store_dir_path.c_str())) {
+ ICING_LOG(ERROR) << "Recursively deleting "
+ << temp_schema_store_dir_path.c_str();
+ return absl_ports::InternalError(
+ "Unable to delete temp directory to prepare to build new schema "
+ "store.");
+ }
+
+ DestructibleDirectory temp_schema_store_dir(
+ filesystem_, std::move(temp_schema_store_dir_path));
+ if (!temp_schema_store_dir.is_valid()) {
+ return absl_ports::InternalError(
+ "Unable to create temp directory to build new schema store.");
+ }
+
+ // Then we create our new schema store with the new schema.
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<SchemaStore> new_schema_store,
+ SchemaStore::Create(filesystem_, temp_schema_store_dir.dir(), clock_,
+ std::move(new_schema)));
+
+ // Then we swap the new schema file + new derived files with the old files.
+ if (!filesystem_->SwapFiles(base_dir_.c_str(),
+ temp_schema_store_dir.dir().c_str())) {
+ return absl_ports::InternalError(
+ "Unable to apply new schema due to failed swap!");
+ }
+
+ std::string old_base_dir = std::move(base_dir_);
+ *this = std::move(*new_schema_store);
+
+ // After the std::move, the filepaths saved in this instance and in the
+ // schema_file_ instance will still be the one from temp_schema_store_dir
+ // even though they now point to files that are within old_base_dir.
+ // Manually set them to the correct paths.
+ base_dir_ = std::move(old_base_dir);
+ schema_file_->SetSwappedFilepath(MakeSchemaFilename(base_dir_));
+ if (overlay_schema_file_ != nullptr) {
+ overlay_schema_file_->SetSwappedFilepath(
+ MakeOverlaySchemaFilename(base_dir_));
}
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
+SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
const auto& type_config_iter =
type_config_map_.find(std::string(schema_type));
if (type_config_iter == type_config_map_.end()) {
@@ -401,44 +776,211 @@ SchemaStore::GetSchemaTypeConfig(std::string_view schema_type) const {
libtextclassifier3::StatusOr<SchemaTypeId> SchemaStore::GetSchemaTypeId(
std::string_view schema_type) const {
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
return schema_type_mapper_->Get(schema_type);
}
-libtextclassifier3::StatusOr<std::vector<std::string>>
-SchemaStore::GetSectionContent(const DocumentProto& document,
- std::string_view section_path) const {
- return section_manager_->GetSectionContent(document, section_path);
+libtextclassifier3::StatusOr<const std::string*> SchemaStore::GetSchemaType(
+ SchemaTypeId schema_type_id) const {
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
+ if (const auto it = reverse_schema_type_mapper_.find(schema_type_id);
+ it == reverse_schema_type_mapper_.end()) {
+ return absl_ports::InvalidArgumentError("Invalid schema type id");
+ } else {
+ return &it->second;
+ }
}
-libtextclassifier3::StatusOr<std::vector<std::string>>
-SchemaStore::GetSectionContent(const DocumentProto& document,
- SectionId section_id) const {
- return section_manager_->GetSectionContent(document, section_id);
+libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
+SchemaStore::GetSchemaTypeIdsWithChildren(std::string_view schema_type) const {
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ GetSchemaTypeId(schema_type));
+ auto iter = schema_subtype_id_map_.find(schema_type_id);
+ if (iter == schema_subtype_id_map_.end()) {
+ // This should never happen, unless there is an inconsistency or IO error.
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Schema type '", schema_type, "' is not found in the subtype map."));
+ }
+ return &iter->second;
}
libtextclassifier3::StatusOr<const SectionMetadata*>
SchemaStore::GetSectionMetadata(SchemaTypeId schema_type_id,
SectionId section_id) const {
- return section_manager_->GetSectionMetadata(schema_type_id, section_id);
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
+ return schema_type_manager_->section_manager().GetSectionMetadata(
+ schema_type_id, section_id);
}
-libtextclassifier3::StatusOr<std::vector<Section>> SchemaStore::ExtractSections(
+libtextclassifier3::StatusOr<SectionGroup> SchemaStore::ExtractSections(
const DocumentProto& document) const {
- return section_manager_->ExtractSections(document);
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
+ return schema_type_manager_->section_manager().ExtractSections(document);
+}
+
+libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+SchemaStore::GetJoinablePropertyMetadata(
+ SchemaTypeId schema_type_id, const std::string& property_path) const {
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
+ return schema_type_manager_->joinable_property_manager()
+ .GetJoinablePropertyMetadata(schema_type_id, property_path);
+}
+
+libtextclassifier3::StatusOr<JoinablePropertyGroup>
+SchemaStore::ExtractJoinableProperties(const DocumentProto& document) const {
+ ICING_RETURN_IF_ERROR(CheckSchemaSet());
+ return schema_type_manager_->joinable_property_manager()
+ .ExtractJoinableProperties(document);
}
libtextclassifier3::Status SchemaStore::PersistToDisk() {
- if (schema_type_mapper_ != nullptr) {
- // It's possible we haven't had a schema set yet, so SchemaTypeMapper hasn't
- // been initialized and is still a nullptr
- ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
+ if (!has_schema_successfully_set_) {
+ return libtextclassifier3::Status::OK;
}
-
+ ICING_RETURN_IF_ERROR(schema_type_mapper_->PersistToDisk());
// Write the header
ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
- ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
+ header_->set_checksum(checksum.Get());
+ return header_->Write(filesystem_, MakeHeaderFilename(base_dir_));
+}
- return libtextclassifier3::Status::OK;
+SchemaStoreStorageInfoProto SchemaStore::GetStorageInfo() const {
+ SchemaStoreStorageInfoProto storage_info;
+ int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
+ storage_info.set_schema_store_size(
+ Filesystem::SanitizeFileSize(directory_size));
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema(), storage_info);
+ storage_info.set_num_schema_types(schema->types_size());
+ int total_sections = 0;
+ int num_types_sections_exhausted = 0;
+ for (const SchemaTypeConfigProto& type : schema->types()) {
+ auto sections_list_or =
+ schema_type_manager_->section_manager().GetMetadataList(
+ type.schema_type());
+ if (!sections_list_or.ok()) {
+ continue;
+ }
+ total_sections += sections_list_or.ValueOrDie()->size();
+ if (sections_list_or.ValueOrDie()->size() == kTotalNumSections) {
+ ++num_types_sections_exhausted;
+ }
+ }
+
+ storage_info.set_num_total_sections(total_sections);
+ storage_info.set_num_schema_types_sections_exhausted(
+ num_types_sections_exhausted);
+ return storage_info;
+}
+
+libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
+SchemaStore::GetSectionMetadata(const std::string& schema_type) const {
+ return schema_type_manager_->section_manager().GetMetadataList(schema_type);
+}
+
+bool SchemaStore::IsPropertyDefinedInSchema(
+ SchemaTypeId schema_type_id, const std::string& property_path) const {
+ auto schema_name_itr = reverse_schema_type_mapper_.find(schema_type_id);
+ if (schema_name_itr == reverse_schema_type_mapper_.end()) {
+ return false;
+ }
+ const std::string* current_type_name = &schema_name_itr->second;
+
+ std::vector<std::string_view> property_path_parts =
+ property_util::SplitPropertyPathExpr(property_path);
+ for (int i = 0; i < property_path_parts.size(); ++i) {
+ auto type_config_itr = type_config_map_.find(*current_type_name);
+ if (type_config_itr == type_config_map_.end()) {
+ return false;
+ }
+ std::string_view property_name = property_path_parts.at(i);
+ const PropertyConfigProto* selected_property = nullptr;
+ for (const PropertyConfigProto& property :
+ type_config_itr->second.properties()) {
+ if (property.property_name() == property_name) {
+ selected_property = &property;
+ break;
+ }
+ }
+ if (selected_property == nullptr) {
+ return false;
+ }
+ if (i == property_path_parts.size() - 1) {
+ // We've found a property at the final part of the path.
+ return true;
+ }
+ if (selected_property->data_type() !=
+ PropertyConfigProto::DataType::DOCUMENT) {
+ // If this isn't final part of the path, but this property isn't a
+ // document, so we know that this path doesn't exist.
+ return false;
+ }
+ current_type_name = &selected_property->schema_type();
+ }
+
+ // We should never reach this point.
+ return false;
+}
+
+libtextclassifier3::StatusOr<SchemaDebugInfoProto> SchemaStore::GetDebugInfo()
+ const {
+ SchemaDebugInfoProto debug_info;
+ if (has_schema_successfully_set_) {
+ ICING_ASSIGN_OR_RETURN(const SchemaProto* schema, GetSchema());
+ *debug_info.mutable_schema() = *schema;
+ }
+ ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
+ debug_info.set_crc(crc.Get());
+ return debug_info;
+}
+
+std::vector<SchemaStore::ExpandedTypePropertyMask>
+SchemaStore::ExpandTypePropertyMasks(
+ const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
+ const {
+ std::unordered_map<SchemaTypeId, ExpandedTypePropertyMask> result_map;
+ for (const TypePropertyMask& type_field_mask : type_property_masks) {
+ if (type_field_mask.schema_type() == kSchemaTypeWildcard) {
+ ExpandedTypePropertyMask entry{type_field_mask.schema_type(),
+ /*paths=*/{}};
+ entry.paths.insert(type_field_mask.paths().begin(),
+ type_field_mask.paths().end());
+ result_map.insert({kInvalidSchemaTypeId, std::move(entry)});
+ } else {
+ auto schema_type_ids_or =
+ GetSchemaTypeIdsWithChildren(type_field_mask.schema_type());
+ // If we can't find the SchemaTypeIds, just throw it away
+ if (!schema_type_ids_or.ok()) {
+ continue;
+ }
+ const std::unordered_set<SchemaTypeId>* schema_type_ids =
+ schema_type_ids_or.ValueOrDie();
+ for (SchemaTypeId schema_type_id : *schema_type_ids) {
+ auto schema_type_name_iter =
+ reverse_schema_type_mapper_.find(schema_type_id);
+ if (schema_type_name_iter == reverse_schema_type_mapper_.end()) {
+ // This should never happen, unless there is an inconsistency or IO
+ // error.
+ ICING_LOG(ERROR) << "Got unknown schema type id: " << schema_type_id;
+ continue;
+ }
+
+ auto iter = result_map.find(schema_type_id);
+ if (iter == result_map.end()) {
+ ExpandedTypePropertyMask entry{schema_type_name_iter->second,
+ /*paths=*/{}};
+ iter = result_map.insert({schema_type_id, std::move(entry)}).first;
+ }
+ iter->second.paths.insert(type_field_mask.paths().begin(),
+ type_field_mask.paths().end());
+ }
+ }
+ }
+ std::vector<ExpandedTypePropertyMask> result;
+ result.reserve(result_map.size());
+ for (auto& entry : result_map) {
+ result.push_back(std::move(entry.second));
+ }
+ return result;
}
} // namespace lib
diff --git a/icing/schema/schema-store.h b/icing/schema/schema-store.h
index f5c6588..88968b1 100644
--- a/icing/schema/schema-store.h
+++ b/icing/schema/schema-store.h
@@ -16,23 +16,34 @@
#define ICING_SCHEMA_SCHEMA_STORE_H_
#include <cstdint>
+#include <cstring>
+#include <limits>
#include <memory>
#include <string>
#include <string_view>
+#include <unordered_map>
#include <unordered_set>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
#include "icing/file/file-backed-proto.h"
#include "icing/file/filesystem.h"
+#include "icing/file/version-util.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/schema-type-manager.h"
#include "icing/schema/schema-util.h"
-#include "icing/schema/section-manager.h"
#include "icing/schema/section.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/key-mapper.h"
+#include "icing/util/clock.h"
#include "icing/util/crc32.h"
namespace icing {
@@ -45,9 +56,7 @@ namespace lib {
// should always call Get* from the SchemaStore.
class SchemaStore {
public:
- struct Header {
- static constexpr int32_t kMagic = 0x72650d0a;
-
+ struct LegacyHeader {
// Holds the magic as a quick sanity check against file corruption.
int32_t magic;
@@ -55,6 +64,69 @@ class SchemaStore {
uint32_t checksum;
};
+ class Header {
+ public:
+ static constexpr int32_t kMagic = 0x72650d0a;
+
+ explicit Header()
+ : magic_(kMagic),
+ checksum_(0),
+ overlay_created_(false),
+ min_overlay_version_compatibility_(
+ std::numeric_limits<int32_t>::max()) {
+ memset(overlay_created_padding_, 0, kOverlayCreatedPaddingSize);
+ memset(padding_, 0, kPaddingSize);
+ }
+
+ // RETURNS:
+ // - On success, a valid Header instance
+ // - NOT_FOUND if header file doesn't exist
+ // - INTERNAL if unable to read header
+ static libtextclassifier3::StatusOr<Header> Read(
+ const Filesystem* filesystem, const std::string& path);
+
+ libtextclassifier3::Status Write(const Filesystem* filesystem,
+ const std::string& path);
+
+ int32_t magic() const { return magic_; }
+
+ uint32_t checksum() const { return checksum_; }
+ void set_checksum(uint32_t checksum) { checksum_ = checksum; }
+
+ bool overlay_created() const { return overlay_created_; }
+
+ int32_t min_overlay_version_compatibility() const {
+ return min_overlay_version_compatibility_;
+ }
+
+ void SetOverlayInfo(bool overlay_created,
+ int32_t min_overlay_version_compatibility) {
+ overlay_created_ = overlay_created;
+ min_overlay_version_compatibility_ = min_overlay_version_compatibility;
+ }
+
+ private:
+ // Holds the magic as a quick sanity check against file corruption.
+ int32_t magic_;
+
+ // Checksum of the SchemaStore's sub-component's checksums.
+ uint32_t checksum_;
+
+ bool overlay_created_;
+ // Three bytes of padding due to the fact that
+ // min_overlay_version_compatibility_ has an alignof() == 4 and the offset
+ // of overlay_created_padding_ == 9.
+ static constexpr int kOverlayCreatedPaddingSize = 3;
+ uint8_t overlay_created_padding_[kOverlayCreatedPaddingSize];
+
+ int32_t min_overlay_version_compatibility_;
+
+ static constexpr int kPaddingSize = 1008;
+ // Padding exists just to reserve space for additional values.
+ uint8_t padding_[kPaddingSize];
+ };
+ static_assert(sizeof(Header) == 1024);
+
// Holds information on what may have been affected by the new schema. This is
// generally data that other classes may depend on from the SchemaStore,
// so that we can know if we should go update those classes as well.
@@ -65,9 +137,6 @@ class SchemaStore {
// to file.
bool success = false;
- // Whether the new schema changes invalidate the index.
- bool index_incompatible = false;
-
// SchemaTypeIds of schema types can be reassigned new SchemaTypeIds if:
// 1. Schema types are added in the middle of the SchemaProto
// 2. Schema types are removed from the middle of the SchemaProto
@@ -97,30 +166,78 @@ class SchemaStore {
// SchemaUtil::ComputeCompatibilityDelta. Represented by the SchemaTypeId
// assigned to this SchemaTypeConfigProto in the *old* schema.
std::unordered_set<SchemaTypeId> schema_types_incompatible_by_id;
+
+ // Schema types that were added in the new schema. Represented by the
+ // `schema_type` field in the SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_new_by_name;
+
+ // Schema types that were changed in a way that was backwards compatible and
+ // didn't invalidate the index. Represented by the `schema_type` field in
+ // the SchemaTypeConfigProto.
+ std::unordered_set<std::string>
+ schema_types_changed_fully_compatible_by_name;
+
+ // Schema types that were changed in a way that was backwards compatible,
+ // but invalidated the index. Represented by the `schema_type` field in the
+ // SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_index_incompatible_by_name;
+
+ // Schema types that were changed in a way that was backwards compatible,
+ // but invalidated the joinable cache. Represented by the `schema_type`
+ // field in the SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_join_incompatible_by_name;
};
+ struct ExpandedTypePropertyMask {
+ std::string schema_type;
+ std::unordered_set<std::string> paths;
+ };
+
+ static constexpr std::string_view kSchemaTypeWildcard = "*";
+
// Factory function to create a SchemaStore which does not take ownership
// of any input components, and all pointers must refer to valid objects that
// outlive the created SchemaStore instance. The base_dir must already exist.
// There does not need to be an existing schema already.
//
+ // If initialize_stats is present, the fields related to SchemaStore will be
+ // populated.
+ //
// Returns:
// A SchemaStore on success
// FAILED_PRECONDITION on any null pointer input
// INTERNAL_ERROR on any IO errors
static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, InitializeStatsProto* initialize_stats = nullptr);
+
+ // Migrates schema files (backup v.s. new schema) according to version state
+ // change.
+ //
+ // Returns:
+ // OK on success or nothing to migrate
+ static libtextclassifier3::Status MigrateSchema(
+ const Filesystem* filesystem, const std::string& base_dir,
+ version_util::StateChange version_state_change, int32_t new_version);
+
+ // Discards all derived data in the schema store.
+ //
+ // Returns:
+ // OK on success or nothing to discard
+ // INTERNAL_ERROR on any I/O errors
+ static libtextclassifier3::Status DiscardDerivedFiles(
const Filesystem* filesystem, const std::string& base_dir);
- // Not copyable
+ SchemaStore(SchemaStore&&) = default;
+ SchemaStore& operator=(SchemaStore&&) = default;
+
SchemaStore(const SchemaStore&) = delete;
SchemaStore& operator=(const SchemaStore&) = delete;
// Persists and updates checksum of subcomponents.
~SchemaStore();
- // Retrieve the current schema if it exists. Caller does not get ownership of
- // the schema proto and modifying the returned pointer does not affect the
- // underlying schema proto.
+ // Retrieve the current schema if it exists.
//
// Returns:
// SchemaProto* if exists
@@ -142,10 +259,12 @@ class SchemaStore {
// INTERNAL_ERROR on any IO errors
libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
const SchemaProto& new_schema,
- bool ignore_errors_and_delete_documents = false);
+ bool ignore_errors_and_delete_documents,
+ bool allow_circular_schema_definitions);
libtextclassifier3::StatusOr<const SetSchemaResult> SetSchema(
SchemaProto&& new_schema,
- bool ignore_errors_and_delete_documents = false);
+ bool ignore_errors_and_delete_documents,
+ bool allow_circular_schema_definitions);
// Get the SchemaTypeConfigProto of schema_type name.
//
@@ -157,52 +276,94 @@ class SchemaStore {
libtextclassifier3::StatusOr<const SchemaTypeConfigProto*>
GetSchemaTypeConfig(std::string_view schema_type) const;
+ // Returns the schema type of the passed in SchemaTypeId
+ //
+ // Returns:
+ // schema type on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
+ // INVALID_ARGUMENT if schema type id is invalid
+ libtextclassifier3::StatusOr<const std::string*> GetSchemaType(
+ SchemaTypeId schema_type_id) const;
+
// Returns the SchemaTypeId of the passed in schema type
//
// Returns:
// SchemaTypeId on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
// NOT_FOUND_ERROR if we don't know about the schema type
// INTERNAL_ERROR on IO error
libtextclassifier3::StatusOr<SchemaTypeId> GetSchemaTypeId(
std::string_view schema_type) const;
- // Finds content of a section by section path (e.g. property1.property2)
+ // Similar to GetSchemaTypeId but will return a set of SchemaTypeId to also
+ // include child types.
//
// Returns:
- // A string of content on success
- // NOT_FOUND if:
- // 1. Property is optional and not found in the document
- // 2. section_path is invalid
- // 3. Content is empty
- libtextclassifier3::StatusOr<std::vector<std::string>> GetSectionContent(
- const DocumentProto& document, std::string_view section_path) const;
-
- // Finds content of a section by id
- //
- // Returns:
- // A string of content on success
- // INVALID_ARGUMENT if section id is invalid
- // NOT_FOUND if type config name of document not found
- libtextclassifier3::StatusOr<std::vector<std::string>> GetSectionContent(
- const DocumentProto& document, SectionId section_id) const;
+ // A set of SchemaTypeId on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
+ // NOT_FOUND_ERROR if we don't know about the schema type
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<const std::unordered_set<SchemaTypeId>*>
+ GetSchemaTypeIdsWithChildren(std::string_view schema_type) const;
// Returns the SectionMetadata associated with the SectionId that's in the
// SchemaTypeId.
//
// Returns:
- // pointer to SectionMetadata on success
- // INVALID_ARGUMENT if schema type id or section is invalid
+ // Valid pointer to SectionMetadata on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
+ // INVALID_ARGUMENT if schema type id or section id is invalid
libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
SchemaTypeId schema_type_id, SectionId section_id) const;
- // Extracts all sections from the given document, sections are sorted by
- // section id in increasing order. Section ids start from 0. Sections with
- // empty content won't be returned.
+ // Returns true if a property is defined in the said schema, regardless of
+ // whether it is indexed or not.
+ bool IsPropertyDefinedInSchema(SchemaTypeId schema_type_id,
+ const std::string& property) const;
+
+ // Extracts all sections of different types from the given document and group
+ // them by type.
+ // - Each Section vector is sorted by section Id in ascending order. The
+ // sorted section ids may not be continuous, since not all sections are
+ // present in the document.
+ // - Sections with empty content won't be returned.
+ // - For example, we may extract:
+ // string_sections: [2, 7, 10]
+ // integer_sections: [3, 5, 8]
//
// Returns:
- // A list of sections on success
+ // A SectionGroup instance on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
// NOT_FOUND if type config name of document not found
- libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections(
+ libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
+ const DocumentProto& document) const;
+
+ // Returns the JoinablePropertyMetadata associated with property_path that's
+ // in the SchemaTypeId.
+ //
+ // Returns:
+ // Valid pointer to JoinablePropertyMetadata on success
+ // nullptr if property_path doesn't exist (or is not joinable) in the
+ // joinable metadata list of the schema
+ // FAILED_PRECONDITION if schema hasn't been set yet
+ // INVALID_ARGUMENT if schema type id is invalid
+ libtextclassifier3::StatusOr<const JoinablePropertyMetadata*>
+ GetJoinablePropertyMetadata(SchemaTypeId schema_type_id,
+ const std::string& property_path) const;
+
+ // Extracts all joinable property contents of different types from the given
+ // document and group them by joinable value type.
+ // - Joinable properties are sorted by joinable property id in ascending
+ // order. The sorted joinable property ids may not be continuous, since not
+ // all joinable properties are present in the document.
+ // - Joinable property ids start from 0.
+ // - Joinable properties with empty content won't be returned.
+ //
+ // Returns:
+ // A JoinablePropertyGroup instance on success
+ // FAILED_PRECONDITION if schema hasn't been set yet
+ // NOT_FOUND if the type config name of document not found
+ libtextclassifier3::StatusOr<JoinablePropertyGroup> ExtractJoinableProperties(
const DocumentProto& document) const;
// Syncs all the data changes to disk.
@@ -220,16 +381,95 @@ class SchemaStore {
// INTERNAL_ERROR on compute error
libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
+ // Returns:
+ // - On success, the section metadata list for the specified schema type
+ // - NOT_FOUND if the schema type is not present in the schema
+ libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
+ GetSectionMetadata(const std::string& schema_type) const;
+
+ // Calculates the StorageInfo for the Schema Store.
+ //
+ // If an IO error occurs while trying to calculate the value for a field, then
+ // that field will be set to -1.
+ SchemaStoreStorageInfoProto GetStorageInfo() const;
+
+ // Get debug information for the schema store.
+ //
+ // Returns:
+ // SchemaDebugInfoProto on success
+ // INTERNAL_ERROR on IO errors, crc compute error
+ libtextclassifier3::StatusOr<SchemaDebugInfoProto> GetDebugInfo() const;
+
+ // Expands the provided type_property_masks into a vector of
+ // ExpandedTypePropertyMasks to account for polymorphism. If both a parent
+ // type and one of its child type appears in the masks, the parent type's
+ // paths will be merged into the child's.
+ //
+ // For example, assume that we have two schema types A and B, and we have
+ // - A is the parent type of B
+ // - Paths of A: {P1, P2}
+ // - Paths of B: {P3}
+ //
+ // Then, we will have the following in the result.
+ // - Expanded paths of A: {P1, P2}
+ // - Expanded paths of B: {P1, P2, P3}
+ std::vector<ExpandedTypePropertyMask> ExpandTypePropertyMasks(
+ const google::protobuf::RepeatedPtrField<TypePropertyMask>& type_property_masks)
+ const;
+
private:
+ // Factory function to create a SchemaStore and set its schema. The created
+ // instance does not take ownership of any input components and all pointers
+ // must refer to valid objects that outlive the created SchemaStore instance.
+ // The base_dir must already exist. No schema must have set in base_dir prior
+ // to this.
+ //
+ // Returns:
+ // A SchemaStore on success
+ // FAILED_PRECONDITION on any null pointer input or if there has already
+ // been a schema set for this path.
+ // INTERNAL_ERROR on any IO errors
+ static libtextclassifier3::StatusOr<std::unique_ptr<SchemaStore>> Create(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, SchemaProto schema);
+
// Use SchemaStore::Create instead.
- explicit SchemaStore(const Filesystem* filesystem, std::string base_dir);
+ explicit SchemaStore(const Filesystem* filesystem, std::string base_dir,
+ const Clock* clock);
+
+ // Deletes the overlay schema and ensures that the Header is correctly set.
+ //
+ // RETURNS:
+ // OK on success
+ // INTERNAL_ERROR on any IO errors
+ static libtextclassifier3::Status DiscardOverlaySchema(
+ const Filesystem* filesystem, const std::string& base_dir,
+ Header& header);
+
+ // Verifies that there is no error retrieving a previously set schema. Then
+ // initializes like normal.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Initialize(InitializeStatsProto* initialize_stats);
+
+ // First, blindly writes new_schema to the schema_file. Then initializes like
+ // normal.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on IO error
+ // FAILED_PRECONDITION if there is already a schema set for the schema_file.
+ libtextclassifier3::Status Initialize(SchemaProto new_schema);
// Handles initializing the SchemaStore and regenerating any data if needed.
//
// Returns:
// OK on success
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status Initialize();
+ libtextclassifier3::Status InitializeInternal(
+ bool create_overlay_if_necessary, InitializeStatsProto* initialize_stats);
// Creates sub-components and verifies the integrity of each sub-component.
//
@@ -244,11 +484,16 @@ class SchemaStore {
// OK on success
// NOT_FOUND_ERROR if a schema proto has not been set
// INTERNAL_ERROR on any IO errors
- libtextclassifier3::Status RegenerateDerivedFiles();
+ libtextclassifier3::Status RegenerateDerivedFiles(
+ bool create_overlay_if_necessary);
- // Checks if the header exists already. This does not create the header file
- // if it doesn't exist.
- bool HeaderExists();
+ // Build type_config_map_, schema_subtype_id_map_, and schema_type_manager_.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND_ERROR if a schema proto has not been set
+ // INTERNAL_ERROR on any IO errors
+ libtextclassifier3::Status BuildInMemoryCache();
// Update and replace the header file. Creates the header file if it doesn't
// exist.
@@ -265,29 +510,74 @@ class SchemaStore {
// Returns any IO errors.
libtextclassifier3::Status ResetSchemaTypeMapper();
- const Filesystem& filesystem_;
- const std::string base_dir_;
-
- // Used internally to indicate whether the class has been initialized. This is
- // to guard against cases where the object has been created, but Initialize
- // fails in the constructor. If we have successfully exited the constructor,
- // then this field can be ignored. Clients of SchemaStore should not need to
- // worry about this field.
- bool initialized_ = false;
+ // Creates a new schema store with new_schema and then swaps that new schema
+ // store with the existing one. This function guarantees that either: this
+ // instance will be fully updated to the new schema or no changes will take
+ // effect.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL on I/O error.
+ libtextclassifier3::Status ApplySchemaChange(SchemaProto new_schema);
+
+ libtextclassifier3::Status CheckSchemaSet() const {
+ return has_schema_successfully_set_
+ ? libtextclassifier3::Status::OK
+ : absl_ports::FailedPreconditionError("Schema not set yet.");
+ }
+
+ // Correctly loads the Header, schema_file_ and (if present) the
+ // overlay_schema_file_.
+ // RETURNS:
+ // - OK on success
+ // - INTERNAL if an IO error is encountered when reading the Header or
+ // schemas.
+ // Or an invalid schema configuration is present.
+ libtextclassifier3::Status LoadSchema();
+
+ const Filesystem* filesystem_;
+ std::string base_dir_;
+ const Clock* clock_;
+
+ // Used internally to indicate whether the class has been successfully
+ // initialized with a valid schema. Will be false if Initialize failed or no
+ // schema has ever been set.
+ bool has_schema_successfully_set_ = false;
// Cached schema
- FileBackedProto<SchemaProto> schema_file_;
+ std::unique_ptr<FileBackedProto<SchemaProto>> schema_file_;
+
+ // This schema holds the definition of any schema types that are not
+ // compatible with older versions of Icing code.
+ std::unique_ptr<FileBackedProto<SchemaProto>> overlay_schema_file_;
+
+ // Maps schema types to a densely-assigned unique id.
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
+
+ // Maps schema type ids to the corresponding schema type. This is an inverse
+ // map of schema_type_mapper_.
+ std::unordered_map<SchemaTypeId, std::string> reverse_schema_type_mapper_;
// A hash map of (type config name -> type config), allows faster lookup of
// type config in schema. The O(1) type config access makes schema-related and
// section-related operations faster.
SchemaUtil::TypeConfigMap type_config_map_;
- // Maps schema types to a densely-assigned unique id.
- std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
-
- // Manager of indexed section related metadata.
- std::unique_ptr<const SectionManager> section_manager_;
+ // Maps from each type id to all of its subtype ids.
+ // T2 is a subtype of T1, if and only if one of the following conditions is
+ // met:
+ // - T2 is T1
+ // - T2 extends T1
+ // - There exists a type U, such that T2 is a subtype of U, and U is a subtype
+ // of T1
+ std::unordered_map<SchemaTypeId, std::unordered_set<SchemaTypeId>>
+ schema_subtype_id_map_;
+
+ // Manager of section (indexable property) and joinable property related
+ // metadata for all Schemas.
+ std::unique_ptr<const SchemaTypeManager> schema_type_manager_;
+
+ std::unique_ptr<Header> header_;
};
} // namespace lib
diff --git a/icing/schema/schema-store_test.cc b/icing/schema/schema-store_test.cc
index 957fd89..8cc7008 100644
--- a/icing/schema/schema-store_test.cc
+++ b/icing/schema/schema-store_test.cc
@@ -18,19 +18,30 @@
#include <string>
#include <vector>
+#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/file/mock-filesystem.h"
+#include "icing/file/version-util.h"
#include "icing/portable/equals-proto.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-util.h"
#include "icing/schema/section-manager.h"
#include "icing/schema/section.h"
#include "icing/store/document-filter-data.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/util/crc32.h"
namespace icing {
namespace lib {
@@ -40,51 +51,166 @@ namespace {
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::Ge;
+using ::testing::Gt;
+using ::testing::HasSubstr;
using ::testing::Not;
using ::testing::Pointee;
+using ::testing::Return;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
+
+constexpr int64_t kDefaultTimestamp = 12345678;
class SchemaStoreTest : public ::testing::Test {
protected:
- SchemaStoreTest() : test_dir_(GetTestTempDir() + "/icing") {
- filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
-
- auto type = schema_.add_types();
- type->set_schema_type("email");
-
- // Add an indexed property so we generate section metadata on it
- auto property = type->add_properties();
- property->set_property_name("subject");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/icing";
+ schema_store_dir_ = test_dir_ + "/schema_store";
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ schema_ = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ // Add an indexed property so we generate
+ // section metadata on it
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
}
void TearDown() override {
- filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ // Check that the schema store directory is the *only* directory in the
+ // schema_store_dir_. IOW, ensure that all temporary directories have been
+ // properly cleaned up.
+ std::vector<std::string> sub_dirs;
+ ASSERT_TRUE(filesystem_.ListDirectory(test_dir_.c_str(), &sub_dirs));
+ ASSERT_THAT(sub_dirs, ElementsAre("schema_store"));
+
+ // Finally, clean everything up.
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
}
- const Filesystem filesystem_;
- const std::string test_dir_;
+ Filesystem filesystem_;
+ std::string test_dir_;
+ std::string schema_store_dir_;
SchemaProto schema_;
+ FakeClock fake_clock_;
};
TEST_F(SchemaStoreTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(SchemaStore::Create(/*filesystem=*/nullptr, test_dir_),
+ EXPECT_THAT(SchemaStore::Create(/*filesystem=*/nullptr, schema_store_dir_,
+ &fake_clock_),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
+TEST_F(SchemaStoreTest, SchemaStoreMoveConstructible) {
+ // Create an instance of SchemaStore.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type_a").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 expected_checksum,
+ schema_store->ComputeChecksum());
+
+ // Move construct an instance of SchemaStore
+ SchemaStore move_constructed_schema_store(std::move(*schema_store));
+ EXPECT_THAT(move_constructed_schema_store.GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ EXPECT_THAT(move_constructed_schema_store.ComputeChecksum(),
+ IsOkAndHolds(Eq(expected_checksum)));
+ SectionMetadata expected_metadata(/*id_in=*/0, TYPE_STRING, TOKENIZER_PLAIN,
+ TERM_MATCH_EXACT, NUMERIC_MATCH_UNKNOWN,
+ "prop1");
+ EXPECT_THAT(move_constructed_schema_store.GetSectionMetadata("type_a"),
+ IsOkAndHolds(Pointee(ElementsAre(expected_metadata))));
+}
+
+TEST_F(SchemaStoreTest, SchemaStoreMoveAssignment) {
+ // Create an instance of SchemaStore.
+ SchemaProto schema1 =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type_a").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema1, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 expected_checksum,
+ schema_store->ComputeChecksum());
+
+ // Construct another instance of SchemaStore
+ SchemaProto schema2 =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("type_b").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> move_assigned_schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema2, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Move assign the first instance into the second one.
+ *move_assigned_schema_store = std::move(*schema_store);
+ EXPECT_THAT(move_assigned_schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema1))));
+ EXPECT_THAT(move_assigned_schema_store->ComputeChecksum(),
+ IsOkAndHolds(Eq(expected_checksum)));
+ SectionMetadata expected_metadata(/*id_in=*/0, TYPE_STRING, TOKENIZER_PLAIN,
+ TERM_MATCH_EXACT, NUMERIC_MATCH_UNKNOWN,
+ "prop1");
+ EXPECT_THAT(move_assigned_schema_store->GetSectionMetadata("type_a"),
+ IsOkAndHolds(Pointee(ElementsAre(expected_metadata))));
+}
+
TEST_F(SchemaStoreTest, CorruptSchemaError) {
{
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
@@ -94,30 +220,37 @@ TEST_F(SchemaStoreTest, CorruptSchemaError) {
// "Corrupt" the ground truth schema by adding new data to it. This will mess
// up the checksum of the schema store
- SchemaProto corrupt_schema;
- auto type = corrupt_schema.add_types();
- type->set_schema_type("corrupted");
+ SchemaProto corrupt_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("corrupted"))
+ .Build();
- const std::string schema_file = absl_ports::StrCat(test_dir_, "/schema.pb");
+ const std::string schema_file =
+ absl_ports::StrCat(schema_store_dir_, "/schema.pb");
const std::string serialized_schema = corrupt_schema.SerializeAsString();
filesystem_.Write(schema_file.c_str(), serialized_schema.data(),
serialized_schema.size());
// If ground truth was corrupted, we won't know what to do
- EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ EXPECT_THAT(
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
}
TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) {
{
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
@@ -131,11 +264,59 @@ TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) {
// regenerated from ground truth
const std::string schema_type_mapper_dir =
- absl_ports::StrCat(test_dir_, "/schema_type_mapper");
+ absl_ports::StrCat(schema_store_dir_, "/schema_type_mapper");
filesystem_.DeleteDirectoryRecursively(schema_type_mapper_dir.c_str());
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ InitializeStatsProto initialize_stats;
+ fake_clock_.SetTimerElapsedMilliseconds(123);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_,
+ &initialize_stats));
+ EXPECT_THAT(initialize_stats.schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(initialize_stats.schema_store_recovery_latency_ms(), Eq(123));
+
+ // Everything looks fine, ground truth and derived data
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+ EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+}
+
+TEST_F(SchemaStoreTest, RecoverDiscardDerivedFilesOk) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(schema_));
+
+ EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+ }
+
+ ICING_ASSERT_OK(
+ SchemaStore::DiscardDerivedFiles(&filesystem_, schema_store_dir_));
+
+ InitializeStatsProto initialize_stats;
+ fake_clock_.SetTimerElapsedMilliseconds(123);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_,
+ &initialize_stats));
+ EXPECT_THAT(initialize_stats.schema_store_recovery_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+ EXPECT_THAT(initialize_stats.schema_store_recovery_latency_ms(), Eq(123));
// Everything looks fine, ground truth and derived data
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -146,13 +327,17 @@ TEST_F(SchemaStoreTest, RecoverCorruptDerivedFileOk) {
TEST_F(SchemaStoreTest, RecoverBadChecksumOk) {
{
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
@@ -165,15 +350,16 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) {
// the recalculated checksum on initialization. This will force a regeneration
// of derived files from ground truth.
const std::string header_file =
- absl_ports::StrCat(test_dir_, "/schema_store_header");
- SchemaStore::Header header;
+ absl_ports::StrCat(schema_store_dir_, "/schema_store_header");
+ SchemaStore::LegacyHeader header;
header.magic = SchemaStore::Header::kMagic;
header.checksum = 10; // Arbitrary garbage checksum
filesystem_.DeleteFile(header_file.c_str());
filesystem_.Write(header_file.c_str(), &header, sizeof(header));
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Everything looks fine, ground truth and derived data
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
@@ -183,70 +369,125 @@ TEST_F(SchemaStoreTest, RecoverBadChecksumOk) {
}
TEST_F(SchemaStoreTest, CreateNoPreviousSchemaOk) {
- EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_), IsOk());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // The apis to retrieve information about the schema should fail gracefully.
+ EXPECT_THAT(store->GetSchema(),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(store->GetSchemaTypeConfig("foo"),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(store->GetSchemaTypeId("foo"),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(store->GetSectionMetadata(/*schema_type_id=*/0, /*section_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(store->GetJoinablePropertyMetadata(/*schema_type_id=*/0,
+ /*property_path=*/"A"),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ // The apis to extract content from a document should fail gracefully.
+ DocumentProto doc;
+ PropertyProto* prop = doc.add_properties();
+ prop->set_name("name");
+ prop->add_string_values("foo bar baz");
+
+ EXPECT_THAT(store->ExtractSections(doc),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(store->ExtractJoinableProperties(doc),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+
+ // The apis to persist and checksum data should succeed.
+ EXPECT_THAT(store->ComputeChecksum(), IsOkAndHolds(Crc32()));
+ EXPECT_THAT(store->PersistToDisk(), IsOk());
}
TEST_F(SchemaStoreTest, CreateWithPreviousSchemaOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
schema_store.reset();
- EXPECT_THAT(SchemaStore::Create(&filesystem_, test_dir_), IsOk());
+ EXPECT_THAT(
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_),
+ IsOk());
}
TEST_F(SchemaStoreTest, MultipleCreateOk) {
DocumentProto document;
document.set_schema("email");
- auto properties = document.add_properties();
- properties->set_name("subject");
- properties->add_string_values("subject_content");
+ auto subject_property = document.add_properties();
+ subject_property->set_name("subject");
+ subject_property->add_string_values("subject_content");
+ auto timestamp_property = document.add_properties();
+ timestamp_property->set_name("timestamp");
+ timestamp_property->add_int64_values(kDefaultTimestamp);
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
// Verify that our in-memory structures are ok
EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"),
IsOkAndHolds(Pointee(EqualsProto(schema_.types(0)))));
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<Section> sections,
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
schema_store->ExtractSections(document));
- EXPECT_THAT(sections[0].content, ElementsAre("subject_content"));
+ EXPECT_THAT(section_group.string_sections[0].content,
+ ElementsAre("subject_content"));
+ EXPECT_THAT(section_group.integer_sections[0].content,
+ ElementsAre(kDefaultTimestamp));
// Verify that our persisted data is ok
EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
schema_store.reset();
- ICING_ASSERT_OK_AND_ASSIGN(schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Verify that our in-memory structures are ok
EXPECT_THAT(schema_store->GetSchemaTypeConfig("email"),
IsOkAndHolds(Pointee(EqualsProto(schema_.types(0)))));
- ICING_ASSERT_OK_AND_ASSIGN(sections, schema_store->ExtractSections(document));
- EXPECT_THAT(sections[0].content, ElementsAre("subject_content"));
+ ICING_ASSERT_OK_AND_ASSIGN(section_group,
+ schema_store->ExtractSections(document));
+ EXPECT_THAT(section_group.string_sections[0].content,
+ ElementsAre("subject_content"));
+ EXPECT_THAT(section_group.integer_sections[0].content,
+ ElementsAre(kDefaultTimestamp));
// Verify that our persisted data is ok
EXPECT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
}
TEST_F(SchemaStoreTest, SetNewSchemaOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
@@ -254,33 +495,45 @@ TEST_F(SchemaStoreTest, SetNewSchemaOk) {
}
TEST_F(SchemaStoreTest, SetSameSchemaOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema_));
// And one more for fun
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result = SchemaStore::SetSchemaResult();
+ result.success = true;
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema_));
}
TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
@@ -290,55 +543,73 @@ TEST_F(SchemaStoreTest, SetIncompatibleSchemaOk) {
schema_.clear_types();
// Set the incompatible schema
+ result = SchemaStore::SetSchemaResult();
result.success = false;
result.schema_types_deleted_by_name.emplace("email");
result.schema_types_deleted_by_id.emplace(0);
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
}
TEST_F(SchemaStoreTest, SetSchemaWithAddedTypeOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("email");
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema),
+ result.schema_types_new_by_name.insert("email");
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
// Add a type, shouldn't affect the index or cached SchemaTypeIds
- type = schema.add_types();
- type->set_schema_type("new_type");
+ schema = SchemaBuilder(schema)
+ .AddType(SchemaTypeConfigBuilder().SetType("new_type"))
+ .Build();
// Set the compatible schema
- EXPECT_THAT(schema_store->SetSchema(schema),
+ result = SchemaStore::SetSchemaResult();
+ result.success = true;
+ result.schema_types_new_by_name.insert("new_type");
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("email");
- type = schema.add_types();
- type->set_schema_type("message");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema),
+ result.schema_types_new_by_name.insert("email");
+ result.schema_types_new_by_name.insert("message");
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
@@ -350,9 +621,9 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) {
schema_store->GetSchemaTypeId("message"));
// Remove "email" type, this also changes previous SchemaTypeIds
- schema.Clear();
- type = schema.add_types();
- type->set_schema_type("message");
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
SchemaStore::SetSchemaResult incompatible_result;
incompatible_result.success = false;
@@ -363,7 +634,9 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) {
old_email_schema_type_id);
// Can't set the incompatible schema
- EXPECT_THAT(schema_store->SetSchema(schema),
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(incompatible_result)));
SchemaStore::SetSchemaResult force_result;
@@ -374,109 +647,208 @@ TEST_F(SchemaStoreTest, SetSchemaWithDeletedTypeOk) {
// Force set the incompatible schema
EXPECT_THAT(schema_store->SetSchema(
- schema, /*ignore_errors_and_delete_documents=*/true),
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(force_result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
TEST_F(SchemaStoreTest, SetSchemaWithReorderedTypesOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("email");
- type = schema.add_types();
- type->set_schema_type("message");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema),
+ result.schema_types_new_by_name.insert("email");
+ result.schema_types_new_by_name.insert("message");
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
// Reorder the types
- schema.clear_types();
- type = schema.add_types();
- type->set_schema_type("message");
- type = schema.add_types();
- type->set_schema_type("email");
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
// Since we assign SchemaTypeIds based on order in the SchemaProto, this will
// cause SchemaTypeIds to change
+ result = SchemaStore::SetSchemaResult();
+ result.success = true;
result.old_schema_type_ids_changed.emplace(0); // Old SchemaTypeId of "email"
result.old_schema_type_ids_changed.emplace(
1); // Old SchemaTypeId of "message"
// Set the compatible schema
- EXPECT_THAT(schema_store->SetSchema(schema),
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
-TEST_F(SchemaStoreTest, SetSchemaThatRequiresReindexingOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
-
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("email");
-
- // Add an unindexed property
- auto property = type->add_properties();
- property->set_property_name("subject");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+TEST_F(SchemaStoreTest, IndexedPropertyChangeRequiresReindexingOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ // Add an unindexed property
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema),
+ result.schema_types_new_by_name.insert("email");
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
// Make a previously unindexed property indexed
- property = schema.mutable_types(0)->mutable_properties(0);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- // With a new indexed property, we'll need to reindex
- result.index_incompatible = true;
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
// Set the compatible schema
- EXPECT_THAT(schema_store->SetSchema(schema),
+ result = SchemaStore::SetSchemaResult();
+ result.success = true;
+ result.schema_types_index_incompatible_by_name.insert("email");
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
-TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+TEST_F(SchemaStoreTest, IndexNestedDocumentsChangeRequiresReindexingOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // Make two schemas. One that sets index_nested_properties to false and one
+ // that sets it to true.
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto no_nested_index_schema =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument("email",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaProto nested_index_schema =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument("email",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Set schema with index_nested_properties=false to start.
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ result.schema_types_new_by_name.insert("email");
+ result.schema_types_new_by_name.insert("person");
+ EXPECT_THAT(
+ schema_store->SetSchema(no_nested_index_schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(no_nested_index_schema));
- SchemaProto schema;
- auto type = schema.add_types();
- type->set_schema_type("email");
+ // Set schema with index_nested_properties=true and confirm that the change to
+ // 'person' is index incompatible.
+ result = SchemaStore::SetSchemaResult();
+ result.success = true;
+ result.schema_types_index_incompatible_by_name.insert("person");
+ EXPECT_THAT(
+ schema_store->SetSchema(nested_index_schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(nested_index_schema));
- // Add a STRING property
- auto property = type->add_properties();
- property->set_property_name("subject");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ // Set schema with index_nested_properties=false and confirm that the change
+ // to 'person' is index incompatible.
+ result = SchemaStore::SetSchemaResult();
+ result.success = true;
+ result.schema_types_index_incompatible_by_name.insert("person");
+ EXPECT_THAT(
+ schema_store->SetSchema(no_nested_index_schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(no_nested_index_schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ // Add a STRING property
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema),
+ result.schema_types_new_by_name.insert("email");
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
schema_store->GetSchema());
@@ -486,8 +858,14 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
schema_store->GetSchemaTypeId("email"));
// Make a previously STRING property into DOUBLE
- property = schema.mutable_types(0)->mutable_properties(0);
- property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ // Add a STRING property
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
SchemaStore::SetSchemaResult incompatible_result;
incompatible_result.success = false;
@@ -496,7 +874,9 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
old_email_schema_type_id);
// Can't set the incompatible schema
- EXPECT_THAT(schema_store->SetSchema(schema),
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(incompatible_result)));
SchemaStore::SetSchemaResult force_result;
@@ -507,15 +887,338 @@ TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleTypesOk) {
// Force set the incompatible schema
EXPECT_THAT(schema_store->SetSchema(
- schema, /*ignore_errors_and_delete_documents=*/true),
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(force_result)));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
+TEST_F(SchemaStoreTest, SetSchemaWithIncompatibleNestedTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // 1. Create a ContactPoint type with a repeated property and set that schema
+ SchemaTypeConfigBuilder contact_point_repeated_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto old_schema =
+ SchemaBuilder().AddType(contact_point_repeated_label).Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ old_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_contact_point_type_id,
+ schema_store->GetSchemaTypeId("ContactPoint"));
+
+ // 2. Create a type that references the ContactPoint type and make a backwards
+ // incompatible change to ContactPoint
+ SchemaTypeConfigBuilder contact_point_optional_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL));
+ SchemaTypeConfigBuilder person =
+ SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto new_schema = SchemaBuilder()
+ .AddType(contact_point_optional_label)
+ .AddType(person)
+ .Build();
+
+ // 3. SetSchema should fail with ignore_errors_and_delete_documents=false and
+ // the old schema should remain
+ SchemaStore::SetSchemaResult expected_result;
+ expected_result.success = false;
+ expected_result.schema_types_incompatible_by_name.insert("ContactPoint");
+ expected_result.schema_types_incompatible_by_id.insert(
+ old_contact_point_type_id);
+ expected_result.schema_types_new_by_name.insert("Person");
+ EXPECT_THAT(
+ schema_store->SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(old_schema));
+
+ // 4. SetSchema should succeed with ignore_errors_and_delete_documents=true
+ // and the new schema should be set
+ expected_result.success = true;
+ EXPECT_THAT(
+ schema_store->SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithIndexIncompatibleNestedTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // 1. Create a ContactPoint type with label that matches prefix and set that
+ // schema
+ SchemaTypeConfigBuilder contact_point_prefix_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto old_schema =
+ SchemaBuilder().AddType(contact_point_prefix_label).Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ old_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // 2. Create a type that references the ContactPoint type and make a index
+ // backwards incompatible change to ContactPoint
+ SchemaTypeConfigBuilder contact_point_exact_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaTypeConfigBuilder person =
+ SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto new_schema = SchemaBuilder()
+ .AddType(contact_point_exact_label)
+ .AddType(person)
+ .Build();
+
+ // SetSchema should succeed, and only ContactPoint should be in
+ // schema_types_index_incompatible_by_name.
+ SchemaStore::SetSchemaResult expected_result;
+ expected_result.success = true;
+ expected_result.schema_types_index_incompatible_by_name.insert(
+ "ContactPoint");
+ expected_result.schema_types_new_by_name.insert("Person");
+ EXPECT_THAT(
+ schema_store->SetSchema(new_schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithCompatibleNestedTypesOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // 1. Create a ContactPoint type with a optional property and set that schema
+ SchemaTypeConfigBuilder contact_point_optional_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL));
+ SchemaProto old_schema =
+ SchemaBuilder().AddType(contact_point_optional_label).Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ old_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // 2. Create a type that references the ContactPoint type and make a backwards
+ // compatible change to ContactPoint
+ SchemaTypeConfigBuilder contact_point_repeated_label =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaTypeConfigBuilder person =
+ SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto new_schema = SchemaBuilder()
+ .AddType(contact_point_repeated_label)
+ .AddType(person)
+ .Build();
+
+ // 3. SetSchema should succeed, and only ContactPoint should be in
+ // schema_types_changed_fully_compatible_by_name.
+ SchemaStore::SetSchemaResult expected_result;
+ expected_result.success = true;
+ expected_result.schema_types_changed_fully_compatible_by_name.insert(
+ "ContactPoint");
+ expected_result.schema_types_new_by_name.insert("Person");
+ EXPECT_THAT(schema_store->SetSchema(
+ new_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithAddedIndexableNestedTypeOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // 1. Create a ContactPoint type with a optional property, and a type that
+ // references the ContactPoint type.
+ SchemaTypeConfigBuilder contact_point =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaTypeConfigBuilder person =
+ SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto old_schema =
+ SchemaBuilder().AddType(contact_point).AddType(person).Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ old_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // 2. Add another nested document property to "Person" that has type
+ // "ContactPoint"
+ SchemaTypeConfigBuilder new_person =
+ SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("anotherContactPoint")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED));
+ SchemaProto new_schema =
+ SchemaBuilder().AddType(contact_point).AddType(new_person).Build();
+
+ // 3. Set to new schema. "Person" should be index-incompatible since we need
+ // to index an additional property: 'anotherContactPoint.label'.
+ // - "Person" is also considered join-incompatible since the added nested
+ // document property could also contain a joinable property.
+ SchemaStore::SetSchemaResult expected_result;
+ expected_result.success = true;
+ expected_result.schema_types_index_incompatible_by_name.insert("Person");
+ expected_result.schema_types_join_incompatible_by_name.insert("Person");
+
+ EXPECT_THAT(schema_store->SetSchema(
+ new_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
+}
+
+TEST_F(SchemaStoreTest, SetSchemaWithAddedJoinableNestedTypeOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // 1. Create a ContactPoint type with a optional property, and a type that
+ // references the ContactPoint type.
+ SchemaTypeConfigBuilder contact_point =
+ SchemaTypeConfigBuilder()
+ .SetType("ContactPoint")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("label")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED));
+ SchemaTypeConfigBuilder person =
+ SchemaTypeConfigBuilder().SetType("Person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL));
+ SchemaProto old_schema =
+ SchemaBuilder().AddType(contact_point).AddType(person).Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ old_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // 2. Add another nested document property to "Person" that has type
+ // "ContactPoint", but make it non-indexable
+ SchemaTypeConfigBuilder new_person =
+ SchemaTypeConfigBuilder()
+ .SetType("Person")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("contactPoints")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("anotherContactPoint")
+ .SetDataTypeDocument("ContactPoint",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL));
+ SchemaProto new_schema =
+ SchemaBuilder().AddType(contact_point).AddType(new_person).Build();
+
+ // 3. Set to new schema. "Person" should be join-incompatible but
+ // index-compatible.
+ SchemaStore::SetSchemaResult expected_result;
+ expected_result.success = true;
+ expected_result.schema_types_join_incompatible_by_name.insert("Person");
+
+ EXPECT_THAT(schema_store->SetSchema(
+ new_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(expected_result)));
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* actual_schema,
+ schema_store->GetSchema());
+ EXPECT_THAT(*actual_schema, EqualsProto(new_schema));
+}
+
TEST_F(SchemaStoreTest, GetSchemaTypeId) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
schema_.clear_types();
@@ -531,7 +1234,11 @@ TEST_F(SchemaStoreTest, GetSchemaTypeId) {
// Set it for the first time
SchemaStore::SetSchemaResult result;
result.success = true;
- EXPECT_THAT(schema_store->SetSchema(schema_),
+ result.schema_types_new_by_name.insert(first_type);
+ result.schema_types_new_by_name.insert(second_type);
+ EXPECT_THAT(schema_store->SetSchema(
+ schema_, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
IsOkAndHolds(EqualsSetSchemaResult(result)));
EXPECT_THAT(schema_store->GetSchemaTypeId(first_type), IsOkAndHolds(0));
@@ -539,22 +1246,25 @@ TEST_F(SchemaStoreTest, GetSchemaTypeId) {
}
TEST_F(SchemaStoreTest, ComputeChecksumDefaultOnEmptySchemaStore) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
Crc32 default_checksum;
EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(default_checksum));
}
TEST_F(SchemaStoreTest, ComputeChecksumSameBetweenCalls) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto foo_schema;
- auto type_config = foo_schema.add_types();
- type_config->set_schema_type("foo");
+ SchemaProto foo_schema =
+ SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build();
- ICING_EXPECT_OK(schema_store->SetSchema(foo_schema));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ foo_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, schema_store->ComputeChecksum());
@@ -563,66 +1273,76 @@ TEST_F(SchemaStoreTest, ComputeChecksumSameBetweenCalls) {
}
TEST_F(SchemaStoreTest, ComputeChecksumSameAcrossInstances) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto foo_schema;
- auto type_config = foo_schema.add_types();
- type_config->set_schema_type("foo");
+ SchemaProto foo_schema =
+ SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build();
- ICING_EXPECT_OK(schema_store->SetSchema(foo_schema));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ foo_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, schema_store->ComputeChecksum());
// Destroy the previous instance and recreate SchemaStore
schema_store.reset();
- ICING_ASSERT_OK_AND_ASSIGN(schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(checksum));
}
TEST_F(SchemaStoreTest, ComputeChecksumChangesOnModification) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto foo_schema;
- auto type_config = foo_schema.add_types();
- type_config->set_schema_type("foo");
+ SchemaProto foo_schema =
+ SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build();
- ICING_EXPECT_OK(schema_store->SetSchema(foo_schema));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ foo_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, schema_store->ComputeChecksum());
// Modifying the SchemaStore changes the checksum
- SchemaProto foo_bar_schema;
- type_config = foo_bar_schema.add_types();
- type_config->set_schema_type("foo");
- type_config = foo_bar_schema.add_types();
- type_config->set_schema_type("bar");
+ SchemaProto foo_bar_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("foo"))
+ .AddType(SchemaTypeConfigBuilder().SetType("bar"))
+ .Build();
- ICING_EXPECT_OK(schema_store->SetSchema(foo_bar_schema));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ foo_bar_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
EXPECT_THAT(schema_store->ComputeChecksum(), IsOkAndHolds(Not(Eq(checksum))));
}
TEST_F(SchemaStoreTest, PersistToDiskFineForEmptySchemaStore) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
// Persisting is fine and shouldn't affect anything
ICING_EXPECT_OK(schema_store->PersistToDisk());
}
TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) {
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("foo");
+ SchemaProto schema =
+ SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("foo")).Build();
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Persisting shouldn't change anything
ICING_EXPECT_OK(schema_store->PersistToDisk());
@@ -632,20 +1352,1843 @@ TEST_F(SchemaStoreTest, PersistToDiskPreservesAcrossInstances) {
EXPECT_THAT(*actual_schema, EqualsProto(schema));
// Modify the schema so that something different is persisted next time
- type_config = schema.add_types();
- type_config->set_schema_type("bar");
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ schema = SchemaBuilder(schema)
+ .AddType(SchemaTypeConfigBuilder().SetType("bar"))
+ .Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Should also persist on destruction
schema_store.reset();
// And we get the same schema back on reinitialization
- ICING_ASSERT_OK_AND_ASSIGN(schema_store,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
ICING_ASSERT_OK_AND_ASSIGN(actual_schema, schema_store->GetSchema());
EXPECT_THAT(*actual_schema, EqualsProto(schema));
}
+TEST_F(SchemaStoreTest, SchemaStoreStorageInfoProto) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // Create a schema with two types: one simple type and one type that uses all
+ // 64 sections.
+ PropertyConfigProto prop =
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+ SchemaTypeConfigBuilder full_sections_type_builder =
+ SchemaTypeConfigBuilder().SetType("fullSectionsType");
+ for (int i = 0; i < 64; ++i) {
+ full_sections_type_builder.AddProperty(
+ PropertyConfigBuilder(prop).SetName("prop" + std::to_string(i)));
+ }
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder(prop)))
+ .AddType(full_sections_type_builder)
+ .Build();
+
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ result.schema_types_new_by_name.insert("email");
+ result.schema_types_new_by_name.insert("fullSectionsType");
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+
+ SchemaStoreStorageInfoProto storage_info = schema_store->GetStorageInfo();
+ EXPECT_THAT(storage_info.schema_store_size(), Ge(0));
+ EXPECT_THAT(storage_info.num_schema_types(), Eq(2));
+ EXPECT_THAT(storage_info.num_total_sections(), Eq(65));
+ EXPECT_THAT(storage_info.num_schema_types_sections_exhausted(), Eq(1));
+}
+
+TEST_F(SchemaStoreTest, GetDebugInfo) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // Set schema
+ ASSERT_THAT(
+ schema_store->SetSchema(schema_,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(SchemaStore::SetSchemaResult{
+ .success = true,
+ .schema_types_new_by_name = {schema_.types(0).schema_type()}})));
+
+ // Check debug info
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out,
+ schema_store->GetDebugInfo());
+ EXPECT_THAT(out.schema(), EqualsProto(schema_));
+ EXPECT_THAT(out.crc(), Gt(0));
+}
+
+TEST_F(SchemaStoreTest, GetDebugInfoForEmptySchemaStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // Check debug info before setting a schema
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaDebugInfoProto out,
+ schema_store->GetDebugInfo());
+ SchemaDebugInfoProto expected_out;
+ expected_out.set_crc(0);
+ EXPECT_THAT(out, EqualsProto(expected_out));
+}
+
+TEST_F(SchemaStoreTest, InitializeRegenerateDerivedFilesFailure) {
+ // This test covers the first point that RegenerateDerivedFiles could fail.
+ // This should simply result in SetSchema::Create returning an INTERNAL error.
+
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("Type"))
+ .Build();
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ std::move(schema), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ }
+
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ ON_CALL(*mock_filesystem,
+ CreateDirectoryRecursively(HasSubstr("key_mapper_dir")))
+ .WillByDefault(Return(false));
+ {
+ EXPECT_THAT(SchemaStore::Create(mock_filesystem.get(), schema_store_dir_,
+ &fake_clock_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(SchemaStoreTest, SetSchemaRegenerateDerivedFilesFailure) {
+ // This test covers the second point that RegenerateDerivedFiles could fail.
+ // If handled correctly, the schema store and section manager should still be
+ // in the original, valid state.
+ SchemaTypeConfigProto type =
+ SchemaTypeConfigBuilder()
+ .SetType("Type")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("intProp1")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("stringProp1")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ SchemaProto schema = SchemaBuilder().AddType(type).Build();
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ std::move(schema), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ }
+
+ {
+ auto mock_filesystem = std::make_unique<MockFilesystem>();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(mock_filesystem.get(), schema_store_dir_,
+ &fake_clock_));
+
+ ON_CALL(*mock_filesystem,
+ CreateDirectoryRecursively(HasSubstr("key_mapper_dir")))
+ .WillByDefault(Return(false));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(type)
+ .AddType(SchemaTypeConfigBuilder().SetType("Type2"))
+ .Build();
+ EXPECT_THAT(
+ schema_store->SetSchema(std::move(schema),
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ DocumentProto document =
+ DocumentBuilder()
+ .SetSchema("Type")
+ .AddInt64Property("intProp1", 1, 2, 3)
+ .AddStringProperty("stringProp1", "foo bar baz")
+ .Build();
+ SectionMetadata expected_int_prop1_metadata(
+ /*id_in=*/0, TYPE_INT64, TOKENIZER_NONE, TERM_MATCH_UNKNOWN,
+ NUMERIC_MATCH_RANGE, "intProp1");
+ SectionMetadata expected_string_prop1_metadata(
+ /*id_in=*/1, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT,
+ NUMERIC_MATCH_UNKNOWN, "stringProp1");
+ ICING_ASSERT_OK_AND_ASSIGN(SectionGroup section_group,
+ schema_store->ExtractSections(document));
+ ASSERT_THAT(section_group.string_sections, SizeIs(1));
+ EXPECT_THAT(section_group.string_sections.at(0).metadata,
+ Eq(expected_string_prop1_metadata));
+ EXPECT_THAT(section_group.string_sections.at(0).content,
+ ElementsAre("foo bar baz"));
+ ASSERT_THAT(section_group.integer_sections, SizeIs(1));
+ EXPECT_THAT(section_group.integer_sections.at(0).metadata,
+ Eq(expected_int_prop1_metadata));
+ EXPECT_THAT(section_group.integer_sections.at(0).content,
+ ElementsAre(1, 2, 3));
+ }
+}
+
+TEST_F(SchemaStoreTest, CanCheckForPropertiesDefinedInSchema) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // Set it for the first time
+ SchemaStore::SetSchemaResult result;
+ result.success = true;
+ result.schema_types_new_by_name.insert(schema_.types(0).schema_type());
+
+ // Don't use schema_ defined in the test suite, as we want to make sure that
+ // the test is written correctly without referring to what the suite has
+ // defined.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ // Add an indexed property so we generate
+ // section metadata on it
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ EXPECT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOkAndHolds(EqualsSetSchemaResult(result)));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId schema_id,
+ schema_store->GetSchemaTypeId("email"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(schema_id, "subject"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(schema_id, "timestamp"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(schema_id, "foobar"));
+}
+
+TEST_F(SchemaStoreTest, GetSchemaTypeIdsWithChildren) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // Create a schema with the following inheritance relation:
+ // A
+ // / \
+ // B E
+ // / \
+ // C D
+ // |
+ // F
+ SchemaTypeConfigProto type_a = SchemaTypeConfigBuilder().SetType("A").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder().SetType("C").AddParentType("B").Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder().SetType("D").AddParentType("B").Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder().SetType("E").AddParentType("A").Build();
+ SchemaTypeConfigProto type_f =
+ SchemaTypeConfigBuilder().SetType("F").AddParentType("D").Build();
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .AddType(type_f)
+ .Build();
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Get schema type id for each type.
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_a_id,
+ schema_store->GetSchemaTypeId("A"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_b_id,
+ schema_store->GetSchemaTypeId("B"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_c_id,
+ schema_store->GetSchemaTypeId("C"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_d_id,
+ schema_store->GetSchemaTypeId("D"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_e_id,
+ schema_store->GetSchemaTypeId("E"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_f_id,
+ schema_store->GetSchemaTypeId("F"));
+
+ // Check the results from GetSchemaTypeIdsWithChildren
+ EXPECT_THAT(
+ schema_store->GetSchemaTypeIdsWithChildren("A"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(
+ type_a_id, type_b_id, type_c_id, type_d_id, type_e_id, type_f_id))));
+ EXPECT_THAT(schema_store->GetSchemaTypeIdsWithChildren("B"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(
+ type_b_id, type_c_id, type_d_id, type_f_id))));
+ EXPECT_THAT(schema_store->GetSchemaTypeIdsWithChildren("C"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(type_c_id))));
+ EXPECT_THAT(
+ schema_store->GetSchemaTypeIdsWithChildren("D"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(type_d_id, type_f_id))));
+ EXPECT_THAT(schema_store->GetSchemaTypeIdsWithChildren("E"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(type_e_id))));
+ EXPECT_THAT(schema_store->GetSchemaTypeIdsWithChildren("F"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(type_f_id))));
+}
+
+TEST_F(SchemaStoreTest, DiamondGetSchemaTypeIdsWithChildren) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ // Create a schema with the following inheritance relation:
+ // A
+ // / \
+ // B E
+ // / \ /
+ // C D
+ // \ /
+ // F
+ SchemaTypeConfigProto type_a = SchemaTypeConfigBuilder().SetType("A").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder().SetType("C").AddParentType("B").Build();
+ SchemaTypeConfigProto type_d = SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddParentType("B")
+ .AddParentType("E")
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder().SetType("E").AddParentType("A").Build();
+ SchemaTypeConfigProto type_f = SchemaTypeConfigBuilder()
+ .SetType("F")
+ .AddParentType("C")
+ .AddParentType("D")
+ .Build();
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .AddType(type_f)
+ .Build();
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ // Get schema type id for each type.
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_a_id,
+ schema_store->GetSchemaTypeId("A"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_b_id,
+ schema_store->GetSchemaTypeId("B"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_c_id,
+ schema_store->GetSchemaTypeId("C"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_d_id,
+ schema_store->GetSchemaTypeId("D"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_e_id,
+ schema_store->GetSchemaTypeId("E"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId type_f_id,
+ schema_store->GetSchemaTypeId("F"));
+
+ // Check the results from GetSchemaTypeIdsWithChildren
+ EXPECT_THAT(
+ schema_store->GetSchemaTypeIdsWithChildren("A"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(
+ type_a_id, type_b_id, type_c_id, type_d_id, type_e_id, type_f_id))));
+ EXPECT_THAT(schema_store->GetSchemaTypeIdsWithChildren("B"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(
+ type_b_id, type_c_id, type_d_id, type_f_id))));
+ EXPECT_THAT(
+ schema_store->GetSchemaTypeIdsWithChildren("C"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(type_c_id, type_f_id))));
+ EXPECT_THAT(
+ schema_store->GetSchemaTypeIdsWithChildren("D"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(type_d_id, type_f_id))));
+ EXPECT_THAT(schema_store->GetSchemaTypeIdsWithChildren("E"),
+ IsOkAndHolds(Pointee(
+ UnorderedElementsAre(type_e_id, type_d_id, type_f_id))));
+ EXPECT_THAT(schema_store->GetSchemaTypeIdsWithChildren("F"),
+ IsOkAndHolds(Pointee(UnorderedElementsAre(type_f_id))));
+}
+
+TEST_F(SchemaStoreTest, IndexableFieldsAreDefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipients")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipientIds")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(email_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeEmailSchemaId = 0;
+
+ // Indexables.
+ EXPECT_TRUE(
+ schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId, "subject"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId,
+ "senderQualifiedId"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId,
+ "recipients"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId,
+ "recipientIds"));
+ EXPECT_TRUE(
+ schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId, "timestamp"));
+}
+
+TEST_F(SchemaStoreTest, JoinableFieldsAreDefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("tagQualifiedId")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(email_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeEmailSchemaId = 0;
+
+ // Joinables.
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId,
+ "tagQualifiedId"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId,
+ "senderQualifiedId"));
+}
+
+TEST_F(SchemaStoreTest, NonIndexableFieldsAreDefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("attachment")
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("nonindexableInteger")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(email_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeEmailSchemaId = 0;
+
+ // Non-indexables.
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId,
+ "attachment"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId,
+ "nonindexableInteger"));
+ EXPECT_TRUE(
+ schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId, "text"));
+}
+
+TEST_F(SchemaStoreTest, NonExistentFieldsAreUndefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("senderQualifiedId")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("nonindexableInteger")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(email_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeEmailSchemaId = 0;
+
+ // Non-existents.
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId, "foobar"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId,
+ "timestamp.foo"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeEmailSchemaId, "time"));
+}
+
+TEST_F(SchemaStoreTest, NestedIndexableFieldsAreDefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("tagQualifiedId")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaTypeConfigProto conversation_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Conversation")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("nestedNonIndexable")
+ .SetDataTypeDocument("Email",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema =
+ SchemaBuilder().AddType(email_type).AddType(conversation_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeConversationSchemaId = 1;
+
+ // Indexables.
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeConversationSchemaId,
+ "emails.subject"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeConversationSchemaId,
+ "emails.timestamp"));
+}
+
+TEST_F(SchemaStoreTest, NestedJoinableFieldsAreDefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("tagQualifiedId")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaTypeConfigProto conversation_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Conversation")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("nestedNonIndexable")
+ .SetDataTypeDocument("Email",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema =
+ SchemaBuilder().AddType(email_type).AddType(conversation_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeConversationSchemaId = 1;
+
+ // Joinables.
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeConversationSchemaId,
+ "emails.tagQualifiedId"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(
+ kTypeConversationSchemaId, "nestedNonIndexable.tagQualifiedId"));
+}
+
+TEST_F(SchemaStoreTest, NestedNonIndexableFieldsAreDefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("tagQualifiedId")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaTypeConfigProto conversation_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Conversation")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("nestedNonIndexable")
+ .SetDataTypeDocument("Email",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema =
+ SchemaBuilder().AddType(email_type).AddType(conversation_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeConversationSchemaId = 1;
+
+ // Non-indexables.
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeConversationSchemaId,
+ "emails.text"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(
+ kTypeConversationSchemaId, "nestedNonIndexable.subject"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(
+ kTypeConversationSchemaId, "nestedNonIndexable.text"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(
+ kTypeConversationSchemaId, "nestedNonIndexable.timestamp"));
+}
+
+TEST_F(SchemaStoreTest, NestedNonExistentFieldsAreUndefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("tagQualifiedId")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaTypeConfigProto conversation_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Conversation")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("nestedNonIndexable")
+ .SetDataTypeDocument("Email",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema =
+ SchemaBuilder().AddType(email_type).AddType(conversation_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeConversationSchemaId = 1;
+
+ // Non-existents.
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(
+ kTypeConversationSchemaId, "emails.foobar"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(
+ kTypeConversationSchemaId, "nestedNonIndexable.foobar"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(
+ kTypeConversationSchemaId, "emails.timestamp.foo"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(
+ kTypeConversationSchemaId, "emails.time"));
+}
+
+TEST_F(SchemaStoreTest, IntermediateDocumentPropertiesAreDefined) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("tagQualifiedId")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("timestamp")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+
+ SchemaTypeConfigProto conversation_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Conversation")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ "Email", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("nestedNonIndexable")
+ .SetDataTypeDocument("Email",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema =
+ SchemaBuilder().AddType(email_type).AddType(conversation_type).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeConversationSchemaId = 1;
+
+ // Intermediate documents props.
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeConversationSchemaId,
+ "emails"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeConversationSchemaId,
+ "nestedNonIndexable"));
+}
+
+TEST_F(SchemaStoreTest, CyclePathsAreDefined) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeASchemaId = 0;
+ constexpr SchemaTypeId kTypeBSchemaId = 1;
+
+ // A's top-level properties
+ EXPECT_TRUE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "subject"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b"));
+
+ // A's nested properties in B
+ EXPECT_TRUE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b.body"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b.a"));
+
+ // A's nested properties in B's nested property in A
+ EXPECT_TRUE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b.a.subject"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b.a.b"));
+
+ // B's top-level properties
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "body"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a"));
+
+ // B's nested properties in A
+ EXPECT_TRUE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a.subject"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a.b"));
+
+ // B's nested properties in A's nested property in B
+ EXPECT_TRUE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a.b.body"));
+ EXPECT_TRUE(schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a.b.a"));
+}
+
+TEST_F(SchemaStoreTest, WrongTypeCyclePathsAreUndefined) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeASchemaId = 0;
+ constexpr SchemaTypeId kTypeBSchemaId = 1;
+
+ // The same paths as above, but we check the wrong types instead.
+ // A's top-level properties
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "subject"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "b"));
+
+ // A's nested properties in B
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "b.body"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "b.a"));
+
+ // A's nested properties in B's nested property in A
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "b.a.subject"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "b.a.b"));
+
+ // B's top-level properties
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "body"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "a"));
+
+ // B's nested properties in A
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "a.subject"));
+ EXPECT_FALSE(schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "a.b"));
+
+ // B's nested properties in A's nested property in B
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "a.b.body"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "a.b.a"));
+}
+
+TEST_F(SchemaStoreTest, CyclePathsNonexistentPropertiesAreUndefined) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/true));
+ constexpr SchemaTypeId kTypeASchemaId = 0;
+ constexpr SchemaTypeId kTypeBSchemaId = 1;
+
+ // Undefined paths in A
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b.subject"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b.a.body"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b.a.a"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeASchemaId, "b.a.subject.b"));
+
+ // Undefined paths in B
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a.body"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a.b.subject"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a.b.b"));
+ EXPECT_FALSE(
+ schema_store->IsPropertyDefinedInSchema(kTypeBSchemaId, "a.b.body.a"));
+}
+
+TEST_F(SchemaStoreTest, LoadsOverlaySchemaOnInit) {
+ // Create a schema that is rollback incompatible and will trigger us to create
+ // an overlay schema.
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("type_b")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ {
+ // Create a new of the schema store and check that the same schema is
+ // present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+
+ // The overlay should exist
+ std::string overlay_schema_path = schema_store_dir_ + "/overlay_schema.pb";
+ ASSERT_TRUE(filesystem_.FileExists(overlay_schema_path.c_str()));
+
+ // The base schema should hold a compatible schema
+ SchemaTypeConfigProto modified_type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING))
+ .Build();
+ SchemaProto expected_base_schema =
+ SchemaBuilder().AddType(modified_type_a).AddType(type_b).Build();
+ std::string base_schema_path = schema_store_dir_ + "/schema.pb";
+ auto base_schema_file_ = std::make_unique<FileBackedProto<SchemaProto>>(
+ filesystem_, base_schema_path);
+ ICING_ASSERT_OK_AND_ASSIGN(const SchemaProto* base_schema,
+ base_schema_file_->Read());
+ EXPECT_THAT(*base_schema, EqualsProto(expected_base_schema));
+ }
+}
+
+TEST_F(SchemaStoreTest, LoadsBaseSchemaWithNoOverlayOnInit) {
+ // Create a normal schema that won't require an overlay.
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("type_b")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ {
+ // Create a new instance of the schema store and check that the same schema
+ // is present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+
+ // Additionally, the overlay should not exist
+ std::string overlay_schema_path = schema_store_dir_ + "/overlay_schema.pb";
+ ASSERT_FALSE(filesystem_.FileExists(overlay_schema_path.c_str()));
+ }
+}
+
+TEST_F(SchemaStoreTest, LoadSchemaBackupSchemaMissing) {
+ // Create a schema that is rollback incompatible and will trigger us to create
+ // a backup schema.
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("type_b")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Delete the backup schema.
+ std::string backup_schema_path = schema_store_dir_ + "/schema.pb";
+ ASSERT_TRUE(filesystem_.DeleteFile(backup_schema_path.c_str()));
+
+ {
+ // Create a new instance of the schema store and check that it fails because
+ // the backup schema is not available.
+ EXPECT_THAT(
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(SchemaStoreTest, LoadSchemaOverlaySchemaMissing) {
+ // Create a schema that is rollback incompatible and will trigger us to create
+ // a backup schema.
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("type_b")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Delete the overlay schema.
+ std::string overlay_schema_path = schema_store_dir_ + "/overlay_schema.pb";
+ ASSERT_TRUE(filesystem_.DeleteFile(overlay_schema_path.c_str()));
+
+ {
+ // Create a new instance of the schema store and check that it fails because
+ // the overlay schema is not available when we expected it to be.
+ EXPECT_THAT(
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(SchemaStoreTest, LoadSchemaHeaderMissing) {
+ // Create a schema that is rollback incompatible and will trigger us to create
+ // a backup schema.
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("type_b")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Delete the overlay schema.
+ std::string schema_header_path = schema_store_dir_ + "/schema_store_header";
+ ASSERT_TRUE(filesystem_.DeleteFile(schema_header_path.c_str()));
+
+ {
+ // Create a new of the schema store and check that the same schema is
+ // present.
+ EXPECT_THAT(
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(SchemaStoreTest, LoadSchemaNoOverlayHeaderMissing) {
+ // Create a normal schema that won't require a backup.
+ PropertyConfigBuilder indexed_string_property_builder =
+ PropertyConfigBuilder()
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN);
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("type_b")
+ .AddProperty(indexed_string_property_builder.SetName("prop0"))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Delete the schema header.
+ std::string schema_header_path = schema_store_dir_ + "/schema_store_header";
+ ASSERT_TRUE(filesystem_.DeleteFile(schema_header_path.c_str()));
+
+ {
+ // Create a new instance of the schema store and check that it fails because
+ // the schema header (which is now a part of the ground truth) is not
+ // available.
+ EXPECT_THAT(
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ }
+}
+
+TEST_F(SchemaStoreTest, MigrateSchemaCompatibleNoChange) {
+ // Create a schema that is rollback incompatible and will trigger us to create
+ // a backup schema.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kCompatible,
+ version_util::kVersion));
+
+ {
+ // Create a new of the schema store and check that the same schema is
+ // present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+}
+
+TEST_F(SchemaStoreTest, MigrateSchemaUpgradeNoChange) {
+ // Create a schema that is rollback incompatible and will trigger us to create
+ // a backup schema.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kUpgrade,
+ version_util::kVersion + 1));
+
+ {
+ // Create a new of the schema store and check that the same schema is
+ // present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+}
+
+TEST_F(SchemaStoreTest, MigrateSchemaVersionZeroUpgradeNoChange) {
+ // Because we are upgrading from version zero, the schema must be compatible
+ // with version zero.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ ICING_EXPECT_OK(
+ SchemaStore::MigrateSchema(&filesystem_, schema_store_dir_,
+ version_util::StateChange::kVersionZeroUpgrade,
+ version_util::kVersion + 1));
+
+ {
+ // Create a new of the schema store and check that the same schema is
+ // present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+}
+
+TEST_F(SchemaStoreTest,
+ MigrateSchemaRollbackDiscardsIncompatibleOverlaySchema) {
+ // Because we are upgrading from version zero, the schema must be compatible
+ // with version zero.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Rollback to a version before kVersionOne. The schema header will declare
+ // that the overlay is compatible with any version starting with kVersionOne.
+ // So kVersionOne - 1 is incompatible and will throw out the schema.
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kRollBack,
+ version_util::kVersionOne - 1));
+
+ {
+ // Create a new of the schema store and check that we fell back to the
+ // base schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ SchemaTypeConfigProto other_type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING))
+ .Build();
+ SchemaProto base_schema = SchemaBuilder().AddType(other_type_a).Build();
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(base_schema))));
+ }
+}
+
+TEST_F(SchemaStoreTest, MigrateSchemaRollbackKeepsCompatibleOverlaySchema) {
+ // Because we are upgrading from version zero, the schema must be compatible
+ // with version zero.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Rollback to kVersion. The schema header will declare that the overlay is
+ // compatible with any version starting with kVersion. So we will be
+ // compatible and retain the overlay schema.
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kRollBack,
+ version_util::kVersion));
+
+ {
+ // Create a new of the schema store and check that the same schema is
+ // present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+}
+
+TEST_F(SchemaStoreTest, MigrateSchemaRollforwardRetainsBaseSchema) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Rollback to a version before kVersionOne. The schema header will declare
+ // that the overlay is compatible with any version starting with kVersionOne.
+ // So kVersionOne - 1 is incompatible and will throw out the schema.
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kRollBack,
+ version_util::kVersionOne - 1));
+
+ SchemaTypeConfigProto other_type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING))
+ .Build();
+ SchemaProto base_schema = SchemaBuilder().AddType(other_type_a).Build();
+
+ {
+ // Create a new of the schema store and check that we fell back to the
+ // base schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(base_schema))));
+ }
+
+ // Now rollforward to a new version. This should accept whatever schema is
+ // present (currently base schema)
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kRollForward,
+ version_util::kVersion));
+ {
+ // Create a new of the schema store and check that we fell back to the
+ // base schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(base_schema))));
+ }
+}
+
+TEST_F(SchemaStoreTest, MigrateSchemaRollforwardRetainsOverlaySchema) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Rollback to kVersion. The schema header will declare that the overlay is
+ // compatible with any version starting with kVersion. So we will be
+ // compatible and retain the overlay schema.
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kRollBack,
+ version_util::kVersion));
+
+ {
+ // Create a new of the schema store and check that the same schema is
+ // present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // Now rollforward to a new version. This should accept whatever schema is
+ // present (currently overlay schema)
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kRollForward,
+ version_util::kVersion));
+ {
+ // Create a new of the schema store and check that the same schema is
+ // present.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+}
+
+TEST_F(SchemaStoreTest,
+ MigrateSchemaVersionZeroRollforwardDiscardsOverlaySchema) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // A VersionZeroRollforward will always discard the overlay schema because it
+ // could be stale.
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_,
+ version_util::StateChange::kVersionZeroRollForward,
+ version_util::kVersion));
+
+ SchemaTypeConfigProto other_type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING))
+ .Build();
+ SchemaProto base_schema = SchemaBuilder().AddType(other_type_a).Build();
+
+ {
+ // Create a new of the schema store and check that we fell back to the
+ // base schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(base_schema))));
+ }
+}
+
+TEST_F(SchemaStoreTest, MigrateSchemaVersionUndeterminedDiscardsOverlaySchema) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+ {
+ // Create an instance of the schema store and set the schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(schema))));
+ }
+
+ // An Undetermined will always discard the overlay schema because it doesn't
+ // know which state we're in and so it fallback to the base schema because
+ // it should always be valid.
+ ICING_EXPECT_OK(SchemaStore::MigrateSchema(
+ &filesystem_, schema_store_dir_, version_util::StateChange::kUndetermined,
+ version_util::kVersion));
+
+ SchemaTypeConfigProto other_type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("type_a")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("propRfc")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_STRING))
+ .Build();
+ SchemaProto base_schema = SchemaBuilder().AddType(other_type_a).Build();
+
+ {
+ // Create a new of the schema store and check that we fell back to the
+ // base schema.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ EXPECT_THAT(schema_store->GetSchema(),
+ IsOkAndHolds(Pointee(EqualsProto(base_schema))));
+ }
+}
+
} // namespace
} // namespace lib
diff --git a/icing/schema/schema-type-manager.cc b/icing/schema/schema-type-manager.cc
new file mode 100644
index 0000000..4a6b7f2
--- /dev/null
+++ b/icing/schema/schema-type-manager.cc
@@ -0,0 +1,108 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/schema-type-manager.h"
+
+#include <memory>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/schema/joinable-property-manager.h"
+#include "icing/schema/property-util.h"
+#include "icing/schema/schema-property-iterator.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section-manager.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/key-mapper.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<SchemaTypeManager>>
+SchemaTypeManager::Create(const SchemaUtil::TypeConfigMap& type_config_map,
+ const KeyMapper<SchemaTypeId>* schema_type_mapper) {
+ ICING_RETURN_ERROR_IF_NULL(schema_type_mapper);
+
+ SectionManager::Builder section_manager_builder(*schema_type_mapper);
+ JoinablePropertyManager::Builder joinable_property_manager_builder(
+ *schema_type_mapper);
+
+ for (const auto& [type_config_name, type_config] : type_config_map) {
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ schema_type_mapper->Get(type_config_name));
+
+ // Use iterator to traverse all leaf properties of the schema.
+ SchemaPropertyIterator iterator(type_config, type_config_map);
+ while (true) {
+ libtextclassifier3::Status status = iterator.Advance();
+ if (!status.ok()) {
+ if (absl_ports::IsOutOfRange(status)) {
+ break;
+ }
+ return status;
+ }
+
+ // Process section (indexable property)
+ if (iterator.GetCurrentPropertyIndexable()) {
+ ICING_RETURN_IF_ERROR(
+ section_manager_builder.ProcessSchemaTypePropertyConfig(
+ schema_type_id, iterator.GetCurrentPropertyConfig(),
+ iterator.GetCurrentPropertyPath()));
+ }
+
+ // Process joinable property
+ ICING_RETURN_IF_ERROR(
+ joinable_property_manager_builder.ProcessSchemaTypePropertyConfig(
+ schema_type_id, iterator.GetCurrentPropertyConfig(),
+ iterator.GetCurrentPropertyPath()));
+ }
+
+ // Process unknown property paths in the indexable_nested_properties_list.
+ // These property paths should consume sectionIds but are currently
+ // not indexed.
+ //
+ // SectionId assignment order:
+ // - We assign section ids to known (existing) properties first in alphabet
+ // order.
+ // - After handling all known properties, we assign section ids to all
+ // unknown (non-existent) properties that are specified in the
+ // indexable_nested_properties_list.
+ // - As a result, assignment of the entire section set is not done
+ // alphabetically, but assignment is still deterministic and alphabetical
+ // order is preserved inside the known properties and unknown properties
+ // sets individually.
+ for (const auto& property_path :
+ iterator.unknown_indexable_nested_property_paths()) {
+ PropertyConfigProto unknown_property_config;
+ unknown_property_config.set_property_name(std::string(
+ property_util::SplitPropertyPathExpr(property_path).back()));
+ unknown_property_config.set_data_type(
+ PropertyConfigProto::DataType::UNKNOWN);
+
+ ICING_RETURN_IF_ERROR(
+ section_manager_builder.ProcessSchemaTypePropertyConfig(
+ schema_type_id, unknown_property_config,
+ std::string(property_path)));
+ }
+ }
+
+ return std::unique_ptr<SchemaTypeManager>(new SchemaTypeManager(
+ std::move(section_manager_builder).Build(),
+ std::move(joinable_property_manager_builder).Build()));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-type-manager.h b/icing/schema/schema-type-manager.h
new file mode 100644
index 0000000..f2adbd9
--- /dev/null
+++ b/icing/schema/schema-type-manager.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCHEMA_SCHEMA_TYPE_MANAGER_H_
+#define ICING_SCHEMA_SCHEMA_TYPE_MANAGER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/schema/joinable-property-manager.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section-manager.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/key-mapper.h"
+
+namespace icing {
+namespace lib {
+
+// This class is a wrapper of SectionManager and JoinablePropertyManager.
+class SchemaTypeManager {
+ public:
+ // Schema type ids are continuous, and so we use a vector instead of an
+ // unordered map for the mappings.
+ using SchemaTypeIdToPropertiesVector =
+ std::vector<std::unordered_set<std::string>>;
+ // Factory function to create a SchemaTypeManager which does not take
+ // ownership of any input components, and all pointers must refer to valid
+ // objects that outlive the created SchemaTypeManager instance.
+ //
+ // Returns:
+ // - A SchemaTypeManager on success
+ // - FAILED_PRECONDITION_ERROR on any null pointer input
+ // - OUT_OF_RANGE_ERROR if # of indexable properties in a single Schema
+ // exceeds the threshold (kTotalNumSections, kTotalNumJoinableProperties)
+ // - INVALID_ARGUMENT_ERROR if type_config_map contains incorrect
+ // information that causes errors (e.g. invalid schema type id, cycle
+ // dependency in nested schema)
+ // - NOT_FOUND_ERROR if any nested schema name is not found in
+ // type_config_map
+ static libtextclassifier3::StatusOr<std::unique_ptr<SchemaTypeManager>>
+ Create(const SchemaUtil::TypeConfigMap& type_config_map,
+ const KeyMapper<SchemaTypeId>* schema_type_mapper);
+
+ const SectionManager& section_manager() const { return *section_manager_; }
+
+ const JoinablePropertyManager& joinable_property_manager() const {
+ return *joinable_property_manager_;
+ }
+
+ private:
+ explicit SchemaTypeManager(
+ std::unique_ptr<SectionManager> section_manager,
+ std::unique_ptr<JoinablePropertyManager> joinable_property_manager)
+ : section_manager_(std::move(section_manager)),
+ joinable_property_manager_(std::move(joinable_property_manager)) {}
+
+ std::unique_ptr<SectionManager> section_manager_;
+
+ std::unique_ptr<JoinablePropertyManager> joinable_property_manager_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCHEMA_SCHEMA_TYPE_MANAGER_H_
diff --git a/icing/schema/schema-type-manager_test.cc b/icing/schema/schema-type-manager_test.cc
new file mode 100644
index 0000000..eafc612
--- /dev/null
+++ b/icing/schema/schema-type-manager_test.cc
@@ -0,0 +1,356 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/schema/schema-type-manager.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Pointee;
+
+// type and property names of EmailMessage
+static constexpr char kTypeEmail[] = "EmailMessage";
+static constexpr SchemaTypeId kTypeEmailSchemaId = 0;
+// indexable (in lexicographical order)
+static constexpr char kPropertyRecipientIds[] = "recipientIds";
+static constexpr char kPropertyRecipients[] = "recipients";
+static constexpr char kPropertySenderQualifiedId[] =
+ "senderQualifiedId"; // QUALIFIED_ID joinable
+static constexpr char kPropertySubject[] = "subject";
+static constexpr char kPropertyTimestamp[] = "timestamp";
+// non-indexable
+static constexpr char kPropertyAttachment[] = "attachment";
+static constexpr char kPropertyNonIndexableInteger[] = "nonIndexableInteger";
+static constexpr char kPropertyTagQualifiedId[] =
+ "tagQualifiedId"; // QUALIFIED_ID joinable
+static constexpr char kPropertyText[] = "text";
+
+// type and property names of Conversation
+static constexpr char kTypeConversation[] = "Conversation";
+static constexpr SchemaTypeId kTypeConversationSchemaId = 1;
+// indexable (in lexicographical order)
+static constexpr char kPropertyEmails[] = "emails";
+static constexpr char kPropertyGroupQualifiedId[] =
+ "groupQualifiedId"; // QUALIFIED_ID joinable
+static constexpr char kPropertyName[] = "name";
+// non-indexable
+static constexpr char kPropertyNestedNonIndexable[] = "nestedNonIndexable";
+static constexpr char kPropertySuperTagQualifiedId[] =
+ "superTagQualifiedId"; // QUALIFIED_ID joinable
+
+PropertyConfigProto CreateReceipientIdsPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyRecipientIds)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED)
+ .Build();
+}
+
+PropertyConfigProto CreateRecipientsPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyRecipients)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)
+ .Build();
+}
+
+PropertyConfigProto CreateSenderQualifiedIdPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertySenderQualifiedId)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+}
+
+PropertyConfigProto CreateSubjectPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertySubject)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+}
+
+PropertyConfigProto CreateTimestampPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyTimestamp)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+}
+
+PropertyConfigProto CreateTagQualifiedIdPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyTagQualifiedId)
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+}
+
+PropertyConfigProto CreateGroupQualifiedIdPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyGroupQualifiedId)
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+}
+
+PropertyConfigProto CreateSuperTagQualifiedIdPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertySuperTagQualifiedId)
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID, /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+}
+
+PropertyConfigProto CreateNamePropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+}
+
+SchemaTypeConfigProto CreateEmailTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType(kTypeEmail)
+ .AddProperty(CreateTagQualifiedIdPropertyConfig())
+ .AddProperty(CreateSubjectPropertyConfig())
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyText)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyAttachment)
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(CreateSenderQualifiedIdPropertyConfig())
+ .AddProperty(CreateRecipientsPropertyConfig())
+ .AddProperty(CreateReceipientIdsPropertyConfig())
+ .AddProperty(CreateTimestampPropertyConfig())
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyNonIndexableInteger)
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+}
+
+SchemaTypeConfigProto CreateConversationTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType(kTypeConversation)
+ .AddProperty(CreateSuperTagQualifiedIdPropertyConfig())
+ .AddProperty(CreateNamePropertyConfig())
+ .AddProperty(CreateGroupQualifiedIdPropertyConfig())
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyEmails)
+ .SetDataTypeDocument(kTypeEmail, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyNestedNonIndexable)
+ .SetDataTypeDocument(kTypeEmail,
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .Build();
+}
+
+class SchemaTypeManagerTest : public ::testing::Test {
+ protected:
+ void SetUp() override { test_dir_ = GetTestTempDir() + "/icing"; }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+};
+
+TEST_F(SchemaTypeManagerTest, Create) {
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(kTypeEmail, CreateEmailTypeConfig());
+ type_config_map.emplace(kTypeConversation, CreateConversationTypeConfig());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put(kTypeEmail, kTypeEmailSchemaId));
+ ICING_ASSERT_OK(
+ schema_type_mapper->Put(kTypeConversation, kTypeConversationSchemaId));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, schema_type_mapper.get()));
+
+ // Check SectionManager
+ // In the Email type, "recipientIds", "recipients", "senderQualifiedId",
+ // "subject" and "timestamp" are indexable properties. "attachment",
+ // "nonIndexableInteger", "tagQualifiedId" and "text" are non-indexable.
+ EXPECT_THAT(
+ schema_type_manager->section_manager().GetMetadataList(kTypeEmail),
+ IsOkAndHolds(Pointee(ElementsAre(
+ EqualsSectionMetadata(/*expected_id=*/0,
+ /*expected_property_path=*/"recipientIds",
+ CreateReceipientIdsPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/1,
+ /*expected_property_path=*/"recipients",
+ CreateRecipientsPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/2,
+ /*expected_property_path=*/"senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/3,
+ /*expected_property_path=*/"subject",
+ CreateSubjectPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/4,
+ /*expected_property_path=*/"timestamp",
+ CreateTimestampPropertyConfig())))));
+
+ // In the Conversation type, "groupQualifiedId" and "name" are indexable
+ // properties as are the indexable properties of the email in the "emails"
+ // property. All properties of the email in the "nestedNonIndexable" property
+ // are not indexable.
+ EXPECT_THAT(
+ schema_type_manager->section_manager().GetMetadataList(kTypeConversation),
+ IsOkAndHolds(Pointee(ElementsAre(
+ EqualsSectionMetadata(
+ /*expected_id=*/0,
+ /*expected_property_path=*/"emails.recipientIds",
+ CreateReceipientIdsPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/1,
+ /*expected_property_path=*/"emails.recipients",
+ CreateRecipientsPropertyConfig()),
+ EqualsSectionMetadata(
+ /*expected_id=*/2,
+ /*expected_property_path=*/"emails.senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/3,
+ /*expected_property_path=*/"emails.subject",
+ CreateSubjectPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/4,
+ /*expected_property_path=*/"emails.timestamp",
+ CreateTimestampPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/5,
+ /*expected_property_path=*/"groupQualifiedId",
+ CreateGroupQualifiedIdPropertyConfig()),
+ EqualsSectionMetadata(/*expected_id=*/6,
+ /*expected_property_path=*/"name",
+ CreateNamePropertyConfig())))));
+
+ // Check JoinablePropertyManager
+ // In the Email type, "senderQualifiedId" and "tagQualifiedId" are joinable
+ // properties.
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager().GetMetadataList(
+ kTypeEmail),
+ IsOkAndHolds(Pointee(ElementsAre(
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()),
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1, /*expected_property_path=*/"tagQualifiedId",
+ CreateTagQualifiedIdPropertyConfig())))));
+ // In the Conversation type, "groupQualifiedId" and "superTagQualifiedId" are
+ // joinable properties as are the joinable properties of the email in the
+ // "emails" and "nestedNonIndexable" property.
+ EXPECT_THAT(
+ schema_type_manager->joinable_property_manager().GetMetadataList(
+ kTypeConversation),
+ IsOkAndHolds(Pointee(ElementsAre(
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/0,
+ /*expected_property_path=*/"emails.senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()),
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/1,
+ /*expected_property_path=*/"emails.tagQualifiedId",
+ CreateTagQualifiedIdPropertyConfig()),
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/2, /*expected_property_path=*/"groupQualifiedId",
+ CreateGroupQualifiedIdPropertyConfig()),
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/3,
+ /*expected_property_path=*/"nestedNonIndexable.senderQualifiedId",
+ CreateSenderQualifiedIdPropertyConfig()),
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/4,
+ /*expected_property_path=*/"nestedNonIndexable.tagQualifiedId",
+ CreateTagQualifiedIdPropertyConfig()),
+ EqualsJoinablePropertyMetadata(
+ /*expected_id=*/5,
+ /*expected_property_path=*/"superTagQualifiedId",
+ CreateSuperTagQualifiedIdPropertyConfig())))));
+}
+
+TEST_F(SchemaTypeManagerTest, CreateWithNullPointerShouldFail) {
+ SchemaUtil::TypeConfigMap type_config_map;
+ EXPECT_THAT(SchemaTypeManager::Create(type_config_map,
+ /*schema_type_mapper=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(SchemaTypeManagerTest, CreateWithSchemaNotInSchemaTypeMapperShouldFail) {
+ SchemaTypeConfigProto type_config;
+ type_config.set_schema_type("type");
+
+ auto property = type_config.add_properties();
+ property->set_property_name("property");
+ property->set_data_type(TYPE_STRING);
+ property->set_cardinality(CARDINALITY_REQUIRED);
+ property->mutable_string_indexing_config()->set_term_match_type(
+ TERM_MATCH_EXACT);
+
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace("type", type_config);
+
+ // Create an empty schema type mapper
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+
+ EXPECT_THAT(
+ SchemaTypeManager::Create(type_config_map, schema_type_mapper.get()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/schema-util.cc b/icing/schema/schema-util.cc
index 7413d73..72287a8 100644
--- a/icing/schema/schema-util.cc
+++ b/icing/schema/schema-util.cc
@@ -14,19 +14,21 @@
#include "icing/schema/schema-util.h"
+#include <algorithm>
#include <cstdint>
+#include <queue>
#include <string>
#include <string_view>
#include <unordered_map>
#include <unordered_set>
#include <utility>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/absl_ports/annotate.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/absl_ports/str_cat.h"
#include "icing/absl_ports/str_join.h"
-#include "icing/legacy/core/icing-string-util.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/term.pb.h"
#include "icing/util/logging.h"
@@ -37,6 +39,20 @@ namespace lib {
namespace {
+bool ArePropertiesEqual(const PropertyConfigProto& old_property,
+ const PropertyConfigProto& new_property) {
+ return old_property.property_name() == new_property.property_name() &&
+ old_property.data_type() == new_property.data_type() &&
+ old_property.schema_type() == new_property.schema_type() &&
+ old_property.cardinality() == new_property.cardinality() &&
+ old_property.string_indexing_config().term_match_type() ==
+ new_property.string_indexing_config().term_match_type() &&
+ old_property.string_indexing_config().tokenizer_type() ==
+ new_property.string_indexing_config().tokenizer_type() &&
+ old_property.document_indexing_config().index_nested_properties() ==
+ new_property.document_indexing_config().index_nested_properties();
+}
+
bool IsCardinalityCompatible(const PropertyConfigProto& old_property,
const PropertyConfigProto& new_property) {
if (old_property.cardinality() < new_property.cardinality()) {
@@ -87,39 +103,481 @@ bool IsPropertyCompatible(const PropertyConfigProto& old_property,
IsCardinalityCompatible(old_property, new_property);
}
-bool IsTermMatchTypeCompatible(const IndexingConfig& old_indexed,
- const IndexingConfig& new_indexed) {
+bool IsTermMatchTypeCompatible(const StringIndexingConfig& old_indexed,
+ const StringIndexingConfig& new_indexed) {
return old_indexed.term_match_type() == new_indexed.term_match_type() &&
old_indexed.tokenizer_type() == new_indexed.tokenizer_type();
}
+bool IsIntegerNumericMatchTypeCompatible(
+ const IntegerIndexingConfig& old_indexed,
+ const IntegerIndexingConfig& new_indexed) {
+ return old_indexed.numeric_match_type() == new_indexed.numeric_match_type();
+}
+
+bool IsDocumentIndexingCompatible(const DocumentIndexingConfig& old_indexed,
+ const DocumentIndexingConfig& new_indexed) {
+ // TODO(b/265304217): This could mark the new schema as incompatible and
+ // generate some unnecessary index rebuilds if the two schemas have an
+ // equivalent set of indexed properties, but changed the way that it is
+ // declared.
+ if (old_indexed.index_nested_properties() !=
+ new_indexed.index_nested_properties()) {
+ return false;
+ }
+
+ if (old_indexed.indexable_nested_properties_list().size() !=
+ new_indexed.indexable_nested_properties_list().size()) {
+ return false;
+ }
+
+ std::unordered_set<std::string_view> old_indexable_nested_properies_set(
+ old_indexed.indexable_nested_properties_list().begin(),
+ old_indexed.indexable_nested_properties_list().end());
+ for (const auto& property : new_indexed.indexable_nested_properties_list()) {
+ if (old_indexable_nested_properies_set.find(property) ==
+ old_indexable_nested_properies_set.end()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void AddIncompatibleChangeToDelta(
+ std::unordered_set<std::string>& incompatible_delta,
+ const SchemaTypeConfigProto& old_type_config,
+ const SchemaUtil::DependentMap& new_schema_dependent_map,
+ const SchemaUtil::TypeConfigMap& old_type_config_map,
+ const SchemaUtil::TypeConfigMap& new_type_config_map) {
+ // If this type is incompatible, then every type that depends on it might
+ // also be incompatible. Use the dependent map to mark those ones as
+ // incompatible too.
+ incompatible_delta.insert(old_type_config.schema_type());
+ auto dependent_types_itr =
+ new_schema_dependent_map.find(old_type_config.schema_type());
+ if (dependent_types_itr != new_schema_dependent_map.end()) {
+ for (const auto& [dependent_type, _] : dependent_types_itr->second) {
+ // The types from new_schema that depend on the current
+ // old_type_config may not present in old_schema.
+ // Those types will be listed at schema_delta.schema_types_new
+ // instead.
+ std::string dependent_type_str(dependent_type);
+ if (old_type_config_map.find(dependent_type_str) !=
+ old_type_config_map.end()) {
+ incompatible_delta.insert(std::move(dependent_type_str));
+ }
+ }
+ }
+}
+
+// Returns if C1 <= C2 based on the following rule, where C1 and C2 are
+// cardinalities that can be one of REPEATED, OPTIONAL, or REQUIRED.
+//
+// Rule: REQUIRED < OPTIONAL < REPEATED
+bool CardinalityLessThanEq(PropertyConfigProto::Cardinality::Code C1,
+ PropertyConfigProto::Cardinality::Code C2) {
+ if (C1 == C2) {
+ return true;
+ }
+ if (C1 == PropertyConfigProto::Cardinality::REQUIRED) {
+ return C2 == PropertyConfigProto::Cardinality::OPTIONAL ||
+ C2 == PropertyConfigProto::Cardinality::REPEATED;
+ }
+ if (C1 == PropertyConfigProto::Cardinality::OPTIONAL) {
+ return C2 == PropertyConfigProto::Cardinality::REPEATED;
+ }
+ return false;
+}
+
+// Check if set1 is a subset of set2.
+template <typename T>
+bool IsSubset(const std::unordered_set<T>& set1,
+ const std::unordered_set<T>& set2) {
+ for (const auto& item : set1) {
+ if (set2.find(item) == set2.end()) {
+ return false;
+ }
+ }
+ return true;
+}
+
} // namespace
-libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) {
- // Tracks SchemaTypeConfigs that we've validated already.
- std::unordered_set<std::string_view> known_schema_types;
+libtextclassifier3::Status CalculateTransitiveNestedTypeRelations(
+ const SchemaUtil::DependentMap& direct_nested_types_map,
+ const std::unordered_set<std::string_view>& joinable_types,
+ std::string_view type, bool path_contains_joinable_property,
+ SchemaUtil::DependentMap* expanded_nested_types_map,
+ std::unordered_map<std::string_view, bool>&&
+ pending_expansion_paths_indexable,
+ std::unordered_set<std::string_view>* sink_types) {
+ // TODO(b/280698121): Implement optimizations to this code to avoid reentering
+ // a node after it's already been expanded.
- // Tracks SchemaTypeConfigs that have been mentioned (by other
- // SchemaTypeConfigs), but we haven't validated yet.
- std::unordered_set<std::string_view> unknown_schema_types;
+ auto itr = direct_nested_types_map.find(type);
+ if (itr == direct_nested_types_map.end()) {
+ // It's a sink node. Just return.
+ sink_types->insert(type);
+ return libtextclassifier3::Status::OK;
+ }
+ std::unordered_map<std::string_view, std::vector<const PropertyConfigProto*>>
+ expanded_relations;
+
+ // Add all of the adjacent outgoing relations.
+ expanded_relations.reserve(itr->second.size());
+ expanded_relations.insert(itr->second.begin(), itr->second.end());
+
+ // Iterate through each adjacent outgoing relation and add their indirect
+ // outgoing relations.
+ for (const auto& [adjacent_type, adjacent_property_protos] : itr->second) {
+ // Make a copy of pending_expansion_paths_indexable for every iteration.
+ std::unordered_map<std::string_view, bool> pending_expansion_paths_copy(
+ pending_expansion_paths_indexable);
+
+ // 1. Check the nested indexable config of the edge (type -> adjacent_type),
+ // and the joinable config of the current path up to adjacent_type.
+ //
+ // The nested indexable config is true if any of the PropertyConfigProtos
+ // representing the connecting edge has index_nested_properties=true.
+ bool is_edge_nested_indexable = std::any_of(
+ adjacent_property_protos.begin(), adjacent_property_protos.end(),
+ [](const PropertyConfigProto* property_config) {
+ return property_config->document_indexing_config()
+ .index_nested_properties();
+ });
+ // TODO(b/265304217): change this once we add joinable_properties_list.
+ // Check if addition of the new edge (type->adjacent_type) makes the path
+ // joinable.
+ bool new_path_contains_joinable_property =
+ joinable_types.count(type) > 0 || path_contains_joinable_property;
+ // Set is_nested_indexable field for the current edge
+ pending_expansion_paths_copy[type] = is_edge_nested_indexable;
+
+ // If is_edge_nested_indexable=false, then all paths to adjacent_type
+ // currently in the pending_expansions map are also not nested indexable.
+ if (!is_edge_nested_indexable) {
+ for (auto& pending_expansion : pending_expansion_paths_copy) {
+ pending_expansion.second = false;
+ }
+ }
+
+ // 2. Check if we're in the middle of expanding this type - IOW
+ // there's a cycle!
+ //
+ // This cycle is not allowed if either:
+ // 1. The cycle starting at adjacent_type is nested indexable, OR
+ // 2. The current path contains a joinable property.
+ auto adjacent_itr = pending_expansion_paths_copy.find(adjacent_type);
+ if (adjacent_itr != pending_expansion_paths_copy.end()) {
+ if (adjacent_itr->second || new_path_contains_joinable_property) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Invalid cycle detected in type configs. '", type,
+ "' references itself and is nested-indexable or nested-joinable."));
+ }
+ // The cycle is allowed and there's no need to keep iterating the loop.
+ // Move on to the next adjacent value.
+ continue;
+ }
+
+ // 3. Expand this type as needed.
+ ICING_RETURN_IF_ERROR(CalculateTransitiveNestedTypeRelations(
+ direct_nested_types_map, joinable_types, adjacent_type,
+ new_path_contains_joinable_property, expanded_nested_types_map,
+ std::move(pending_expansion_paths_copy), sink_types));
+ if (sink_types->count(adjacent_type) > 0) {
+ // "adjacent" is a sink node. Just skip to the next.
+ continue;
+ }
+
+ // 4. "adjacent" has been fully expanded. Add all of its transitive
+ // outgoing relations to this type's transitive outgoing relations.
+ auto adjacent_expanded_itr = expanded_nested_types_map->find(adjacent_type);
+ for (const auto& [transitive_reachable, _] :
+ adjacent_expanded_itr->second) {
+ // Insert a transitive reachable node `transitive_reachable` for `type` if
+ // it wasn't previously reachable.
+ // Since there is no direct edge between `type` and `transitive_reachable`
+ // we insert an empty vector into the dependent map.
+ expanded_relations.insert({transitive_reachable, {}});
+ }
+ }
+ for (const auto& kvp : expanded_relations) {
+ expanded_nested_types_map->operator[](type).insert(kvp);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T>
+libtextclassifier3::Status CalculateAcyclicTransitiveRelations(
+ const SchemaUtil::TypeRelationMap<T>& direct_relation_map,
+ std::string_view type,
+ SchemaUtil::TypeRelationMap<T>* expanded_relation_map,
+ std::unordered_set<std::string_view>* pending_expansions,
+ std::unordered_set<std::string_view>* sink_types) {
+ auto expanded_itr = expanded_relation_map->find(type);
+ if (expanded_itr != expanded_relation_map->end()) {
+ // We've already expanded this type. Just return.
+ return libtextclassifier3::Status::OK;
+ }
+ auto itr = direct_relation_map.find(type);
+ if (itr == direct_relation_map.end()) {
+ // It's a sink node. Just return.
+ sink_types->insert(type);
+ return libtextclassifier3::Status::OK;
+ }
+ pending_expansions->insert(type);
+ std::unordered_map<std::string_view, T> expanded_relations;
+
+ // Add all of the adjacent outgoing relations.
+ expanded_relations.reserve(itr->second.size());
+ expanded_relations.insert(itr->second.begin(), itr->second.end());
+
+ // Iterate through each adjacent outgoing relation and add their indirect
+ // outgoing relations.
+ for (const auto& [adjacent, _] : itr->second) {
+ // 1. Check if we're in the middle of expanding this type - IOW there's a
+ // cycle!
+ if (pending_expansions->count(adjacent) > 0) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Invalid cycle detected in type configs. '", type,
+ "' references or inherits from itself."));
+ }
+
+ // 2. Expand this type as needed.
+ ICING_RETURN_IF_ERROR(CalculateAcyclicTransitiveRelations(
+ direct_relation_map, adjacent, expanded_relation_map,
+ pending_expansions, sink_types));
+ if (sink_types->count(adjacent) > 0) {
+ // "adjacent" is a sink node. Just skip to the next.
+ continue;
+ }
+
+ // 3. "adjacent" has been fully expanded. Add all of its transitive outgoing
+ // relations to this type's transitive outgoing relations.
+ auto adjacent_expanded_itr = expanded_relation_map->find(adjacent);
+ for (const auto& [transitive_reachable, _] :
+ adjacent_expanded_itr->second) {
+ // Insert a transitive reachable node `transitive_reachable` for `type`.
+ // Also since there is no direct edge between `type` and
+ // `transitive_reachable`, the direct edge is initialized by default.
+ expanded_relations.insert({transitive_reachable, T()});
+ }
+ }
+ expanded_relation_map->insert({type, std::move(expanded_relations)});
+ pending_expansions->erase(type);
+ return libtextclassifier3::Status::OK;
+}
+
+// Calculate and return the expanded nested-type map from
+// direct_nested_type_map. This expands the direct_nested_type_map to also
+// include indirect nested-type relations.
+//
+// Ex. Suppose we have the following relations in direct_nested_type_map.
+//
+// C -> B (Schema type B has a document property of type C)
+// B -> A (Schema type A has a document property of type B)
+//
+// Then, this function would expand the map by adding C -> A to the map.
+libtextclassifier3::StatusOr<SchemaUtil::DependentMap>
+CalculateTransitiveNestedTypeRelations(
+ const SchemaUtil::DependentMap& direct_nested_type_map,
+ const std::unordered_set<std::string_view>& joinable_types,
+ bool allow_circular_schema_definitions) {
+ SchemaUtil::DependentMap expanded_nested_type_map;
+ // Types that have no outgoing relations.
+ std::unordered_set<std::string_view> sink_types;
+
+ if (allow_circular_schema_definitions) {
+ // Map of nodes that are pending expansion -> whether the path from each key
+ // node to the 'current' node is nested_indexable.
+ // A copy of this map is made for each new node that we expand.
+ std::unordered_map<std::string_view, bool>
+ pending_expansion_paths_indexable;
+ for (const auto& kvp : direct_nested_type_map) {
+ ICING_RETURN_IF_ERROR(CalculateTransitiveNestedTypeRelations(
+ direct_nested_type_map, joinable_types, kvp.first,
+ /*path_contains_joinable_property=*/false, &expanded_nested_type_map,
+ std::unordered_map<std::string_view, bool>(
+ pending_expansion_paths_indexable),
+ &sink_types));
+ }
+ } else {
+ // If allow_circular_schema_definitions is false, then fallback to the old
+ // way of detecting cycles.
+ // Types that we are expanding.
+ std::unordered_set<std::string_view> pending_expansions;
+ for (const auto& kvp : direct_nested_type_map) {
+ ICING_RETURN_IF_ERROR(CalculateAcyclicTransitiveRelations(
+ direct_nested_type_map, kvp.first, &expanded_nested_type_map,
+ &pending_expansions, &sink_types));
+ }
+ }
+ return expanded_nested_type_map;
+}
+
+// Calculate and return the expanded inheritance map from
+// direct_nested_type_map. This expands the direct_inheritance_map to also
+// include indirect inheritance relations.
+//
+// Ex. Suppose we have the following relations in direct_inheritance_map.
+//
+// C -> B (Schema type C is B's parent_type )
+// B -> A (Schema type B is A's parent_type)
+//
+// Then, this function would expand the map by adding C -> A to the map.
+libtextclassifier3::StatusOr<SchemaUtil::InheritanceMap>
+CalculateTransitiveInheritanceRelations(
+ const SchemaUtil::InheritanceMap& direct_inheritance_map) {
+ SchemaUtil::InheritanceMap expanded_inheritance_map;
+
+ // Types that we are expanding.
+ std::unordered_set<std::string_view> pending_expansions;
+
+ // Types that have no outgoing relation.
+ std::unordered_set<std::string_view> sink_types;
+ for (const auto& kvp : direct_inheritance_map) {
+ ICING_RETURN_IF_ERROR(CalculateAcyclicTransitiveRelations(
+ direct_inheritance_map, kvp.first, &expanded_inheritance_map,
+ &pending_expansions, &sink_types));
+ }
+ return expanded_inheritance_map;
+}
+
+// Builds a transitive dependent map. Types with no dependents will not be
+// present in the map as keys.
+//
+// Ex. Suppose we have a schema with four types A, B, C, D. A has a property of
+// type B and B has a property of type C. C and D only have non-document
+// properties.
+//
+// The transitive dependent map for this schema would be:
+// C -> A, B (both A and B depend on C)
+// B -> A (A depends on B)
+//
+// A and D will not be present in the map as keys because no type depends on
+// them.
+//
+// RETURNS:
+// On success, a transitive dependent map of all types in the schema.
+// INVALID_ARGUMENT if the schema contains a cycle or an undefined type.
+// ALREADY_EXISTS if a schema type is specified more than once in the schema
+libtextclassifier3::StatusOr<SchemaUtil::DependentMap>
+BuildTransitiveDependentGraph(const SchemaProto& schema,
+ bool allow_circular_schema_definitions) {
+ // We expand the nested-type dependent map and inheritance map differently
+ // when calculating transitive relations. These two types of relations also
+ // should not be transitive so we keep these as separate maps.
+ //
+ // e.g. For schema type A, B and C, B depends on A through inheritance, and
+ // C depends on B by having a property with type B, we will have the two
+ // relations {A, B} and {B, C} in the dependent map, but will not have {A, C}
+ // in the map.
+ SchemaUtil::DependentMap direct_nested_type_map;
+ SchemaUtil::InheritanceMap direct_inheritance_map;
+
+ // Set of schema types that have at least one joinable property.
+ std::unordered_set<std::string_view> joinable_types;
+
+ // Add all first-order dependents.
+ std::unordered_set<std::string_view> known_types;
+ std::unordered_set<std::string_view> unknown_types;
+ for (const auto& type_config : schema.types()) {
+ std::string_view schema_type(type_config.schema_type());
+ if (known_types.count(schema_type) > 0) {
+ return absl_ports::AlreadyExistsError(absl_ports::StrCat(
+ "Field 'schema_type' '", schema_type, "' is already defined"));
+ }
+ known_types.insert(schema_type);
+ unknown_types.erase(schema_type);
+ // Insert inheritance relations into the inheritance map.
+ for (std::string_view parent_schema_type : type_config.parent_types()) {
+ if (known_types.count(parent_schema_type) == 0) {
+ unknown_types.insert(parent_schema_type);
+ }
+ direct_inheritance_map[parent_schema_type][schema_type] = true;
+ }
+ for (const auto& property_config : type_config.properties()) {
+ if (property_config.joinable_config().value_type() !=
+ JoinableConfig::ValueType::NONE) {
+ joinable_types.insert(schema_type);
+ }
+ // Insert nested-type relations into the nested-type map.
+ if (property_config.data_type() ==
+ PropertyConfigProto::DataType::DOCUMENT) {
+ // Need to know what schema_type these Document properties should be
+ // validated against
+ std::string_view property_schema_type(property_config.schema_type());
+ if (known_types.count(property_schema_type) == 0) {
+ unknown_types.insert(property_schema_type);
+ }
+ direct_nested_type_map[property_schema_type][schema_type].push_back(
+ &property_config);
+ }
+ }
+ }
+ if (!unknown_types.empty()) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Undefined 'schema_type's: ", absl_ports::StrJoin(unknown_types, ",")));
+ }
+
+ // Merge two expanded maps into a single dependent_map, without making
+ // inheritance and nested-type relations transitive.
+ ICING_ASSIGN_OR_RETURN(SchemaUtil::DependentMap merged_dependent_map,
+ CalculateTransitiveNestedTypeRelations(
+ direct_nested_type_map, joinable_types,
+ allow_circular_schema_definitions));
+ ICING_ASSIGN_OR_RETURN(
+ SchemaUtil::InheritanceMap expanded_inheritance_map,
+ CalculateTransitiveInheritanceRelations(direct_inheritance_map));
+ for (const auto& [parent_type, inheritance_relation] :
+ expanded_inheritance_map) {
+ // Insert the parent_type into the dependent map if it is not present
+ // already.
+ merged_dependent_map.insert({parent_type, {}});
+ for (const auto& [child_type, _] : inheritance_relation) {
+ // Insert the child_type into parent_type's dependent map if it's not
+ // present already, in which case the value will be an empty vector.
+ merged_dependent_map[parent_type].insert({child_type, {}});
+ }
+ }
+ return merged_dependent_map;
+}
+
+libtextclassifier3::StatusOr<SchemaUtil::InheritanceMap>
+SchemaUtil::BuildTransitiveInheritanceGraph(const SchemaProto& schema) {
+ SchemaUtil::InheritanceMap direct_inheritance_map;
+ for (const auto& type_config : schema.types()) {
+ for (std::string_view parent_schema_type : type_config.parent_types()) {
+ direct_inheritance_map[parent_schema_type][type_config.schema_type()] =
+ true;
+ }
+ }
+ return CalculateTransitiveInheritanceRelations(direct_inheritance_map);
+}
+
+libtextclassifier3::StatusOr<SchemaUtil::DependentMap> SchemaUtil::Validate(
+ const SchemaProto& schema, bool allow_circular_schema_definitions) {
+ // 1. Build the dependent map. This will detect any cycles, non-existent or
+ // duplicate types in the schema.
+ ICING_ASSIGN_OR_RETURN(
+ SchemaUtil::DependentMap dependent_map,
+ BuildTransitiveDependentGraph(schema, allow_circular_schema_definitions));
// Tracks PropertyConfigs within a SchemaTypeConfig that we've validated
// already.
std::unordered_set<std::string_view> known_property_names;
+ // Tracks PropertyConfigs containing joinable properties.
+ std::unordered_set<std::string_view> schema_types_with_joinable_property;
+
+ // 2. Validate the properties of each type.
for (const auto& type_config : schema.types()) {
std::string_view schema_type(type_config.schema_type());
ICING_RETURN_IF_ERROR(ValidateSchemaType(schema_type));
- // We can't have duplicate schema_types
- if (!known_schema_types.insert(schema_type).second) {
- return absl_ports::AlreadyExistsError(absl_ports::StrCat(
- "Field 'schema_type' '", schema_type, "' is already defined"));
- }
- unknown_schema_types.erase(schema_type);
-
// We only care about properties being unique within one type_config
known_property_names.clear();
+
for (const auto& property_config : type_config.properties()) {
std::string_view property_name(property_config.property_name());
ICING_RETURN_IF_ERROR(ValidatePropertyName(property_name, schema_type));
@@ -146,32 +604,78 @@ libtextclassifier3::Status SchemaUtil::Validate(const SchemaProto& schema) {
validated_status,
absl_ports::StrCat("Field 'schema_type' is required for DOCUMENT "
"data_types in schema property '",
- schema_type, " ", property_name, "'"));
+ schema_type, ".", property_name, "'"));
}
- // Need to make sure we eventually see/validate this schema_type
- if (known_schema_types.count(property_schema_type) == 0) {
- unknown_schema_types.insert(property_schema_type);
- }
+ ICING_RETURN_IF_ERROR(ValidateDocumentIndexingConfig(
+ property_config.document_indexing_config(), schema_type,
+ property_name));
}
ICING_RETURN_IF_ERROR(ValidateCardinality(property_config.cardinality(),
schema_type, property_name));
- ICING_RETURN_IF_ERROR(
- ValidateIndexingConfig(property_config.indexing_config(), data_type));
+ if (data_type == PropertyConfigProto::DataType::STRING) {
+ ICING_RETURN_IF_ERROR(ValidateStringIndexingConfig(
+ property_config.string_indexing_config(), data_type, schema_type,
+ property_name));
+ }
+
+ ICING_RETURN_IF_ERROR(ValidateJoinableConfig(
+ property_config.joinable_config(), data_type,
+ property_config.cardinality(), schema_type, property_name));
+ if (property_config.joinable_config().value_type() !=
+ JoinableConfig::ValueType::NONE) {
+ schema_types_with_joinable_property.insert(schema_type);
+ }
}
}
- // An Document property claimed to be of a schema_type that we never
- // saw/validated
- if (!unknown_schema_types.empty()) {
- return absl_ports::UnknownError(
- absl_ports::StrCat("Undefined 'schema_type's: ",
- absl_ports::StrJoin(unknown_schema_types, ",")));
+ // BFS traverse the dependent graph to make sure that no nested levels
+ // (properties with DOCUMENT data type) have REPEATED cardinality while
+ // depending on schema types with joinable property.
+ std::queue<std::string_view> frontier;
+ for (const auto& schema_type : schema_types_with_joinable_property) {
+ frontier.push(schema_type);
}
+ std::unordered_set<std::string_view> traversed =
+ std::move(schema_types_with_joinable_property);
+ while (!frontier.empty()) {
+ std::string_view schema_type = frontier.front();
+ frontier.pop();
- return libtextclassifier3::Status::OK;
+ const auto it = dependent_map.find(schema_type);
+ if (it == dependent_map.end()) {
+ continue;
+ }
+
+ // Check every type that has a property of type schema_type.
+ for (const auto& [next_schema_type, property_configs] : it->second) {
+ // Check all properties in "next_schema_type" that are of type
+ // "schema_type".
+ for (const PropertyConfigProto* property_config : property_configs) {
+ if (property_config != nullptr &&
+ property_config->cardinality() ==
+ PropertyConfigProto::Cardinality::REPEATED) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Schema type '", next_schema_type,
+ "' cannot have REPEATED nested document property '",
+ property_config->property_name(),
+ "' while connecting to some joinable properties"));
+ }
+ }
+
+ if (traversed.count(next_schema_type) == 0) {
+ traversed.insert(next_schema_type);
+ frontier.push(next_schema_type);
+ }
+ }
+ }
+
+ // Verify that every child type's property set has included all compatible
+ // properties from parent types.
+ ICING_RETURN_IF_ERROR(ValidateInheritedProperties(schema));
+ return dependent_map;
}
libtextclassifier3::Status SchemaUtil::ValidateSchemaType(
@@ -214,7 +718,7 @@ libtextclassifier3::Status SchemaUtil::ValidateDataType(
if (data_type == PropertyConfigProto::DataType::UNKNOWN) {
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
"Field 'data_type' cannot be UNKNOWN for schema property '",
- schema_type, " ", property_name, "'"));
+ schema_type, ".", property_name, "'"));
}
return libtextclassifier3::Status::OK;
@@ -228,22 +732,196 @@ libtextclassifier3::Status SchemaUtil::ValidateCardinality(
if (cardinality == PropertyConfigProto::Cardinality::UNKNOWN) {
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
"Field 'cardinality' cannot be UNKNOWN for schema property '",
- schema_type, " ", property_name, "'"));
+ schema_type, ".", property_name, "'"));
}
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status SchemaUtil::ValidateIndexingConfig(
- const IndexingConfig& config,
- PropertyConfigProto::DataType::Code data_type) {
- if (data_type == PropertyConfigProto::DataType::DOCUMENT) {
- return libtextclassifier3::Status::OK;
+libtextclassifier3::Status SchemaUtil::ValidateStringIndexingConfig(
+ const StringIndexingConfig& config,
+ PropertyConfigProto::DataType::Code data_type, std::string_view schema_type,
+ std::string_view property_name) {
+ if (config.term_match_type() == TermMatchType::UNKNOWN &&
+ config.tokenizer_type() != StringIndexingConfig::TokenizerType::NONE) {
+ // They set a tokenizer type, but no term match type.
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Indexed string property '", schema_type, ".", property_name,
+ "' cannot have a term match type UNKNOWN"));
}
+
if (config.term_match_type() != TermMatchType::UNKNOWN &&
- config.tokenizer_type() == IndexingConfig::TokenizerType::NONE) {
+ config.tokenizer_type() == StringIndexingConfig::TokenizerType::NONE) {
+ // They set a term match type, but no tokenizer type
return absl_ports::InvalidArgumentError(
- "TermMatchType properties cannot have a tokenizer type of NONE");
+ absl_ports::StrCat("Indexed string property '", property_name,
+ "' cannot have a tokenizer type of NONE"));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaUtil::ValidateJoinableConfig(
+ const JoinableConfig& config, PropertyConfigProto::DataType::Code data_type,
+ PropertyConfigProto::Cardinality::Code cardinality,
+ std::string_view schema_type, std::string_view property_name) {
+ if (config.value_type() == JoinableConfig::ValueType::QUALIFIED_ID) {
+ if (data_type != PropertyConfigProto::DataType::STRING) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Qualified id joinable property '", property_name,
+ "' is required to have STRING data type"));
+ }
+
+ if (cardinality == PropertyConfigProto::Cardinality::REPEATED) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Qualified id joinable property '", property_name,
+ "' cannot have REPEATED cardinality"));
+ }
+ }
+
+ if (config.propagate_delete() &&
+ config.value_type() != JoinableConfig::ValueType::QUALIFIED_ID) {
+ return absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Field 'property_name' '", property_name,
+ "' is required to have QUALIFIED_ID joinable "
+ "value type with delete propagation enabled"));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status SchemaUtil::ValidateDocumentIndexingConfig(
+ const DocumentIndexingConfig& config, std::string_view schema_type,
+ std::string_view property_name) {
+ if (!config.indexable_nested_properties_list().empty() &&
+ config.index_nested_properties()) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "DocumentIndexingConfig.index_nested_properties is required to be "
+ "false when providing a non-empty indexable_nested_properties_list "
+ "for property '",
+ schema_type, ".", property_name, "'"));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+/* static */ bool SchemaUtil::IsIndexedProperty(
+ const PropertyConfigProto& property_config) {
+ switch (property_config.data_type()) {
+ case PropertyConfigProto::DataType::STRING:
+ return property_config.string_indexing_config().term_match_type() !=
+ TermMatchType::UNKNOWN &&
+ property_config.string_indexing_config().tokenizer_type() !=
+ StringIndexingConfig::TokenizerType::NONE;
+ case PropertyConfigProto::DataType::INT64:
+ return property_config.integer_indexing_config().numeric_match_type() !=
+ IntegerIndexingConfig::NumericMatchType::UNKNOWN;
+ case PropertyConfigProto::DataType::DOCUMENT:
+ // A document property is considered indexed if it has
+ // index_nested_properties=true, or a non-empty
+ // indexable_nested_properties_list.
+ return property_config.document_indexing_config()
+ .index_nested_properties() ||
+ !property_config.document_indexing_config()
+ .indexable_nested_properties_list()
+ .empty();
+ case PropertyConfigProto::DataType::UNKNOWN:
+ case PropertyConfigProto::DataType::DOUBLE:
+ case PropertyConfigProto::DataType::BOOLEAN:
+ case PropertyConfigProto::DataType::BYTES:
+ return false;
+ }
+}
+
+bool SchemaUtil::IsParent(const SchemaUtil::InheritanceMap& inheritance_map,
+ std::string_view parent_type,
+ std::string_view child_type) {
+ auto iter = inheritance_map.find(parent_type);
+ if (iter == inheritance_map.end()) {
+ return false;
+ }
+ return iter->second.count(child_type) > 0;
+}
+
+bool SchemaUtil::IsInheritedPropertyCompatible(
+ const SchemaUtil::InheritanceMap& inheritance_map,
+ const PropertyConfigProto& child_property_config,
+ const PropertyConfigProto& parent_property_config) {
+ // Check if child_property_config->cardinality() <=
+ // parent_property_config->cardinality().
+ // Subtype may require a stricter cardinality, but cannot loosen cardinality
+ // requirements.
+ if (!CardinalityLessThanEq(child_property_config.cardinality(),
+ parent_property_config.cardinality())) {
+ return false;
+ }
+
+ // Now we can assume T1 and T2 are not nullptr, and cardinality check passes.
+ if (child_property_config.data_type() !=
+ PropertyConfigProto::DataType::DOCUMENT ||
+ parent_property_config.data_type() !=
+ PropertyConfigProto::DataType::DOCUMENT) {
+ return child_property_config.data_type() ==
+ parent_property_config.data_type();
+ }
+
+ // Now we can assume T1 and T2 are both document type.
+ return child_property_config.schema_type() ==
+ parent_property_config.schema_type() ||
+ IsParent(inheritance_map, parent_property_config.schema_type(),
+ child_property_config.schema_type());
+}
+
+libtextclassifier3::Status SchemaUtil::ValidateInheritedProperties(
+ const SchemaProto& schema) {
+ // Create a inheritance map
+ ICING_ASSIGN_OR_RETURN(SchemaUtil::InheritanceMap inheritance_map,
+ BuildTransitiveInheritanceGraph(schema));
+
+ // Create a map that maps from type name to property names, and then from
+ // property names to PropertyConfigProto.
+ std::unordered_map<
+ std::string, std::unordered_map<std::string, const PropertyConfigProto*>>
+ property_map;
+ for (const SchemaTypeConfigProto& type_config : schema.types()) {
+ // Skipping building entries for types without any child or parent, since
+ // such entry will never be used.
+ if (type_config.parent_types().empty() &&
+ inheritance_map.count(type_config.schema_type()) == 0) {
+ continue;
+ }
+ auto& curr_property_map = property_map[type_config.schema_type()];
+ for (const PropertyConfigProto& property_config :
+ type_config.properties()) {
+ curr_property_map[property_config.property_name()] = &property_config;
+ }
+ }
+
+ // Validate child properties.
+ for (const SchemaTypeConfigProto& type_config : schema.types()) {
+ const std::string& child_type_name = type_config.schema_type();
+ auto& child_property_map = property_map[child_type_name];
+
+ for (const std::string& parent_type_name : type_config.parent_types()) {
+ auto& parent_property_map = property_map[parent_type_name];
+
+ for (const auto& [property_name, parent_property_config] :
+ parent_property_map) {
+ auto child_property_iter = child_property_map.find(property_name);
+ if (child_property_iter == child_property_map.end()) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Property ", property_name, " is not present in child type ",
+ child_type_name, ", but it is defined in the parent type ",
+ parent_type_name, "."));
+ }
+ if (!IsInheritedPropertyCompatible(inheritance_map,
+ *child_property_iter->second,
+ *parent_property_config)) {
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Property ", property_name, " from child type ", child_type_name,
+ " is not compatible to the parent type ", parent_type_name, "."));
+ }
+ }
+ }
}
return libtextclassifier3::Status::OK;
}
@@ -260,21 +938,35 @@ SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs(
const SchemaTypeConfigProto& type_config) {
ParsedPropertyConfigs parsed_property_configs;
- // TODO(samzheng): consider caching property_config_map for some properties,
+ // TODO(cassiewang): consider caching property_config_map for some properties,
// e.g. using LRU cache. Or changing schema.proto to use go/protomap.
for (const PropertyConfigProto& property_config : type_config.properties()) {
- parsed_property_configs.property_config_map.emplace(
- property_config.property_name(), &property_config);
+ std::string_view property_name = property_config.property_name();
+ parsed_property_configs.property_config_map.emplace(property_name,
+ &property_config);
if (property_config.cardinality() ==
PropertyConfigProto::Cardinality::REQUIRED) {
- parsed_property_configs.num_required_properties++;
+ parsed_property_configs.required_properties.insert(property_name);
}
// A non-default term_match_type indicates that this property is meant to be
// indexed.
- if (property_config.indexing_config().term_match_type() !=
- TermMatchType::UNKNOWN) {
- parsed_property_configs.num_indexed_properties++;
+ if (IsIndexedProperty(property_config)) {
+ parsed_property_configs.indexed_properties.insert(property_name);
+ }
+
+ // A non-default value_type indicates that this property is meant to be
+ // joinable.
+ if (property_config.joinable_config().value_type() !=
+ JoinableConfig::ValueType::NONE) {
+ parsed_property_configs.joinable_properties.insert(property_name);
+ }
+
+ // Also keep track of how many nested document properties there are. Adding
+ // new nested document properties will result in join-index rebuild.
+ if (property_config.data_type() ==
+ PropertyConfigProto::DataType::DOCUMENT) {
+ parsed_property_configs.nested_document_properties.insert(property_name);
}
}
@@ -282,11 +974,12 @@ SchemaUtil::ParsedPropertyConfigs SchemaUtil::ParsePropertyConfigs(
}
const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
- const SchemaProto& old_schema, const SchemaProto& new_schema) {
+ const SchemaProto& old_schema, const SchemaProto& new_schema,
+ const DependentMap& new_schema_dependent_map) {
SchemaDelta schema_delta;
- schema_delta.index_incompatible = false;
- TypeConfigMap new_type_config_map;
+ TypeConfigMap old_type_config_map, new_type_config_map;
+ BuildTypeConfigMap(old_schema, &old_type_config_map);
BuildTypeConfigMap(new_schema, &new_type_config_map);
// Iterate through and check each field of the old schema
@@ -297,9 +990,9 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
if (new_schema_type_and_config == new_type_config_map.end()) {
// Didn't find the old schema type in the new schema, all the old
// documents of this schema type are invalid without the schema
- ICING_VLOG(1) << absl_ports::StrCat("Previously defined schema type ",
+ ICING_VLOG(1) << absl_ports::StrCat("Previously defined schema type '",
old_type_config.schema_type(),
- " was not defined in new schema");
+ "' was not defined in new schema");
schema_delta.schema_types_deleted.insert(old_type_config.schema_type());
continue;
}
@@ -310,9 +1003,48 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
// We only need to check the old, existing properties to see if they're
// compatible since we'll have old data that may be invalidated or need to
// be reindexed.
- int32_t old_required_properties = 0;
- int32_t old_indexed_properties = 0;
+ std::unordered_set<std::string_view> old_required_properties;
+ std::unordered_set<std::string_view> old_indexed_properties;
+ std::unordered_set<std::string_view> old_joinable_properties;
+ std::unordered_set<std::string_view> old_nested_document_properties;
+
+ // If there is a different number of properties, then there must have been a
+ // change.
+ bool has_property_changed =
+ old_type_config.properties_size() !=
+ new_schema_type_and_config->second.properties_size();
+ bool is_incompatible = false;
+ bool is_index_incompatible = false;
+ bool is_join_incompatible = false;
for (const auto& old_property_config : old_type_config.properties()) {
+ std::string_view property_name = old_property_config.property_name();
+ if (old_property_config.cardinality() ==
+ PropertyConfigProto::Cardinality::REQUIRED) {
+ old_required_properties.insert(property_name);
+ }
+
+ // A non-default term_match_type indicates that this property is meant to
+ // be indexed.
+ bool is_indexed_property = IsIndexedProperty(old_property_config);
+ if (is_indexed_property) {
+ old_indexed_properties.insert(property_name);
+ }
+
+ bool is_joinable_property =
+ old_property_config.joinable_config().value_type() !=
+ JoinableConfig::ValueType::NONE;
+ if (is_joinable_property) {
+ old_joinable_properties.insert(property_name);
+ }
+
+ // A nested-document property is a property of DataType::DOCUMENT.
+ bool is_nested_document_property =
+ old_property_config.data_type() ==
+ PropertyConfigProto::DataType::DOCUMENT;
+ if (is_nested_document_property) {
+ old_nested_document_properties.insert(property_name);
+ }
+
auto new_property_name_and_config =
new_parsed_property_configs.property_config_map.find(
old_property_config.property_name());
@@ -320,42 +1052,48 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
if (new_property_name_and_config ==
new_parsed_property_configs.property_config_map.end()) {
// Didn't find the old property
- ICING_VLOG(1) << absl_ports::StrCat("Previously defined property type ",
- old_type_config.schema_type(), ".",
- old_property_config.property_name(),
- " was not defined in new schema");
- schema_delta.schema_types_incompatible.insert(
- old_type_config.schema_type());
+ ICING_VLOG(1) << absl_ports::StrCat(
+ "Previously defined property type '", old_type_config.schema_type(),
+ ".", old_property_config.property_name(),
+ "' was not defined in new schema");
+ is_incompatible = true;
+ is_index_incompatible |= is_indexed_property;
+ is_join_incompatible |=
+ is_joinable_property || is_nested_document_property;
continue;
}
const PropertyConfigProto* new_property_config =
new_property_name_and_config->second;
+ if (!has_property_changed &&
+ !ArePropertiesEqual(old_property_config, *new_property_config)) {
+ // Finally found a property that changed.
+ has_property_changed = true;
+ }
if (!IsPropertyCompatible(old_property_config, *new_property_config)) {
ICING_VLOG(1) << absl_ports::StrCat(
- "Property ", old_type_config.schema_type(), ".",
- old_property_config.property_name(), " is incompatible.");
- schema_delta.schema_types_incompatible.insert(
- old_type_config.schema_type());
+ "Property '", old_type_config.schema_type(), ".",
+ old_property_config.property_name(), "' is incompatible.");
+ is_incompatible = true;
}
- if (old_property_config.cardinality() ==
- PropertyConfigProto::Cardinality::REQUIRED) {
- ++old_required_properties;
- }
-
- // A non-default term_match_type indicates that this property is meant to
- // be indexed.
- if (old_property_config.indexing_config().term_match_type() !=
- TermMatchType::UNKNOWN) {
- ++old_indexed_properties;
+ // Any change in the indexed property requires a reindexing
+ if (!IsTermMatchTypeCompatible(
+ old_property_config.string_indexing_config(),
+ new_property_config->string_indexing_config()) ||
+ !IsIntegerNumericMatchTypeCompatible(
+ old_property_config.integer_indexing_config(),
+ new_property_config->integer_indexing_config()) ||
+ !IsDocumentIndexingCompatible(
+ old_property_config.document_indexing_config(),
+ new_property_config->document_indexing_config())) {
+ is_index_incompatible = true;
}
- // Any change in the indexed property requires a reindexing
- if (!IsTermMatchTypeCompatible(old_property_config.indexing_config(),
- new_property_config->indexing_config())) {
- schema_delta.index_incompatible = true;
+ if (old_property_config.joinable_config().value_type() !=
+ new_property_config->joinable_config().value_type()) {
+ is_join_incompatible = true;
}
}
@@ -364,27 +1102,79 @@ const SchemaUtil::SchemaDelta SchemaUtil::ComputeCompatibilityDelta(
// guaranteed from our previous checks that all the old properties are also
// present in the new property config, so we can do a simple int comparison
// here to detect new required properties.
- if (new_parsed_property_configs.num_required_properties >
- old_required_properties) {
+ if (!IsSubset(new_parsed_property_configs.required_properties,
+ old_required_properties)) {
ICING_VLOG(1) << absl_ports::StrCat(
- "New schema ", old_type_config.schema_type(),
- " has REQUIRED properties that are not "
+ "New schema '", old_type_config.schema_type(),
+ "' has REQUIRED properties that are not "
"present in the previously defined schema");
- schema_delta.schema_types_incompatible.insert(
- old_type_config.schema_type());
+ is_incompatible = true;
}
- // If we've gained any new indexed properties, then the section ids may
- // change. Since the section ids are stored in the index, we'll need to
+ // If we've gained any new indexed properties (this includes gaining new
+ // indexed nested document properties), then the section ids may change.
+ // Since the section ids are stored in the index, we'll need to
// reindex everything.
- if (new_parsed_property_configs.num_indexed_properties >
- old_indexed_properties) {
- ICING_VLOG(1) << absl_ports::StrCat(
- "Set of indexed properties in schema type '",
- old_type_config.schema_type(),
- "' has changed, required reindexing.");
- schema_delta.index_incompatible = true;
+ if (!IsSubset(new_parsed_property_configs.indexed_properties,
+ old_indexed_properties)) {
+ ICING_VLOG(1) << "Set of indexed properties in schema type '"
+ << old_type_config.schema_type()
+ << "' has changed, required reindexing.";
+ is_index_incompatible = true;
+ }
+
+ // If we've gained any new joinable properties, then the joinable property
+ // ids may change. Since the joinable property ids are stored in the cache,
+ // we'll need to reconstruct join index.
+ // If we've gained any new nested document properties, we also rebuild the
+ // join index. This is because we index all nested joinable properties, so
+ // adding a nested document property will most probably result in having
+ // more joinable properties.
+ if (!IsSubset(new_parsed_property_configs.joinable_properties,
+ old_joinable_properties) ||
+ !IsSubset(new_parsed_property_configs.nested_document_properties,
+ old_nested_document_properties)) {
+ ICING_VLOG(1) << "Set of joinable properties in schema type '"
+ << old_type_config.schema_type()
+ << "' has changed, required reconstructing joinable cache.";
+ is_join_incompatible = true;
+ }
+
+ if (is_incompatible) {
+ AddIncompatibleChangeToDelta(schema_delta.schema_types_incompatible,
+ old_type_config, new_schema_dependent_map,
+ old_type_config_map, new_type_config_map);
+ }
+
+ if (is_index_incompatible) {
+ AddIncompatibleChangeToDelta(schema_delta.schema_types_index_incompatible,
+ old_type_config, new_schema_dependent_map,
+ old_type_config_map, new_type_config_map);
}
+
+ if (is_join_incompatible) {
+ AddIncompatibleChangeToDelta(schema_delta.schema_types_join_incompatible,
+ old_type_config, new_schema_dependent_map,
+ old_type_config_map, new_type_config_map);
+ }
+
+ if (!is_incompatible && !is_index_incompatible && !is_join_incompatible &&
+ has_property_changed) {
+ schema_delta.schema_types_changed_fully_compatible.insert(
+ old_type_config.schema_type());
+ }
+
+ // Lastly, remove this type from the map. We know that this type can't
+ // come up in future iterations through the old schema types because the old
+ // type config has unique types.
+ new_type_config_map.erase(old_type_config.schema_type());
+ }
+
+ // Any types that are still present in the new_type_config_map are newly added
+ // types.
+ schema_delta.schema_types_new.reserve(new_type_config_map.size());
+ for (auto& kvp : new_type_config_map) {
+ schema_delta.schema_types_new.insert(std::move(kvp.first));
}
return schema_delta;
diff --git a/icing/schema/schema-util.h b/icing/schema/schema-util.h
index d65dd10..4f09915 100644
--- a/icing/schema/schema-util.h
+++ b/icing/schema/schema-util.h
@@ -22,6 +22,7 @@
#include <unordered_set>
#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/proto/schema.pb.h"
namespace icing {
@@ -32,13 +33,41 @@ class SchemaUtil {
using TypeConfigMap =
std::unordered_map<std::string, const SchemaTypeConfigProto>;
- struct SchemaDelta {
- // Whether an indexing config has changed, requiring the index to be
- // regenerated. We don't list out all the types that make the index
- // incompatible because our index isn't optimized for that. It's much easier
- // to reset the entire index and reindex every document.
- bool index_incompatible = false;
+ // A data structure that stores the relationships between schema types. The
+ // keys in TypeRelationMap are schema types, and the values are sets of schema
+ // types that are directly or indirectly related to the key.
+ template <typename T>
+ using TypeRelationMap =
+ std::unordered_map<std::string_view,
+ std::unordered_map<std::string_view, T>>;
+
+ // If A -> B is indicated in the map, then type A must be built before
+ // building type B, which implies one of the following situations.
+ //
+ // 1. B has a property of type A.
+ // 2. A is a parent type of B via polymorphism.
+ //
+ // For the first case, this map will also include all PropertyConfigProto
+ // (with DOCUMENT data_type) pointers which *directly* connects type A and B.
+ // IOW, this vector of PropertyConfigProto* are "direct edges" connecting A
+ // and B directly. It will be an empty vector if A and B are not "directly"
+ // connected, but instead via another intermediate level of schema type. For
+ // example, the actual dependency is A -> C -> B, so there will be A -> C and
+ // C -> B with valid PropertyConfigProto* respectively in this map, but we
+ // will also expand transitive dependents: add A -> B into dependent map with
+ // empty vector of "edges".
+ using DependentMap = TypeRelationMap<std::vector<const PropertyConfigProto*>>;
+ // If A -> B is indicated in the map, then type A is a parent type of B,
+ // directly or indirectly. If directly, the bool value in the map will be
+ // true, otherwise false.
+ //
+ // Note that all relationships contained in this map are also entries in the
+ // DependentMap, i.e. if B inherits from A, then there will be a mapping from
+ // A to B in both this map and the DependentMap.
+ using InheritanceMap = TypeRelationMap<bool>;
+
+ struct SchemaDelta {
// Which schema types were present in the old schema, but were deleted from
// the new schema.
std::unordered_set<std::string> schema_types_deleted;
@@ -47,10 +76,35 @@ class SchemaUtil {
// could invalidate existing Documents of that schema type.
std::unordered_set<std::string> schema_types_incompatible;
+ // Schema types that were added in the new schema. Represented by the
+ // `schema_type` field in the SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_new;
+
+ // Schema types that were changed in a way that was backwards compatible and
+ // didn't invalidate the index. Represented by the `schema_type` field in
+ // the SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_changed_fully_compatible;
+
+ // Schema types that were changed in a way that was backwards compatible,
+ // but invalidated the index. Represented by the `schema_type` field in the
+ // SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_index_incompatible;
+
+ // Schema types that were changed in a way that was backwards compatible,
+ // but invalidated the joinable cache. Represented by the `schema_type`
+ // field in the SchemaTypeConfigProto.
+ std::unordered_set<std::string> schema_types_join_incompatible;
+
bool operator==(const SchemaDelta& other) const {
- return index_incompatible == other.index_incompatible &&
- schema_types_deleted == other.schema_types_deleted &&
- schema_types_incompatible == other.schema_types_incompatible;
+ return schema_types_deleted == other.schema_types_deleted &&
+ schema_types_incompatible == other.schema_types_incompatible &&
+ schema_types_new == other.schema_types_new &&
+ schema_types_changed_fully_compatible ==
+ other.schema_types_changed_fully_compatible &&
+ schema_types_index_incompatible ==
+ other.schema_types_index_incompatible &&
+ schema_types_join_incompatible ==
+ other.schema_types_join_incompatible;
}
};
@@ -59,11 +113,17 @@ class SchemaUtil {
std::unordered_map<std::string_view, const PropertyConfigProto*>
property_config_map;
- // Total number of properties that have an indexing config
- int32_t num_indexed_properties = 0;
+ // Properties that have an indexing config
+ std::unordered_set<std::string_view> indexed_properties;
+
+ // Properties that were REQUIRED
+ std::unordered_set<std::string_view> required_properties;
- // Total number of properties that were REQUIRED
- int32_t num_required_properties = 0;
+ // Properties that have joinable config
+ std::unordered_set<std::string_view> joinable_properties;
+
+ // Properties that have DataType::DOCUMENT
+ std::unordered_set<std::string_view> nested_document_properties;
};
// This function validates:
@@ -80,12 +140,58 @@ class SchemaUtil {
// 9. PropertyConfigProtos.schema_type's must correspond to a
// SchemaTypeConfigProto.schema_type
// 10. Property names can only be alphanumeric.
+ // 11. Any STRING data types have a valid string_indexing_config
+ // 12. PropertyConfigProtos.joinable_config must be valid. See
+ // ValidateJoinableConfig for more details.
+ // 13. Any PropertyConfigProtos with nested DOCUMENT data type must not have
+ // REPEATED cardinality if they reference a schema type containing
+ // joinable property.
+ // 14. The schema definition cannot have invalid cycles. A cycle is invalid
+ // if:
+ // a. SchemaTypeConfigProto.parent_type definitions form an inheritance
+ // cycle.
+ // b. The schema's property definitions have schema_types that form a
+ // cycle, and all properties on the cycle declare
+ // DocumentIndexingConfig.index_nested_properties=true.
+ // c. The schema's property definitions have schema_types that form a
+ // cycle, and the cycle leads to an invalid joinable property config.
+ // This is the case if:
+ // i. Any type node in the cycle itself has a joinable proprty
+ // (property whose joinable config is not NONE), OR
+ // ii. Any type node in the cycle has a nested-type (direct or
+ // indirect) with a joinable property.
+ // 15. For DOCUMENT data types, if
+ // DocumentIndexingConfig.indexable_nested_properties_list is non-empty,
+ // DocumentIndexingConfig.index_nested_properties must be false.
//
// Returns:
+ // On success, a dependent map from each types to their dependent types
+ // that depend on it directly or indirectly.
// ALREADY_EXISTS for case 1 and 2
- // INVALID_ARGUMENT for 3-10
- // OK otherwise
- static libtextclassifier3::Status Validate(const SchemaProto& schema);
+ // INVALID_ARGUMENT for 3-15
+ static libtextclassifier3::StatusOr<DependentMap> Validate(
+ const SchemaProto& schema, bool allow_circular_schema_definitions);
+
+ // Builds a transitive inheritance map.
+ //
+ // Ex. Suppose we have a schema with four types A, B, C and D, and we have the
+ // following direct inheritance relation.
+ //
+ // A -> B (A is the parent type of B)
+ // B -> C (B is the parent type of C)
+ // C -> D (C is the parent type of D)
+ //
+ // Then, the transitive inheritance map for this schema would be:
+ //
+ // A -> B, C, D
+ // B -> C, D
+ // C -> D
+ //
+ // RETURNS:
+ // On success, a transitive inheritance map of all types in the schema.
+ // INVALID_ARGUMENT if the inheritance graph contains a cycle.
+ static libtextclassifier3::StatusOr<SchemaUtil::InheritanceMap>
+ BuildTransitiveInheritanceGraph(const SchemaProto& schema);
// Creates a mapping of schema type -> schema type config proto. The
// type_config_map is cleared, and then each schema-type_config_proto pair is
@@ -107,6 +213,8 @@ class SchemaUtil {
// `SchemaDelta.schema_types_deleted`
// 3. A schema type's new definition would mean any existing data of the old
// definition is now incompatible.
+ // 4. The derived join index would be incompatible. This is held in
+ // `SchemaDelta.join_incompatible`.
//
// For case 1, the two schemas would result in an incompatible index if:
// 1.1. The new SchemaProto has a different set of indexed properties than
@@ -129,30 +237,146 @@ class SchemaUtil {
// scale defined as:
// LEAST <REPEATED - OPTIONAL - REQUIRED> MOST
//
+ // For case 4, the two schemas would result in an incompatible join if:
+ // 4.1. A SchematypeConfig exists in the new SchemaProto that has a
+ // different set of joinable properties than it did in the old
+ // SchemaProto.
+ //
// A property is defined by the combination of the
// SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name.
//
// Returns a SchemaDelta that captures the aforementioned differences.
static const SchemaDelta ComputeCompatibilityDelta(
- const SchemaProto& old_schema, const SchemaProto& new_schema);
+ const SchemaProto& old_schema, const SchemaProto& new_schema,
+ const DependentMap& new_schema_dependent_map);
+
+ // Validates the 'property_name' field.
+ // 1. Can't be an empty string
+ // 2. Can only contain alphanumeric characters
+ //
+ // NOTE: schema_type is only used for logging. It is not necessary to populate
+ // it.
+ //
+ // RETURNS:
+ // - OK if property_name is valid
+ // - INVALID_ARGUMENT if property name is empty or contains an
+ // non-alphabetic character.
+ static libtextclassifier3::Status ValidatePropertyName(
+ std::string_view property_name, std::string_view schema_type = "");
+
+ static bool IsIndexedProperty(const PropertyConfigProto& property_config);
private:
+ // Validates the 'schema_type' field
+ //
+ // Returns:
+ // INVALID_ARGUMENT if 'schema_type' is an empty string.
+ // OK on success
static libtextclassifier3::Status ValidateSchemaType(
std::string_view schema_type);
- static libtextclassifier3::Status ValidatePropertyName(
- std::string_view property_name, std::string_view schema_type);
+
+ // Validates the 'data_type' field.
+ //
+ // Returns:
+ // INVALID_ARGUMENT if it's UNKNOWN
+ // OK on success
static libtextclassifier3::Status ValidateDataType(
PropertyConfigProto::DataType::Code data_type,
std::string_view schema_type, std::string_view property_name);
- static libtextclassifier3::Status ValidatePropertySchemaType(
- std::string_view property_schema_type, std::string_view schema_type,
- std::string_view property_name);
+
+ // Validates the 'cardinality' field.
+ //
+ // Returns:
+ // INVALID_ARGUMENT if it's UNKNOWN
+ // OK on success
static libtextclassifier3::Status ValidateCardinality(
PropertyConfigProto::Cardinality::Code cardinality,
std::string_view schema_type, std::string_view property_name);
- static libtextclassifier3::Status ValidateIndexingConfig(
- const IndexingConfig& config,
- PropertyConfigProto::DataType::Code data_type);
+
+ // Checks that the 'string_indexing_config' satisfies the following rules:
+ // 1. Only STRING data types can be indexed
+ // 2. An indexed property must have a valid tokenizer type
+ //
+ // Returns:
+ // INVALID_ARGUMENT if any of the rules are not followed
+ // OK on success
+ static libtextclassifier3::Status ValidateStringIndexingConfig(
+ const StringIndexingConfig& config,
+ PropertyConfigProto::DataType::Code data_type,
+ std::string_view schema_type, std::string_view property_name);
+
+ // Checks that the 'joinable_config' satisfies the following rules:
+ // 1. If the data type matches joinable value type
+ // a. Only STRING data types can use QUALIFIED_ID joinable value type
+ // 2. Only QUALIFIED_ID joinable value type can have delete propagation
+ // enabled
+ // 3. Any joinable property should have non-REPEATED cardinality
+ //
+ // Returns:
+ // INVALID_ARGUMENT if any of the rules are not followed
+ // OK on success
+ static libtextclassifier3::Status ValidateJoinableConfig(
+ const JoinableConfig& config,
+ PropertyConfigProto::DataType::Code data_type,
+ PropertyConfigProto::Cardinality::Code cardinality,
+ std::string_view schema_type, std::string_view property_name);
+
+ // Checks that the 'document_indexing_config' satisfies the following rule:
+ // 1. If indexable_nested_properties is non-empty, index_nested_properties
+ // must be set to false.
+ //
+ // Returns:
+ // INVALID_ARGUMENT if any of the rules are not followed
+ // OK on success
+ static libtextclassifier3::Status ValidateDocumentIndexingConfig(
+ const DocumentIndexingConfig& config, std::string_view schema_type,
+ std::string_view property_name);
+
+ // Returns if 'parent_type' is a direct or indirect parent of 'child_type'.
+ static bool IsParent(const SchemaUtil::InheritanceMap& inheritance_map,
+ std::string_view parent_type,
+ std::string_view child_type);
+
+ // Returns if 'child_property_config' in a child type can override
+ // 'parent_property_config' in the parent type.
+ //
+ // Let's assign 'child_property_config' a type T1 and 'parent_property_config'
+ // a type T2 that captures information for their data_type, schema_type and
+ // cardinalities, so that 'child_property_config' can override
+ // 'parent_property_config' if and only if T1 <: T2, i.e. T1 is a subtype of
+ // T2.
+ //
+ // Below are the rules for inferring subtype relations.
+ // - T <: T for every type T.
+ // - If U extends T, then U <: T.
+ // - For every type T1, T2 and T3, if T1 <: T2 and T2 <: T3, then T1 <: T3.
+ // - Optional<T> <: Repeated<T> for every type T.
+ // - Required<T> <: Optional<T> for every type T.
+ // - If T1 <: T2, then
+ // - Required<T1> <: Required<T2>
+ // - Optional<T1> <: Optional<T2>
+ // - Repeated<T1> <: Repeated<T2>
+ //
+ // We assume the Closed World Assumption (CWA), i.e. if T1 <: T2 cannot be
+ // deduced from the above rules, then T1 is not a subtype of T2.
+ static bool IsInheritedPropertyCompatible(
+ const SchemaUtil::InheritanceMap& inheritance_map,
+ const PropertyConfigProto& child_property_config,
+ const PropertyConfigProto& parent_property_config);
+
+ // Verifies that every child type's property set has included all compatible
+ // properties from parent types, based on the following rule:
+ //
+ // - If a property "prop" of type T is in the parent, then the child type must
+ // also have "prop" that is of type U, such that U <: T, i.e. U is a subtype
+ // of T.
+ //
+ // RETURNS:
+ // Ok on validation success
+ // INVALID_ARGUMENT if an exception that violates the above validation rule
+ // is found.
+ static libtextclassifier3::Status ValidateInheritedProperties(
+ const SchemaProto& schema);
};
} // namespace lib
diff --git a/icing/schema/schema-util_test.cc b/icing/schema/schema-util_test.cc
index a3ab96f..82683ba 100644
--- a/icing/schema/schema-util_test.cc
+++ b/icing/schema/schema-util_test.cc
@@ -14,606 +14,5286 @@
#include "icing/schema/schema-util.h"
-#include <cstdint>
+#include <initializer_list>
#include <string>
#include <string_view>
+#include <unordered_set>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/proto/schema.pb.h"
-#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
#include "icing/testing/common-matchers.h"
namespace icing {
namespace lib {
namespace {
+using portable_equals_proto::EqualsProto;
using ::testing::Eq;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+using ::testing::Pair;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
// Properties/fields in a schema type
constexpr char kEmailType[] = "EmailMessage";
+constexpr char kMessageType[] = "Text";
constexpr char kPersonType[] = "Person";
-class SchemaUtilTest : public ::testing::Test {
- protected:
- SchemaProto schema_proto_;
-
- static SchemaTypeConfigProto CreateSchemaTypeConfig(
- const std::string_view schema_type,
- const std::string_view nested_schema_type = "") {
- SchemaTypeConfigProto type;
- type.set_schema_type(std::string(schema_type));
-
- auto string_property = type.add_properties();
- string_property->set_property_name("string");
- string_property->set_data_type(PropertyConfigProto::DataType::STRING);
- string_property->set_cardinality(
- PropertyConfigProto::Cardinality::REQUIRED);
-
- auto int_property = type.add_properties();
- int_property->set_property_name("int");
- int_property->set_data_type(PropertyConfigProto::DataType::INT64);
- int_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- auto double_property = type.add_properties();
- double_property->set_property_name("double");
- double_property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
- double_property->set_cardinality(
- PropertyConfigProto::Cardinality::REPEATED);
-
- auto bool_property = type.add_properties();
- bool_property->set_property_name("boolean");
- bool_property->set_data_type(PropertyConfigProto::DataType::BOOLEAN);
- bool_property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
-
- auto bytes_property = type.add_properties();
- bytes_property->set_property_name("bytes");
- bytes_property->set_data_type(PropertyConfigProto::DataType::BYTES);
- bytes_property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
-
- if (!nested_schema_type.empty()) {
- auto document_property = type.add_properties();
- document_property->set_property_name("document");
- document_property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- document_property->set_cardinality(
- PropertyConfigProto::Cardinality::REPEATED);
- document_property->set_schema_type(std::string(nested_schema_type));
- }
-
- return type;
+class SchemaUtilTest : public ::testing::TestWithParam<bool> {};
+
+TEST_P(SchemaUtilTest, DependentGraphAlphabeticalOrder) {
+ // Create a schema with the following dependent relation:
+ // C
+ // / \
+ // A - B E - F
+ // \ /
+ // D
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder()
+ .SetType("E")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("f")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("F", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_f =
+ SchemaTypeConfigBuilder()
+ .SetType("F")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ // Provide these in alphabetical order: A, B, C, D, E, F
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .AddType(type_f)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, testing::SizeIs(5));
+ EXPECT_THAT(
+ d_map["F"],
+ UnorderedElementsAre(Pair("A", IsEmpty()), Pair("B", IsEmpty()),
+ Pair("C", IsEmpty()), Pair("D", IsEmpty()),
+ Pair("E", UnorderedElementsAre(Pointee(
+ EqualsProto(type_e.properties(0)))))));
+ EXPECT_THAT(d_map["E"],
+ UnorderedElementsAre(
+ Pair("A", IsEmpty()), Pair("B", IsEmpty()),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(0))))),
+ Pair("D", UnorderedElementsAre(
+ Pointee(EqualsProto(type_d.properties(0)))))));
+ EXPECT_THAT(
+ d_map["D"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(1)))))));
+ EXPECT_THAT(
+ d_map["C"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(0)))))));
+ EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair(
+ "A", UnorderedElementsAre(Pointee(
+ EqualsProto(type_a.properties(0)))))));
+}
+
+TEST_P(SchemaUtilTest, DependentGraphReverseAlphabeticalOrder) {
+ // Create a schema with the following dependent relation:
+ // C
+ // / \
+ // A - B E - F
+ // \ /
+ // D
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder()
+ .SetType("E")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("f")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("F", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_f =
+ SchemaTypeConfigBuilder()
+ .SetType("F")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ // Provide these in reverse alphabetical order:
+ // F, E, D, C, B, A
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_f)
+ .AddType(type_e)
+ .AddType(type_d)
+ .AddType(type_c)
+ .AddType(type_b)
+ .AddType(type_a)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, testing::SizeIs(5));
+ EXPECT_THAT(
+ d_map["F"],
+ UnorderedElementsAre(Pair("A", IsEmpty()), Pair("B", IsEmpty()),
+ Pair("C", IsEmpty()), Pair("D", IsEmpty()),
+ Pair("E", UnorderedElementsAre(Pointee(
+ EqualsProto(type_e.properties(0)))))));
+ EXPECT_THAT(d_map["E"],
+ UnorderedElementsAre(
+ Pair("A", IsEmpty()), Pair("B", IsEmpty()),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(0))))),
+ Pair("D", UnorderedElementsAre(
+ Pointee(EqualsProto(type_d.properties(0)))))));
+ EXPECT_THAT(
+ d_map["D"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(1)))))));
+ EXPECT_THAT(
+ d_map["C"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(0)))))));
+ EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair(
+ "A", UnorderedElementsAre(Pointee(
+ EqualsProto(type_a.properties(0)))))));
+}
+
+TEST_P(SchemaUtilTest, DependentGraphMixedOrder) {
+ // Create a schema with the following dependent relation:
+ // C
+ // / \
+ // A - B E - F
+ // \ /
+ // D
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder()
+ .SetType("E")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("f")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("F", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_f =
+ SchemaTypeConfigBuilder()
+ .SetType("F")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+
+ // Provide these in a random order: C, E, F, A, B, D
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_c)
+ .AddType(type_e)
+ .AddType(type_f)
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_d)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, testing::SizeIs(5));
+ EXPECT_THAT(
+ d_map["F"],
+ UnorderedElementsAre(Pair("A", IsEmpty()), Pair("B", IsEmpty()),
+ Pair("C", IsEmpty()), Pair("D", IsEmpty()),
+ Pair("E", UnorderedElementsAre(Pointee(
+ EqualsProto(type_e.properties(0)))))));
+ EXPECT_THAT(d_map["E"],
+ UnorderedElementsAre(
+ Pair("A", IsEmpty()), Pair("B", IsEmpty()),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(0))))),
+ Pair("D", UnorderedElementsAre(
+ Pointee(EqualsProto(type_d.properties(0)))))));
+ EXPECT_THAT(
+ d_map["D"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(1)))))));
+ EXPECT_THAT(
+ d_map["C"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(0)))))));
+ EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair(
+ "A", UnorderedElementsAre(Pointee(
+ EqualsProto(type_a.properties(0)))))));
+}
+
+TEST_P(SchemaUtilTest, TopLevelCycleIndexableTrueInvalid) {
+ // Create a schema with the following nested-type relation:
+ // A - B - B - B - B.... where all edges declare index_nested_properties=true
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, TopLevelCycleIndexableFalseNotJoinableOK) {
+ if (GetParam() != true) {
+ GTEST_SKIP() << "This is an invalid cycle if circular schema definitions "
+ "are not allowed.";
}
-};
-TEST_F(SchemaUtilTest, Valid_Empty) {
- ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+ // Create a schema with the following nested-type relation and
+ // index_nested_properties definition:
+ // A -(true)-> B -(false)-> B -(false)-> B....
+ // Edge B -(false)-> B breaks the invalid cycle, so this is allowed.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ // Assert Validate status is OK and check dependent map
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(1));
+ EXPECT_THAT(d_map["B"],
+ UnorderedElementsAre(
+ Pair("A", UnorderedElementsAre(
+ Pointee(EqualsProto(type_a.properties(0))))),
+ Pair("B", UnorderedElementsAre(
+ Pointee(EqualsProto(type_b.properties(0)))))));
+}
+
+TEST_P(SchemaUtilTest, MultiLevelCycleIndexableTrueInvalid) {
+ // Create a schema with the following dependent relation:
+ // A - B - C - A - B - C - A ...
+ // where all edges declare index_nested_properties=true
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs((libtextclassifier3::StatusCode::INVALID_ARGUMENT),
+ HasSubstr("Invalid cycle")));
}
-TEST_F(SchemaUtilTest, Valid_Nested) {
- auto email_type = schema_proto_.add_types();
- *email_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+TEST_P(SchemaUtilTest, MultiLevelCycleIndexableFalseNotJoinableOK) {
+ if (GetParam() != true) {
+ GTEST_SKIP() << "This is an invalid cycle if circular schema definitions "
+ "are not allowed.";
+ }
- auto person_type = schema_proto_.add_types();
- *person_type = CreateSchemaTypeConfig(kPersonType);
+ // Create a schema with the following nested-type relation:
+ // A -(true)-> B -(false)-> C -(true)-> A -(true)-> B -(false)-> C ...
+ // B -(false)-> C breaking the infinite cycle.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .Build();
- ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::OK));
}
-TEST_F(SchemaUtilTest, Valid_ClearedPropertyConfigs) {
- // No property fields is technically ok, but probably not realistic.
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
- type->clear_properties();
+TEST_P(SchemaUtilTest, MultiLevelCycleDependentMapOk) {
+ if (GetParam() != true) {
+ GTEST_SKIP() << "This is an invalid cycle if circular schema definitions "
+ "are not allowed.";
+ }
+
+ // Create a schema with the following nested-type dependent relation:
+ // A -(false)-> B -(false)-> C -(false)-> A --> B --> C ...
+ // i.e. A is a property of B
+ // B is a property of C
+ // C is a property of A
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
- ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ // Assert Validate status is OK and check dependent map
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(3));
+ EXPECT_THAT(
+ d_map["A"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(0))))),
+ Pair("C", IsEmpty())));
+ EXPECT_THAT(
+ d_map["B"],
+ UnorderedElementsAre(Pair("A", IsEmpty()), Pair("B", IsEmpty()),
+ Pair("C", UnorderedElementsAre(Pointee(
+ EqualsProto(type_c.properties(0)))))));
+ EXPECT_THAT(
+ d_map["C"],
+ UnorderedElementsAre(Pair("A", UnorderedElementsAre(Pointee(
+ EqualsProto(type_a.properties(0))))),
+ Pair("B", IsEmpty()), Pair("C", IsEmpty())));
}
-TEST_F(SchemaUtilTest, Invalid_ClearedSchemaType) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
- type->clear_schema_type();
+TEST_P(SchemaUtilTest, NestedCycleIndexableTrueInvalid) {
+ // Create a schema with the following dependent relation:
+ // A -(false)-> B <-(true)-> C -(false)-> D.
+ // B <-(true)-> C creates an invalid cycle.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE))
+ .Build();
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, NestedCycleIndexableFalseNotJoinableOK) {
+ if (GetParam() != true) {
+ GTEST_SKIP() << "This is an invalid cycle if circular schema definitions "
+ "are not allowed.";
+ }
+
+ // Create a schema with the following nested-type relation:
+ // A -(true)-> B -(true)-> C -(false)-> B -(true)-> D.
+ // C -(false)-> B breaks the invalid cycle in B - C - B.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .Build();
+ // Assert Validate status is OK and check dependent map
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(3));
+ EXPECT_THAT(d_map["B"],
+ UnorderedElementsAre(
+ Pair("A", UnorderedElementsAre(
+ Pointee(EqualsProto(type_a.properties(0))))),
+ Pair("B", IsEmpty()),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(0)))))));
+ EXPECT_THAT(
+ d_map["C"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(0))))),
+ Pair("C", IsEmpty())));
+ EXPECT_THAT(d_map["D"],
+ UnorderedElementsAre(
+ Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(
+ Pointee(EqualsProto(type_b.properties(1))))),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(1)))))));
}
-TEST_F(SchemaUtilTest, Invalid_EmptySchemaType) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
- type->set_schema_type("");
+TEST_P(SchemaUtilTest, MultiplePathsAnyPathContainsCycleIsInvalid) {
+ // Create a schema with the following nested-type relation:
+ // C -(false)-> B -(true)-> A
+ // ^ /
+ // (true)\ /(true)
+ // \ v
+ // D
+ // There is a cycle in B-A-D-B... so this is not allowed
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_d)
+ .AddType(type_c)
+ .AddType(type_b)
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, MultipleCycles_anyCycleIndexableTrueInvalid) {
+ // Create a schema with the following nested-type dependent relation:
+ // Note that the arrows in this graph shows the direction of the dependent
+ // relation, rather than nested-type relations.
+ // A -(F)-> B
+ // ^ \ |
+ // (T)| (T)\ |(T)
+ // | v v
+ // D <-(T)- C
+ // There are two cycles: A-B-C-D and A-C-D. The first cycle is allowed because
+ // A-B has nested-indexable=false, but A-C-D
+ //
+ // Schema nested-type property relation graph:
+ // A <-- B
+ // | ^ ^
+ // v \ |
+ // D --> C
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_d)
+ .AddType(type_c)
+ .AddType(type_b)
+ .AddType(type_a)
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, AnySchemaTypeOk) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
- type->set_schema_type("abc123!@#$%^&*()_-+=[{]}|\\;:'\",<.>?你好");
+TEST_P(SchemaUtilTest, CycleWithSameTypedProps_allPropsIndexableFalseIsOK) {
+ if (GetParam() != true) {
+ GTEST_SKIP() << "This is an invalid cycle if circular schema definitions "
+ "are not allowed.";
+ }
- ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+ // Create a schema with the following nested-type relation and
+ // index_nested_properties definition:
+ // A <-(true)- B <-(false)- A -(false)-> B -(true)-> A
+ // A has 2 properties with type B. A - B breaks the invalid cycle only when
+ // both properties declare index_nested_properties=false.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b1")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b2")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("A")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ // Assert Validate status is OK and check dependent map
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(2));
+ EXPECT_THAT(
+ d_map["A"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(0)))))));
+ EXPECT_THAT(d_map["B"],
+ UnorderedElementsAre(
+ Pair("A", UnorderedElementsAre(
+ Pointee(EqualsProto(type_a.properties(0))),
+ Pointee(EqualsProto(type_a.properties(1))))),
+ Pair("B", IsEmpty())));
+}
+
+TEST_P(SchemaUtilTest, CycleWithSameTypedProps_anyPropIndexableTrueIsInvalid) {
+ // Create a schema with the following nested-type relation and
+ // index_nested_properties definition:
+ // A <-(true)- B <-(true)- A -(false)-> B -(true)-> A
+ // A has 2 properties with type B. Prop 'b2' declares
+ // index_nested_properties=true, so there is an invalid cycle.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b1")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b2")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("A")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
}
-TEST_F(SchemaUtilTest, Invalid_ClearedPropertyName) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, CycleWithJoinablePropertyNotAllowed) {
+ // Create a schema with the following dependent relation:
+ // A
+ // / ^
+ // v \
+ // (joinable) B ---> C
+ // B also has a string property that is joinable on QUALIFIED_ID
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("joinableProp")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false))
+ .Build();
- auto property = type->add_properties();
- property->clear_property_name();
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+TEST_P(SchemaUtilTest, NonNestedJoinablePropOutsideCycleOK) {
+ if (GetParam() != true) {
+ GTEST_SKIP() << "This is an invalid cycle if circular schema definitions "
+ "are not allowed.";
+ }
+
+ // Create a schema with the following dependent relation:
+ // A -(false)-> B <-(false)-> C...
+ // A has a string property that is joinable on QUALIFIED_ID, but the cycle is
+ // B-C-B, and none of B or C depends on A, so this is fine.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("joinableProp")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ // Assert Validate status is OK and check dependent map
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(2));
+ EXPECT_THAT(d_map["B"],
+ UnorderedElementsAre(
+ Pair("A", UnorderedElementsAre(
+ Pointee(EqualsProto(type_a.properties(0))))),
+ Pair("B", IsEmpty()),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(0)))))));
+ EXPECT_THAT(
+ d_map["C"],
+ UnorderedElementsAre(Pair("A", IsEmpty()),
+ Pair("B", UnorderedElementsAre(Pointee(
+ EqualsProto(type_b.properties(0))))),
+ Pair("C", IsEmpty())));
+}
+
+TEST_P(SchemaUtilTest, DirectNestedJoinablePropOutsideCycleNotAllowed) {
+ // Create a schema with the following dependent relation:
+ // A
+ // / ^
+ // v \
+ // B ---> C ---> D(joinable)
+ // All edges have index_nested_properties=false and only D has a joinable
+ // property. The cycle A-B-C... is not allowed since there is a type in the
+ // cycle (C) which has a direct nested-type (D) with a joinable property.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("joinableProp")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, TransitiveNestedJoinablePropOutsideCycleNotAllowed) {
+ // Create a schema with the following dependent relation:
+ // A
+ // / ^
+ // v \
+ // B ---> C ---> D ---> E (joinable)
+ // All edges have index_nested_properties=false and only D has a joinable
+ // property. The cycle A-B-C... is not allowed since there is a type in the
+ // cycle (C) which has a transitive nested-type (E) with a joinable property.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder()
+ .SetType("E")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("joinableProp")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest,
+ NestedJoinablePropOutsideCycleNotAllowed_reverseIterationOrder) {
+ // Create a schema with the following dependent relation:
+ // E
+ // / ^
+ // v \
+ // D ---> C ---> B ---> A (joinable)
+ // All edges have index_nested_properties=false and only D has a joinable
+ // property. The cycle A-B-C... is not allowed since there is a type in the
+ // cycle (C) which has a transitive nested-type (E) with a joinable property.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("joinableProp")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder()
+ .SetType("E")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/false))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, ComplexCycleWithJoinablePropertyNotAllowed) {
+ // Create a schema with the following dependent relation:
+ // A
+ // / ^
+ // v \
+ // B ---> E
+ // / \ ^
+ // v v \
+ // C D --> F
+ //
+ // Cycles: A-B-E-A, A-B-D-F-E-A.
+ // All edges have index_nested_properties=false, but D has a joinable property
+ // so the second cycle is not allowed.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("joinableProp")
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("f")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("F", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("joinableProp")
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder()
+ .SetType("E")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_f =
+ SchemaTypeConfigBuilder()
+ .SetType("F")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/false))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .AddType(type_f)
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, ComplexCycleWithIndexableTrueNotAllowed) {
+ // Create a schema with the following dependent relation:
+ // A
+ // / ^
+ // v \
+ // B ---> E
+ // / \ ^
+ // v v \
+ // C D --> F
+ //
+ // Cycles: A-B-E-A, A-B-D-F-E-A.
+ // B->E has index_nested_properties=false, so the first cycle is allowed.
+ // All edges on the second cycle are nested_indexable, so the second cycle is
+ // not allowed
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/false))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("joinableProp")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeJoinableString(JOINABLE_VALUE_TYPE_QUALIFIED_ID))
+ .Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("f")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("F", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder()
+ .SetType("E")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_f =
+ SchemaTypeConfigBuilder()
+ .SetType("F")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("e")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("E", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .AddType(type_f)
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, InheritanceAndNestedTypeRelations_noCycle) {
+ if (GetParam() != true) {
+ GTEST_SKIP() << "This is an invalid cycle if circular schema definitions "
+ "are not allowed.";
+ }
+
+ // Create a schema with the following relations:
+ // index_nested_properties definition:
+ // 1. Nested-type relations:
+ // A -(true)-> B -(true)-> C
+ // (false)| (false)/ \(false)
+ // B B C
+ // The properties in the second row are required for B and C to be
+ // compatible with their parents. index_nested_properties must be false in
+ // these properties so that no invalid cycle can be formed because of these
+ // self reference.
+ //
+ // 2. Inheritance relations:
+ // C -> B -> A (A is a parent of B, which is a parent of C)
+ //
+ // These two relations are separate and do not affect each other. In this
+ // case there is no cycle.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddParentType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddParentType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE))
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(3));
+ // Both A-B and A-C are inheritance relations.
+ EXPECT_THAT(d_map["A"],
+ UnorderedElementsAre(Pair("B", IsEmpty()), Pair("C", IsEmpty())));
+ // B-A and B-B are nested-type relations, B-C is both a nested-type and an
+ // inheritance relation.
+ EXPECT_THAT(d_map["B"],
+ UnorderedElementsAre(
+ Pair("A", UnorderedElementsAre(
+ Pointee(EqualsProto(type_a.properties(0))))),
+ Pair("B", UnorderedElementsAre(
+ Pointee(EqualsProto(type_b.properties(0))))),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(0)))))));
+ // C-C, C-B and C-A are all nested-type relations.
+ EXPECT_THAT(d_map["C"],
+ UnorderedElementsAre(
+ Pair("B", UnorderedElementsAre(
+ Pointee(EqualsProto(type_b.properties(1))))),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(1))))),
+ Pair("A", IsEmpty())));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaUtil::InheritanceMap i_map,
+ SchemaUtil::BuildTransitiveInheritanceGraph(schema));
+ EXPECT_THAT(i_map, SizeIs(2));
+ EXPECT_THAT(i_map["A"],
+ UnorderedElementsAre(Pair("B", IsTrue()), Pair("C", IsFalse())));
+ EXPECT_THAT(i_map["B"], UnorderedElementsAre(Pair("C", IsTrue())));
+}
+
+TEST_P(SchemaUtilTest, InheritanceAndNestedTypeRelations_nestedTypeCycle) {
+ // Create a schema with the following relations:
+ // index_nested_properties definition:
+ // 1. Nested-type relations:
+ // A -(true)-> B -(true)-> C
+ // (true)| (false)/ \(false)
+ // B B C
+ //
+ // 2. Inheritance relations:
+ // C -> B -> A (A is a parent of B, which is a parent of C)
+ //
+ // These two relations are separate and do not affect each other, but there is
+ // a cycle in nested-type relations: B - B
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddParentType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddParentType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE))
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, InheritanceAndNestedTypeRelations_inheritanceCycle) {
+ // Create a schema with the following relations:
+ // index_nested_properties definition:
+ // 1. Nested-type relations:
+ // A -(true)-> B -(true)-> C
+ // (false)| (false)/ \(false)
+ // B B C
+ //
+ // 2. Inheritance relations:
+ // C -> B -> A -> B (A is a parent of B, which is a parent of C and A)
+ //
+ // These two relations are separate and do not affect each other, but there is
+ // a cycle in inheritance relation: B - A - B
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddParentType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddParentType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddParentType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/false))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("prop")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE))
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("inherits from itself")));
+}
+
+TEST_P(SchemaUtilTest, NonExistentType) {
+ // Create a schema with the following dependent relation:
+ // A - B - C - X (does not exist)
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("c")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("C", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("x")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("X", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_EmptyPropertyName) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, SingleTypeIsBothDirectAndIndirectDependent) {
+ // Create a schema with the following dependent relation, all of which are via
+ // nested document. In this case, C is both a direct dependent and an indirect
+ // dependent of A.
+ // A
+ // | \
+ // | B
+ // | /
+ // C
+ SchemaTypeConfigProto type_a = SchemaTypeConfigBuilder().SetType("A").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(2));
+ EXPECT_THAT(d_map["A"],
+ UnorderedElementsAre(
+ Pair("B", UnorderedElementsAre(
+ Pointee(EqualsProto(type_b.properties(0))))),
+ Pair("C", UnorderedElementsAre(
+ Pointee(EqualsProto(type_c.properties(0)))))));
+ EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair(
+ "C", UnorderedElementsAre(Pointee(
+ EqualsProto(type_c.properties(1)))))));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaUtil::InheritanceMap i_map,
+ SchemaUtil::BuildTransitiveInheritanceGraph(schema));
+ EXPECT_THAT(i_map, IsEmpty());
+}
+
+TEST_P(SchemaUtilTest, SimpleInheritance) {
+ // Create a schema with the following inheritance relation:
+ // A <- B
+ SchemaTypeConfigProto type_a = SchemaTypeConfigBuilder().SetType("A").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
- auto property = type->add_properties();
- property->set_property_name("");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(1));
+ EXPECT_THAT(d_map["A"], UnorderedElementsAre(Pair("B", IsEmpty())));
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaUtil::InheritanceMap i_map,
+ SchemaUtil::BuildTransitiveInheritanceGraph(schema));
+ EXPECT_THAT(i_map, SizeIs(1));
+ EXPECT_THAT(i_map["A"], UnorderedElementsAre(Pair("B", IsTrue())));
+}
+
+TEST_P(SchemaUtilTest, SingleInheritanceTypeIsBothDirectAndIndirectChild) {
+ // Create a schema with the following inheritance relation. In this case, C is
+ // both a direct and an indirect child of A.
+ // A
+ // | \
+ // | B
+ // | /
+ // C
+ SchemaTypeConfigProto type_a = SchemaTypeConfigBuilder().SetType("A").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+ SchemaTypeConfigProto type_c = SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddParentType("A")
+ .AddParentType("B")
+ .Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(2));
+ EXPECT_THAT(d_map["A"],
+ UnorderedElementsAre(Pair("B", IsEmpty()), Pair("C", IsEmpty())));
+ EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair("C", IsEmpty())));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaUtil::InheritanceMap i_map,
+ SchemaUtil::BuildTransitiveInheritanceGraph(schema));
+ EXPECT_THAT(i_map, SizeIs(2));
+ EXPECT_THAT(i_map["A"],
+ UnorderedElementsAre(Pair("B", IsTrue()), Pair("C", IsTrue())));
+ EXPECT_THAT(i_map["B"], UnorderedElementsAre(Pair("C", IsTrue())));
+}
+
+TEST_P(SchemaUtilTest, ComplexInheritance) {
+ // Create a schema with the following inheritance relation:
+ // A
+ // / \
+ // B E
+ // / \
+ // C D
+ // |
+ // F
+ SchemaTypeConfigProto type_a = SchemaTypeConfigBuilder().SetType("A").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder().SetType("C").AddParentType("B").Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder().SetType("D").AddParentType("B").Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder().SetType("E").AddParentType("A").Build();
+ SchemaTypeConfigProto type_f =
+ SchemaTypeConfigBuilder().SetType("F").AddParentType("D").Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .AddType(type_f)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(3));
+ EXPECT_THAT(d_map["A"],
+ UnorderedElementsAre(Pair("B", IsEmpty()), Pair("C", IsEmpty()),
+ Pair("D", IsEmpty()), Pair("E", IsEmpty()),
+ Pair("F", IsEmpty())));
+ EXPECT_THAT(d_map["B"],
+ UnorderedElementsAre(Pair("C", IsEmpty()), Pair("D", IsEmpty()),
+ Pair("F", IsEmpty())));
+ EXPECT_THAT(d_map["D"], UnorderedElementsAre(Pair("F", IsEmpty())));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaUtil::InheritanceMap i_map,
+ SchemaUtil::BuildTransitiveInheritanceGraph(schema));
+ EXPECT_THAT(i_map, SizeIs(3));
+ EXPECT_THAT(i_map["A"],
+ UnorderedElementsAre(Pair("B", IsTrue()), Pair("C", IsFalse()),
+ Pair("D", IsFalse()), Pair("E", IsTrue()),
+ Pair("F", IsFalse())));
+ EXPECT_THAT(i_map["B"],
+ UnorderedElementsAre(Pair("C", IsTrue()), Pair("D", IsTrue()),
+ Pair("F", IsFalse())));
+ EXPECT_THAT(i_map["D"], UnorderedElementsAre(Pair("F", IsTrue())));
+}
+
+TEST_P(SchemaUtilTest, InheritanceCycle) {
+ // Create a schema with the following inheritance relation:
+ // C <- A <- B <- C
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder().SetType("A").AddParentType("C").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder().SetType("C").AddParentType("B").Build();
+
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, NonAlphanumericPropertyNameIsInvalid) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, SelfInheritance) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder().SetType("A").AddParentType("A").Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(type_a).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
- auto property = type->add_properties();
- property->set_property_name("_");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+TEST_P(SchemaUtilTest, NonExistentParentType) {
+ // Create a schema with the following inheritance relation:
+ // (does not exist) X <- A <- B <- C
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder().SetType("A").AddParentType("X").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder().SetType("C").AddParentType("B").Build();
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, AlphanumericPropertyNameOk) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, SimpleInheritanceWithNestedType) {
+ // Create a schema with the following dependent relation:
+ // A - B (via inheritance)
+ // B - C (via nested document)
+ SchemaTypeConfigProto type_a = SchemaTypeConfigBuilder().SetType("A").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
- auto property = type->add_properties();
- property->set_property_name("abc123");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ SchemaProto schema =
+ SchemaBuilder().AddType(type_a).AddType(type_b).AddType(type_c).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(2));
+ // Nested-type dependency and inheritance dependencies are not transitive.
+ EXPECT_THAT(d_map["A"], UnorderedElementsAre(Pair("B", IsEmpty())));
+ EXPECT_THAT(d_map["B"], UnorderedElementsAre(Pair(
+ "C", UnorderedElementsAre(Pointee(
+ EqualsProto(type_c.properties(0)))))));
- ICING_ASSERT_OK(SchemaUtil::Validate(schema_proto_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaUtil::InheritanceMap i_map,
+ SchemaUtil::BuildTransitiveInheritanceGraph(schema));
+ EXPECT_THAT(i_map, SizeIs(1));
+ EXPECT_THAT(i_map["A"], UnorderedElementsAre(Pair("B", IsTrue())));
}
-TEST_F(SchemaUtilTest, Invalid_DuplicatePropertyName) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, ComplexInheritanceWithNestedType) {
+ // Create a schema with the following dependent relation:
+ // A
+ // / \
+ // B E
+ // / \
+ // C D
+ // |
+ // F
+ // Approach:
+ // B extends A
+ // C extends B
+ // D has a nested document of type B
+ // E has a nested document of type A
+ // F has a nested document of type D
+ SchemaTypeConfigProto type_a = SchemaTypeConfigBuilder().SetType("A").Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+ SchemaTypeConfigProto type_c =
+ SchemaTypeConfigBuilder().SetType("C").AddParentType("B").Build();
+ SchemaTypeConfigProto type_d =
+ SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_e =
+ SchemaTypeConfigBuilder()
+ .SetType("E")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("A", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_f =
+ SchemaTypeConfigBuilder()
+ .SetType("F")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("d")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("D", /*index_nested_properties=*/true))
+ .Build();
- auto first_property = type->add_properties();
- first_property->set_property_name("DuplicatedProperty");
- first_property->set_data_type(PropertyConfigProto::DataType::STRING);
- first_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ SchemaProto schema = SchemaBuilder()
+ .AddType(type_a)
+ .AddType(type_b)
+ .AddType(type_c)
+ .AddType(type_d)
+ .AddType(type_e)
+ .AddType(type_f)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(3));
+ EXPECT_THAT(
+ d_map["A"],
+ UnorderedElementsAre(Pair("B", IsEmpty()), Pair("C", IsEmpty()),
+ Pair("E", UnorderedElementsAre(Pointee(
+ EqualsProto(type_e.properties(0)))))));
+ EXPECT_THAT(
+ d_map["B"],
+ UnorderedElementsAre(Pair("C", IsEmpty()),
+ Pair("D", UnorderedElementsAre(Pointee(
+ EqualsProto(type_d.properties(0))))),
+ Pair("F", IsEmpty())));
+ EXPECT_THAT(d_map["D"], UnorderedElementsAre(Pair(
+ "F", UnorderedElementsAre(Pointee(
+ EqualsProto(type_f.properties(0)))))));
- auto second_property = type->add_properties();
- second_property->set_property_name("DuplicatedProperty");
- second_property->set_data_type(PropertyConfigProto::DataType::STRING);
- second_property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaUtil::InheritanceMap i_map,
+ SchemaUtil::BuildTransitiveInheritanceGraph(schema));
+ EXPECT_THAT(i_map, SizeIs(2));
+ EXPECT_THAT(i_map["A"],
+ UnorderedElementsAre(Pair("B", IsTrue()), Pair("C", IsFalse())));
+ EXPECT_THAT(i_map["B"], UnorderedElementsAre(Pair("C", IsTrue())));
+}
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
- StatusIs(libtextclassifier3::StatusCode::ALREADY_EXISTS));
+TEST_P(SchemaUtilTest, InheritanceWithNestedTypeCycle) {
+ // Create a schema that A and B depend on each other, in the sense that B
+ // extends A but A has a nested document of type B.
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument("B", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_ClearedDataType) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, EmptySchemaProtoIsValid) {
+ SchemaProto schema;
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema, GetParam()));
+}
- auto property = type->add_properties();
- property->set_property_name("NewProperty");
- property->clear_data_type();
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+TEST_P(SchemaUtilTest, Valid_Nested) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument(
+ kPersonType,
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema, GetParam()));
}
-TEST_F(SchemaUtilTest, Invalid_UnknownDataType) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, ClearedPropertyConfigsIsValid) {
+ // No property fields is technically ok, but probably not realistic.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType(kEmailType))
+ .Build();
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema, GetParam()));
+}
- auto property = type->add_properties();
- property->set_property_name("NewProperty");
- property->set_data_type(PropertyConfigProto::DataType::UNKNOWN);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+TEST_P(SchemaUtilTest, ClearedSchemaTypeIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder().AddType(SchemaTypeConfigBuilder()).Build();
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+TEST_P(SchemaUtilTest, EmptySchemaTypeIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder().AddType(SchemaTypeConfigBuilder().SetType("")).Build();
+
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_ClearedCardinality) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, AnySchemaTypeOk) {
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType(
+ "abc123!@#$%^&*()_-+=[{]}|\\;:'\",<.>?你好"))
+ .Build();
- auto property = type->add_properties();
- property->set_property_name("NewProperty");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->clear_cardinality();
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema, GetParam()));
+}
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+TEST_P(SchemaUtilTest, ClearedPropertyNameIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("foo")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ schema.mutable_types(0)->mutable_properties(0)->clear_property_name();
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_UnknownCardinality) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, EmptyPropertyNameIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
- auto property = type->add_properties();
- property->set_property_name("NewProperty");
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::UNKNOWN);
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+TEST_P(SchemaUtilTest, NonAlphanumericPropertyNameIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a_b")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_ClearedPropertySchemaType) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, AlphanumericPropertyNameOk) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("abc123")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
- auto property = type->add_properties();
- property->set_property_name("NewProperty");
- property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- property->clear_schema_type();
+ ICING_ASSERT_OK(SchemaUtil::Validate(schema, GetParam()));
+}
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+TEST_P(SchemaUtilTest, DuplicatePropertyNameIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("DuplicatedProperty")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("DuplicatedProperty")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::ALREADY_EXISTS));
+}
+
+TEST_P(SchemaUtilTest, ClearedDataTypeIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ schema.mutable_types(0)->mutable_properties(0)->clear_data_type();
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_EmptyPropertySchemaType) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, UnknownDataTypeIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataType(PropertyConfigProto::DataType::UNKNOWN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
- auto property = type->add_properties();
- property->set_property_name("NewProperty");
- property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- property->set_schema_type("");
+TEST_P(SchemaUtilTest, ClearedCardinalityIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ schema.mutable_types(0)->mutable_properties(0)->clear_cardinality();
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
+TEST_P(SchemaUtilTest, UnknownCardinalityIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_UNKNOWN)))
+ .Build();
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SchemaUtilTest, Invalid_NoMatchingSchemaType) {
- auto type = schema_proto_.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+TEST_P(SchemaUtilTest, ClearedPropertySchemaTypeIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataType(TYPE_DOCUMENT)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(SchemaUtilTest, Invalid_EmptyPropertySchemaType) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataTypeDocument(
+ /*schema_type=*/"",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
- auto property = type->add_properties();
- property->set_property_name("NewProperty");
- property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- property->set_schema_type("NewSchemaType");
+TEST_P(SchemaUtilTest, NoMatchingSchemaTypeIsInvalid) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataTypeDocument(
+ /*schema_type=*/"NewSchemaType",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
- ASSERT_THAT(SchemaUtil::Validate(schema_proto_),
- StatusIs(libtextclassifier3::StatusCode::UNKNOWN));
+ ASSERT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Undefined 'schema_type'")));
}
-TEST_F(SchemaUtilTest, NewOptionalPropertyIsCompatible) {
+TEST_P(SchemaUtilTest, NewOptionalPropertyIsCompatible) {
// Configure old schema
- SchemaProto old_schema;
- auto type = old_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
// Configure new schema with an optional field, not considered incompatible
// since it's fine if old data doesn't have this optional field
- SchemaProto new_schema_with_optional;
- type = new_schema_with_optional.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
-
- auto property = type->add_properties();
- property->set_property_name("NewOptional");
- property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ SchemaProto new_schema_with_optional =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewOptional")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema,
- new_schema_with_optional),
+ schema_delta.schema_types_changed_fully_compatible.insert(kEmailType);
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ old_schema, new_schema_with_optional, no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, NewRequiredPropertyIsIncompatible) {
+TEST_P(SchemaUtilTest, NewRequiredPropertyIsIncompatible) {
// Configure old schema
- SchemaProto old_schema;
- auto type = old_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
// Configure new schema with a required field, considered incompatible since
// old data won't have this required field
- SchemaProto new_schema_with_required;
- type = new_schema_with_required.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
-
- auto property = type->add_properties();
- property->set_property_name("NewRequired");
- property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
+ SchemaProto new_schema_with_required =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewRequired")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
schema_delta.schema_types_incompatible.emplace(kEmailType);
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema,
- new_schema_with_required),
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ old_schema, new_schema_with_required, no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, NewSchemaMissingPropertyIsIncompatible) {
+TEST_P(SchemaUtilTest, NewSchemaMissingPropertyIsIncompatible) {
// Configure old schema
- SchemaProto old_schema;
- auto type = old_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
-
- auto property = type->add_properties();
- property->set_property_name("OldOptional");
- property->set_data_type(PropertyConfigProto::DataType::INT64);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("OldOptional")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
// Configure new schema, new schema needs to at least have all the
// previously defined properties
- SchemaProto new_schema;
- type = new_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
schema_delta.schema_types_incompatible.emplace(kEmailType);
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) {
+TEST_P(SchemaUtilTest, CompatibilityOfDifferentCardinalityOk) {
// Configure less restrictive schema based on cardinality
- SchemaProto less_restrictive_schema;
- auto type = less_restrictive_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
-
- auto property = type->add_properties();
- property->set_property_name("Property");
- property->set_data_type(PropertyConfigProto::DataType::INT64);
- property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ SchemaProto less_restrictive_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
// Configure more restrictive schema based on cardinality
- SchemaProto more_restrictive_schema;
- type = more_restrictive_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+ SchemaProto more_restrictive_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
- property = type->add_properties();
- property->set_property_name("Property");
- property->set_data_type(PropertyConfigProto::DataType::INT64);
- property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- // We can't have a new schema be less restrictive, REQUIRED->OPTIONAL
+ // We can't have a new schema be more restrictive, REPEATED->OPTIONAL
SchemaUtil::SchemaDelta incompatible_schema_delta;
incompatible_schema_delta.schema_types_incompatible.emplace(kEmailType);
+ SchemaUtil::DependentMap no_dependents_map;
EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
/*old_schema=*/less_restrictive_schema,
- /*new_schema=*/more_restrictive_schema),
+ /*new_schema=*/more_restrictive_schema, no_dependents_map),
Eq(incompatible_schema_delta));
- // We can have the new schema be more restrictive, OPTIONAL->REPEATED;
+ // We can have the new schema be less restrictive, OPTIONAL->REPEATED;
SchemaUtil::SchemaDelta compatible_schema_delta;
+ compatible_schema_delta.schema_types_changed_fully_compatible.insert(
+ kEmailType);
EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
/*old_schema=*/more_restrictive_schema,
- /*new_schema=*/less_restrictive_schema),
+ /*new_schema=*/less_restrictive_schema, no_dependents_map),
Eq(compatible_schema_delta));
}
-TEST_F(SchemaUtilTest, DifferentDataTypeIsIncompatible) {
+TEST_P(SchemaUtilTest, DifferentDataTypeIsIncompatible) {
// Configure old schema, with an int64_t property
- SchemaProto old_schema;
- auto type = old_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
-
- auto property = type->add_properties();
- property->set_property_name("Property");
- property->set_data_type(PropertyConfigProto::DataType::INT64);
- property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
// Configure new schema, with a double property
- SchemaProto new_schema;
- type = new_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
-
- property = type->add_properties();
- property->set_property_name("Property");
- property->set_data_type(PropertyConfigProto::DataType::DOUBLE);
- property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
schema_delta.schema_types_incompatible.emplace(kEmailType);
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) {
+TEST_P(SchemaUtilTest, DifferentSchemaTypeIsIncompatible) {
// Configure old schema, where Property is supposed to be a Person type
- SchemaProto old_schema;
- auto type = old_schema.add_types();
- *type = CreateSchemaTypeConfig(kPersonType);
-
- *type = CreateSchemaTypeConfig(kEmailType);
- auto property = type->add_properties();
- property->set_property_name("Property");
- property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- property->set_schema_type(kPersonType);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kMessageType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeDocument(
+ kPersonType,
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
// Configure new schema, where Property is supposed to be an Email type
- SchemaProto new_schema;
- type = new_schema.add_types();
- *type = CreateSchemaTypeConfig(kPersonType);
-
- *type = CreateSchemaTypeConfig(kEmailType);
- property = type->add_properties();
- property->set_property_name("Property");
- property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- property->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- property->set_schema_type(kEmailType);
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kMessageType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeDocument(
+ kMessageType,
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
schema_delta.schema_types_incompatible.emplace(kEmailType);
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ // kEmailType depends on kMessageType
+ SchemaUtil::DependentMap dependents_map = {
+ {kMessageType, {{kEmailType, {}}}}};
+ SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta(
+ old_schema, new_schema, dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+ EXPECT_THAT(actual.schema_types_incompatible,
+ testing::ElementsAre(kEmailType));
+ EXPECT_THAT(actual.schema_types_deleted, testing::IsEmpty());
+}
+
+TEST_P(SchemaUtilTest, SameNumberOfRequiredFieldsCanBeIncompatible) {
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataType(TYPE_STRING)
+ // Changing required to optional should be fine
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Property2")
+ .SetDataType(TYPE_STRING)
+ // Adding a new required property is incompatible
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ SchemaUtil::SchemaDelta delta = SchemaUtil::ComputeCompatibilityDelta(
+ old_schema, new_schema, /*new_schema_dependent_map=*/{});
+ EXPECT_THAT(delta.schema_types_incompatible,
+ testing::ElementsAre(kEmailType));
+ EXPECT_THAT(delta.schema_types_index_incompatible, testing::IsEmpty());
+ EXPECT_THAT(delta.schema_types_deleted, testing::IsEmpty());
+}
+
+TEST_P(SchemaUtilTest, SameNumberOfIndexedPropertiesCanMakeIndexIncompatible) {
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property2")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta delta = SchemaUtil::ComputeCompatibilityDelta(
+ old_schema, new_schema, /*new_schema_dependent_map=*/{});
+ EXPECT_THAT(delta.schema_types_incompatible, testing::IsEmpty());
+ EXPECT_THAT(delta.schema_types_index_incompatible,
+ testing::ElementsAre(kEmailType));
+ EXPECT_THAT(delta.schema_types_deleted, testing::IsEmpty());
+}
+
+TEST_P(SchemaUtilTest, SameNumberOfJoinablePropertiesCanMakeJoinIncompatible) {
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property2")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta delta = SchemaUtil::ComputeCompatibilityDelta(
+ old_schema, new_schema, /*new_schema_dependent_map=*/{});
+ EXPECT_THAT(delta.schema_types_incompatible, testing::IsEmpty());
+ EXPECT_THAT(delta.schema_types_index_incompatible, testing::IsEmpty());
+ EXPECT_THAT(delta.schema_types_deleted, testing::IsEmpty());
+ EXPECT_THAT(delta.schema_types_join_incompatible,
+ testing::ElementsAre(kEmailType));
+}
+
+TEST_P(SchemaUtilTest, ChangingIndexedStringPropertiesMakesIndexIncompatible) {
+ // Configure old schema
+ SchemaProto schema_with_indexed_property =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema
+ SchemaProto schema_with_unindexed_property =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_index_incompatible.insert(kPersonType);
+
+ // New schema gained a new indexed string property.
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ schema_with_unindexed_property, schema_with_indexed_property,
+ no_dependents_map),
+ Eq(schema_delta));
+
+ // New schema lost an indexed string property.
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ schema_with_indexed_property, schema_with_unindexed_property,
+ no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, ChangingIndexedPropertiesMakesIndexIncompatible) {
+TEST_P(SchemaUtilTest, AddingNewIndexedStringPropertyMakesIndexIncompatible) {
// Configure old schema
- SchemaProto old_schema;
- auto old_type = old_schema.add_types();
- *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
- auto old_property = old_type->add_properties();
- old_property->set_property_name("Property");
- old_property->set_data_type(PropertyConfigProto::DataType::STRING);
- old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ // Configure new schema
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewIndexedProperty")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_index_incompatible.insert(kPersonType);
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map),
+ Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest,
+ AddingNewNonIndexedStringPropertyShouldRemainIndexCompatible) {
+ // Configure old schema
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
// Configure new schema
- SchemaProto new_schema;
- auto new_type = new_schema.add_types();
- *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
- auto new_property = new_type->add_properties();
- new_property->set_property_name("Property");
- new_property->set_data_type(PropertyConfigProto::DataType::STRING);
- new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map)
+ .schema_types_index_incompatible,
+ IsEmpty());
+}
+
+TEST_P(SchemaUtilTest, ChangingIndexedIntegerPropertiesMakesIndexIncompatible) {
+ // Configure old schema
+ SchemaProto schema_with_indexed_property =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema
+ SchemaProto schema_with_unindexed_property =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
- schema_delta.index_incompatible = true;
-
- // New schema gained a new indexed property.
- old_property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::UNKNOWN);
- new_property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ schema_delta.schema_types_index_incompatible.insert(kPersonType);
+
+ // New schema gained a new indexed integer property.
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ schema_with_unindexed_property, schema_with_indexed_property,
+ no_dependents_map),
Eq(schema_delta));
- // New schema lost an indexed property.
- old_property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- new_property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::UNKNOWN);
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ // New schema lost an indexed integer property.
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ schema_with_indexed_property, schema_with_unindexed_property,
+ no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, AddingNewIndexedPropertyMakesIndexIncompatible) {
+TEST_P(SchemaUtilTest, AddingNewIndexedIntegerPropertyMakesIndexIncompatible) {
// Configure old schema
- SchemaProto old_schema;
- auto old_type = old_schema.add_types();
- *old_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
-
- auto old_property = old_type->add_properties();
- old_property->set_property_name("Property");
- old_property->set_data_type(PropertyConfigProto::DataType::STRING);
- old_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
// Configure new schema
- SchemaProto new_schema;
- auto new_type = new_schema.add_types();
- *new_type = CreateSchemaTypeConfig(kEmailType, kPersonType);
-
- auto new_property = new_type->add_properties();
- new_property->set_property_name("Property");
- new_property->set_data_type(PropertyConfigProto::DataType::STRING);
- new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- new_property = new_type->add_properties();
- new_property->set_property_name("NewIndexedProperty");
- new_property->set_data_type(PropertyConfigProto::DataType::STRING);
- new_property->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- new_property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewIndexedProperty")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
- schema_delta.index_incompatible = true;
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ schema_delta.schema_types_index_incompatible.insert(kPersonType);
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, AddingTypeIsCompatible) {
+TEST_P(SchemaUtilTest,
+ AddingNewNonIndexedIntegerPropertyShouldRemainIndexCompatible) {
+ // Configure old schema
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map)
+ .schema_types_index_incompatible,
+ IsEmpty());
+}
+
+TEST_P(SchemaUtilTest,
+ AddingNewIndexedDocumentPropertyMakesIndexAndJoinIncompatible) {
+ SchemaTypeConfigProto nested_schema =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Configure old schema
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("NewEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_index_incompatible.insert(kPersonType);
+ schema_delta.schema_types_join_incompatible.insert(kPersonType);
+
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta result_schema_delta =
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ dependents_map);
+ EXPECT_THAT(result_schema_delta, Eq(schema_delta));
+}
+
+TEST_P(
+ SchemaUtilTest,
+ AddingNewIndexedDocumentPropertyWithIndexableListMakesIndexAndJoinIncompatible) {
+ SchemaTypeConfigProto nested_schema =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Configure old schema
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema. The added nested document property is indexed, so
+ // this is both index and join incompatible
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("NewEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*indexable_nested_properties_list=*/
+ std::initializer_list<std::string>{"subject"})
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_index_incompatible.insert(kPersonType);
+ schema_delta.schema_types_join_incompatible.insert(kPersonType);
+
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta result_schema_delta =
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ dependents_map);
+ EXPECT_THAT(result_schema_delta, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest,
+ AddingNewNonIndexedDocumentPropertyMakesJoinIncompatible) {
+ SchemaTypeConfigProto nested_schema =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Configure old schema
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema. The added nested document property is not indexed, so
+ // this is index compatible, but join incompatible
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_join_incompatible.insert(kPersonType);
+
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta result_schema_delta =
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ dependents_map);
+ EXPECT_THAT(result_schema_delta, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, DeletingIndexedDocumentPropertyIsIncompatible) {
+ SchemaTypeConfigProto nested_schema =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Configure old schemam with two nested document properties of the same type
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("EmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("AnotherEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema and drop one of the nested document properties
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("EmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.insert(kPersonType);
+ schema_delta.schema_types_index_incompatible.insert(kPersonType);
+ schema_delta.schema_types_join_incompatible.insert(kPersonType);
+
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta result_schema_delta =
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ dependents_map);
+ EXPECT_THAT(result_schema_delta, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, DeletingNonIndexedDocumentPropertyIsIncompatible) {
+ SchemaTypeConfigProto nested_schema =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Configure old schemam with two nested document properties of the same type
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("EmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("AnotherEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema and drop the non-indexed nested document property
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("EmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.insert(kPersonType);
+ schema_delta.schema_types_join_incompatible.insert(kPersonType);
+
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta result_schema_delta =
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ dependents_map);
+ EXPECT_THAT(result_schema_delta, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, ChangingIndexedDocumentPropertyIsIncompatible) {
+ SchemaTypeConfigProto nested_schema =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Configure old schemam with two nested document properties of the same type
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("EmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("AnotherEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema and change one of the nested document properties
+ // to a different name (this is the same as deleting a property and adding
+ // another)
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("EmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("DifferentEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.insert(kPersonType);
+ schema_delta.schema_types_index_incompatible.insert(kPersonType);
+ schema_delta.schema_types_join_incompatible.insert(kPersonType);
+
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta result_schema_delta =
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ dependents_map);
+ EXPECT_THAT(result_schema_delta, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, ChangingNonIndexedDocumentPropertyIsIncompatible) {
+ SchemaTypeConfigProto nested_schema =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Configure old schemam with two nested document properties of the same type
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("EmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("AnotherEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema and change the non-indexed nested document property to
+ // a different name (this is the same as deleting a property and adding
+ // another)
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(nested_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("EmailProperty")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("DifferentEmailProperty")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.insert(kPersonType);
+ schema_delta.schema_types_join_incompatible.insert(kPersonType);
+
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta result_schema_delta =
+ SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ dependents_map);
+ EXPECT_THAT(result_schema_delta, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, ChangingJoinablePropertiesMakesJoinIncompatible) {
+ // Configure old schema
+ SchemaProto schema_with_joinable_property =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema
+ SchemaProto schema_with_non_joinable_property =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta expected_schema_delta;
+ expected_schema_delta.schema_types_join_incompatible.insert(kPersonType);
+
+ // New schema gained a new joinable property.
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ schema_with_non_joinable_property,
+ schema_with_joinable_property, no_dependents_map),
+ Eq(expected_schema_delta));
+
+ // New schema lost a joinable property.
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(
+ schema_with_joinable_property,
+ schema_with_non_joinable_property, no_dependents_map),
+ Eq(expected_schema_delta));
+}
+
+TEST_P(SchemaUtilTest, AddingNewJoinablePropertyMakesJoinIncompatible) {
+ // Configure old schema
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewJoinableProperty")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta expected_schema_delta;
+ expected_schema_delta.schema_types_join_incompatible.insert(kPersonType);
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map),
+ Eq(expected_schema_delta));
+}
+
+TEST_P(SchemaUtilTest, AddingNewNonJoinablePropertyShouldRemainJoinCompatible) {
+ // Configure old schema
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("JoinableProperty")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Configure new schema
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("JoinableProperty")
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NewProperty")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map)
+ .schema_types_join_incompatible,
+ IsEmpty());
+}
+
+TEST_P(SchemaUtilTest, AddingTypeIsCompatible) {
// Can add a new type, existing data isn't incompatible, since none of them
// are of this new schema type
- SchemaProto old_schema;
- auto type = old_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
- SchemaProto new_schema;
- type = new_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
- type = new_schema.add_types();
- *type = CreateSchemaTypeConfig(kPersonType);
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ schema_delta.schema_types_new.insert(kEmailType);
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, DeletingTypeIsNoted) {
+TEST_P(SchemaUtilTest, DeletingTypeIsNoted) {
// Can't remove an old type, new schema needs to at least have all the
// previously defined schema otherwise the Documents of the missing schema
// are invalid
- SchemaProto old_schema;
- auto type = old_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
- type = old_schema.add_types();
- *type = CreateSchemaTypeConfig(kPersonType);
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
- SchemaProto new_schema;
- type = new_schema.add_types();
- *type = CreateSchemaTypeConfig(kEmailType);
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
SchemaUtil::SchemaDelta schema_delta;
schema_delta.schema_types_deleted.emplace(kPersonType);
- EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema),
+ SchemaUtil::DependentMap no_dependents_map;
+ EXPECT_THAT(SchemaUtil::ComputeCompatibilityDelta(old_schema, new_schema,
+ no_dependents_map),
Eq(schema_delta));
}
-TEST_F(SchemaUtilTest, ValidateNoTokenizer) {
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("MyType");
-
- auto* prop = type->add_properties();
- prop->set_property_name("Foo");
- prop->set_data_type(PropertyConfigProto::DataType::STRING);
- prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- EXPECT_THAT(SchemaUtil::Validate(schema),
+TEST_P(SchemaUtilTest, DeletingPropertyAndChangingProperty) {
+ SchemaProto old_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property2")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ // Remove Property2 and make Property1 indexed now. Removing Property2 should
+ // be incompatible.
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("Property1")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_incompatible.emplace(kEmailType);
+ schema_delta.schema_types_index_incompatible.emplace(kEmailType);
+ SchemaUtil::DependentMap no_dependents_map;
+ SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta(
+ old_schema, new_schema, no_dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, IndexNestedDocumentsIndexIncompatible) {
+ // Make two schemas. One that sets index_nested_properties to false and one
+ // that sets it to true.
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto no_nested_index_schema =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaProto nested_index_schema =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Going from index_nested_properties=false to index_nested_properties=true
+ // should make kPersonType index_incompatible. kEmailType should be
+ // unaffected.
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_index_incompatible.emplace(kPersonType);
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta actual = SchemaUtil::ComputeCompatibilityDelta(
+ no_nested_index_schema, nested_index_schema, dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+
+ // Going from index_nested_properties=true to index_nested_properties=false
+ // should also make kPersonType index_incompatible. kEmailType should be
+ // unaffected.
+ actual = SchemaUtil::ComputeCompatibilityDelta(
+ nested_index_schema, no_nested_index_schema, dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, AddOrDropIndexableNestedProperties_IndexIncompatible) {
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema_1 =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*indexable_nested_properties_list=*/
+ {"recipient", "subject", "body"})
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaProto schema_2 =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*indexable_nested_properties=*/
+ {"recipient", "subject"})
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Dropping some indexable_nested_properties should make kPersonType
+ // index_incompatible. kEmailType should be unaffected.
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_index_incompatible.emplace(kPersonType);
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta actual =
+ SchemaUtil::ComputeCompatibilityDelta(schema_1, schema_2, dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+
+ // Adding some indexable_nested_properties should also make kPersonType
+ // index_incompatible. kEmailType should be unaffected.
+ actual =
+ SchemaUtil::ComputeCompatibilityDelta(schema_2, schema_1, dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, ChangingIndexableNestedProperties_IndexIncompatible) {
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema_1 =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*indexable_nested_properties_list=*/
+ {"recipient", "subject"})
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaProto schema_2 =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*indexable_nested_properties_list=*/
+ {"recipient", "body"})
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Changing 'subject' to 'body' for indexable_nested_properties_list should
+ // make kPersonType index_incompatible. kEmailType should be unaffected.
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_index_incompatible.emplace(kPersonType);
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta actual =
+ SchemaUtil::ComputeCompatibilityDelta(schema_1, schema_2, dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest, IndexableNestedPropertiesFullSet_IndexIncompatible) {
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema_1 =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaProto schema_2 =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*indexable_nested_properties_list=*/
+ {"recipient", "body", "subject"})
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // This scenario also invalidates kPersonType and triggers an index rebuild at
+ // the moment, even though the set of indexable_nested_properties from
+ // schema_1 to schema_2 should be the same.
+ SchemaUtil::SchemaDelta schema_delta;
+ schema_delta.schema_types_index_incompatible.emplace(kPersonType);
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta actual =
+ SchemaUtil::ComputeCompatibilityDelta(schema_1, schema_2, dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+}
+
+TEST_P(SchemaUtilTest,
+ ChangingIndexableNestedPropertiesOrder_IndexIsCompatible) {
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType(kEmailType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema_1 =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*indexable_nested_properties_list=*/
+ {"recipient", "subject", "body"})
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaProto schema_2 =
+ SchemaBuilder()
+ .AddType(email_type_config)
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kPersonType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("emails")
+ .SetDataTypeDocument(
+ kEmailType,
+ /*indexable_nested_properties_list=*/
+ {"subject", "body", "recipient"})
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Changing order of elements in indexable_nested_properties_list should have
+ // no effect on schema compatibility.
+ SchemaUtil::SchemaDelta schema_delta;
+ SchemaUtil::DependentMap dependents_map = {{kEmailType, {{kPersonType, {}}}}};
+ SchemaUtil::SchemaDelta actual =
+ SchemaUtil::ComputeCompatibilityDelta(schema_1, schema_2, dependents_map);
+ EXPECT_THAT(actual, Eq(schema_delta));
+ EXPECT_THAT(actual.schema_types_index_incompatible, IsEmpty());
+}
+
+TEST_P(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTermMatchType) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ // Error if we don't set a term match type
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
- EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
+ // Passes once we set a term match type
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
}
-TEST_F(SchemaUtilTest, ValidateDocumentNoTokenizer) {
- SchemaProto schema;
- auto* type = schema.add_types();
- type->set_schema_type("OtherType");
+TEST_P(SchemaUtilTest, ValidateStringIndexingConfigShouldHaveTokenizer) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ // Error if we don't set a tokenizer type
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Passes once we set a tokenizer type
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(SchemaUtilTest,
+ ValidateJoinablePropertyTypeQualifiedIdShouldHaveStringDataType) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_INT64)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ // Error if data type is not STRING for qualified id joinable value type.
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Passes once we set STRING as the data type.
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
- type = schema.add_types();
- type->set_schema_type("MyType");
+TEST_P(SchemaUtilTest,
+ ValidateJoinablePropertyShouldNotHaveRepeatedCardinality) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
- auto* prop = type->add_properties();
- prop->set_property_name("SubType");
- prop->set_schema_type("OtherType");
- prop->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- prop->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- prop->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- prop->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::NONE);
+ // Error if using REPEATED cardinality for joinable property.
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Passes once we use OPTIONAL cardinality with joinable property.
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+
+ // Passes once we use REQUIRED cardinality with joinable property.
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+
+ // Passes once we use REPEATED cardinality with non-joinable property.
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_NONE,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(SchemaUtilTest,
+ ValidateJoinablePropertyWithDeletePropagationShouldHaveTypeQualifiedId) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_NONE,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ // Error if enabling delete propagation with non qualified id joinable value
+ // type.
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Passes once we set qualified id joinable value type with delete propagation
+ // enabled.
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/true)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+
+ // Passes once we disable delete propagation.
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("MyType").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_NONE,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(SchemaUtilTest,
+ ValidateNestedJoinablePropertyShouldNotHaveNestedRepeatedCardinality) {
+ // Dependency and nested document property cardinality:
+ // "C" --(REPEATED)--> "B" --(OPTIONAL)--> "A"
+ // where "A" contains joinable property. This should not be allowed.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument("B",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Passes once we use non-REPEATED cardinality for "C.b", i.e. the dependency
+ // and nested document property cardinality becomes:
+ // "C" --(OPTIONAL)--> "B" --(OPTIONAL)--> "A"
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument("B",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(
+ SchemaUtilTest,
+ ValidateNestedJoinablePropertyShouldAllowRepeatedCardinalityIfNoJoinableProperty) {
+ // Dependency and nested document property cardinality:
+ // "C" --(OPTIONAL)--> "B" --(REPEATED)--> "A"
+ // where only "B" contains joinable property. This should be allowed.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_NONE,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument(
+ "A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Bar")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument("B",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ // Passes since nested schema type with REPEATED cardinality doesn't have
+ // joinable property.
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(SchemaUtilTest,
+ ValidateNestedJoinablePropertyMultiplePropertiesWithSameSchema) {
+ // Dependency and nested document property cardinality:
+ // --(a1: OPTIONAL)--
+ // / \
+ // B -- --> A
+ // \ /
+ // --(a2: REPEATED)--
+ // where "A" contains joinable property. This should not be allowed.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a1")
+ .SetDataTypeDocument(
+ "A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a2")
+ .SetDataTypeDocument(
+ "A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Passes once we use non-REPEATED cardinality for "B.a2", i.e. the dependency
+ // and nested document property cardinality becomes:
+ // --(a1: OPTIONAL)--
+ // / \
+ // B -- --> A
+ // \ /
+ // --(a2: OPTIONAL)--
+ schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("B")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a1")
+ .SetDataTypeDocument(
+ "A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("a2")
+ .SetDataTypeDocument(
+ "A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(SchemaUtilTest, ValidateNestedJoinablePropertyDiamondRelationship) {
+ // Dependency and nested document property cardinality:
+ // B
+ // / \
+ // (OPTIONAL) (OPTIONAL)
+ // / \
+ // D --- --> A
+ // \ /
+ // (OPTIONAL) (OPTIONAL)
+ // \ /
+ // C
+ // where "A" contains joinable property. This should be allowed.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument(
+ "B",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("c")
+ .SetDataTypeDocument(
+ "C",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+
+ // Fails once we change any of edge to REPEATED cardinality.
+ // B
+ // / \
+ // (REPEATED) (OPTIONAL)
+ // / \
+ // D --- --> A
+ // \ /
+ // (OPTIONAL) (OPTIONAL)
+ // \ /
+ // C
+ schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument(
+ "B",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("c")
+ .SetDataTypeDocument(
+ "C",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // B
+ // / \
+ // (OPTIONAL) (REPEATED)
+ // / \
+ // D --- --> A
+ // \ /
+ // (OPTIONAL) (OPTIONAL)
+ // \ /
+ // C
+ schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument(
+ "B",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("c")
+ .SetDataTypeDocument(
+ "C",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // B
+ // / \
+ // (OPTIONAL) (OPTIONAL)
+ // / \
+ // D --- --> A
+ // \ /
+ // (REPEATED) (OPTIONAL)
+ // \ /
+ // C
+ schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument(
+ "B",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("c")
+ .SetDataTypeDocument(
+ "C",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // B
+ // / \
+ // (OPTIONAL) (OPTIONAL)
+ // / \
+ // D --- --> A
+ // \ /
+ // (OPTIONAL) (REPEATED)
+ // \ /
+ // C
+ schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("A").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("Foo")
+ .SetDataType(TYPE_STRING)
+ .SetJoinable(JOINABLE_VALUE_TYPE_QUALIFIED_ID,
+ /*propagate_delete=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("B").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("C").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("a")
+ .SetDataTypeDocument("A",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("D")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("b")
+ .SetDataTypeDocument(
+ "B",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("c")
+ .SetDataTypeDocument(
+ "C",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(SchemaUtilTest,
+ ValidDocumentIndexingConfigFields_emptyIndexableListBooleanTrue) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("InnerSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("OuterSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("InnerProperty")
+ .SetDataTypeDocument(
+ "InnerSchema",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaTypeConfigProto* outerSchemaType = schema.mutable_types(1);
+ outerSchemaType->mutable_properties(0)
+ ->mutable_document_indexing_config()
+ ->clear_indexable_nested_properties_list();
+
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(SchemaUtilTest,
+ ValidDocumentIndexingConfigFields_emptyIndexableListBooleanFalse) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("InnerSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop2")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("OuterSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("InnerProperty")
+ .SetDataTypeDocument(
+ "InnerSchema",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaTypeConfigProto* outerSchemaType = schema.mutable_types(1);
+ outerSchemaType->mutable_properties(0)
+ ->mutable_document_indexing_config()
+ ->clear_indexable_nested_properties_list();
+
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(SchemaUtilTest,
+ ValidDocumentIndexingConfigFields_nonEmptyIndexableList) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("InnerSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("OuterSchema")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("InnerProperty")
+ .SetDataTypeDocument(
+ "InnerSchema",
+ /*indexable_nested_properties_list=*/
+ std::initializer_list<std::string>{"prop1"})
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ SchemaTypeConfigProto* outerSchemaType = schema.mutable_types(1);
+ outerSchemaType->mutable_properties(0)
+ ->mutable_document_indexing_config()
+ ->set_index_nested_properties(false);
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
+}
+
+TEST_P(SchemaUtilTest, InvalidDocumentIndexingConfigFields) {
+ // If indexable_nested_properties is non-empty, index_nested_properties is
+ // required to be false.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("InnerSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("prop1")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("OuterSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("InnerProperty")
+ .SetDataTypeDocument(
+ "InnerSchema",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Setting a non-empty indexable_nested_properties_list while
+ // index_nested_properties=true is invalid.
+ SchemaTypeConfigProto* outerSchemaType = schema.mutable_types(1);
+ outerSchemaType->mutable_properties(0)
+ ->mutable_document_indexing_config()
+ ->add_indexable_nested_properties_list("prop");
+
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_P(SchemaUtilTest, MultipleReferencesToSameNestedSchemaOk) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("InnerSchema"))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("OuterSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("InnerProperty1")
+ .SetDataTypeDocument(
+ "InnerSchema",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("InnerProperty2")
+ .SetDataTypeDocument(
+ "InnerSchema",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
- EXPECT_THAT(SchemaUtil::Validate(schema), IsOk());
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()), IsOk());
}
+TEST_P(SchemaUtilTest, InvalidSelfReference) {
+ // Create a schema with a self-reference cycle in it: OwnSchema -> OwnSchema
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("OwnSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NestedDocument")
+ .SetDataTypeDocument(
+ "OwnSchema",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, InvalidSelfReferenceEvenWithOtherProperties) {
+ // Create a schema with a self-reference cycle in it: OwnSchema -> OwnSchema
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("OwnSchema")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NestedDocument")
+ .SetDataTypeDocument(
+ "OwnSchema",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("SomeString")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, InvalidInfiniteLoopTwoDegrees) {
+ // Create a schema for the outer schema
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ // Reference schema B, so far so good
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NestedDocument")
+ .SetDataTypeDocument(
+ "B", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ // Create the inner schema
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ // Reference the schema A, causing an invalid cycle of
+ // references.
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NestedDocument")
+ .SetDataTypeDocument(
+ "A", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Two degrees of referencing: A -> B -> A
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, InvalidInfiniteLoopThreeDegrees) {
+ SchemaProto schema =
+ SchemaBuilder()
+ // Create a schema for the outer schema
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ // Reference schema B, so far so good
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NestedDocument")
+ .SetDataTypeDocument(
+ "B", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ // Create the inner schema
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("B")
+ // Reference schema C, so far so good
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NestedDocument")
+ .SetDataTypeDocument(
+ "C", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ // Create the inner schema
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("C")
+ // Reference schema C, so far so good
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("NestedDocument")
+ .SetDataTypeDocument(
+ "A", /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ // Three degrees of referencing: A -> B -> C -> A
+ EXPECT_THAT(SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Invalid cycle")));
+}
+
+TEST_P(SchemaUtilTest, ChildMissingOptionalAndRepeatedPropertiesNotOk) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ EXPECT_THAT(
+ SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Property text is not present in child type")));
+}
+
+TEST_P(SchemaUtilTest, ChildMissingRequiredPropertyNotOk) {
+ SchemaTypeConfigProto type_a =
+ SchemaTypeConfigBuilder()
+ .SetType("A")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .Build();
+ SchemaTypeConfigProto type_b =
+ SchemaTypeConfigBuilder().SetType("B").AddParentType("A").Build();
+
+ SchemaProto schema = SchemaBuilder().AddType(type_a).AddType(type_b).Build();
+ EXPECT_THAT(
+ SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Property text is not present in child type")));
+}
+
+TEST_P(SchemaUtilTest, ChildCompatiblePropertyOk) {
+ SchemaTypeConfigProto message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("person")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto artist_message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("ArtistMessage")
+ .AddParentType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ // OPTIONAL is compatible with REPEATED.
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ // An extra text is compatible.
+ PropertyConfigBuilder()
+ .SetName("extraText")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ // An extra double is compatible
+ PropertyConfigBuilder()
+ .SetName("extraDouble")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataType(TYPE_DOUBLE))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("person")
+ // REQUIRED is compatible with OPTIONAL.
+ .SetCardinality(CARDINALITY_REQUIRED)
+ // Artist is compatible with Person.
+ .SetDataTypeDocument(
+ "Artist", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaTypeConfigProto person_type =
+ SchemaTypeConfigBuilder().SetType("Person").Build();
+ SchemaTypeConfigProto artist_type = SchemaTypeConfigBuilder()
+ .SetType("Artist")
+ .AddParentType("Person")
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(message_type)
+ .AddType(artist_message_type)
+ .AddType(person_type)
+ .AddType(artist_type)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(3));
+ EXPECT_THAT(d_map["Message"],
+ UnorderedElementsAre(Pair("ArtistMessage", IsEmpty())));
+ EXPECT_THAT(d_map["Person"],
+ UnorderedElementsAre(
+ Pair("Message", UnorderedElementsAre(Pointee(EqualsProto(
+ message_type.properties(1))))),
+ Pair("Artist", IsEmpty())));
+ EXPECT_THAT(d_map["Artist"],
+ UnorderedElementsAre(Pair(
+ "ArtistMessage", UnorderedElementsAre(Pointee(EqualsProto(
+ artist_message_type.properties(3)))))));
+}
+
+TEST_P(SchemaUtilTest, ChildIncompatibleCardinalityPropertyNotOk) {
+ SchemaTypeConfigProto message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("person")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto artist_message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("ArtistMessage")
+ .AddParentType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("extraText")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("person")
+ // Overwrite OPTIONAL to REPEATED is not ok.
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeDocument(
+ "Artist", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaTypeConfigProto person_type =
+ SchemaTypeConfigBuilder().SetType("Person").Build();
+ SchemaTypeConfigProto artist_type = SchemaTypeConfigBuilder()
+ .SetType("Artist")
+ .AddParentType("Person")
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(message_type)
+ .AddType(artist_message_type)
+ .AddType(person_type)
+ .AddType(artist_type)
+ .Build();
+ EXPECT_THAT(
+ SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Property person from child type ArtistMessage is not "
+ "compatible to the parent type Message.")));
+}
+
+TEST_P(SchemaUtilTest, ChildIncompatibleDataTypePropertyNotOk) {
+ SchemaTypeConfigProto message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("person")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto artist_message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("ArtistMessage")
+ .AddParentType("Message")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ // Double is not compatible to string.
+ .SetDataType(TYPE_DOUBLE))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("extraText")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("person")
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .SetDataTypeDocument(
+ "Artist", /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaTypeConfigProto person_type =
+ SchemaTypeConfigBuilder().SetType("Person").Build();
+ SchemaTypeConfigProto artist_type = SchemaTypeConfigBuilder()
+ .SetType("Artist")
+ .AddParentType("Person")
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(message_type)
+ .AddType(artist_message_type)
+ .AddType(person_type)
+ .AddType(artist_type)
+ .Build();
+ EXPECT_THAT(
+ SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Property text from child type ArtistMessage is not "
+ "compatible to the parent type Message.")));
+}
+
+TEST_P(SchemaUtilTest, ChildIncompatibleDocumentTypePropertyNotOk) {
+ SchemaTypeConfigProto message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("person")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeDocument(
+ "Person", /*index_nested_properties=*/true))
+ .Build();
+ SchemaTypeConfigProto artist_message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("ArtistMessage")
+ .AddParentType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("text")
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("extraText")
+ .SetCardinality(CARDINALITY_REPEATED)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("person")
+ .SetCardinality(CARDINALITY_REQUIRED)
+ // Artist is not a subtype of Person, thus incompatible
+ .SetDataTypeDocument("Artist",
+ /*index_nested_properties=*/true))
+ .Build();
+
+ SchemaTypeConfigProto person_type =
+ SchemaTypeConfigBuilder().SetType("Person").Build();
+ // In this test, Artist is not a subtype of Person.
+ SchemaTypeConfigProto artist_type =
+ SchemaTypeConfigBuilder().SetType("Artist").Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(message_type)
+ .AddType(artist_message_type)
+ .AddType(person_type)
+ .AddType(artist_type)
+ .Build();
+ EXPECT_THAT(
+ SchemaUtil::Validate(schema, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("Property person from child type ArtistMessage is not "
+ "compatible to the parent type Message.")));
+}
+
+TEST_P(SchemaUtilTest, ChildCompatibleMultipleParentPropertyOk) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto email_message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("EmailMessage")
+ .AddParentType("Email")
+ .AddParentType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(email_type)
+ .AddType(message_type)
+ .AddType(email_message_type)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaUtil::DependentMap d_map,
+ SchemaUtil::Validate(schema, GetParam()));
+ EXPECT_THAT(d_map, SizeIs(2));
+ EXPECT_THAT(d_map["Email"],
+ UnorderedElementsAre(Pair("EmailMessage", IsEmpty())));
+ EXPECT_THAT(d_map["Message"],
+ UnorderedElementsAre(Pair("EmailMessage", IsEmpty())));
+}
+
+TEST_P(SchemaUtilTest, ChildIncompatibleMultipleParentPropertyNotOk) {
+ SchemaTypeConfigProto email_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto message_type =
+ SchemaTypeConfigBuilder()
+ .SetType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+
+ // Missing the "sender" field from parent "Email", thus incompatible.
+ SchemaTypeConfigProto email_message_type1 =
+ SchemaTypeConfigBuilder()
+ .SetType("EmailMessage")
+ .AddParentType("Email")
+ .AddParentType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("content")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema1 = SchemaBuilder()
+ .AddType(email_type)
+ .AddType(message_type)
+ .AddType(email_message_type1)
+ .Build();
+ EXPECT_THAT(
+ SchemaUtil::Validate(schema1, GetParam()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr(
+ "Property sender is not present in child type EmailMessage, "
+ "but it is defined in the parent type Email.")));
+
+ // Missing the "content" field from parent "Message", thus incompatible.
+ SchemaTypeConfigProto email_message_type2 =
+ SchemaTypeConfigBuilder()
+ .SetType("EmailMessage")
+ .AddParentType("Email")
+ .AddParentType("Message")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("recipient")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema2 = SchemaBuilder()
+ .AddType(email_type)
+ .AddType(message_type)
+ .AddType(email_message_type2)
+ .Build();
+ EXPECT_THAT(
+ SchemaUtil::Validate(schema2, GetParam()),
+ StatusIs(
+ libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr(
+ "Property content is not present in child type EmailMessage, "
+ "but it is defined in the parent type Message.")));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ SchemaUtilTest, SchemaUtilTest,
+ testing::Values(/*allow_circular_schema_definitions=*/true, false));
+
+struct IsIndexedPropertyTestParam {
+ PropertyConfigProto property_config;
+ bool expected_result;
+
+ explicit IsIndexedPropertyTestParam(PropertyConfigProto property_config_in,
+ bool expected_result_in)
+ : property_config(std::move(property_config_in)),
+ expected_result(expected_result_in) {}
+};
+
+class SchemaUtilIsIndexedPropertyTest
+ : public ::testing::TestWithParam<IsIndexedPropertyTestParam> {};
+
+TEST_P(SchemaUtilIsIndexedPropertyTest, IsIndexedProperty) {
+ const IsIndexedPropertyTestParam& test_param = GetParam();
+ EXPECT_THAT(SchemaUtil::IsIndexedProperty(test_param.property_config),
+ Eq(test_param.expected_result));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ SchemaUtilIsIndexedPropertyTest, SchemaUtilIsIndexedPropertyTest,
+ testing::Values(
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_NONE)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_PLAIN)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_VERBATIM)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_RFC822)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN,
+ TOKENIZER_URL)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_NONE)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_VERBATIM)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_RFC822)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_URL)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_NONE)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_VERBATIM)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_RFC822)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_URL)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .Build(),
+ true),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_DOUBLE)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_BOOLEAN)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_BYTES)
+ .Build(),
+ false),
+ IsIndexedPropertyTestParam(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_DOCUMENT)
+ .Build(),
+ false)));
+
} // namespace
} // namespace lib
diff --git a/icing/schema/section-manager-builder_test.cc b/icing/schema/section-manager-builder_test.cc
new file mode 100644
index 0000000..1d452d5
--- /dev/null
+++ b/icing/schema/section-manager-builder_test.cc
@@ -0,0 +1,341 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/section-manager.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Pointee;
+
+class SectionManagerBuilderTest : public ::testing::Test {
+ protected:
+ void SetUp() override { test_dir_ = GetTestTempDir() + "/icing"; }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ Filesystem filesystem_;
+ std::string test_dir_;
+};
+
+TEST_F(SectionManagerBuilderTest, Build) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("typeOne", 0));
+ ICING_ASSERT_OK(schema_type_mapper->Put("typeTwo", 1));
+
+ PropertyConfigProto prop_foo =
+ PropertyConfigBuilder()
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+ PropertyConfigProto prop_bar =
+ PropertyConfigBuilder()
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+ PropertyConfigProto prop_baz =
+ PropertyConfigBuilder()
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+
+ SectionManager::Builder builder(*schema_type_mapper);
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, prop_foo, /*property_path=*/"foo"));
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, prop_bar, /*property_path=*/"bar"));
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/1, prop_baz, /*property_path=*/"baz"));
+
+ std::unique_ptr<SectionManager> section_manager = std::move(builder).Build();
+ // Check "typeOne"
+ EXPECT_THAT(
+ section_manager->GetMetadataList("typeOne"),
+ IsOkAndHolds(Pointee(ElementsAre(
+ EqualsSectionMetadata(/*expected_id=*/0,
+ /*expected_property_path=*/"foo", prop_foo),
+ EqualsSectionMetadata(/*expected_id=*/1,
+ /*expected_property_path=*/"bar", prop_bar)))));
+ // Check "typeTwo"
+ EXPECT_THAT(section_manager->GetMetadataList("typeTwo"),
+ IsOkAndHolds(Pointee(ElementsAre(EqualsSectionMetadata(
+ /*expected_id=*/0,
+ /*expected_property_path=*/"baz", prop_baz)))));
+}
+
+TEST_F(SectionManagerBuilderTest, TooManyPropertiesShouldFail) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("type", 0));
+
+ SectionManager::Builder builder(*schema_type_mapper);
+ // Add kTotalNumSections indexable properties
+ for (int i = 0; i < kTotalNumSections; i++) {
+ PropertyConfigProto property_config =
+ PropertyConfigBuilder()
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config,
+ /*property_path=*/"property" + std::to_string(i)));
+ }
+
+ // Add another indexable property. This should fail.
+ PropertyConfigProto property_config =
+ PropertyConfigBuilder()
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+ EXPECT_THAT(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config,
+ /*property_path=*/"propertyExceed"),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE,
+ HasSubstr("Too many properties")));
+}
+
+TEST_F(SectionManagerBuilderTest, InvalidSchemaTypeIdShouldFail) {
+ // Create a schema type mapper with invalid schema type id.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("type", 0));
+
+ PropertyConfigProto property_config =
+ PropertyConfigBuilder()
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+
+ SectionManager::Builder builder(*schema_type_mapper);
+ EXPECT_THAT(
+ builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/-1, property_config, /*property_path=*/"property"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SectionManagerBuilderTest,
+ SchemaTypeIdInconsistentWithSchemaTypeMapperSizeShouldFail) {
+ // Create a schema type mapper with schema type id = 2, but size of mapper is
+ // 2.
+ // Since SectionManagerBuilder expects 2 schema type ids = [0, 1], building
+ // with schema type id = 2 should fail even though id = 2 is in schema type
+ // mapper.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put("typeOne", 0));
+ ICING_ASSERT_OK(schema_type_mapper->Put("typeTwo", 2));
+
+ PropertyConfigProto property_config =
+ PropertyConfigBuilder()
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+
+ SectionManager::Builder builder(*schema_type_mapper);
+ EXPECT_THAT(
+ builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/2, property_config, /*property_path=*/"property"),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+class IndexableSectionManagerBuilderTest
+ : public SectionManagerBuilderTest,
+ public ::testing::WithParamInterface<PropertyConfigProto> {};
+
+TEST_P(IndexableSectionManagerBuilderTest, Build) {
+ static constexpr std::string_view kSchemaType = "type";
+ static constexpr std::string_view kPropertyPath = "foo.bar";
+ const PropertyConfigProto& property_config = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put(kSchemaType, 0));
+
+ SectionManager::Builder builder(*schema_type_mapper);
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config, std::string(kPropertyPath)));
+
+ std::unique_ptr<SectionManager> section_manager = std::move(builder).Build();
+ EXPECT_THAT(section_manager->GetMetadataList(std::string(kSchemaType)),
+ IsOkAndHolds(Pointee(ElementsAre(EqualsSectionMetadata(
+ /*expected_id=*/0, kPropertyPath, property_config)))));
+}
+
+// The following types are considered indexable:
+// - String with valid TermMatchType and TokenizerType
+// - Int64 with valid NumericMatchType
+INSTANTIATE_TEST_SUITE_P(
+ IndexableSectionManagerBuilderTest, IndexableSectionManagerBuilderTest,
+ testing::Values(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_RFC822)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_VERBATIM)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_PREFIX, TOKENIZER_RFC822)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build()));
+
+class NonIndexableSectionManagerBuilderTest
+ : public SectionManagerBuilderTest,
+ public ::testing::WithParamInterface<PropertyConfigProto> {};
+
+TEST_P(NonIndexableSectionManagerBuilderTest, Build) {
+ static constexpr std::string_view kSchemaType = "type";
+ static constexpr std::string_view kPropertyPath = "foo.bar";
+ const PropertyConfigProto& property_config = GetParam();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_ + "/schema_type_mapper",
+ /*maximum_size_bytes=*/3 * 128 * 1024));
+ ICING_ASSERT_OK(schema_type_mapper->Put(kSchemaType, 0));
+
+ SectionManager::Builder builder(*schema_type_mapper);
+ ICING_ASSERT_OK(builder.ProcessSchemaTypePropertyConfig(
+ /*schema_type_id=*/0, property_config, std::string(kPropertyPath)));
+
+ // NonIndexable sections will still consume a sectionId.
+ std::unique_ptr<SectionManager> section_manager = std::move(builder).Build();
+ EXPECT_THAT(section_manager->GetMetadataList(std::string(kSchemaType)),
+ IsOkAndHolds(Pointee(ElementsAre(EqualsSectionMetadata(
+ /*expected_id=*/0, kPropertyPath, property_config)))));
+}
+
+// The following types are considered non-indexable:
+// - String with TERM_MATCH_UNKNOWN or TOKENIZER_NONE
+// - Int64 with NUMERIC_MATCH_UNKNOWN
+// - Double
+// - Boolean
+// - Bytes
+// - Document
+INSTANTIATE_TEST_SUITE_P(
+ NonIndexableSectionManagerBuilderTest,
+ NonIndexableSectionManagerBuilderTest,
+ testing::Values(PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeInt64(NUMERIC_MATCH_UNKNOWN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_DOUBLE)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_BOOLEAN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeDocument("anotherSchema",
+ /*index_nested_properties=*/false)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build(),
+ PropertyConfigBuilder()
+ .SetName("property")
+ .SetDataTypeDocument("anotherSchema",
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build()));
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/schema/section-manager.cc b/icing/schema/section-manager.cc
index 6a10c9a..3d540d6 100644
--- a/icing/schema/section-manager.cc
+++ b/icing/schema/section-manager.cc
@@ -15,27 +15,20 @@
#include "icing/schema/section-manager.h"
#include <algorithm>
-#include <cinttypes>
-#include <cstddef>
#include <cstdint>
-#include <iterator>
-#include <memory>
#include <string>
#include <string_view>
-#include <unordered_map>
-#include <unordered_set>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
-#include "icing/absl_ports/str_cat.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/term.pb.h"
-#include "icing/schema/schema-util.h"
+#include "icing/schema/property-util.h"
#include "icing/schema/section.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/key-mapper.h"
@@ -43,300 +36,85 @@
namespace icing {
namespace lib {
-namespace {
-
-using TypeSectionMap =
- std::unordered_map<std::string, const std::vector<SectionMetadata>>;
-
-// This state helps detect infinite loops (e.g. two type configs referencing
-// each other) when assigning sections. The combination of 'number of section
-// assigned' and 'current schema name' represents a unique state in the
-// section-assign process. If the same state is seen the second time, that means
-// an infinite loop.
-struct SectionAssigningState {
- size_t num_sections_assigned;
- std::string current_schema_name;
-
- SectionAssigningState(size_t num_sections_assigned_in,
- std::string&& current_schema_name_in)
- : num_sections_assigned(num_sections_assigned_in),
- current_schema_name(std::move(current_schema_name_in)) {}
-};
-
-// Provides a hash value of this struct so that it can be stored in a hash
-// set.
-struct SectionAssigningStateHasher {
- size_t operator()(const SectionAssigningState& state) const {
- size_t str_hash = std::hash<std::string>()(state.current_schema_name);
- size_t int_hash = std::hash<size_t>()(state.num_sections_assigned);
- // Combine the two hashes by taking the upper 16-bits of the string hash and
- // the lower 16-bits of the int hash.
- return (str_hash & 0xFFFF0000) | (int_hash & 0x0000FFFF);
- }
-};
-
-bool operator==(const SectionAssigningState& lhs,
- const SectionAssigningState& rhs) {
- return lhs.num_sections_assigned == rhs.num_sections_assigned &&
- lhs.current_schema_name == rhs.current_schema_name;
-}
-// Helper function to concatenate a path and a property name
-std::string ConcatenatePath(const std::string& path,
- const std::string& next_property_name) {
- if (path.empty()) {
- return next_property_name;
- }
- return absl_ports::StrCat(path, kPropertySeparator, next_property_name);
-}
+namespace {
-// Helper function to recursively identify sections from a type config and add
-// them to a section metadata list
-libtextclassifier3::Status AssignSections(
- const SchemaTypeConfigProto& type_config,
- const std::string& current_section_path,
- const SchemaUtil::TypeConfigMap& type_config_map,
- std::unordered_set<SectionAssigningState, SectionAssigningStateHasher>*
- visited_states,
- std::vector<SectionMetadata>* metadata_list) {
- if (!visited_states
- ->emplace(metadata_list->size(),
- std::string(type_config.schema_type()))
- .second) {
- // Failed to insert, the same state has been seen before, there's an
- // infinite loop in type configs
- return absl_ports::InvalidArgumentError(
- "Infinite loop detected in type configs");
+// Helper function to append a new section metadata
+libtextclassifier3::Status AppendNewSectionMetadata(
+ std::vector<SectionMetadata>* metadata_list,
+ std::string&& concatenated_path,
+ const PropertyConfigProto& property_config) {
+ // Validates next section id, makes sure that section id is the same as the
+ // list index so that we could find any section metadata by id in O(1) later.
+ SectionId new_section_id = static_cast<SectionId>(metadata_list->size());
+ if (!IsSectionIdValid(new_section_id)) {
+ // Max number of sections reached
+ return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
+ "Too many properties to be indexed, max number of properties "
+ "allowed: %d",
+ kMaxSectionId - kMinSectionId + 1));
}
- // Sorts properties by name's alphabetical order so that order doesn't affect
- // section assigning.
- auto sorted_properties = type_config.properties();
- std::sort(sorted_properties.pointer_begin(), sorted_properties.pointer_end(),
- [](const PropertyConfigProto* p1, const PropertyConfigProto* p2) {
- return p1->property_name() < p2->property_name();
- });
- for (const auto& property_config : sorted_properties) {
- if (property_config.indexing_config().term_match_type() ==
- TermMatchType::UNKNOWN) {
- // No need to create section for current property
- continue;
- }
-
- // Creates section metadata according to data type
- if (property_config.data_type() == PropertyConfigProto::DataType::STRING ||
- property_config.data_type() == PropertyConfigProto::DataType::INT64 ||
- property_config.data_type() == PropertyConfigProto::DataType::DOUBLE) {
- // Validates next section id, makes sure that section id is the same as
- // the list index so that we could find any section metadata by id in O(1)
- // later.
- auto new_section_id = static_cast<SectionId>(metadata_list->size());
- if (!IsSectionIdValid(new_section_id)) {
- // Max number of sections reached
- return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
- "Too many properties to be indexed, max number of properties "
- "allowed: %d",
- kMaxSectionId - kMinSectionId + 1));
- }
- // Creates section metadata from property config
- metadata_list->emplace_back(
- new_section_id, property_config.indexing_config().term_match_type(),
- property_config.indexing_config().tokenizer_type(),
- ConcatenatePath(current_section_path,
- property_config.property_name()));
- } else if (property_config.data_type() ==
- PropertyConfigProto::DataType::DOCUMENT) {
- // Tries to find sections recursively
- auto nested_type_config_iter =
- type_config_map.find(property_config.schema_type());
- if (nested_type_config_iter == type_config_map.end()) {
- return absl_ports::NotFoundError(absl_ports::StrCat(
- "type config not found: ", property_config.schema_type()));
- }
- const SchemaTypeConfigProto& nested_type_config =
- nested_type_config_iter->second;
- ICING_RETURN_IF_ERROR(
- AssignSections(nested_type_config,
- ConcatenatePath(current_section_path,
- property_config.property_name()),
- type_config_map, visited_states, metadata_list));
- }
- // NOTE: we don't create sections for BOOLEAN and BYTES data types.
- }
+ // Creates section metadata
+ metadata_list->push_back(SectionMetadata(
+ new_section_id, property_config.data_type(),
+ property_config.string_indexing_config().tokenizer_type(),
+ property_config.string_indexing_config().term_match_type(),
+ property_config.integer_indexing_config().numeric_match_type(),
+ std::move(concatenated_path)));
return libtextclassifier3::Status::OK;
}
-// Builds a vector of vectors that holds SectionMetadatas for all the schema
-// types. The outer vector's index corresponds with a type's SchemaTypeId. The
-// inner vector's index corresponds to the section's SectionId.
-libtextclassifier3::StatusOr<std::vector<std::vector<SectionMetadata>>>
-BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap& type_config_map,
- const KeyMapper<SchemaTypeId>& schema_type_mapper) {
- // Create our vector and reserve the number of schema types we have
- std::vector<std::vector<SectionMetadata>> section_metadata_cache(
- schema_type_mapper.num_keys());
-
- std::unordered_set<SectionAssigningState, SectionAssigningStateHasher>
- visited_states;
- for (const auto& name_and_type : type_config_map) {
- // Assigns sections for each type config
- visited_states.clear();
- const std::string& type_config_name = name_and_type.first;
- const SchemaTypeConfigProto& type_config = name_and_type.second;
- std::vector<SectionMetadata> metadata_list;
- ICING_RETURN_IF_ERROR(
- AssignSections(type_config, /*current_section_path*/ "",
- type_config_map, &visited_states, &metadata_list));
-
- // Insert the section metadata list at the index of the type's SchemaTypeId
- ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
- schema_type_mapper.Get(type_config_name));
- section_metadata_cache[schema_type_id] = std::move(metadata_list);
+template <typename T>
+void AppendSection(
+ SectionMetadata section_metadata,
+ libtextclassifier3::StatusOr<std::vector<T>>&& section_content_or,
+ std::vector<Section<T>>& sections_out) {
+ if (!section_content_or.ok()) {
+ return;
}
- return section_metadata_cache;
-}
-// Helper function to get string content from a property. Repeated values are
-// joined into one string. We only care about STRING, INT64, and DOUBLE data
-// types.
-std::vector<std::string> GetPropertyContent(const PropertyProto& property) {
- std::vector<std::string> values;
- if (!property.string_values().empty()) {
- std::copy(property.string_values().begin(), property.string_values().end(),
- std::back_inserter(values));
- } else if (!property.int64_values().empty()) {
- std::transform(
- property.int64_values().begin(), property.int64_values().end(),
- std::back_inserter(values),
- [](int64_t i) { return IcingStringUtil::StringPrintf("%" PRId64, i); });
- } else {
- std::transform(
- property.double_values().begin(), property.double_values().end(),
- std::back_inserter(values),
- [](double d) { return IcingStringUtil::StringPrintf("%f", d); });
+ std::vector<T> section_content = std::move(section_content_or).ValueOrDie();
+ if (!section_content.empty()) {
+ // Adds to result vector if section is found in document
+ sections_out.emplace_back(std::move(section_metadata),
+ std::move(section_content));
}
- return values;
-}
-
-// Helper function to get metadata list of a type config
-libtextclassifier3::StatusOr<std::vector<SectionMetadata>> GetMetadataList(
- const KeyMapper<SchemaTypeId>& schema_type_mapper,
- const std::vector<std::vector<SectionMetadata>>& section_metadata_cache,
- const std::string& type_config_name) {
- ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
- schema_type_mapper.Get(type_config_name));
- return section_metadata_cache.at(schema_type_id);
}
} // namespace
-SectionManager::SectionManager(
- const KeyMapper<SchemaTypeId>* schema_type_mapper,
- std::vector<std::vector<SectionMetadata>>&& section_metadata_cache)
- : schema_type_mapper_(*schema_type_mapper),
- section_metadata_cache_(std::move(section_metadata_cache)) {}
-
-libtextclassifier3::StatusOr<std::unique_ptr<SectionManager>>
-SectionManager::Create(const SchemaUtil::TypeConfigMap& type_config_map,
- const KeyMapper<SchemaTypeId>* schema_type_mapper) {
- ICING_RETURN_ERROR_IF_NULL(schema_type_mapper);
-
- ICING_ASSIGN_OR_RETURN(
- std::vector<std::vector<SectionMetadata>> section_metadata_cache,
- BuildSectionMetadataCache(type_config_map, *schema_type_mapper));
- return std::unique_ptr<SectionManager>(new SectionManager(
- schema_type_mapper, std::move(section_metadata_cache)));
-}
-
-libtextclassifier3::StatusOr<std::vector<std::string>>
-SectionManager::GetSectionContent(const DocumentProto& document,
- std::string_view section_path) const {
- // Finds the first property name in section_path
- size_t separator_position = section_path.find(kPropertySeparator);
- std::string_view current_property_name =
- (separator_position == std::string::npos)
- ? section_path
- : section_path.substr(0, separator_position);
-
- // Tries to match the property name with the ones in document
- auto property_iterator =
- std::find_if(document.properties().begin(), document.properties().end(),
- [current_property_name](const PropertyProto& property) {
- return property.name() == current_property_name;
- });
-
- if (property_iterator == document.properties().end()) {
- // Property name not found, it could be one of the following 2 cases:
- // 1. The property is optional and it's not in the document
- // 2. The property name is invalid
- return absl_ports::NotFoundError(
- absl_ports::StrCat("Section path ", section_path,
- " not found in type config ", document.schema()));
- }
-
- if (separator_position == std::string::npos) {
- // Current property name is the last one in section path
- std::vector<std::string> content = GetPropertyContent(*property_iterator);
- if (content.empty()) {
- // The content of property is explicitly set to empty, we'll treat it as
- // NOT_FOUND because the index doesn't care about empty strings.
- return absl_ports::NotFoundError(
- absl_ports::StrCat("Section path ", section_path,
- " not found in type config ", document.schema()));
- }
- return content;
- }
-
- // Gets section content recursively
- std::string_view sub_section_path =
- section_path.substr(separator_position + 1);
- std::vector<std::string> nested_document_content;
- for (const auto& nested_document : property_iterator->document_values()) {
- auto content_or = GetSectionContent(nested_document, sub_section_path);
- if (content_or.ok()) {
- std::vector<std::string> content = std::move(content_or).ValueOrDie();
- std::move(content.begin(), content.end(),
- std::back_inserter(nested_document_content));
- }
+libtextclassifier3::Status
+SectionManager::Builder::ProcessSchemaTypePropertyConfig(
+ SchemaTypeId schema_type_id, const PropertyConfigProto& property_config,
+ std::string&& property_path) {
+ if (schema_type_id < 0 || schema_type_id >= section_metadata_cache_.size()) {
+ return absl_ports::InvalidArgumentError("Invalid schema type id");
}
- if (nested_document_content.empty()) {
- return absl_ports::NotFoundError(
- absl_ports::StrCat("Section path ", section_path,
- " not found in type config ", document.schema()));
- }
- return nested_document_content;
-}
-libtextclassifier3::StatusOr<std::vector<std::string>>
-SectionManager::GetSectionContent(const DocumentProto& document,
- SectionId section_id) const {
- if (!IsSectionIdValid(section_id)) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Section id %d is greater than the max value %d", section_id,
- kMaxSectionId));
- }
- ICING_ASSIGN_OR_RETURN(
- const std::vector<SectionMetadata>& metadata_list,
- GetMetadataList(schema_type_mapper_, section_metadata_cache_,
- document.schema()));
- if (section_id >= metadata_list.size()) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Section with id %d doesn't exist in type config %s", section_id,
- document.schema().c_str()));
- }
- // The index of metadata list is the same as the section id, so we can use
- // section id as the index.
- return GetSectionContent(document, metadata_list[section_id].path);
+ // We don't need to check if the property is indexable. This method will
+ // only be called properties that should consume sectionIds, even if the
+ // property's indexing configuration itself is not indexable.
+ // This would be the case for unknown and non-indexable property paths that
+ // are defined in the indexable_nested_properties_list.
+ ICING_RETURN_IF_ERROR(
+ AppendNewSectionMetadata(&section_metadata_cache_[schema_type_id],
+ std::move(property_path), property_config));
+ return libtextclassifier3::Status::OK;
}
libtextclassifier3::StatusOr<const SectionMetadata*>
SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
SectionId section_id) const {
+ if (schema_type_id < 0 || schema_type_id >= section_metadata_cache_.size()) {
+ return absl_ports::InvalidArgumentError("Invalid schema type id");
+ }
if (!IsSectionIdValid(section_id)) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Section id %d is greater than the max value %d", section_id,
kMaxSectionId));
}
+
const std::vector<SectionMetadata>& section_metadatas =
section_metadata_cache_[schema_type_id];
if (section_id >= section_metadatas.size()) {
@@ -350,23 +128,54 @@ SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
return &section_metadatas[section_id];
}
-libtextclassifier3::StatusOr<std::vector<Section>>
-SectionManager::ExtractSections(const DocumentProto& document) const {
- ICING_ASSIGN_OR_RETURN(
- const std::vector<SectionMetadata>& metadata_list,
- GetMetadataList(schema_type_mapper_, section_metadata_cache_,
- document.schema()));
- std::vector<Section> sections;
- for (const auto& section_metadata : metadata_list) {
- auto section_content_or =
- GetSectionContent(document, section_metadata.path);
- // Adds to result vector if section is found in document
- if (section_content_or.ok()) {
- sections.emplace_back(SectionMetadata(section_metadata),
- std::move(section_content_or).ValueOrDie());
+libtextclassifier3::StatusOr<SectionGroup> SectionManager::ExtractSections(
+ const DocumentProto& document) const {
+ ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
+ GetMetadataList(document.schema()));
+ SectionGroup section_group;
+ for (const SectionMetadata& section_metadata : *metadata_list) {
+ switch (section_metadata.data_type) {
+ case PropertyConfigProto::DataType::STRING: {
+ if (section_metadata.term_match_type == TermMatchType::UNKNOWN ||
+ section_metadata.tokenizer ==
+ StringIndexingConfig::TokenizerType::NONE) {
+ // Skip if term-match type is UNKNOWN, or if the tokenizer-type is
+ // NONE.
+ break;
+ }
+ AppendSection(
+ section_metadata,
+ property_util::ExtractPropertyValuesFromDocument<std::string_view>(
+ document, section_metadata.path),
+ section_group.string_sections);
+ break;
+ }
+ case PropertyConfigProto::DataType::INT64: {
+ if (section_metadata.numeric_match_type ==
+ IntegerIndexingConfig::NumericMatchType::UNKNOWN) {
+ // Skip if numeric-match type is UNKNOWN.
+ break;
+ }
+ AppendSection(section_metadata,
+ property_util::ExtractPropertyValuesFromDocument<int64_t>(
+ document, section_metadata.path),
+ section_group.integer_sections);
+ break;
+ }
+ default: {
+ // Skip other data types.
+ break;
+ }
}
}
- return sections;
+ return section_group;
+}
+
+libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
+SectionManager::GetMetadataList(const std::string& type_config_name) const {
+ ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
+ schema_type_mapper_.Get(type_config_name));
+ return &section_metadata_cache_.at(schema_type_id);
}
} // namespace lib
diff --git a/icing/schema/section-manager.h b/icing/schema/section-manager.h
index 475fa6a..6241dc0 100644
--- a/icing/schema/section-manager.h
+++ b/icing/schema/section-manager.h
@@ -22,7 +22,6 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/proto/document.pb.h"
-#include "icing/schema/schema-util.h"
#include "icing/schema/section.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/key-mapper.h"
@@ -30,49 +29,55 @@
namespace icing {
namespace lib {
-inline constexpr char kPropertySeparator[] = ".";
-
// This class provides section-related operations. It assigns sections according
// to type configs and extracts section / sections from documents.
+// The actual instance is created together with JoinablePropertyManager and both
+// of them are wrapped into SchemaTypeManager.
+//
+// Note: SectionManager assumes schema type ids are consecutive integers
+// starting from 0, so it maintains a vector with size
+// schema_type_mapper_->num_keys() that maps schema type id to a list (2nd level
+// vector) of SectionMetadatas. Therefore, all schema type ids stored in
+// schema_type_mapper_ must be in range [0, schema_type_mapper_->num_keys() - 1]
+// and unique.
class SectionManager {
public:
+ // Builder class to create a SectionManager which does not take ownership of
+ // any input components, and all pointers must refer to valid objects that
+ // outlive the created SectionManager instance.
+ class Builder {
+ public:
+ explicit Builder(const KeyMapper<SchemaTypeId>& schema_type_mapper)
+ : schema_type_mapper_(schema_type_mapper),
+ section_metadata_cache_(schema_type_mapper.num_keys()) {}
+
+ // Checks and appends a new SectionMetadata for the schema type id if the
+ // given property config is indexable.
+ //
+ // Returns:
+ // - OK on success
+ // - INVALID_ARGUMENT_ERROR if schema type id is invalid (not in range [0,
+ // schema_type_mapper_.num_keys() - 1])
+ // - OUT_OF_RANGE_ERROR if # of indexable properties in a single Schema
+ // exceeds the threshold (kTotalNumSections)
+ libtextclassifier3::Status ProcessSchemaTypePropertyConfig(
+ SchemaTypeId schema_type_id, const PropertyConfigProto& property_config,
+ std::string&& property_path);
+
+ // Builds and returns a SectionManager instance.
+ std::unique_ptr<SectionManager> Build() && {
+ return std::unique_ptr<SectionManager>(new SectionManager(
+ schema_type_mapper_, std::move(section_metadata_cache_)));
+ }
+
+ private:
+ const KeyMapper<SchemaTypeId>& schema_type_mapper_; // Does not own.
+ std::vector<std::vector<SectionMetadata>> section_metadata_cache_;
+ };
+
SectionManager(const SectionManager&) = delete;
SectionManager& operator=(const SectionManager&) = delete;
- // Factory function to create a SectionManager which does not take ownership
- // of any input components, and all pointers must refer to valid objects that
- // outlive the created SectionManager instance.
- //
- // Returns:
- // A SectionManager on success
- // FAILED_PRECONDITION on any null pointer input
- // INVALID_ARGUMENT if infinite loop detected in the type configs
- // OUT_OF_RANGE if number of properties need indexing exceeds the max number
- // NOT_FOUND if any type config name not found in the map
- static libtextclassifier3::StatusOr<std::unique_ptr<SectionManager>> Create(
- const SchemaUtil::TypeConfigMap& type_config_map,
- const KeyMapper<SchemaTypeId>* schema_type_mapper);
-
- // Finds content of a section by section path (e.g. property1.property2)
- //
- // Returns:
- // A string of content on success
- // NOT_FOUND if:
- // 1. Property is optional and not found in the document
- // 2. section_path is invalid
- // 3. Content is empty
- libtextclassifier3::StatusOr<std::vector<std::string>> GetSectionContent(
- const DocumentProto& document, std::string_view section_path) const;
-
- // Finds content of a section by id
- //
- // Returns:
- // A string of content on success
- // INVALID_ARGUMENT if section id is invalid
- // NOT_FOUND if type config name of document not found
- libtextclassifier3::StatusOr<std::vector<std::string>> GetSectionContent(
- const DocumentProto& document, SectionId section_id) const;
-
// Returns the SectionMetadata associated with the SectionId that's in the
// SchemaTypeId.
//
@@ -82,24 +87,34 @@ class SectionManager {
libtextclassifier3::StatusOr<const SectionMetadata*> GetSectionMetadata(
SchemaTypeId schema_type_id, SectionId section_id) const;
- // Extracts all sections from the given document, sections are sorted by
- // section id in increasing order. Section ids start from 0. Sections with
- // empty content won't be returned.
+ // Extracts all sections of different types from the given document and group
+ // them by type.
+ // - Sections are sorted by section id in ascending order.
+ // - Section ids start from 0.
+ // - Sections with empty content won't be returned.
//
// Returns:
- // A list of sections on success
- // NOT_FOUND if type config name of document not found
- libtextclassifier3::StatusOr<std::vector<Section>> ExtractSections(
+ // A SectionGroup instance on success
+ // NOT_FOUND if the type config name of document is not present in
+ // schema_type_mapper_
+ libtextclassifier3::StatusOr<SectionGroup> ExtractSections(
const DocumentProto& document) const;
+ // Returns:
+ // - On success, the section metadatas for the specified type
+ // - NOT_FOUND if the type config name is not present in schema_type_mapper_
+ libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
+ GetMetadataList(const std::string& type_config_name) const;
+
private:
- // Use SectionManager::Create() to instantiate
explicit SectionManager(
- const KeyMapper<SchemaTypeId>* schema_type_mapper,
- std::vector<std::vector<SectionMetadata>>&& section_metadata_cache);
+ const KeyMapper<SchemaTypeId>& schema_type_mapper,
+ std::vector<std::vector<SectionMetadata>>&& section_metadata_cache)
+ : schema_type_mapper_(schema_type_mapper),
+ section_metadata_cache_(std::move(section_metadata_cache)) {}
// Maps schema types to a densely-assigned unique id.
- const KeyMapper<SchemaTypeId>& schema_type_mapper_;
+ const KeyMapper<SchemaTypeId>& schema_type_mapper_; // Does not own
// The index of section_metadata_cache_ corresponds to a schema type's
// SchemaTypeId. At that SchemaTypeId index, we store an inner vector. The
diff --git a/icing/schema/section-manager_test.cc b/icing/schema/section-manager_test.cc
index 9e73465..eee78e9 100644
--- a/icing/schema/section-manager_test.cc
+++ b/icing/schema/section-manager_test.cc
@@ -14,432 +14,1012 @@
#include "icing/schema/section-manager.h"
-#include <limits>
+#include <memory>
+#include <string>
+#include <string_view>
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
-#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-type-manager.h"
#include "icing/schema/schema-util.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
#include "icing/store/key-mapper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
namespace icing {
namespace lib {
+
+namespace {
+
using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::HasSubstr;
-
-// type and property names of EmailMessage
-constexpr char kTypeEmail[] = "EmailMessage";
-constexpr char kPropertySubject[] = "subject";
-constexpr char kPropertyText[] = "text";
-constexpr char kPropertyTimestamp[] = "timestamp";
-constexpr char kPropertyAttachment[] = "attachment";
-constexpr char kPropertyRecipients[] = "recipients";
+using ::testing::IsEmpty;
+using ::testing::Pointee;
+using ::testing::SizeIs;
+
+// type and property names of Email
+static constexpr std::string_view kTypeEmail = "Email";
+// indexable
+static constexpr std::string_view kPropertyRecipientIds = "recipientIds";
+static constexpr std::string_view kPropertyRecipients = "recipients";
+static constexpr std::string_view kPropertySubject = "subject";
+static constexpr std::string_view kPropertyTimestamp = "timestamp";
+// non-indexable
+static constexpr std::string_view kPropertyAttachment = "attachment";
+static constexpr std::string_view kPropertyNonIndexableInteger =
+ "nonIndexableInteger";
+static constexpr std::string_view kPropertyText = "text";
+
// type and property names of Conversation
-constexpr char kTypeConversation[] = "Conversation";
-constexpr char kPropertyName[] = "name";
-constexpr char kPropertyEmails[] = "emails";
+static constexpr std::string_view kTypeConversation = "Conversation";
+// indexable
+static constexpr std::string_view kPropertyEmails = "emails";
+static constexpr std::string_view kPropertyName = "name";
+
+// type and property names of Group
+static constexpr std::string_view kTypeGroup = "Group";
+// indexable
+static constexpr std::string_view kPropertyConversation = "conversation";
+static constexpr std::string_view kPropertyGroupName = "groupName";
+// nested indexable
+static constexpr std::string_view kPropertyNestedConversationName = "name";
+static constexpr std::string_view kPropertyNestedConversationEmailRecipientIds =
+ "emails.recipientIds";
+static constexpr std::string_view kPropertyNestedConversationEmailRecipient =
+ "emails.recipients";
+static constexpr std::string_view kPropertyNestedConversationEmailSubject =
+ "emails.subject";
+// nested non-indexable
+static constexpr std::string_view kPropertyNestedConversationEmailAttachment =
+ "emails.attachment";
+// non-existent property path
+static constexpr std::string_view kPropertyNestedNonExistent =
+ "emails.nonExistentNestedProperty";
+static constexpr std::string_view kPropertyNestedNonExistent2 =
+ "emails.nonExistentNestedProperty2";
+
+constexpr int64_t kDefaultTimestamp = 1663274901;
+
+PropertyConfigProto CreateRecipientIdsPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyRecipientIds)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED)
+ .Build();
+}
+
+PropertyConfigProto CreateRecipientsPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyRecipients)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED)
+ .Build();
+}
+
+PropertyConfigProto CreateSubjectPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertySubject)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+}
+
+PropertyConfigProto CreateTimestampPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyTimestamp)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REQUIRED)
+ .Build();
+}
+
+PropertyConfigProto CreateNamePropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+}
+
+PropertyConfigProto CreateAttachmentPropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyAttachment)
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+}
+
+PropertyConfigProto CreateGroupNamePropertyConfig() {
+ return PropertyConfigBuilder()
+ .SetName(kPropertyGroupName)
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)
+ .Build();
+}
+
+SchemaTypeConfigProto CreateEmailTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType(kTypeEmail)
+ .AddProperty(CreateSubjectPropertyConfig())
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyText)
+ .SetDataTypeString(TERM_MATCH_UNKNOWN, TOKENIZER_NONE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyAttachment)
+ .SetDataType(TYPE_BYTES)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(CreateRecipientsPropertyConfig())
+ .AddProperty(CreateRecipientIdsPropertyConfig())
+ .AddProperty(CreateTimestampPropertyConfig())
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyNonIndexableInteger)
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .Build();
+}
+
+SchemaTypeConfigProto CreateConversationTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType(kTypeConversation)
+ .AddProperty(CreateNamePropertyConfig())
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyEmails)
+ .SetDataTypeDocument(kTypeEmail,
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .Build();
+}
+
+SchemaTypeConfigProto CreateGroupTypeConfig() {
+ return SchemaTypeConfigBuilder()
+ .SetType(kTypeGroup)
+ .AddProperty(CreateGroupNamePropertyConfig())
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyConversation)
+ .SetDataTypeDocument(
+ kTypeConversation,
+ /*indexable_nested_properties_list=*/
+ {std::string(kPropertyNestedConversationName),
+ std::string(kPropertyNestedConversationEmailRecipientIds),
+ std::string(kPropertyNestedConversationEmailSubject),
+ std::string(kPropertyNestedConversationEmailRecipient),
+ std::string(kPropertyNestedConversationEmailAttachment),
+ std::string(kPropertyNestedNonExistent2),
+ std::string(kPropertyNestedNonExistent),
+ std::string(kPropertyNestedNonExistent)})
+ .SetCardinality(CARDINALITY_REPEATED))
+ .Build();
+}
class SectionManagerTest : public ::testing::Test {
protected:
- SectionManagerTest() : test_dir_(GetTestTempDir() + "/icing") {
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/icing";
+
auto email_type = CreateEmailTypeConfig();
auto conversation_type = CreateConversationTypeConfig();
+ auto group_type = CreateGroupTypeConfig();
type_config_map_.emplace(email_type.schema_type(), email_type);
type_config_map_.emplace(conversation_type.schema_type(),
conversation_type);
+ type_config_map_.emplace(group_type.schema_type(), group_type);
+
+ // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each
+ // one 128KiB so the total DynamicTrieKeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ ICING_ASSERT_OK_AND_ASSIGN(schema_type_mapper_,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(
+ filesystem_, test_dir_, key_mapper_size));
+ ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeEmail, 0));
+ ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeConversation, 1));
+ ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeGroup, 2));
email_document_ =
DocumentBuilder()
.SetKey("icing", "email/1")
- .SetSchema(kTypeEmail)
- .AddStringProperty(kPropertySubject, "the subject")
- .AddStringProperty(kPropertyText, "the text")
- .AddInt64Property(kPropertyTimestamp, 1234567890)
- .AddBytesProperty(kPropertyAttachment, "attachment bytes")
- .AddStringProperty(kPropertyRecipients, "recipient1", "recipient2",
- "recipient3")
+ .SetSchema(std::string(kTypeEmail))
+ .AddStringProperty(std::string(kPropertySubject), "the subject")
+ .AddStringProperty(std::string(kPropertyText), "the text")
+ .AddBytesProperty(std::string(kPropertyAttachment),
+ "attachment bytes")
+ .AddStringProperty(std::string(kPropertyRecipients), "recipient1",
+ "recipient2", "recipient3")
+ .AddInt64Property(std::string(kPropertyRecipientIds), 1, 2, 3)
+ .AddInt64Property(std::string(kPropertyTimestamp),
+ kDefaultTimestamp)
+ .AddInt64Property(std::string(kPropertyNonIndexableInteger), 100)
.Build();
conversation_document_ =
DocumentBuilder()
.SetKey("icing", "conversation/1")
- .SetSchema(kTypeConversation)
- .AddDocumentProperty(kPropertyEmails,
+ .SetSchema(std::string(kTypeConversation))
+ .AddDocumentProperty(std::string(kPropertyEmails),
DocumentProto(email_document_),
DocumentProto(email_document_))
.Build();
- }
- void SetUp() override {
- // KeyMapper uses 3 internal arrays for bookkeeping. Give each one 128KiB so
- // the total KeyMapper should get 384KiB
- int key_mapper_size = 3 * 128 * 1024;
- ICING_ASSERT_OK_AND_ASSIGN(schema_type_mapper_,
- KeyMapper<SchemaTypeId>::Create(
- filesystem_, test_dir_, key_mapper_size));
- ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeEmail, 0));
- ICING_ASSERT_OK(schema_type_mapper_->Put(kTypeConversation, 1));
- }
-
- static SchemaTypeConfigProto CreateEmailTypeConfig() {
- SchemaTypeConfigProto type;
- type.set_schema_type(kTypeEmail);
-
- auto subject = type.add_properties();
- subject->set_property_name(kPropertySubject);
- subject->set_data_type(PropertyConfigProto::DataType::STRING);
- subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- subject->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- subject->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- auto text = type.add_properties();
- text->set_property_name(kPropertyText);
- text->set_data_type(PropertyConfigProto::DataType::STRING);
- text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- text->mutable_indexing_config()->set_term_match_type(
- TermMatchType::UNKNOWN);
-
- auto timestamp = type.add_properties();
- timestamp->set_property_name(kPropertyTimestamp);
- timestamp->set_data_type(PropertyConfigProto::DataType::INT64);
- timestamp->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- timestamp->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- timestamp->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- auto attachment = type.add_properties();
- attachment->set_property_name(kPropertyAttachment);
- attachment->set_data_type(PropertyConfigProto::DataType::BYTES);
- attachment->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- attachment->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- attachment->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- auto recipients = type.add_properties();
- recipients->set_property_name(kPropertyRecipients);
- recipients->set_data_type(PropertyConfigProto::DataType::STRING);
- recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- recipients->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- recipients->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- return type;
+ group_document_ =
+ DocumentBuilder()
+ .SetKey("icing", "group/1")
+ .SetSchema(std::string(kTypeGroup))
+ .AddDocumentProperty(std::string(kPropertyConversation),
+ DocumentProto(conversation_document_))
+ .AddStringProperty(std::string(kPropertyGroupName), "group_name_1")
+ .Build();
}
- static SchemaTypeConfigProto CreateConversationTypeConfig() {
- SchemaTypeConfigProto type;
- type.set_schema_type(kTypeConversation);
-
- auto name = type.add_properties();
- name->set_property_name(kPropertyName);
- name->set_data_type(PropertyConfigProto::DataType::STRING);
- name->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- name->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
-
- auto emails = type.add_properties();
- emails->set_property_name(kPropertyEmails);
- emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- emails->set_schema_type(kTypeEmail);
- emails->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
-
- return type;
+ void TearDown() override {
+ schema_type_mapper_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
}
Filesystem filesystem_;
- const std::string test_dir_;
+ std::string test_dir_;
SchemaUtil::TypeConfigMap type_config_map_;
std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper_;
DocumentProto email_document_;
DocumentProto conversation_document_;
+ DocumentProto group_document_;
};
-TEST_F(SectionManagerTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(
- SectionManager::Create(type_config_map_, /*schema_type_mapper=*/nullptr),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
-}
+TEST_F(SectionManagerTest, ExtractSections) {
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Extracts all sections from 'Email' document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SectionGroup section_group,
+ schema_type_manager->section_manager().ExtractSections(email_document_));
-TEST_F(SectionManagerTest, CreationWithSchemaInfiniteLoopShouldFail) {
- // Creates 2 type configs that reference each other
- SchemaTypeConfigProto type_config1;
- type_config1.set_schema_type("type1");
- auto property1 = type_config1.add_properties();
- property1->set_property_name("property1");
- property1->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- property1->set_schema_type("type2"); // Here we reference type2
- property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property1->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
-
- SchemaTypeConfigProto type_config2;
- type_config2.set_schema_type("type2");
- auto property2 = type_config2.add_properties();
- property2->set_property_name("property2");
- property2->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- // Here we reference type1, which references type2 causing the infinite loop
- property2->set_schema_type("type1");
- property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property2->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ // String sections
+ EXPECT_THAT(section_group.string_sections, SizeIs(2));
- SchemaUtil::TypeConfigMap type_config_map;
- type_config_map.emplace("type1", type_config1);
- type_config_map.emplace("type2", type_config2);
+ EXPECT_THAT(section_group.string_sections[0].metadata,
+ EqualsSectionMetadata(/*expected_id=*/1,
+ /*expected_property_path=*/"recipients",
+ CreateRecipientsPropertyConfig()));
+ EXPECT_THAT(section_group.string_sections[0].content,
+ ElementsAre("recipient1", "recipient2", "recipient3"));
- EXPECT_THAT(
- SectionManager::Create(type_config_map, schema_type_mapper_.get()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
- HasSubstr("Infinite loop detected")));
-}
+ EXPECT_THAT(section_group.string_sections[1].metadata,
+ EqualsSectionMetadata(/*expected_id=*/2,
+ /*expected_property_path=*/"subject",
+ CreateSubjectPropertyConfig()));
+ EXPECT_THAT(section_group.string_sections[1].content,
+ ElementsAre("the subject"));
-TEST_F(SectionManagerTest, CreationWithSchemaSelfReferenceShouldFail) {
- // Creates a type config that has a section and references to self.
- SchemaTypeConfigProto type_config;
- type_config.set_schema_type("type");
- auto property1 = type_config.add_properties();
- property1->set_property_name("property1");
- property1->set_data_type(PropertyConfigProto::DataType::STRING);
- property1->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property1->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- auto property2 = type_config.add_properties();
- property2->set_property_name("property2");
- property2->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- // Here we're referencing our own type, causing an infinite loop
- property2->set_schema_type("type");
- property2->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property2->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ // Integer sections
+ EXPECT_THAT(section_group.integer_sections, SizeIs(2));
- SchemaUtil::TypeConfigMap type_config_map;
- type_config_map.emplace("type", type_config);
+ EXPECT_THAT(section_group.integer_sections[0].metadata,
+ EqualsSectionMetadata(/*expected_id=*/0,
+ /*expected_property_path=*/"recipientIds",
+ CreateRecipientIdsPropertyConfig()));
+ EXPECT_THAT(section_group.integer_sections[0].content, ElementsAre(1, 2, 3));
- EXPECT_THAT(
- SectionManager::Create(type_config_map, schema_type_mapper_.get()),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE,
- HasSubstr("Too many properties")));
+ EXPECT_THAT(section_group.integer_sections[1].metadata,
+ EqualsSectionMetadata(/*expected_id=*/3,
+ /*expected_property_path=*/"timestamp",
+ CreateTimestampPropertyConfig()));
+ EXPECT_THAT(section_group.integer_sections[1].content,
+ ElementsAre(kDefaultTimestamp));
}
-TEST_F(SectionManagerTest, CreationWithTooManyPropertiesShouldFail) {
- SchemaTypeConfigProto type_config;
- type_config.set_schema_type("type");
- // Adds more properties than allowed
- int max_num_sections_allowed = kMaxSectionId - kMinSectionId + 1;
- for (int i = 0; i < max_num_sections_allowed + 1; i++) {
- auto property = type_config.add_properties();
- property->set_property_name("property" + std::to_string(i));
- property->set_data_type(PropertyConfigProto::DataType::STRING);
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- }
+TEST_F(SectionManagerTest, ExtractSectionsNested) {
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
- SchemaUtil::TypeConfigMap type_config_map;
- type_config_map.emplace("type", type_config);
+ // Extracts all sections from 'Conversation' document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SectionGroup section_group,
+ schema_type_manager->section_manager().ExtractSections(
+ conversation_document_));
+
+ // String sections
+ EXPECT_THAT(section_group.string_sections, SizeIs(2));
EXPECT_THAT(
- SectionManager::Create(type_config_map, schema_type_mapper_.get()),
- StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE,
- HasSubstr("Too many properties")));
-}
+ section_group.string_sections[0].metadata,
+ EqualsSectionMetadata(/*expected_id=*/1,
+ /*expected_property_path=*/"emails.recipients",
+ CreateRecipientsPropertyConfig()));
+ EXPECT_THAT(section_group.string_sections[0].content,
+ ElementsAre("recipient1", "recipient2", "recipient3",
+ "recipient1", "recipient2", "recipient3"));
-TEST_F(SectionManagerTest, CreationWithUnknownSchemaTypeNameShouldFail) {
- SchemaTypeConfigProto type_config;
- type_config.set_schema_type("type");
- auto property = type_config.add_properties();
- property->set_property_name("property");
- property->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- property->set_schema_type("unknown_name");
- property->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
- property->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
+ EXPECT_THAT(section_group.string_sections[1].metadata,
+ EqualsSectionMetadata(/*expected_id=*/2,
+ /*expected_property_path=*/"emails.subject",
+ CreateSubjectPropertyConfig()));
+ EXPECT_THAT(section_group.string_sections[1].content,
+ ElementsAre("the subject", "the subject"));
- SchemaUtil::TypeConfigMap type_config_map;
- type_config_map.emplace("type", type_config);
+ // Integer sections
+ EXPECT_THAT(section_group.integer_sections, SizeIs(2));
EXPECT_THAT(
- SectionManager::Create(type_config_map, schema_type_mapper_.get()),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
- HasSubstr("type config not found")));
+ section_group.integer_sections[0].metadata,
+ EqualsSectionMetadata(/*expected_id=*/0,
+ /*expected_property_path=*/"emails.recipientIds",
+ CreateRecipientIdsPropertyConfig()));
+ EXPECT_THAT(section_group.integer_sections[0].content,
+ ElementsAre(1, 2, 3, 1, 2, 3));
+
+ EXPECT_THAT(
+ section_group.integer_sections[1].metadata,
+ EqualsSectionMetadata(/*expected_id=*/3,
+ /*expected_property_path=*/"emails.timestamp",
+ CreateTimestampPropertyConfig()));
+ EXPECT_THAT(section_group.integer_sections[1].content,
+ ElementsAre(kDefaultTimestamp, kDefaultTimestamp));
}
-TEST_F(SectionManagerTest, GetSectionContent) {
+TEST_F(SectionManagerTest, ExtractSectionsIndexableNestedPropertiesList) {
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
ICING_ASSERT_OK_AND_ASSIGN(
- auto section_manager,
- SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
-
- // Test simple section paths
- EXPECT_THAT(section_manager->GetSectionContent(email_document_,
- /*section_path*/ "subject"),
- IsOkAndHolds(ElementsAre("the subject")));
- EXPECT_THAT(section_manager->GetSectionContent(email_document_,
- /*section_path*/ "text"),
- IsOkAndHolds(ElementsAre("the text")));
-
- // Test repeated values, they are joined into one string
- ICING_ASSERT_OK_AND_ASSIGN(auto content, section_manager->GetSectionContent(
- email_document_,
- /*section_path*/ "recipients"));
- EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3"));
-
- // Test concatenated section paths: "property1.property2"
- ICING_ASSERT_OK_AND_ASSIGN(content, section_manager->GetSectionContent(
- conversation_document_,
- /*section_path*/ "emails.subject"));
- EXPECT_THAT(content, ElementsAre("the subject", "the subject"));
-
- ICING_ASSERT_OK_AND_ASSIGN(content, section_manager->GetSectionContent(
- conversation_document_,
- /*section_path*/ "emails.text"));
- EXPECT_THAT(content, ElementsAre("the text", "the text"));
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+ // Extracts all sections from 'Group' document
ICING_ASSERT_OK_AND_ASSIGN(
- content,
- section_manager->GetSectionContent(conversation_document_,
- /*section_path*/ "emails.recipients"));
- EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3",
- "recipient1", "recipient2", "recipient3"));
-
- // Test non-existing paths
- EXPECT_THAT(section_manager->GetSectionContent(email_document_,
- /*section_path*/ "name"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- EXPECT_THAT(section_manager->GetSectionContent(email_document_,
- /*section_path*/ "invalid"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- EXPECT_THAT(
- section_manager->GetSectionContent(conversation_document_,
- /*section_path*/ "emails.invalid"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- // Test other data types
- // INT64
- EXPECT_THAT(section_manager->GetSectionContent(email_document_,
- /*section_path*/ "timestamp"),
- IsOkAndHolds(ElementsAre("1234567890")));
- // BYTES type can't be indexed, so content won't be returned
- EXPECT_THAT(section_manager->GetSectionContent(email_document_,
- /*section_path*/ "attachment"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
-
- // The following tests are similar to the ones above but use section ids
- // instead of section paths
-
- // EmailMessage (section id -> section path):
- SectionId recipients_section_id = 0;
- SectionId subject_section_id = 1;
- SectionId timestamp_section_id = 2;
- SectionId invalid_email_section_id = 3;
+ SectionGroup section_group,
+ schema_type_manager->section_manager().ExtractSections(group_document_));
+
+ // SectionId assignments:
+ // 0 -> conversation.emails.attachment (bytes, non-indexable)
+ // 1 -> conversation.emails.recipientIds (int64)
+ // 2 -> conversation.emails.recipients (string)
+ // 3 -> conversation.emails.subject (string)
+ // 4 -> conversation.name
+ // (string, but no entry for this in conversation_document_)
+ // 5 -> groupName (string)
+ // 6 -> conversation.emails.nonExistentNestedProperty
+ // (unknown, non-indexable)
+ // 7 -> conversation.emails.nonExistentNestedProperty2
+ // (unknown, non-indexable)
+ //
+ // SectionId assignment order:
+ // - We assign section ids to known (existing) properties first in alphabet
+ // order.
+ // - After handling all known properties, we assign section ids to all unknown
+ // (non-existent) properties that are specified in the
+ // indexable_nested_properties_list.
+ // - As a result, assignment of the entire section set is not done
+ // alphabetically, but assignment is still deterministic and alphabetical
+ // order is preserved inside the known properties and unknown properties
+ // sets individually.
+ //
+ // 'conversation.emails.attachment',
+ // 'conversation.emails.nonExistentNestedProperty' and
+ // 'conversation.emails.nonExistentNestedProperty2' are assigned sectionIds
+ // even though they are non-indexable because they appear in 'Group' schema
+ // type's indexable_nested_props_list.
+ // However 'conversation.emails.attachment' does not exist in section_group
+ // (even though the property exists and has a sectionId assignment) as
+ // SectionManager::ExtractSections only extracts indexable string and integer
+ // section data from a document.
+
+ // String sections
+ EXPECT_THAT(section_group.string_sections, SizeIs(3));
+
+ EXPECT_THAT(section_group.string_sections[0].metadata,
+ EqualsSectionMetadata(
+ /*expected_id=*/2,
+ /*expected_property_path=*/"conversation.emails.recipients",
+ CreateRecipientsPropertyConfig()));
+ EXPECT_THAT(section_group.string_sections[0].content,
+ ElementsAre("recipient1", "recipient2", "recipient3",
+ "recipient1", "recipient2", "recipient3"));
+
+ EXPECT_THAT(section_group.string_sections[1].metadata,
+ EqualsSectionMetadata(
+ /*expected_id=*/3,
+ /*expected_property_path=*/"conversation.emails.subject",
+ CreateSubjectPropertyConfig()));
+ EXPECT_THAT(section_group.string_sections[1].content,
+ ElementsAre("the subject", "the subject"));
+
+ EXPECT_THAT(section_group.string_sections[2].metadata,
+ EqualsSectionMetadata(
+ /*expected_id=*/5,
+ /*expected_property_path=*/"groupName",
+ CreateGroupNamePropertyConfig()));
+ EXPECT_THAT(section_group.string_sections[2].content,
+ ElementsAre("group_name_1"));
+
+ // Integer sections
+ EXPECT_THAT(section_group.integer_sections, SizeIs(1));
+
+ EXPECT_THAT(section_group.integer_sections[0].metadata,
+ EqualsSectionMetadata(
+ /*expected_id=*/1,
+ /*expected_property_path=*/"conversation.emails.recipientIds",
+ CreateRecipientIdsPropertyConfig()));
+ EXPECT_THAT(section_group.integer_sections[0].content,
+ ElementsAre(1, 2, 3, 1, 2, 3));
+}
+
+TEST_F(SectionManagerTest, GetSectionMetadata) {
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
ICING_ASSERT_OK_AND_ASSIGN(
- content, section_manager->GetSectionContent(email_document_,
- recipients_section_id));
- EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3"));
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+
+ // Email (section id -> section property path):
+ // 0 -> recipientIds
+ // 1 -> recipients
+ // 2 -> subject
+ // 3 -> timestamp
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/0, /*section_id=*/0),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"recipientIds",
+ CreateRecipientIdsPropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/0, /*section_id=*/1),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/1, /*expected_property_path=*/"recipients",
+ CreateRecipientsPropertyConfig()))));
+ // Conversation (section id -> section property path):
+ // 0 -> emails.recipientIds
+ // 1 -> emails.recipients
+ // 2 -> emails.subject
+ // 3 -> emails.timestamp
+ // 4 -> name
+ EXPECT_THAT(
+ schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/1, /*section_id=*/0),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/0, /*expected_property_path=*/"emails.recipientIds",
+ CreateRecipientIdsPropertyConfig()))));
+ EXPECT_THAT(
+ schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/1, /*section_id=*/1),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/1, /*expected_property_path=*/"emails.recipients",
+ CreateRecipientsPropertyConfig()))));
EXPECT_THAT(
- section_manager->GetSectionContent(email_document_, subject_section_id),
- IsOkAndHolds(ElementsAre("the subject")));
+ schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/1, /*section_id=*/2),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/2, /*expected_property_path=*/"emails.subject",
+ CreateSubjectPropertyConfig()))));
EXPECT_THAT(
- section_manager->GetSectionContent(email_document_, timestamp_section_id),
- IsOkAndHolds(ElementsAre("1234567890")));
+ schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/1, /*section_id=*/3),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/3, /*expected_property_path=*/"emails.timestamp",
+ CreateTimestampPropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/1, /*section_id=*/4),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/4, /*expected_property_path=*/"name",
+ CreateNamePropertyConfig()))));
- EXPECT_THAT(section_manager->GetSectionContent(email_document_,
- invalid_email_section_id),
+ // Group (section id -> section property path):
+ // 0 -> conversation.emails.attachment (non-indexable)
+ // 1 -> conversation.emails.recipientIds
+ // 2 -> conversation.emails.recipients
+ // 3 -> conversation.emails.subject
+ // 4 -> conversation.name
+ // 5 -> groupName
+ // 6 -> conversation.emails.nonExistentNestedProperty (non-indexable)
+ // 7 -> conversation.emails.nonExistentNestedProperty2 (non-indexable)
+ //
+ // SectionId assignment order:
+ // - We assign section ids to known (existing) properties first in alphabet
+ // order.
+ // - After handling all known properties, we assign section ids to all unknown
+ // (non-existent) properties that are specified in the
+ // indexable_nested_properties_list.
+ // - As a result, assignment of the entire section set is not done
+ // alphabetically, but assignment is still deterministic and alphabetical
+ // order is preserved inside the known properties and unknown properties
+ // sets individually.
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/0),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/0,
+ /*expected_property_path=*/"conversation.emails.attachment",
+ CreateAttachmentPropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/1),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/1,
+ /*expected_property_path=*/"conversation.emails.recipientIds",
+ CreateRecipientIdsPropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/2),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/2,
+ /*expected_property_path=*/"conversation.emails.recipients",
+ CreateRecipientsPropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/3),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/3,
+ /*expected_property_path=*/"conversation.emails.subject",
+ CreateSubjectPropertyConfig()))));
+ EXPECT_THAT(
+ schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/4),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/4, /*expected_property_path=*/"conversation.name",
+ CreateNamePropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/5),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/5, /*expected_property_path=*/"groupName",
+ CreateGroupNamePropertyConfig()))));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/6),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/6,
+ /*expected_property_path=*/
+ "conversation.emails.nonExistentNestedProperty",
+ PropertyConfigBuilder()
+ .SetName("nonExistentNestedProperty")
+ .SetDataType(TYPE_UNKNOWN)
+ .Build()))));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/7),
+ IsOkAndHolds(Pointee(EqualsSectionMetadata(
+ /*expected_id=*/7,
+ /*expected_property_path=*/
+ "conversation.emails.nonExistentNestedProperty2",
+ PropertyConfigBuilder()
+ .SetName("nonExistentNestedProperty2")
+ .SetDataType(TYPE_UNKNOWN)
+ .Build()))));
+ // Check that no more properties are indexed
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/2, /*section_id=*/8),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
- // Conversation (section id -> section path):
- // 0 -> emails.recipients
- // 1 -> emails.subject
- // 2 -> emails.timestamp
- // 3 -> name
- SectionId emails_recipients_section_id = 0;
- SectionId emails_subject_section_id = 1;
- SectionId emails_timestamp_section_id = 2;
- SectionId name_section_id = 3;
- SectionId invalid_conversation_section_id = 4;
+TEST_F(SectionManagerTest, GetSectionMetadataInvalidSchemaTypeId) {
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
ICING_ASSERT_OK_AND_ASSIGN(
- content, section_manager->GetSectionContent(
- conversation_document_, emails_recipients_section_id));
- EXPECT_THAT(content, ElementsAre("recipient1", "recipient2", "recipient3",
- "recipient1", "recipient2", "recipient3"));
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
+ ASSERT_THAT(type_config_map_, SizeIs(3));
- ICING_ASSERT_OK_AND_ASSIGN(
- content, section_manager->GetSectionContent(conversation_document_,
- emails_subject_section_id));
- EXPECT_THAT(content, ElementsAre("the subject", "the subject"));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/-1, /*section_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/3, /*section_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+TEST_F(SectionManagerTest, GetSectionMetadataInvalidSectionId) {
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
ICING_ASSERT_OK_AND_ASSIGN(
- content, section_manager->GetSectionContent(conversation_document_,
- emails_timestamp_section_id));
- EXPECT_THAT(content, ElementsAre("1234567890", "1234567890"));
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map_, schema_type_mapper_.get()));
- EXPECT_THAT(section_manager->GetSectionContent(conversation_document_,
- name_section_id),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ // Email (section id -> section property path):
+ // 0 -> recipientIds
+ // 1 -> recipients
+ // 2 -> subject
+ // 3 -> timestamp
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/0, /*section_id=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/0, /*section_id=*/4),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(section_manager->GetSectionContent(
- conversation_document_, invalid_conversation_section_id),
+ // Conversation (section id -> section property path):
+ // 0 -> emails.recipientIds
+ // 1 -> emails.recipients
+ // 2 -> emails.subject
+ // 3 -> emails.timestamp
+ // 4 -> name
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/1, /*section_id=*/-1),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(schema_type_manager->section_manager().GetSectionMetadata(
+ /*schema_type_id=*/1, /*section_id=*/5),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(SectionManagerTest, ExtractSections) {
+TEST_F(SectionManagerTest,
+ NonStringFieldsWithStringIndexingConfigDontCreateSections) {
+ // Create a schema for an empty document.
+ SchemaTypeConfigProto empty_type;
+ empty_type.set_schema_type("EmptySchema");
+
+ // Create a schema with all the non-string fields
+ SchemaTypeConfigProto type_with_non_string_properties;
+ type_with_non_string_properties.set_schema_type("Schema");
+
+ // Create an int property with a string_indexing_config
+ auto int_property = type_with_non_string_properties.add_properties();
+ int_property->set_property_name("int");
+ int_property->set_data_type(TYPE_INT64);
+ int_property->set_cardinality(CARDINALITY_REQUIRED);
+ int_property->mutable_string_indexing_config()->set_term_match_type(
+ TERM_MATCH_EXACT);
+ int_property->mutable_string_indexing_config()->set_tokenizer_type(
+ TOKENIZER_PLAIN);
+
+ // Create a double property with a string_indexing_config
+ auto double_property = type_with_non_string_properties.add_properties();
+ double_property->set_property_name("double");
+ double_property->set_data_type(TYPE_DOUBLE);
+ double_property->set_cardinality(CARDINALITY_REQUIRED);
+ double_property->mutable_string_indexing_config()->set_term_match_type(
+ TERM_MATCH_EXACT);
+ double_property->mutable_string_indexing_config()->set_tokenizer_type(
+ TOKENIZER_PLAIN);
+
+ // Create a boolean property with a string_indexing_config
+ auto boolean_property = type_with_non_string_properties.add_properties();
+ boolean_property->set_property_name("boolean");
+ boolean_property->set_data_type(TYPE_BOOLEAN);
+ boolean_property->set_cardinality(CARDINALITY_REQUIRED);
+ boolean_property->mutable_string_indexing_config()->set_term_match_type(
+ TERM_MATCH_EXACT);
+ boolean_property->mutable_string_indexing_config()->set_tokenizer_type(
+ TOKENIZER_PLAIN);
+
+ // Create a bytes property with a string_indexing_config
+ auto bytes_property = type_with_non_string_properties.add_properties();
+ bytes_property->set_property_name("bytes");
+ bytes_property->set_data_type(TYPE_BYTES);
+ bytes_property->set_cardinality(CARDINALITY_REQUIRED);
+ bytes_property->mutable_string_indexing_config()->set_term_match_type(
+ TERM_MATCH_EXACT);
+ bytes_property->mutable_string_indexing_config()->set_tokenizer_type(
+ TOKENIZER_PLAIN);
+
+ // Create a document property with a string_indexing_config
+ auto document_property = type_with_non_string_properties.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(TYPE_DOCUMENT);
+ document_property->set_schema_type(empty_type.schema_type());
+ document_property->set_cardinality(CARDINALITY_REQUIRED);
+ document_property->mutable_string_indexing_config()->set_term_match_type(
+ TERM_MATCH_EXACT);
+ document_property->mutable_string_indexing_config()->set_tokenizer_type(
+ TOKENIZER_PLAIN);
+
+ // Setup classes to create the section manager
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(type_with_non_string_properties.schema_type(),
+ type_with_non_string_properties);
+ type_config_map.emplace(empty_type.schema_type(), empty_type);
+
+ // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each one
+ // 128KiB so the total DynamicTrieKeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ std::string dir = GetTestTempDir() + "/non_string_fields";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, dir,
+ key_mapper_size));
+ ICING_ASSERT_OK(schema_type_mapper->Put(
+ type_with_non_string_properties.schema_type(), /*schema_type_id=*/0));
+ ICING_ASSERT_OK(schema_type_mapper->Put(empty_type.schema_type(),
+ /*schema_type_id=*/1));
+
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
ICING_ASSERT_OK_AND_ASSIGN(
- auto section_manager,
- SectionManager::Create(type_config_map_, schema_type_mapper_.get()));
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, schema_type_mapper.get()));
- // Extracts all sections from 'EmailMessage' document
- ICING_ASSERT_OK_AND_ASSIGN(auto sections,
- section_manager->ExtractSections(email_document_));
- EXPECT_THAT(sections.size(), Eq(3));
+ // Create an empty document to be nested
+ DocumentProto empty_document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(empty_type.schema_type())
+ .Build();
- EXPECT_THAT(sections[0].metadata.id, Eq(0));
- EXPECT_THAT(sections[0].metadata.path, Eq("recipients"));
- EXPECT_THAT(sections[0].content,
- ElementsAre("recipient1", "recipient2", "recipient3"));
+ // Create a document that follows "Schema"
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(type_with_non_string_properties.schema_type())
+ .AddInt64Property("int", 1)
+ .AddDoubleProperty("double", 0.2)
+ .AddBooleanProperty("boolean", true)
+ .AddBytesProperty("bytes", "attachment bytes")
+ .AddDocumentProperty("document", empty_document)
+ .Build();
- EXPECT_THAT(sections[1].metadata.id, Eq(1));
- EXPECT_THAT(sections[1].metadata.path, Eq("subject"));
- EXPECT_THAT(sections[1].content, ElementsAre("the subject"));
+ // Extracts sections from 'Schema' document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SectionGroup section_group,
+ schema_type_manager->section_manager().ExtractSections(document));
+ EXPECT_THAT(section_group.string_sections, IsEmpty());
+ EXPECT_THAT(section_group.integer_sections, IsEmpty());
+}
- EXPECT_THAT(sections[2].metadata.id, Eq(2));
- EXPECT_THAT(sections[2].metadata.path, Eq("timestamp"));
- EXPECT_THAT(sections[2].content, ElementsAre("1234567890"));
+TEST_F(SectionManagerTest,
+ NonIntegerFieldsWithIntegerIndexingConfigDontCreateSections) {
+ // Create a schema for an empty document.
+ SchemaTypeConfigProto empty_type;
+ empty_type.set_schema_type("EmptySchema");
- // Extracts all sections from 'Conversation' document
+ // Create a schema with all the non-integer fields
+ SchemaTypeConfigProto type_with_non_integer_properties;
+ type_with_non_integer_properties.set_schema_type("Schema");
+
+ // Create an string property with a integer_indexing_config
+ auto string_property = type_with_non_integer_properties.add_properties();
+ string_property->set_property_name("string");
+ string_property->set_data_type(TYPE_STRING);
+ string_property->set_cardinality(CARDINALITY_REQUIRED);
+ string_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create a double property with a integer_indexing_config
+ auto double_property = type_with_non_integer_properties.add_properties();
+ double_property->set_property_name("double");
+ double_property->set_data_type(TYPE_DOUBLE);
+ double_property->set_cardinality(CARDINALITY_REQUIRED);
+ double_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create a boolean property with a integer_indexing_config
+ auto boolean_property = type_with_non_integer_properties.add_properties();
+ boolean_property->set_property_name("boolean");
+ boolean_property->set_data_type(TYPE_BOOLEAN);
+ boolean_property->set_cardinality(CARDINALITY_REQUIRED);
+ boolean_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create a bytes property with a integer_indexing_config
+ auto bytes_property = type_with_non_integer_properties.add_properties();
+ bytes_property->set_property_name("bytes");
+ bytes_property->set_data_type(TYPE_BYTES);
+ bytes_property->set_cardinality(CARDINALITY_REQUIRED);
+ bytes_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create a document property with a integer_indexing_config
+ auto document_property = type_with_non_integer_properties.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(TYPE_DOCUMENT);
+ document_property->set_schema_type(empty_type.schema_type());
+ document_property->set_cardinality(CARDINALITY_REQUIRED);
+ document_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Setup classes to create the section manager
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(type_with_non_integer_properties.schema_type(),
+ type_with_non_integer_properties);
+ type_config_map.emplace(empty_type.schema_type(), empty_type);
+
+ // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each one
+ // 128KiB so the total DynamicTrieKeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ std::string dir = GetTestTempDir() + "/non_integer_fields";
ICING_ASSERT_OK_AND_ASSIGN(
- sections, section_manager->ExtractSections(conversation_document_));
- EXPECT_THAT(sections.size(), Eq(3));
-
- // Section id 3 (name) not found in document, so the first section id found
- // is 1 below.
- EXPECT_THAT(sections[0].metadata.id, Eq(0));
- EXPECT_THAT(sections[0].metadata.path, Eq("emails.recipients"));
- EXPECT_THAT(sections[0].content,
- ElementsAre("recipient1", "recipient2", "recipient3",
- "recipient1", "recipient2", "recipient3"));
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, dir,
+ key_mapper_size));
+ ICING_ASSERT_OK(schema_type_mapper->Put(
+ type_with_non_integer_properties.schema_type(), /*schema_type_id=*/0));
+ ICING_ASSERT_OK(schema_type_mapper->Put(empty_type.schema_type(),
+ /*schema_type_id=*/1));
- EXPECT_THAT(sections[1].metadata.id, Eq(1));
- EXPECT_THAT(sections[1].metadata.path, Eq("emails.subject"));
- EXPECT_THAT(sections[1].content, ElementsAre("the subject", "the subject"));
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, schema_type_mapper.get()));
+
+ // Create an empty document to be nested
+ DocumentProto empty_document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(empty_type.schema_type())
+ .Build();
+
+ // Create a document that follows "Schema"
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(type_with_non_integer_properties.schema_type())
+ .AddStringProperty("string", "abc")
+ .AddDoubleProperty("double", 0.2)
+ .AddBooleanProperty("boolean", true)
+ .AddBytesProperty("bytes", "attachment bytes")
+ .AddDocumentProperty("document", empty_document)
+ .Build();
- EXPECT_THAT(sections[2].metadata.id, Eq(2));
- EXPECT_THAT(sections[2].metadata.path, Eq("emails.timestamp"));
- EXPECT_THAT(sections[2].content, ElementsAre("1234567890", "1234567890"));
+ // Extracts sections from 'Schema' document
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SectionGroup section_group,
+ schema_type_manager->section_manager().ExtractSections(document));
+ EXPECT_THAT(section_group.string_sections, IsEmpty());
+ EXPECT_THAT(section_group.integer_sections, IsEmpty());
}
+TEST_F(SectionManagerTest, AssignSectionsRecursivelyForDocumentFields) {
+ // Create the inner schema that the document property is.
+ SchemaTypeConfigProto document_type;
+ document_type.set_schema_type("DocumentSchema");
+
+ auto string_property = document_type.add_properties();
+ string_property->set_property_name("string");
+ string_property->set_data_type(TYPE_STRING);
+ string_property->set_cardinality(CARDINALITY_REQUIRED);
+ string_property->mutable_string_indexing_config()->set_term_match_type(
+ TERM_MATCH_EXACT);
+ string_property->mutable_string_indexing_config()->set_tokenizer_type(
+ TOKENIZER_PLAIN);
+
+ auto integer_property = document_type.add_properties();
+ integer_property->set_property_name("integer");
+ integer_property->set_data_type(TYPE_INT64);
+ integer_property->set_cardinality(CARDINALITY_REQUIRED);
+ integer_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create the outer schema which has the document property.
+ SchemaTypeConfigProto type;
+ type.set_schema_type("Schema");
+
+ auto document_property = type.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(TYPE_DOCUMENT);
+ document_property->set_schema_type(document_type.schema_type());
+ document_property->set_cardinality(CARDINALITY_REQUIRED);
+
+ // Opt into recursing into the document fields.
+ document_property->mutable_document_indexing_config()
+ ->set_index_nested_properties(true);
+
+ // Create the inner document.
+ DocumentProto inner_document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(document_type.schema_type())
+ .AddStringProperty("string", "foo")
+ .AddInt64Property("integer", 123)
+ .Build();
+
+ // Create the outer document that holds the inner document
+ DocumentProto outer_document =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(type.schema_type())
+ .AddDocumentProperty("document", inner_document)
+ .Build();
+
+ // Setup classes to create the section manager
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(type.schema_type(), type);
+ type_config_map.emplace(document_type.schema_type(), document_type);
+
+ // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each one
+ // 128KiB so the total DynamicTrieKeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ std::string dir = GetTestTempDir() + "/recurse_into_document";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, dir,
+ key_mapper_size));
+ int type_schema_type_id = 0;
+ int document_type_schema_type_id = 1;
+ ICING_ASSERT_OK(
+ schema_type_mapper->Put(type.schema_type(), type_schema_type_id));
+ ICING_ASSERT_OK(schema_type_mapper->Put(document_type.schema_type(),
+ document_type_schema_type_id));
+
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, schema_type_mapper.get()));
+
+ // Extracts sections from 'Schema' document; there should be the 1 string
+ // property and 1 integer property inside the document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SectionGroup section_group,
+ schema_type_manager->section_manager().ExtractSections(outer_document));
+ EXPECT_THAT(section_group.string_sections, SizeIs(1));
+ EXPECT_THAT(section_group.integer_sections, SizeIs(1));
+}
+
+TEST_F(SectionManagerTest, DontAssignSectionsRecursivelyForDocumentFields) {
+ // Create the inner schema that the document property is.
+ SchemaTypeConfigProto document_type;
+ document_type.set_schema_type("DocumentSchema");
+
+ auto string_property = document_type.add_properties();
+ string_property->set_property_name("string");
+ string_property->set_data_type(TYPE_STRING);
+ string_property->set_cardinality(CARDINALITY_REQUIRED);
+ string_property->mutable_string_indexing_config()->set_term_match_type(
+ TERM_MATCH_EXACT);
+ string_property->mutable_string_indexing_config()->set_tokenizer_type(
+ TOKENIZER_PLAIN);
+
+ auto integer_property = document_type.add_properties();
+ integer_property->set_property_name("integer");
+ integer_property->set_data_type(TYPE_INT64);
+ integer_property->set_cardinality(CARDINALITY_REQUIRED);
+ integer_property->mutable_integer_indexing_config()->set_numeric_match_type(
+ NUMERIC_MATCH_RANGE);
+
+ // Create the outer schema which has the document property.
+ SchemaTypeConfigProto type;
+ type.set_schema_type("Schema");
+
+ auto document_property = type.add_properties();
+ document_property->set_property_name("document");
+ document_property->set_data_type(TYPE_DOCUMENT);
+ document_property->set_schema_type(document_type.schema_type());
+ document_property->set_cardinality(CARDINALITY_REQUIRED);
+
+ // Opt into recursing into the document fields.
+ document_property->mutable_document_indexing_config()
+ ->set_index_nested_properties(false);
+
+ // Create the inner document.
+ DocumentProto inner_document = DocumentBuilder()
+ .SetKey("icing", "uri1")
+ .SetSchema(document_type.schema_type())
+ .AddStringProperty("string", "foo")
+ .AddInt64Property("integer", 123)
+ .Build();
+
+ // Create the outer document that holds the inner document
+ DocumentProto outer_document =
+ DocumentBuilder()
+ .SetKey("icing", "uri2")
+ .SetSchema(type.schema_type())
+ .AddDocumentProperty("document", inner_document)
+ .Build();
+
+ // Setup classes to create the section manager
+ SchemaUtil::TypeConfigMap type_config_map;
+ type_config_map.emplace(type.schema_type(), type);
+ type_config_map.emplace(document_type.schema_type(), document_type);
+
+ // DynamicTrieKeyMapper uses 3 internal arrays for bookkeeping. Give each one
+ // 128KiB so the total DynamicTrieKeyMapper should get 384KiB
+ int key_mapper_size = 3 * 128 * 1024;
+ std::string dir = GetTestTempDir() + "/recurse_into_document";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<SchemaTypeId>> schema_type_mapper,
+ DynamicTrieKeyMapper<SchemaTypeId>::Create(filesystem_, dir,
+ key_mapper_size));
+ int type_schema_type_id = 0;
+ int document_type_schema_type_id = 1;
+ ICING_ASSERT_OK(
+ schema_type_mapper->Put(type.schema_type(), type_schema_type_id));
+ ICING_ASSERT_OK(schema_type_mapper->Put(document_type.schema_type(),
+ document_type_schema_type_id));
+
+ // Use SchemaTypeManager factory method to instantiate SectionManager.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaTypeManager> schema_type_manager,
+ SchemaTypeManager::Create(type_config_map, schema_type_mapper.get()));
+
+ // Extracts sections from 'Schema' document; there won't be any since we
+ // didn't recurse into the document to see the inner string property
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SectionGroup section_group,
+ schema_type_manager->section_manager().ExtractSections(outer_document));
+ EXPECT_THAT(section_group.string_sections, IsEmpty());
+ EXPECT_THAT(section_group.integer_sections, IsEmpty());
+}
+
+} // namespace
+
} // namespace lib
} // namespace icing
diff --git a/icing/schema/section.h b/icing/schema/section.h
index daf4fd0..3685a29 100644
--- a/icing/schema/section.h
+++ b/icing/schema/section.h
@@ -17,6 +17,7 @@
#include <cstdint>
#include <string>
+#include <string_view>
#include <utility>
#include <vector>
@@ -27,25 +28,30 @@ namespace icing {
namespace lib {
using SectionId = int8_t;
-// 4 bits for 16 values. NOTE: Increasing this value means that SectionIdMask
-// must increase from an int16_t to an int32_t
-inline constexpr int kSectionIdBits = 4;
-inline constexpr SectionId kInvalidSectionId = (1 << kSectionIdBits);
-inline constexpr SectionId kMaxSectionId = kInvalidSectionId - 1;
+// 6 bits for 64 values.
+inline constexpr int kSectionIdBits = 6;
+inline constexpr SectionId kTotalNumSections = (1 << kSectionIdBits);
+inline constexpr SectionId kInvalidSectionId = kTotalNumSections;
+inline constexpr SectionId kMaxSectionId = kTotalNumSections - 1;
+// Prior versions of Icing only supported 16 indexed properties.
+inline constexpr SectionId kOldTotalNumSections = 16;
inline constexpr SectionId kMinSectionId = 0;
constexpr bool IsSectionIdValid(SectionId section_id) {
return section_id >= kMinSectionId && section_id <= kMaxSectionId;
}
-using SectionIdMask = int16_t;
+using SectionIdMask = int64_t;
inline constexpr SectionIdMask kSectionIdMaskAll = ~SectionIdMask{0};
inline constexpr SectionIdMask kSectionIdMaskNone = SectionIdMask{0};
+static_assert(kSectionIdBits < 8 * sizeof(SectionId),
+ "Cannot exhaust all bits of SectionId since it is a signed "
+ "integer and the most significant bit should be preserved.");
+
static_assert(
kMaxSectionId < 8 * sizeof(SectionIdMask),
"SectionIdMask is not large enough to represent all section values!");
-// TODO(samzheng): add more metadata when needed, e.g. tokenizer type,
struct SectionMetadata {
// Dot-joined property names, representing the location of section inside an
// document. E.g. "property1.property2"
@@ -54,11 +60,14 @@ struct SectionMetadata {
// A unique id of property within a type config
SectionId id;
- // How content in this section should be tokenized. It is invalid for a
- // section to have tokenizer == 'NONE'.
- IndexingConfig::TokenizerType::Code tokenizer;
+ // Indexable data type of this section. E.g. STRING, INT64.
+ PropertyConfigProto::DataType::Code data_type;
+
+ // How strings should be tokenized. It is invalid for a string section
+ // (data_type == 'STRING') to have tokenizer == 'NONE'.
+ StringIndexingConfig::TokenizerType::Code tokenizer;
- // How tokens in this section should be matched.
+ // How tokens in a string section should be matched.
//
// TermMatchType::UNKNOWN:
// Terms will not match anything
@@ -70,24 +79,71 @@ struct SectionMetadata {
// Terms will be only stored as an exact match, "fool" only matches "fool"
TermMatchType::Code term_match_type = TermMatchType::UNKNOWN;
- SectionMetadata(SectionId id_in, TermMatchType::Code term_match_type_in,
- IndexingConfig::TokenizerType::Code tokenizer,
- std::string&& path_in)
+ // How tokens in a numeric section should be matched.
+ //
+ // NumericMatchType::UNKNOWN:
+ // Contents will not match anything. It is invalid for a numeric section
+ // (data_type == 'INT64') to have numeric_match_type == 'UNKNOWN'.
+ //
+ // NumericMatchType::RANGE:
+ // Contents will be matched by a range query.
+ IntegerIndexingConfig::NumericMatchType::Code numeric_match_type;
+
+ explicit SectionMetadata(
+ SectionId id_in, PropertyConfigProto::DataType::Code data_type_in,
+ StringIndexingConfig::TokenizerType::Code tokenizer,
+ TermMatchType::Code term_match_type_in,
+ IntegerIndexingConfig::NumericMatchType::Code numeric_match_type_in,
+ std::string&& path_in)
: path(std::move(path_in)),
id(id_in),
+ data_type(data_type_in),
tokenizer(tokenizer),
- term_match_type(term_match_type_in) {}
+ term_match_type(term_match_type_in),
+ numeric_match_type(numeric_match_type_in) {}
+
+ SectionMetadata(const SectionMetadata& other) = default;
+ SectionMetadata& operator=(const SectionMetadata& other) = default;
+
+ SectionMetadata(SectionMetadata&& other) = default;
+ SectionMetadata& operator=(SectionMetadata&& other) = default;
+
+ bool operator==(const SectionMetadata& rhs) const {
+ return path == rhs.path && id == rhs.id && data_type == rhs.data_type &&
+ tokenizer == rhs.tokenizer &&
+ term_match_type == rhs.term_match_type &&
+ numeric_match_type == rhs.numeric_match_type;
+ }
};
// Section is an icing internal concept similar to document property but with
// extra metadata. The content can be a value or the combination of repeated
-// values of a property.
+// values of a property, and the type of content is specified by template.
+//
+// Current supported types:
+// - std::string_view (PropertyConfigProto::DataType::STRING)
+// - int64_t (PropertyConfigProto::DataType::INT64)
+template <typename T>
struct Section {
SectionMetadata metadata;
- std::vector<std::string> content;
+ std::vector<T> content;
- Section(SectionMetadata&& metadata_in, std::vector<std::string>&& content_in)
+ explicit Section(SectionMetadata&& metadata_in, std::vector<T>&& content_in)
: metadata(std::move(metadata_in)), content(std::move(content_in)) {}
+
+ PropertyConfigProto::DataType::Code data_type() const {
+ return metadata.data_type;
+ }
+};
+
+// Groups of different type sections. Callers can access sections with types
+// they want and avoid going through non-desired ones.
+//
+// REQUIRES: lifecycle of the property must be longer than this object, since we
+// use std::string_view for extracting its string_values.
+struct SectionGroup {
+ std::vector<Section<std::string_view>> string_sections;
+ std::vector<Section<int64_t>> integer_sections;
};
} // namespace lib
diff --git a/icing/scoring/advanced_scoring/advanced-scorer.cc b/icing/scoring/advanced_scoring/advanced-scorer.cc
new file mode 100644
index 0000000..83c1519
--- /dev/null
+++ b/icing/scoring/advanced_scoring/advanced-scorer.cc
@@ -0,0 +1,68 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/advanced_scoring/advanced-scorer.h"
+
+#include <memory>
+
+#include "icing/query/advanced_query_parser/lexer.h"
+#include "icing/query/advanced_query_parser/parser.h"
+#include "icing/scoring/advanced_scoring/score-expression.h"
+#include "icing/scoring/advanced_scoring/scoring-visitor.h"
+#include "icing/scoring/bm25f-calculator.h"
+#include "icing/scoring/section-weights.h"
+
+namespace icing {
+namespace lib {
+
+libtextclassifier3::StatusOr<std::unique_ptr<AdvancedScorer>>
+AdvancedScorer::Create(const ScoringSpecProto& scoring_spec,
+ double default_score,
+ const DocumentStore* document_store,
+ const SchemaStore* schema_store, int64_t current_time_ms,
+ const JoinChildrenFetcher* join_children_fetcher) {
+ ICING_RETURN_ERROR_IF_NULL(document_store);
+ ICING_RETURN_ERROR_IF_NULL(schema_store);
+
+ Lexer lexer(scoring_spec.advanced_scoring_expression(),
+ Lexer::Language::SCORING);
+ ICING_ASSIGN_OR_RETURN(std::vector<Lexer::LexerToken> lexer_tokens,
+ lexer.ExtractTokens());
+ Parser parser = Parser::Create(std::move(lexer_tokens));
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> tree_root,
+ parser.ConsumeScoring());
+
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store, scoring_spec));
+ std::unique_ptr<Bm25fCalculator> bm25f_calculator =
+ std::make_unique<Bm25fCalculator>(document_store, section_weights.get(),
+ current_time_ms);
+ ScoringVisitor visitor(default_score, document_store, schema_store,
+ section_weights.get(), bm25f_calculator.get(),
+ join_children_fetcher, current_time_ms);
+ tree_root->Accept(&visitor);
+
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<ScoreExpression> expression,
+ std::move(visitor).Expression());
+ if (expression->type() != ScoreExpressionType::kDouble) {
+ return absl_ports::InvalidArgumentError(
+ "The root scoring expression is not of double type.");
+ }
+ return std::unique_ptr<AdvancedScorer>(
+ new AdvancedScorer(std::move(expression), std::move(section_weights),
+ std::move(bm25f_calculator), default_score));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/advanced_scoring/advanced-scorer.h b/icing/scoring/advanced_scoring/advanced-scorer.h
new file mode 100644
index 0000000..d69abad
--- /dev/null
+++ b/icing/scoring/advanced_scoring/advanced-scorer.h
@@ -0,0 +1,92 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_ADVANCED_SCORING_ADVANCED_SCORER_H_
+#define ICING_SCORING_ADVANCED_SCORING_ADVANCED_SCORER_H_
+
+#include <memory>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/schema/schema-store.h"
+#include "icing/scoring/advanced_scoring/score-expression.h"
+#include "icing/scoring/bm25f-calculator.h"
+#include "icing/scoring/scorer.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+class AdvancedScorer : public Scorer {
+ public:
+ // Returns:
+ // A AdvancedScorer instance on success
+ // FAILED_PRECONDITION on any null pointer input
+ // INVALID_ARGUMENT if fails to create an instance
+ static libtextclassifier3::StatusOr<std::unique_ptr<AdvancedScorer>> Create(
+ const ScoringSpecProto& scoring_spec, double default_score,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ int64_t current_time_ms,
+ const JoinChildrenFetcher* join_children_fetcher = nullptr);
+
+ double GetScore(const DocHitInfo& hit_info,
+ const DocHitInfoIterator* query_it) override {
+ libtextclassifier3::StatusOr<double> result =
+ score_expression_->eval(hit_info, query_it);
+ if (!result.ok()) {
+ ICING_LOG(ERROR) << "Got an error when scoring a document:\n"
+ << result.status().error_message();
+ return default_score_;
+ }
+ return std::move(result).ValueOrDie();
+ }
+
+ void PrepareToScore(
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>*
+ query_term_iterators) override {
+ if (query_term_iterators == nullptr || query_term_iterators->empty()) {
+ return;
+ }
+ bm25f_calculator_->PrepareToScore(query_term_iterators);
+ }
+
+ bool is_constant() const { return score_expression_->is_constant_double(); }
+
+ private:
+ explicit AdvancedScorer(std::unique_ptr<ScoreExpression> score_expression,
+ std::unique_ptr<SectionWeights> section_weights,
+ std::unique_ptr<Bm25fCalculator> bm25f_calculator,
+ double default_score)
+ : score_expression_(std::move(score_expression)),
+ section_weights_(std::move(section_weights)),
+ bm25f_calculator_(std::move(bm25f_calculator)),
+ default_score_(default_score) {
+ if (is_constant()) {
+ ICING_LOG(WARNING)
+ << "The advanced scoring expression will evaluate to a constant.";
+ }
+ }
+
+ std::unique_ptr<ScoreExpression> score_expression_;
+ std::unique_ptr<SectionWeights> section_weights_;
+ std::unique_ptr<Bm25fCalculator> bm25f_calculator_;
+ double default_score_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_ADVANCED_SCORING_ADVANCED_SCORER_H_
diff --git a/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc b/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc
new file mode 100644
index 0000000..3612359
--- /dev/null
+++ b/icing/scoring/advanced_scoring/advanced-scorer_fuzz_test.cc
@@ -0,0 +1,70 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <memory>
+#include <string_view>
+
+#include "icing/scoring/advanced_scoring/advanced-scorer.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
+ FakeClock fake_clock;
+ Filesystem filesystem;
+ const std::string test_dir = GetTestTempDir() + "/icing";
+ const std::string doc_store_dir = test_dir + "/doc_store";
+ const std::string schema_store_dir = test_dir + "/schema_store";
+ filesystem.DeleteDirectoryRecursively(test_dir.c_str());
+ filesystem.CreateDirectoryRecursively(doc_store_dir.c_str());
+ filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
+
+ std::unique_ptr<SchemaStore> schema_store =
+ SchemaStore::Create(&filesystem, schema_store_dir, &fake_clock)
+ .ValueOrDie();
+ std::unique_ptr<DocumentStore> document_store =
+ DocumentStore::Create(
+ &filesystem, doc_store_dir, &fake_clock, schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr)
+ .ValueOrDie()
+ .document_store;
+
+ std::string_view text(reinterpret_cast<const char*>(data), size);
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION);
+ scoring_spec.set_advanced_scoring_expression(text);
+
+ AdvancedScorer::Create(scoring_spec,
+ /*default_score=*/10, document_store.get(),
+ schema_store.get(),
+ fake_clock.GetSystemTimeMilliseconds());
+
+ // Not able to test the GetScore method of AdvancedScorer, since it will only
+ // be available after AdvancedScorer is successfully created. However, the
+ // text provided by the fuzz test is very random, which means that in most
+ // cases, there will be syntax errors or type errors that cause
+ // AdvancedScorer::Create to fail.
+ return 0;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/advanced_scoring/advanced-scorer_test.cc b/icing/scoring/advanced_scoring/advanced-scorer_test.cc
new file mode 100644
index 0000000..cc1d413
--- /dev/null
+++ b/icing/scoring/advanced_scoring/advanced-scorer_test.cc
@@ -0,0 +1,1039 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/advanced_scoring/advanced-scorer.h"
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/scoring/scorer-factory.h"
+#include "icing/scoring/scorer.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::DoubleNear;
+using ::testing::Eq;
+
+class AdvancedScorerTest : public testing::Test {
+ protected:
+ AdvancedScorerTest()
+ : test_dir_(GetTestTempDir() + "/icing"),
+ doc_store_dir_(test_dir_ + "/doc_store"),
+ schema_store_dir_(test_dir_ + "/schema_store") {}
+
+ void SetUp() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
+
+ // Creates a simple email schema
+ SchemaProto test_email_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("person")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("emailAddress")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("phoneNumber")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ test_email_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ }
+
+ void TearDown() override {
+ document_store_.reset();
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ const std::string test_dir_;
+ const std::string doc_store_dir_;
+ const std::string schema_store_dir_;
+ Filesystem filesystem_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentStore> document_store_;
+ FakeClock fake_clock_;
+};
+
+constexpr double kEps = 0.0000000001;
+constexpr int kDefaultScore = 0;
+constexpr int64_t kDefaultCreationTimestampMs = 1571100001111;
+
+DocumentProto CreateDocument(
+ const std::string& name_space, const std::string& uri,
+ int score = kDefaultScore,
+ int64_t creation_timestamp_ms = kDefaultCreationTimestampMs) {
+ return DocumentBuilder()
+ .SetKey(name_space, uri)
+ .SetSchema("email")
+ .SetScore(score)
+ .SetCreationTimestampMs(creation_timestamp_ms)
+ .Build();
+}
+
+UsageReport CreateUsageReport(std::string name_space, std::string uri,
+ int64_t timestamp_ms,
+ UsageReport::UsageType usage_type) {
+ UsageReport usage_report;
+ usage_report.set_document_namespace(name_space);
+ usage_report.set_document_uri(uri);
+ usage_report.set_usage_timestamp_ms(timestamp_ms);
+ usage_report.set_usage_type(usage_type);
+ return usage_report;
+}
+
+ScoringSpecProto CreateAdvancedScoringSpec(
+ const std::string& advanced_scoring_expression) {
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION);
+ scoring_spec.set_advanced_scoring_expression(advanced_scoring_expression);
+ return scoring_spec;
+}
+
+PropertyWeight CreatePropertyWeight(std::string path, double weight) {
+ PropertyWeight property_weight;
+ property_weight.set_path(std::move(path));
+ property_weight.set_weight(weight);
+ return property_weight;
+}
+
+TypePropertyWeights CreateTypePropertyWeights(
+ std::string schema_type, std::vector<PropertyWeight>&& property_weights) {
+ TypePropertyWeights type_property_weights;
+ type_property_weights.set_schema_type(std::move(schema_type));
+ type_property_weights.mutable_property_weights()->Reserve(
+ property_weights.size());
+
+ for (PropertyWeight& property_weight : property_weights) {
+ *type_property_weights.add_property_weights() = std::move(property_weight);
+ }
+
+ return type_property_weights;
+}
+
+TEST_F(AdvancedScorerTest, InvalidAdvancedScoringSpec) {
+ // Empty scoring expression for advanced scoring
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION);
+ EXPECT_THAT(scorer_factory::Create(scoring_spec, /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // Non-empty scoring expression for normal scoring
+ scoring_spec = ScoringSpecProto::default_instance();
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ scoring_spec.set_advanced_scoring_expression("1");
+ EXPECT_THAT(scorer_factory::Create(scoring_spec, /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(AdvancedScorerTest, SimpleExpression) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(CreateDocument("namespace", "uri")));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("123"),
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(123));
+}
+
+TEST_F(AdvancedScorerTest, BasicPureArithmeticExpression) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(CreateDocument("namespace", "uri")));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("1 + 2"),
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(3));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("-1 + 2"),
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("1 + -2"),
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(-1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("1 - 2"),
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(-1));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("1 * 2"),
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(2));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("1 / 2"),
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(0.5));
+}
+
+TEST_F(AdvancedScorerTest, BasicMathFunctionExpression) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(CreateDocument("namespace", "uri")));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("log(10, 1000)"),
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(3, kEps));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("log(2.718281828459045)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(1, kEps));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("pow(2, 10)"),
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(1024));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("max(10, 11, 12, 13, 14)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(14));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("min(10, 11, 12, 13, 14)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("len(10, 11, 12, 13, 14)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(5));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("sum(10, 11, 12, 13, 14)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10 + 11 + 12 + 13 + 14));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("avg(10, 11, 12, 13, 14)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq((10 + 11 + 12 + 13 + 14) / 5.));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("sqrt(2)"),
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(sqrt(2), kEps));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("abs(-2) + abs(2)"),
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(4));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("sin(3.141592653589793)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(0, kEps));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("cos(3.141592653589793)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(-1, kEps));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("tan(3.141592653589793 / 4)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(1, kEps));
+}
+
+TEST_F(AdvancedScorerTest, DocumentScoreCreationTimestampFunctionExpression) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(CreateDocument(
+ "namespace", "uri", /*score=*/123,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs)));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("this.documentScore()"),
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(123));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.creationTimestamp()"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(kDefaultCreationTimestampMs));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec(
+ "this.documentScore() + this.creationTimestamp()"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo),
+ Eq(123 + kDefaultCreationTimestampMs));
+}
+
+TEST_F(AdvancedScorerTest, DocumentUsageFunctionExpression) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(CreateDocument("namespace", "uri")));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.usageCount(1) + this.usageCount(2) "
+ "+ this.usageLastUsedTimestamp(3)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(0));
+ ICING_ASSERT_OK(document_store_->ReportUsage(
+ CreateUsageReport("namespace", "uri", 100000, UsageReport::USAGE_TYPE1)));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(1));
+ ICING_ASSERT_OK(document_store_->ReportUsage(
+ CreateUsageReport("namespace", "uri", 200000, UsageReport::USAGE_TYPE2)));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(2));
+ ICING_ASSERT_OK(document_store_->ReportUsage(
+ CreateUsageReport("namespace", "uri", 300000, UsageReport::USAGE_TYPE3)));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(300002));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.usageLastUsedTimestamp(1)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(100000));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.usageLastUsedTimestamp(2)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(200000));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.usageLastUsedTimestamp(3)"),
+ /*default_score=*/10, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(300000));
+}
+
+TEST_F(AdvancedScorerTest, DocumentUsageFunctionOutOfRange) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(CreateDocument("namespace", "uri")));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ const double default_score = 123;
+
+ // Should get default score for the following expressions that cause "runtime"
+ // errors.
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("this.usageCount(4)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(default_score));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.usageCount(0)"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(default_score));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.usageCount(1.5)"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(default_score));
+}
+
+// scoring-processor_test.cc will help to get better test coverage for relevance
+// score.
+TEST_F(AdvancedScorerTest, RelevanceScoreFunctionScoreExpression) {
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetScore(5)
+ .SetKey("namespace", "uri")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(kDefaultCreationTimestampMs)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store_->Put(test_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<AdvancedScorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("this.relevanceScore()"),
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ scorer->PrepareToScore(/*query_term_iterators=*/{});
+
+ // Should get the default score.
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer->GetScore(docHitInfo, /*query_it=*/nullptr), Eq(10));
+}
+
+TEST_F(AdvancedScorerTest, ChildrenScoresFunctionScoreExpression) {
+ const double default_score = 123;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id_1,
+ document_store_->Put(CreateDocument("namespace", "uri1")));
+ DocHitInfo docHitInfo1 = DocHitInfo(document_id_1);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id_2,
+ document_store_->Put(CreateDocument("namespace", "uri2")));
+ DocHitInfo docHitInfo2 = DocHitInfo(document_id_2);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id_3,
+ document_store_->Put(CreateDocument("namespace", "uri3")));
+ DocHitInfo docHitInfo3 = DocHitInfo(document_id_3);
+
+ // Create a JoinChildrenFetcher that matches:
+ // document_id_1 to fake_child1 with score 1 and fake_child2 with score 2.
+ // document_id_2 to fake_child3 with score 4.
+ // document_id_3 has no child.
+ JoinSpecProto join_spec;
+ join_spec.set_parent_property_expression("this.qualifiedId()");
+ join_spec.set_child_property_expression("sender");
+ std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
+ map_joinable_qualified_id;
+ ScoredDocumentHit fake_child1(/*document_id=*/10, kSectionIdMaskNone,
+ /*score=*/1.0);
+ ScoredDocumentHit fake_child2(/*document_id=*/11, kSectionIdMaskNone,
+ /*score=*/2.0);
+ ScoredDocumentHit fake_child3(/*document_id=*/12, kSectionIdMaskNone,
+ /*score=*/4.0);
+ map_joinable_qualified_id[document_id_1].push_back(fake_child1);
+ map_joinable_qualified_id[document_id_1].push_back(fake_child2);
+ map_joinable_qualified_id[document_id_2].push_back(fake_child3);
+ JoinChildrenFetcher fetcher(join_spec, std::move(map_joinable_qualified_id));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<AdvancedScorer> scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("len(this.childrenRankingSignals())"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds(), &fetcher));
+ // document_id_1 has two children.
+ EXPECT_THAT(scorer->GetScore(docHitInfo1, /*query_it=*/nullptr), Eq(2));
+ // document_id_2 has one child.
+ EXPECT_THAT(scorer->GetScore(docHitInfo2, /*query_it=*/nullptr), Eq(1));
+ // document_id_3 has no child.
+ EXPECT_THAT(scorer->GetScore(docHitInfo3, /*query_it=*/nullptr), Eq(0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("sum(this.childrenRankingSignals())"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds(), &fetcher));
+ // document_id_1 has two children with scores 1 and 2.
+ EXPECT_THAT(scorer->GetScore(docHitInfo1, /*query_it=*/nullptr), Eq(3));
+ // document_id_2 has one child with score 4.
+ EXPECT_THAT(scorer->GetScore(docHitInfo2, /*query_it=*/nullptr), Eq(4));
+ // document_id_3 has no child.
+ EXPECT_THAT(scorer->GetScore(docHitInfo3, /*query_it=*/nullptr), Eq(0));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("avg(this.childrenRankingSignals())"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds(), &fetcher));
+ // document_id_1 has two children with scores 1 and 2.
+ EXPECT_THAT(scorer->GetScore(docHitInfo1, /*query_it=*/nullptr), Eq(3 / 2.));
+ // document_id_2 has one child with score 4.
+ EXPECT_THAT(scorer->GetScore(docHitInfo2, /*query_it=*/nullptr), Eq(4 / 1.));
+ // document_id_3 has no child.
+ // This is an evaluation error, so default_score will be returned.
+ EXPECT_THAT(scorer->GetScore(docHitInfo3, /*query_it=*/nullptr),
+ Eq(default_score));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(
+ CreateAdvancedScoringSpec(
+ // Equivalent to "avg(this.childrenRankingSignals())"
+ "sum(this.childrenRankingSignals()) / "
+ "len(this.childrenRankingSignals())"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds(), &fetcher));
+ // document_id_1 has two children with scores 1 and 2.
+ EXPECT_THAT(scorer->GetScore(docHitInfo1, /*query_it=*/nullptr), Eq(3 / 2.));
+ // document_id_2 has one child with score 4.
+ EXPECT_THAT(scorer->GetScore(docHitInfo2, /*query_it=*/nullptr), Eq(4 / 1.));
+ // document_id_3 has no child.
+ // This is an evaluation error, so default_score will be returned.
+ EXPECT_THAT(scorer->GetScore(docHitInfo3, /*query_it=*/nullptr),
+ Eq(default_score));
+}
+
+TEST_F(AdvancedScorerTest, PropertyWeightsFunctionScoreExpression) {
+ DocumentProto test_document_1 =
+ DocumentBuilder().SetKey("namespace", "uri1").SetSchema("email").Build();
+ DocumentProto test_document_2 =
+ DocumentBuilder().SetKey("namespace", "uri2").SetSchema("person").Build();
+ DocumentProto test_document_3 =
+ DocumentBuilder().SetKey("namespace", "uri3").SetSchema("person").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id_1,
+ document_store_->Put(test_document_1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id_2,
+ document_store_->Put(test_document_2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id_3,
+ document_store_->Put(test_document_3));
+
+ ScoringSpecProto spec_proto = CreateAdvancedScoringSpec("");
+
+ *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
+ /*schema_type=*/"email",
+ {CreatePropertyWeight(/*path=*/"subject", /*weight=*/1.0)});
+ *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
+ /*schema_type=*/"person",
+ {CreatePropertyWeight(/*path=*/"emailAddress", /*weight=*/0.5),
+ CreatePropertyWeight(/*path=*/"name", /*weight=*/0.8),
+ CreatePropertyWeight(/*path=*/"phoneNumber", /*weight=*/1.0)});
+
+ // Let the hit for test_document_1 match property "subject".
+ // So this.propertyWeights() for test_document_1 will return [1].
+ DocHitInfo doc_hit_info_1 = DocHitInfo(document_id_1);
+ doc_hit_info_1.UpdateSection(0);
+
+ // Let the hit for test_document_2 match properties "emailAddress" and "name".
+ // So this.propertyWeights() for test_document_2 will return [0.5, 0.8].
+ DocHitInfo doc_hit_info_2 = DocHitInfo(document_id_2);
+ doc_hit_info_2.UpdateSection(0);
+ doc_hit_info_2.UpdateSection(1);
+
+ // Let the hit for test_document_3 match properties "emailAddress", "name" and
+ // "phoneNumber". So this.propertyWeights() for test_document_3 will return
+ // [0.5, 0.8, 1].
+ DocHitInfo doc_hit_info_3 = DocHitInfo(document_id_3);
+ doc_hit_info_3.UpdateSection(0);
+ doc_hit_info_3.UpdateSection(1);
+ doc_hit_info_3.UpdateSection(2);
+
+ spec_proto.set_advanced_scoring_expression("min(this.propertyWeights())");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<AdvancedScorer> scorer,
+ AdvancedScorer::Create(spec_proto,
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // min([1]) = 1
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_1, /*query_it=*/nullptr), Eq(1));
+ // min([0.5, 0.8]) = 0.5
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_2, /*query_it=*/nullptr), Eq(0.5));
+ // min([0.5, 0.8, 1.0]) = 0.5
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_3, /*query_it=*/nullptr), Eq(0.5));
+
+ spec_proto.set_advanced_scoring_expression("max(this.propertyWeights())");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(spec_proto,
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // max([1]) = 1
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_1, /*query_it=*/nullptr), Eq(1));
+ // max([0.5, 0.8]) = 0.8
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_2, /*query_it=*/nullptr), Eq(0.8));
+ // max([0.5, 0.8, 1.0]) = 1
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_3, /*query_it=*/nullptr), Eq(1));
+
+ spec_proto.set_advanced_scoring_expression("sum(this.propertyWeights())");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(spec_proto,
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // sum([1]) = 1
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_1, /*query_it=*/nullptr), Eq(1));
+ // sum([0.5, 0.8]) = 1.3
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_2, /*query_it=*/nullptr), Eq(1.3));
+ // sum([0.5, 0.8, 1.0]) = 2.3
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_3, /*query_it=*/nullptr), Eq(2.3));
+}
+
+TEST_F(AdvancedScorerTest,
+ PropertyWeightsFunctionScoreExpressionUnspecifiedWeights) {
+ DocumentProto test_document_1 =
+ DocumentBuilder().SetKey("namespace", "uri1").SetSchema("email").Build();
+ DocumentProto test_document_2 =
+ DocumentBuilder().SetKey("namespace", "uri2").SetSchema("person").Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id_1,
+ document_store_->Put(test_document_1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id_2,
+ document_store_->Put(test_document_2));
+
+ ScoringSpecProto spec_proto = CreateAdvancedScoringSpec("");
+
+ // The entry for type "email" is missing, so every properties in "email"
+ // should get weight 1.0.
+ // The weight of "phoneNumber" in "person" type is unspecified, which should
+ // default to 1/2 = 0.5
+ *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
+ /*schema_type=*/"person",
+ {CreatePropertyWeight(/*path=*/"emailAddress", /*weight=*/1.0),
+ CreatePropertyWeight(/*path=*/"name", /*weight=*/2)});
+
+ // Let the hit for test_document_1 match property "subject".
+ // So this.propertyWeights() for test_document_1 will return [1].
+ DocHitInfo doc_hit_info_1 = DocHitInfo(document_id_1);
+ doc_hit_info_1.UpdateSection(0);
+
+ // Let the hit for test_document_2 match properties "emailAddress", "name" and
+ // "phoneNumber". So this.propertyWeights() for test_document_3 will return
+ // [0.5, 1, 0.5].
+ DocHitInfo doc_hit_info_2 = DocHitInfo(document_id_2);
+ doc_hit_info_2.UpdateSection(0);
+ doc_hit_info_2.UpdateSection(1);
+ doc_hit_info_2.UpdateSection(2);
+
+ spec_proto.set_advanced_scoring_expression("min(this.propertyWeights())");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<AdvancedScorer> scorer,
+ AdvancedScorer::Create(spec_proto,
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // min([1]) = 1
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_1, /*query_it=*/nullptr), Eq(1));
+ // min([0.5, 1, 0.5]) = 0.5
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_2, /*query_it=*/nullptr), Eq(0.5));
+
+ spec_proto.set_advanced_scoring_expression("max(this.propertyWeights())");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(spec_proto,
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // max([1]) = 1
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_1, /*query_it=*/nullptr), Eq(1));
+ // max([0.5, 1, 0.5]) = 1
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_2, /*query_it=*/nullptr), Eq(1));
+
+ spec_proto.set_advanced_scoring_expression("sum(this.propertyWeights())");
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(spec_proto,
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // sum([1]) = 1
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_1, /*query_it=*/nullptr), Eq(1));
+ // sum([0.5, 1, 0.5]) = 2
+ EXPECT_THAT(scorer->GetScore(doc_hit_info_2, /*query_it=*/nullptr), Eq(2));
+}
+
+TEST_F(AdvancedScorerTest, InvalidChildrenScoresFunctionScoreExpression) {
+ const double default_score = 123;
+
+ // Without join_children_fetcher provided,
+ // "len(this.childrenRankingSignals())" cannot be created.
+ EXPECT_THAT(
+ AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("len(this.childrenRankingSignals())"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds(),
+ /*join_children_fetcher=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // The root expression can only be of double type, but here it is of list
+ // type.
+ JoinChildrenFetcher fake_fetcher(JoinSpecProto::default_instance(),
+ /*map_joinable_qualified_id=*/{});
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.childrenRankingSignals()"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds(), &fake_fetcher),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(AdvancedScorerTest, ComplexExpression) {
+ const int64_t creation_timestamp_ms = 123;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(CreateDocument("namespace", "uri", /*score=*/123,
+ creation_timestamp_ms)));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<AdvancedScorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec(
+ "pow(sin(2), 2)"
+ // This is this.usageCount(1)
+ "+ this.usageCount(this.documentScore() - 122)"
+ "/ 12.34"
+ "* (10 * pow(2 * 1, sin(2))"
+ "+ 10 * (2 + 10 + this.creationTimestamp()))"
+ // This should evaluate to default score.
+ "+ this.relevanceScore()"),
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_FALSE(scorer->is_constant());
+ scorer->PrepareToScore(/*query_term_iterators=*/{});
+
+ ICING_ASSERT_OK(document_store_->ReportUsage(
+ CreateUsageReport("namespace", "uri", 0, UsageReport::USAGE_TYPE1)));
+ ICING_ASSERT_OK(document_store_->ReportUsage(
+ CreateUsageReport("namespace", "uri", 0, UsageReport::USAGE_TYPE1)));
+ EXPECT_THAT(scorer->GetScore(docHitInfo, /*query_it=*/nullptr),
+ DoubleNear(pow(sin(2), 2) +
+ 2 / 12.34 *
+ (10 * pow(2 * 1, sin(2)) +
+ 10 * (2 + 10 + creation_timestamp_ms)) +
+ 10,
+ kEps));
+}
+
+TEST_F(AdvancedScorerTest, ConstantExpression) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<AdvancedScorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec(
+ "pow(sin(2), 2)"
+ "+ log(2, 122) / 12.34"
+ "* (10 * pow(2 * 1, sin(2)) + 10 * (2 + 10))"),
+ /*default_score=*/10, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_TRUE(scorer->is_constant());
+}
+
+// Should be a parsing Error
+TEST_F(AdvancedScorerTest, EmptyExpression) {
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec(""),
+ /*default_score=*/10,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(AdvancedScorerTest, EvaluationErrorShouldReturnDefaultScore) {
+ const double default_score = 123;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store_->Put(CreateDocument("namespace", "uri")));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("log(0)"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(default_score, kEps));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer,
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("1 / 0"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(default_score, kEps));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("sqrt(-1)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(default_score, kEps));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ scorer, AdvancedScorer::Create(CreateAdvancedScoringSpec("pow(-1, 0.5)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), DoubleNear(default_score, kEps));
+}
+
+// The following tests should trigger a type error while the visitor tries to
+// build a ScoreExpression object.
+TEST_F(AdvancedScorerTest, MathTypeError) {
+ const double default_score = 0;
+
+ EXPECT_THAT(
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("test"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("log()"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("log(1, 2, 3)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("log(1, this)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("pow(1)"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("sqrt(1, 2)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("abs(1, 2)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("sin(1, 2)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("cos(1, 2)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("tan(1, 2)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("this"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(
+ AdvancedScorer::Create(CreateAdvancedScoringSpec("-this"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("1 + this"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(AdvancedScorerTest, DocumentFunctionTypeError) {
+ const double default_score = 0;
+
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("documentScore(1)"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.creationTimestamp(1)"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.usageCount()"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("usageLastUsedTimestamp(1, 1)"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("relevanceScore(1)"), default_score,
+ document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("documentScore(this)"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("that.documentScore()"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(AdvancedScorer::Create(
+ CreateAdvancedScoringSpec("this.this.creationTimestamp()"),
+ default_score, document_store_.get(), schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(AdvancedScorer::Create(CreateAdvancedScoringSpec("this.log(2)"),
+ default_score, document_store_.get(),
+ schema_store_.get(),
+ fake_clock_.GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/advanced_scoring/score-expression.cc b/icing/scoring/advanced_scoring/score-expression.cc
new file mode 100644
index 0000000..e8a2a89
--- /dev/null
+++ b/icing/scoring/advanced_scoring/score-expression.cc
@@ -0,0 +1,521 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/advanced_scoring/score-expression.h"
+
+#include <numeric>
+#include <vector>
+
+#include "icing/absl_ports/canonical_errors.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+libtextclassifier3::Status CheckChildrenNotNull(
+ const std::vector<std::unique_ptr<ScoreExpression>>& children) {
+ for (const auto& child : children) {
+ ICING_RETURN_ERROR_IF_NULL(child);
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace
+
+libtextclassifier3::StatusOr<std::unique_ptr<ScoreExpression>>
+OperatorScoreExpression::Create(
+ OperatorType op, std::vector<std::unique_ptr<ScoreExpression>> children) {
+ if (children.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "OperatorScoreExpression must have at least one argument.");
+ }
+ ICING_RETURN_IF_ERROR(CheckChildrenNotNull(children));
+
+ bool children_all_constant_double = true;
+ for (const auto& child : children) {
+ if (child->type() != ScoreExpressionType::kDouble) {
+ return absl_ports::InvalidArgumentError(
+ "Operators are only supported for double type.");
+ }
+ if (!child->is_constant_double()) {
+ children_all_constant_double = false;
+ }
+ }
+ if (op == OperatorType::kNegative) {
+ if (children.size() != 1) {
+ return absl_ports::InvalidArgumentError(
+ "Negative operator must have only 1 argument.");
+ }
+ }
+ std::unique_ptr<ScoreExpression> expression =
+ std::unique_ptr<OperatorScoreExpression>(
+ new OperatorScoreExpression(op, std::move(children)));
+ if (children_all_constant_double) {
+ // Because all of the children are constants, this expression does not
+ // depend on the DocHitInto or query_it that are passed into it.
+ return ConstantScoreExpression::Create(
+ expression->eval(DocHitInfo(), /*query_it=*/nullptr));
+ }
+ return expression;
+}
+
+libtextclassifier3::StatusOr<double> OperatorScoreExpression::eval(
+ const DocHitInfo& hit_info, const DocHitInfoIterator* query_it) const {
+ // The Create factory guarantees that an operator will have at least one
+ // child.
+ ICING_ASSIGN_OR_RETURN(double res, children_.at(0)->eval(hit_info, query_it));
+
+ if (op_ == OperatorType::kNegative) {
+ return -res;
+ }
+
+ for (int i = 1; i < children_.size(); ++i) {
+ ICING_ASSIGN_OR_RETURN(double v, children_.at(i)->eval(hit_info, query_it));
+ switch (op_) {
+ case OperatorType::kPlus:
+ res += v;
+ break;
+ case OperatorType::kMinus:
+ res -= v;
+ break;
+ case OperatorType::kTimes:
+ res *= v;
+ break;
+ case OperatorType::kDiv:
+ res /= v;
+ break;
+ case OperatorType::kNegative:
+ return absl_ports::InternalError("Should never reach here.");
+ }
+ if (!std::isfinite(res)) {
+ return absl_ports::InvalidArgumentError(
+ "Got a non-finite value while evaluating operator score expression.");
+ }
+ }
+ return res;
+}
+
+const std::unordered_map<std::string, MathFunctionScoreExpression::FunctionType>
+ MathFunctionScoreExpression::kFunctionNames = {
+ {"log", FunctionType::kLog}, {"pow", FunctionType::kPow},
+ {"max", FunctionType::kMax}, {"min", FunctionType::kMin},
+ {"len", FunctionType::kLen}, {"sum", FunctionType::kSum},
+ {"avg", FunctionType::kAvg}, {"sqrt", FunctionType::kSqrt},
+ {"abs", FunctionType::kAbs}, {"sin", FunctionType::kSin},
+ {"cos", FunctionType::kCos}, {"tan", FunctionType::kTan}};
+
+const std::unordered_set<MathFunctionScoreExpression::FunctionType>
+ MathFunctionScoreExpression::kVariableArgumentsFunctions = {
+ FunctionType::kMax, FunctionType::kMin, FunctionType::kLen,
+ FunctionType::kSum, FunctionType::kAvg};
+
+libtextclassifier3::StatusOr<std::unique_ptr<ScoreExpression>>
+MathFunctionScoreExpression::Create(
+ FunctionType function_type,
+ std::vector<std::unique_ptr<ScoreExpression>> args) {
+ if (args.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Math functions must have at least one argument.");
+ }
+ ICING_RETURN_IF_ERROR(CheckChildrenNotNull(args));
+
+ // Received a list type in the function argument.
+ if (args.size() == 1 && args[0]->type() == ScoreExpressionType::kDoubleList) {
+ // Only certain functions support list type.
+ if (kVariableArgumentsFunctions.count(function_type) > 0) {
+ return std::unique_ptr<MathFunctionScoreExpression>(
+ new MathFunctionScoreExpression(function_type, std::move(args)));
+ }
+ return absl_ports::InvalidArgumentError(absl_ports::StrCat(
+ "Received an unsupported list type argument in the math function."));
+ }
+
+ bool args_all_constant_double = true;
+ for (const auto& child : args) {
+ if (child->type() != ScoreExpressionType::kDouble) {
+ return absl_ports::InvalidArgumentError(
+ "Got an invalid type for the math function. Should expect a double "
+ "type argument.");
+ }
+ if (!child->is_constant_double()) {
+ args_all_constant_double = false;
+ }
+ }
+ switch (function_type) {
+ case FunctionType::kLog:
+ if (args.size() != 1 && args.size() != 2) {
+ return absl_ports::InvalidArgumentError(
+ "log must have 1 or 2 arguments.");
+ }
+ break;
+ case FunctionType::kPow:
+ if (args.size() != 2) {
+ return absl_ports::InvalidArgumentError("pow must have 2 arguments.");
+ }
+ break;
+ case FunctionType::kSqrt:
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError("sqrt must have 1 argument.");
+ }
+ break;
+ case FunctionType::kAbs:
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError("abs must have 1 argument.");
+ }
+ break;
+ case FunctionType::kSin:
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError("sin must have 1 argument.");
+ }
+ break;
+ case FunctionType::kCos:
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError("cos must have 1 argument.");
+ }
+ break;
+ case FunctionType::kTan:
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError("tan must have 1 argument.");
+ }
+ break;
+ // Functions that support variable length arguments
+ case FunctionType::kMax:
+ [[fallthrough]];
+ case FunctionType::kMin:
+ [[fallthrough]];
+ case FunctionType::kLen:
+ [[fallthrough]];
+ case FunctionType::kSum:
+ [[fallthrough]];
+ case FunctionType::kAvg:
+ break;
+ }
+ std::unique_ptr<ScoreExpression> expression =
+ std::unique_ptr<MathFunctionScoreExpression>(
+ new MathFunctionScoreExpression(function_type, std::move(args)));
+ if (args_all_constant_double) {
+ // Because all of the arguments are constants, this expression does not
+ // depend on the DocHitInto or query_it that are passed into it.
+ return ConstantScoreExpression::Create(
+ expression->eval(DocHitInfo(), /*query_it=*/nullptr));
+ }
+ return expression;
+}
+
+libtextclassifier3::StatusOr<double> MathFunctionScoreExpression::eval(
+ const DocHitInfo& hit_info, const DocHitInfoIterator* query_it) const {
+ std::vector<double> values;
+ if (args_.at(0)->type() == ScoreExpressionType::kDoubleList) {
+ ICING_ASSIGN_OR_RETURN(values, args_.at(0)->eval_list(hit_info, query_it));
+ } else {
+ for (const auto& child : args_) {
+ ICING_ASSIGN_OR_RETURN(double v, child->eval(hit_info, query_it));
+ values.push_back(v);
+ }
+ }
+
+ double res = 0;
+ switch (function_type_) {
+ case FunctionType::kLog:
+ if (values.size() == 1) {
+ res = log(values[0]);
+ } else {
+ // argument 0 is log base
+ // argument 1 is the value
+ res = log(values[1]) / log(values[0]);
+ }
+ break;
+ case FunctionType::kPow:
+ res = pow(values[0], values[1]);
+ break;
+ case FunctionType::kMax:
+ if (values.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Got an empty parameter set in max function");
+ }
+ res = *std::max_element(values.begin(), values.end());
+ break;
+ case FunctionType::kMin:
+ if (values.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Got an empty parameter set in min function");
+ }
+ res = *std::min_element(values.begin(), values.end());
+ break;
+ case FunctionType::kLen:
+ res = values.size();
+ break;
+ case FunctionType::kSum:
+ res = std::reduce(values.begin(), values.end());
+ break;
+ case FunctionType::kAvg:
+ if (values.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Got an empty parameter set in avg function.");
+ }
+ res = std::reduce(values.begin(), values.end()) / values.size();
+ break;
+ case FunctionType::kSqrt:
+ res = sqrt(values[0]);
+ break;
+ case FunctionType::kAbs:
+ res = abs(values[0]);
+ break;
+ case FunctionType::kSin:
+ res = sin(values[0]);
+ break;
+ case FunctionType::kCos:
+ res = cos(values[0]);
+ break;
+ case FunctionType::kTan:
+ res = tan(values[0]);
+ break;
+ }
+ if (!std::isfinite(res)) {
+ return absl_ports::InvalidArgumentError(
+ "Got a non-finite value while evaluating math function score "
+ "expression.");
+ }
+ return res;
+}
+
+const std::unordered_map<std::string,
+ DocumentFunctionScoreExpression::FunctionType>
+ DocumentFunctionScoreExpression::kFunctionNames = {
+ {"documentScore", FunctionType::kDocumentScore},
+ {"creationTimestamp", FunctionType::kCreationTimestamp},
+ {"usageCount", FunctionType::kUsageCount},
+ {"usageLastUsedTimestamp", FunctionType::kUsageLastUsedTimestamp}};
+
+libtextclassifier3::StatusOr<std::unique_ptr<DocumentFunctionScoreExpression>>
+DocumentFunctionScoreExpression::Create(
+ FunctionType function_type,
+ std::vector<std::unique_ptr<ScoreExpression>> args,
+ const DocumentStore* document_store, double default_score,
+ int64_t current_time_ms) {
+ if (args.empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Document-based functions must have at least one argument.");
+ }
+ ICING_RETURN_IF_ERROR(CheckChildrenNotNull(args));
+
+ if (args[0]->type() != ScoreExpressionType::kDocument) {
+ return absl_ports::InvalidArgumentError(
+ "The first parameter of document-based functions must be \"this\".");
+ }
+ switch (function_type) {
+ case FunctionType::kDocumentScore:
+ [[fallthrough]];
+ case FunctionType::kCreationTimestamp:
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError(
+ "DocumentScore/CreationTimestamp must have 1 argument.");
+ }
+ break;
+ case FunctionType::kUsageCount:
+ [[fallthrough]];
+ case FunctionType::kUsageLastUsedTimestamp:
+ if (args.size() != 2 || args[1]->type() != ScoreExpressionType::kDouble) {
+ return absl_ports::InvalidArgumentError(
+ "UsageCount/UsageLastUsedTimestamp must have 2 arguments. The "
+ "first argument should be \"this\", and the second argument "
+ "should be the usage type.");
+ }
+ break;
+ }
+ return std::unique_ptr<DocumentFunctionScoreExpression>(
+ new DocumentFunctionScoreExpression(function_type, std::move(args),
+ document_store, default_score,
+ current_time_ms));
+}
+
+libtextclassifier3::StatusOr<double> DocumentFunctionScoreExpression::eval(
+ const DocHitInfo& hit_info, const DocHitInfoIterator* query_it) const {
+ switch (function_type_) {
+ case FunctionType::kDocumentScore:
+ [[fallthrough]];
+ case FunctionType::kCreationTimestamp: {
+ ICING_ASSIGN_OR_RETURN(DocumentAssociatedScoreData score_data,
+ document_store_.GetDocumentAssociatedScoreData(
+ hit_info.document_id()),
+ default_score_);
+ if (function_type_ == FunctionType::kDocumentScore) {
+ return static_cast<double>(score_data.document_score());
+ }
+ return static_cast<double>(score_data.creation_timestamp_ms());
+ }
+ case FunctionType::kUsageCount:
+ [[fallthrough]];
+ case FunctionType::kUsageLastUsedTimestamp: {
+ ICING_ASSIGN_OR_RETURN(double raw_usage_type,
+ args_[1]->eval(hit_info, query_it));
+ int usage_type = (int)raw_usage_type;
+ if (usage_type < 1 || usage_type > 3 || raw_usage_type != usage_type) {
+ return absl_ports::InvalidArgumentError(
+ "Usage type must be an integer from 1 to 3");
+ }
+ std::optional<UsageStore::UsageScores> usage_scores =
+ document_store_.GetUsageScores(hit_info.document_id(),
+ current_time_ms_);
+ if (!usage_scores) {
+ // If there's no UsageScores entry present for this doc, then just
+ // treat it as a default instance.
+ usage_scores = UsageStore::UsageScores();
+ }
+ if (function_type_ == FunctionType::kUsageCount) {
+ if (usage_type == 1) {
+ return usage_scores->usage_type1_count;
+ } else if (usage_type == 2) {
+ return usage_scores->usage_type2_count;
+ } else {
+ return usage_scores->usage_type3_count;
+ }
+ }
+ if (usage_type == 1) {
+ return usage_scores->usage_type1_last_used_timestamp_s * 1000.0;
+ } else if (usage_type == 2) {
+ return usage_scores->usage_type2_last_used_timestamp_s * 1000.0;
+ } else {
+ return usage_scores->usage_type3_last_used_timestamp_s * 1000.0;
+ }
+ }
+ }
+}
+
+libtextclassifier3::StatusOr<
+ std::unique_ptr<RelevanceScoreFunctionScoreExpression>>
+RelevanceScoreFunctionScoreExpression::Create(
+ std::vector<std::unique_ptr<ScoreExpression>> args,
+ Bm25fCalculator* bm25f_calculator, double default_score) {
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError(
+ "relevanceScore must have 1 argument.");
+ }
+ ICING_RETURN_IF_ERROR(CheckChildrenNotNull(args));
+
+ if (args[0]->type() != ScoreExpressionType::kDocument) {
+ return absl_ports::InvalidArgumentError(
+ "relevanceScore must take \"this\" as its argument.");
+ }
+ return std::unique_ptr<RelevanceScoreFunctionScoreExpression>(
+ new RelevanceScoreFunctionScoreExpression(bm25f_calculator,
+ default_score));
+}
+
+libtextclassifier3::StatusOr<double>
+RelevanceScoreFunctionScoreExpression::eval(
+ const DocHitInfo& hit_info, const DocHitInfoIterator* query_it) const {
+ if (query_it == nullptr) {
+ return default_score_;
+ }
+ return static_cast<double>(
+ bm25f_calculator_.ComputeScore(query_it, hit_info, default_score_));
+}
+
+libtextclassifier3::StatusOr<
+ std::unique_ptr<ChildrenRankingSignalsFunctionScoreExpression>>
+ChildrenRankingSignalsFunctionScoreExpression::Create(
+ std::vector<std::unique_ptr<ScoreExpression>> args,
+ const JoinChildrenFetcher* join_children_fetcher) {
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError(
+ "childrenRankingSignals must have 1 argument.");
+ }
+ ICING_RETURN_IF_ERROR(CheckChildrenNotNull(args));
+
+ if (args[0]->type() != ScoreExpressionType::kDocument) {
+ return absl_ports::InvalidArgumentError(
+ "childrenRankingSignals must take \"this\" as its argument.");
+ }
+ if (join_children_fetcher == nullptr) {
+ return absl_ports::InvalidArgumentError(
+ "childrenRankingSignals must only be used with join, but "
+ "JoinChildrenFetcher "
+ "is not provided.");
+ }
+ return std::unique_ptr<ChildrenRankingSignalsFunctionScoreExpression>(
+ new ChildrenRankingSignalsFunctionScoreExpression(
+ *join_children_fetcher));
+}
+
+libtextclassifier3::StatusOr<std::vector<double>>
+ChildrenRankingSignalsFunctionScoreExpression::eval_list(
+ const DocHitInfo& hit_info, const DocHitInfoIterator* query_it) const {
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<ScoredDocumentHit> children_hits,
+ join_children_fetcher_.GetChildren(hit_info.document_id()));
+ std::vector<double> children_scores;
+ children_scores.reserve(children_hits.size());
+ for (const ScoredDocumentHit& child_hit : children_hits) {
+ children_scores.push_back(child_hit.score());
+ }
+ return std::move(children_scores);
+}
+
+libtextclassifier3::StatusOr<
+ std::unique_ptr<PropertyWeightsFunctionScoreExpression>>
+PropertyWeightsFunctionScoreExpression::Create(
+ std::vector<std::unique_ptr<ScoreExpression>> args,
+ const DocumentStore* document_store, const SectionWeights* section_weights,
+ int64_t current_time_ms) {
+ if (args.size() != 1) {
+ return absl_ports::InvalidArgumentError(
+ "propertyWeights must have 1 argument.");
+ }
+ ICING_RETURN_IF_ERROR(CheckChildrenNotNull(args));
+
+ if (args[0]->type() != ScoreExpressionType::kDocument) {
+ return absl_ports::InvalidArgumentError(
+ "propertyWeights must take \"this\" as its argument.");
+ }
+ return std::unique_ptr<PropertyWeightsFunctionScoreExpression>(
+ new PropertyWeightsFunctionScoreExpression(
+ document_store, section_weights, current_time_ms));
+}
+
+libtextclassifier3::StatusOr<std::vector<double>>
+PropertyWeightsFunctionScoreExpression::eval_list(
+ const DocHitInfo& hit_info, const DocHitInfoIterator*) const {
+ std::vector<double> weights;
+ SectionIdMask sections = hit_info.hit_section_ids_mask();
+ SchemaTypeId schema_type_id = GetSchemaTypeId(hit_info.document_id());
+
+ while (sections != 0) {
+ SectionId section_id = __builtin_ctzll(sections);
+ sections &= ~(UINT64_C(1) << section_id);
+ weights.push_back(section_weights_.GetNormalizedSectionWeight(
+ schema_type_id, section_id));
+ }
+ return weights;
+}
+
+SchemaTypeId PropertyWeightsFunctionScoreExpression::GetSchemaTypeId(
+ DocumentId document_id) const {
+ auto filter_data_optional =
+ document_store_.GetAliveDocumentFilterData(document_id, current_time_ms_);
+ if (!filter_data_optional) {
+ // This should never happen. The only failure case for
+ // GetAliveDocumentFilterData is if the document_id is outside of the range
+ // of allocated document_ids, which shouldn't be possible since we're
+ // getting this document_id from the posting lists.
+ ICING_LOG(WARNING) << "No document filter data for document ["
+ << document_id << "]";
+ return kInvalidSchemaTypeId;
+ }
+ return filter_data_optional.value().schema_type_id();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/advanced_scoring/score-expression.h b/icing/scoring/advanced_scoring/score-expression.h
new file mode 100644
index 0000000..08d7997
--- /dev/null
+++ b/icing/scoring/advanced_scoring/score-expression.h
@@ -0,0 +1,348 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_ADVANCED_SCORING_SCORE_EXPRESSION_H_
+#define ICING_SCORING_ADVANCED_SCORING_SCORE_EXPRESSION_H_
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/scoring/bm25f-calculator.h"
+#include "icing/store/document-store.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+enum class ScoreExpressionType {
+ kDouble,
+ kDoubleList,
+ kDocument // Only "this" is considered as document type.
+};
+
+class ScoreExpression {
+ public:
+ virtual ~ScoreExpression() = default;
+
+ // Evaluate the score expression to double with the current document.
+ //
+ // RETURNS:
+ // - The evaluated result as a double on success.
+ // - INVALID_ARGUMENT if a non-finite value is reached while evaluating the
+ // expression.
+ // - INTERNAL if there are inconsistencies.
+ virtual libtextclassifier3::StatusOr<double> eval(
+ const DocHitInfo& hit_info, const DocHitInfoIterator* query_it) const {
+ if (type() == ScoreExpressionType::kDouble) {
+ return absl_ports::UnimplementedError(
+ "All ScoreExpressions of type Double must provide their own "
+ "implementation of eval!");
+ }
+ return absl_ports::InternalError(
+ "Runtime type error: the expression should never be evaluated to a "
+ "double. There must be inconsistencies in the static type checking.");
+ }
+
+ virtual libtextclassifier3::StatusOr<std::vector<double>> eval_list(
+ const DocHitInfo& hit_info, const DocHitInfoIterator* query_it) const {
+ if (type() == ScoreExpressionType::kDoubleList) {
+ return absl_ports::UnimplementedError(
+ "All ScoreExpressions of type Double List must provide their own "
+ "implementation of eval_list!");
+ }
+ return absl_ports::InternalError(
+ "Runtime type error: the expression should never be evaluated to a "
+ "double list. There must be inconsistencies in the static type "
+ "checking.");
+ }
+
+ // Indicate the type to which the current expression will be evaluated.
+ virtual ScoreExpressionType type() const = 0;
+
+ // Indicate whether the current expression is a constant double.
+ // Returns true if and only if the object is of ConstantScoreExpression type.
+ virtual bool is_constant_double() const { return false; }
+};
+
+class ThisExpression : public ScoreExpression {
+ public:
+ static std::unique_ptr<ThisExpression> Create() {
+ return std::unique_ptr<ThisExpression>(new ThisExpression());
+ }
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDocument;
+ }
+
+ private:
+ ThisExpression() = default;
+};
+
+class ConstantScoreExpression : public ScoreExpression {
+ public:
+ static std::unique_ptr<ConstantScoreExpression> Create(
+ libtextclassifier3::StatusOr<double> c) {
+ return std::unique_ptr<ConstantScoreExpression>(
+ new ConstantScoreExpression(c));
+ }
+
+ libtextclassifier3::StatusOr<double> eval(
+ const DocHitInfo&, const DocHitInfoIterator*) const override {
+ return c_;
+ }
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDouble;
+ }
+
+ bool is_constant_double() const override { return true; }
+
+ private:
+ explicit ConstantScoreExpression(libtextclassifier3::StatusOr<double> c)
+ : c_(c) {}
+
+ libtextclassifier3::StatusOr<double> c_;
+};
+
+class OperatorScoreExpression : public ScoreExpression {
+ public:
+ enum class OperatorType { kPlus, kMinus, kNegative, kTimes, kDiv };
+
+ // RETURNS:
+ // - An OperatorScoreExpression instance on success if not simplifiable.
+ // - A ConstantScoreExpression instance on success if simplifiable.
+ // - FAILED_PRECONDITION on any null pointer in children.
+ // - INVALID_ARGUMENT on type errors.
+ static libtextclassifier3::StatusOr<std::unique_ptr<ScoreExpression>> Create(
+ OperatorType op, std::vector<std::unique_ptr<ScoreExpression>> children);
+
+ libtextclassifier3::StatusOr<double> eval(
+ const DocHitInfo& hit_info,
+ const DocHitInfoIterator* query_it) const override;
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDouble;
+ }
+
+ private:
+ explicit OperatorScoreExpression(
+ OperatorType op, std::vector<std::unique_ptr<ScoreExpression>> children)
+ : op_(op), children_(std::move(children)) {}
+
+ OperatorType op_;
+ std::vector<std::unique_ptr<ScoreExpression>> children_;
+};
+
+class MathFunctionScoreExpression : public ScoreExpression {
+ public:
+ enum class FunctionType {
+ kLog,
+ kPow,
+ kMax,
+ kMin,
+ kLen,
+ kSum,
+ kAvg,
+ kSqrt,
+ kAbs,
+ kSin,
+ kCos,
+ kTan
+ };
+
+ static const std::unordered_map<std::string, FunctionType> kFunctionNames;
+
+ static const std::unordered_set<FunctionType> kVariableArgumentsFunctions;
+
+ // RETURNS:
+ // - A MathFunctionScoreExpression instance on success if not simplifiable.
+ // - A ConstantScoreExpression instance on success if simplifiable.
+ // - FAILED_PRECONDITION on any null pointer in args.
+ // - INVALID_ARGUMENT on type errors.
+ static libtextclassifier3::StatusOr<std::unique_ptr<ScoreExpression>> Create(
+ FunctionType function_type,
+ std::vector<std::unique_ptr<ScoreExpression>> args);
+
+ libtextclassifier3::StatusOr<double> eval(
+ const DocHitInfo& hit_info,
+ const DocHitInfoIterator* query_it) const override;
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDouble;
+ }
+
+ private:
+ explicit MathFunctionScoreExpression(
+ FunctionType function_type,
+ std::vector<std::unique_ptr<ScoreExpression>> args)
+ : function_type_(function_type), args_(std::move(args)) {}
+
+ FunctionType function_type_;
+ std::vector<std::unique_ptr<ScoreExpression>> args_;
+};
+
+class DocumentFunctionScoreExpression : public ScoreExpression {
+ public:
+ enum class FunctionType {
+ kDocumentScore,
+ kCreationTimestamp,
+ kUsageCount,
+ kUsageLastUsedTimestamp,
+ };
+
+ static const std::unordered_map<std::string, FunctionType> kFunctionNames;
+
+ // RETURNS:
+ // - A DocumentFunctionScoreExpression instance on success.
+ // - FAILED_PRECONDITION on any null pointer in args.
+ // - INVALID_ARGUMENT on type errors.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<DocumentFunctionScoreExpression>>
+ Create(FunctionType function_type,
+ std::vector<std::unique_ptr<ScoreExpression>> args,
+ const DocumentStore* document_store, double default_score,
+ int64_t current_time_ms);
+
+ libtextclassifier3::StatusOr<double> eval(
+ const DocHitInfo& hit_info,
+ const DocHitInfoIterator* query_it) const override;
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDouble;
+ }
+
+ private:
+ explicit DocumentFunctionScoreExpression(
+ FunctionType function_type,
+ std::vector<std::unique_ptr<ScoreExpression>> args,
+ const DocumentStore* document_store, double default_score,
+ int64_t current_time_ms)
+ : args_(std::move(args)),
+ document_store_(*document_store),
+ default_score_(default_score),
+ function_type_(function_type),
+ current_time_ms_(current_time_ms) {}
+
+ std::vector<std::unique_ptr<ScoreExpression>> args_;
+ const DocumentStore& document_store_;
+ double default_score_;
+ FunctionType function_type_;
+ int64_t current_time_ms_;
+};
+
+class RelevanceScoreFunctionScoreExpression : public ScoreExpression {
+ public:
+ static constexpr std::string_view kFunctionName = "relevanceScore";
+
+ // RETURNS:
+ // - A RelevanceScoreFunctionScoreExpression instance on success.
+ // - FAILED_PRECONDITION on any null pointer in args.
+ // - INVALID_ARGUMENT on type errors.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<RelevanceScoreFunctionScoreExpression>>
+ Create(std::vector<std::unique_ptr<ScoreExpression>> args,
+ Bm25fCalculator* bm25f_calculator, double default_score);
+
+ libtextclassifier3::StatusOr<double> eval(
+ const DocHitInfo& hit_info,
+ const DocHitInfoIterator* query_it) const override;
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDouble;
+ }
+
+ private:
+ explicit RelevanceScoreFunctionScoreExpression(
+ Bm25fCalculator* bm25f_calculator, double default_score)
+ : bm25f_calculator_(*bm25f_calculator), default_score_(default_score) {}
+
+ Bm25fCalculator& bm25f_calculator_;
+ double default_score_;
+};
+
+class ChildrenRankingSignalsFunctionScoreExpression : public ScoreExpression {
+ public:
+ static constexpr std::string_view kFunctionName = "childrenRankingSignals";
+
+ // RETURNS:
+ // - A ChildrenRankingSignalsFunctionScoreExpression instance on success.
+ // - FAILED_PRECONDITION on any null pointer in children.
+ // - INVALID_ARGUMENT on type errors.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<ChildrenRankingSignalsFunctionScoreExpression>>
+ Create(std::vector<std::unique_ptr<ScoreExpression>> args,
+ const JoinChildrenFetcher* join_children_fetcher);
+
+ libtextclassifier3::StatusOr<std::vector<double>> eval_list(
+ const DocHitInfo& hit_info,
+ const DocHitInfoIterator* query_it) const override;
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDoubleList;
+ }
+
+ private:
+ explicit ChildrenRankingSignalsFunctionScoreExpression(
+ const JoinChildrenFetcher& join_children_fetcher)
+ : join_children_fetcher_(join_children_fetcher) {}
+ const JoinChildrenFetcher& join_children_fetcher_;
+};
+
+class PropertyWeightsFunctionScoreExpression : public ScoreExpression {
+ public:
+ static constexpr std::string_view kFunctionName = "propertyWeights";
+
+ // RETURNS:
+ // - A PropertyWeightsFunctionScoreExpression instance on success.
+ // - FAILED_PRECONDITION on any null pointer in children.
+ // - INVALID_ARGUMENT on type errors.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<PropertyWeightsFunctionScoreExpression>>
+ Create(std::vector<std::unique_ptr<ScoreExpression>> args,
+ const DocumentStore* document_store,
+ const SectionWeights* section_weights, int64_t current_time_ms);
+
+ libtextclassifier3::StatusOr<std::vector<double>> eval_list(
+ const DocHitInfo& hit_info, const DocHitInfoIterator*) const override;
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDoubleList;
+ }
+
+ SchemaTypeId GetSchemaTypeId(DocumentId document_id) const;
+
+ private:
+ explicit PropertyWeightsFunctionScoreExpression(
+ const DocumentStore* document_store,
+ const SectionWeights* section_weights, int64_t current_time_ms)
+ : document_store_(*document_store),
+ section_weights_(*section_weights),
+ current_time_ms_(current_time_ms) {}
+ const DocumentStore& document_store_;
+ const SectionWeights& section_weights_;
+ int64_t current_time_ms_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_ADVANCED_SCORING_SCORE_EXPRESSION_H_
diff --git a/icing/scoring/advanced_scoring/score-expression_test.cc b/icing/scoring/advanced_scoring/score-expression_test.cc
new file mode 100644
index 0000000..588090d
--- /dev/null
+++ b/icing/scoring/advanced_scoring/score-expression_test.cc
@@ -0,0 +1,353 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/advanced_scoring/score-expression.h"
+
+#include <cmath>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+
+class NonConstantScoreExpression : public ScoreExpression {
+ public:
+ static std::unique_ptr<NonConstantScoreExpression> Create() {
+ return std::make_unique<NonConstantScoreExpression>();
+ }
+
+ libtextclassifier3::StatusOr<double> eval(
+ const DocHitInfo &, const DocHitInfoIterator *) const override {
+ return 0;
+ }
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDouble;
+ }
+
+ bool is_constant_double() const override { return false; }
+};
+
+class ListScoreExpression : public ScoreExpression {
+ public:
+ static std::unique_ptr<ListScoreExpression> Create(
+ const std::vector<double> &values) {
+ std::unique_ptr<ListScoreExpression> res =
+ std::make_unique<ListScoreExpression>();
+ res->values = values;
+ return res;
+ }
+
+ libtextclassifier3::StatusOr<std::vector<double>> eval_list(
+ const DocHitInfo &, const DocHitInfoIterator *) const override {
+ return values;
+ }
+
+ ScoreExpressionType type() const override {
+ return ScoreExpressionType::kDoubleList;
+ }
+
+ std::vector<double> values;
+};
+
+template <typename... Args>
+std::vector<std::unique_ptr<ScoreExpression>> MakeChildren(Args... args) {
+ std::vector<std::unique_ptr<ScoreExpression>> children;
+ (children.push_back(std::move(args)), ...);
+ return children;
+}
+
+TEST(ScoreExpressionTest, OperatorSimplification) {
+ // 1 + 1 = 2
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoreExpression> expression,
+ OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kPlus,
+ MakeChildren(ConstantScoreExpression::Create(1),
+ ConstantScoreExpression::Create(1))));
+ ASSERT_TRUE(expression->is_constant_double());
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(2)));
+
+ // 1 - 2 - 3 = -4
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kMinus,
+ MakeChildren(ConstantScoreExpression::Create(1),
+ ConstantScoreExpression::Create(2),
+ ConstantScoreExpression::Create(3))));
+ ASSERT_TRUE(expression->is_constant_double());
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(-4)));
+
+ // 1 * 2 * 3 * 4 = 24
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kTimes,
+ MakeChildren(ConstantScoreExpression::Create(1),
+ ConstantScoreExpression::Create(2),
+ ConstantScoreExpression::Create(3),
+ ConstantScoreExpression::Create(4))));
+ ASSERT_TRUE(expression->is_constant_double());
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(24)));
+
+ // 1 / 2 / 4 = 0.125
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kDiv,
+ MakeChildren(ConstantScoreExpression::Create(1),
+ ConstantScoreExpression::Create(2),
+ ConstantScoreExpression::Create(4))));
+ ASSERT_TRUE(expression->is_constant_double());
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(0.125)));
+
+ // -(2) = -2
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kNegative,
+ MakeChildren(ConstantScoreExpression::Create(2))));
+ ASSERT_TRUE(expression->is_constant_double());
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(-2)));
+}
+
+TEST(ScoreExpressionTest, MathFunctionSimplification) {
+ // pow(2, 2) = 4
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoreExpression> expression,
+ MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kPow,
+ MakeChildren(ConstantScoreExpression::Create(2),
+ ConstantScoreExpression::Create(2))));
+ ASSERT_TRUE(expression->is_constant_double());
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(4)));
+
+ // abs(-2) = 2
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kAbs,
+ MakeChildren(ConstantScoreExpression::Create(-2))));
+ ASSERT_TRUE(expression->is_constant_double());
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(2)));
+
+ // log(e) = 1
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kLog,
+ MakeChildren(ConstantScoreExpression::Create(M_E))));
+ ASSERT_TRUE(expression->is_constant_double());
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(1)));
+}
+
+TEST(ScoreExpressionTest, CannotSimplifyNonConstant) {
+ // 1 + non_constant = non_constant
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoreExpression> expression,
+ OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kPlus,
+ MakeChildren(ConstantScoreExpression::Create(1),
+ NonConstantScoreExpression::Create())));
+ ASSERT_FALSE(expression->is_constant_double());
+
+ // non_constant * non_constant = non_constant
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kTimes,
+ MakeChildren(NonConstantScoreExpression::Create(),
+ NonConstantScoreExpression::Create())));
+ ASSERT_FALSE(expression->is_constant_double());
+
+ // -(non_constant) = non_constant
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kNegative,
+ MakeChildren(NonConstantScoreExpression::Create())));
+ ASSERT_FALSE(expression->is_constant_double());
+
+ // pow(non_constant, 2) = non_constant
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kPow,
+ MakeChildren(NonConstantScoreExpression::Create(),
+ ConstantScoreExpression::Create(2))));
+ ASSERT_FALSE(expression->is_constant_double());
+
+ // abs(non_constant) = non_constant
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kAbs,
+ MakeChildren(NonConstantScoreExpression::Create())));
+ ASSERT_FALSE(expression->is_constant_double());
+
+ // log(non_constant) = non_constant
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kLog,
+ MakeChildren(NonConstantScoreExpression::Create())));
+ ASSERT_FALSE(expression->is_constant_double());
+}
+
+TEST(ScoreExpressionTest, MathFunctionsWithListTypeArgument) {
+ // max({1, 2, 3}) = 3
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoreExpression> expression,
+ MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kMax,
+ MakeChildren(ListScoreExpression::Create({1, 2, 3}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(3)));
+
+ // min({1, 2, 3}) = 1
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kMin,
+ MakeChildren(ListScoreExpression::Create({1, 2, 3}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(1)));
+
+ // len({1, 2, 3}) = 3
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kLen,
+ MakeChildren(ListScoreExpression::Create({1, 2, 3}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(3)));
+
+ // sum({1, 2, 3}) = 6
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kSum,
+ MakeChildren(ListScoreExpression::Create({1, 2, 3}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(6)));
+
+ // avg({1, 2, 3}) = 2
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kAvg,
+ MakeChildren(ListScoreExpression::Create({1, 2, 3}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(2)));
+
+ // max({4}) = 4
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kMax,
+ MakeChildren(ListScoreExpression::Create({4}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(4)));
+
+ // min({5}) = 5
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kMin,
+ MakeChildren(ListScoreExpression::Create({5}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(5)));
+
+ // len({6}) = 1
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kLen,
+ MakeChildren(ListScoreExpression::Create({6}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(1)));
+
+ // sum({7}) = 7
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kSum,
+ MakeChildren(ListScoreExpression::Create({7}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(7)));
+
+ // avg({7}) = 7
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kAvg,
+ MakeChildren(ListScoreExpression::Create({7}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(7)));
+
+ // len({}) = 0
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kLen,
+ MakeChildren(ListScoreExpression::Create({}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(0)));
+
+ // sum({}) = 0
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kSum,
+ MakeChildren(ListScoreExpression::Create({}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr), IsOkAndHolds(Eq(0)));
+}
+
+TEST(ScoreExpressionTest, MathFunctionsWithListTypeArgumentError) {
+ // max({}) = evaluation error, since max on empty list does not produce a
+ // valid result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoreExpression> expression,
+ MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kMax,
+ MakeChildren(ListScoreExpression::Create({}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // avg({}) = evaluation error, since avg on empty list does not produce a
+ // valid result.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ expression, MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kAvg,
+ MakeChildren(ListScoreExpression::Create({}))));
+ EXPECT_THAT(expression->eval(DocHitInfo(), nullptr),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // max(1, {2}) = type error, since max must take either n > 0 parameters of
+ // type double, or a single parameter of type list.
+ EXPECT_THAT(MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kMax,
+ MakeChildren(ConstantScoreExpression::Create(1),
+ ListScoreExpression::Create({2}))),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // min({1}, {2}) = type error, since min must take either n > 0 parameters of
+ // type double, or a single parameter of type list.
+ EXPECT_THAT(MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kMin,
+ MakeChildren(ListScoreExpression::Create({1}),
+ ListScoreExpression::Create({2}))),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ // sin({1}) = type error, since sin does not support list type parameters.
+ EXPECT_THAT(MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kSin,
+ MakeChildren(ListScoreExpression::Create({1}))),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST(ScoreExpressionTest, ChildrenCannotBeNull) {
+ EXPECT_THAT(OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kPlus,
+ MakeChildren(ConstantScoreExpression::Create(1), nullptr)),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::FunctionType::kPow,
+ MakeChildren(ConstantScoreExpression::Create(2), nullptr)),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/advanced_scoring/scoring-visitor.cc b/icing/scoring/advanced_scoring/scoring-visitor.cc
new file mode 100644
index 0000000..e2b24a2
--- /dev/null
+++ b/icing/scoring/advanced_scoring/scoring-visitor.cc
@@ -0,0 +1,191 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/advanced_scoring/scoring-visitor.h"
+
+#include "icing/absl_ports/str_cat.h"
+
+namespace icing {
+namespace lib {
+
+void ScoringVisitor::VisitFunctionName(const FunctionNameNode* node) {
+ pending_error_ = absl_ports::InternalError(
+ "FunctionNameNode should be handled in VisitFunction!");
+}
+
+void ScoringVisitor::VisitString(const StringNode* node) {
+ pending_error_ =
+ absl_ports::InvalidArgumentError("Scoring does not support String!");
+}
+
+void ScoringVisitor::VisitText(const TextNode* node) {
+ pending_error_ =
+ absl_ports::InternalError("TextNode should be handled in VisitMember!");
+}
+
+void ScoringVisitor::VisitMember(const MemberNode* node) {
+ bool is_member_function = node->function() != nullptr;
+ if (is_member_function) {
+ // If the member node represents a member function, it must have only one
+ // child for "this".
+ if (node->children().size() != 1 ||
+ node->children()[0]->value() != "this") {
+ pending_error_ = absl_ports::InvalidArgumentError(
+ "Member functions can only be called via \"this\".");
+ return;
+ }
+ return VisitFunctionHelper(node->function(), is_member_function);
+ }
+ std::string value;
+ if (node->children().size() == 1) {
+ // If a member has only one child, then it represents a integer literal.
+ value = node->children()[0]->value();
+ } else if (node->children().size() == 2) {
+ // If a member has two children, then it can only represent a floating point
+ // number, so we need to join them by "." to build the numeric literal.
+ value = absl_ports::StrCat(node->children()[0]->value(), ".",
+ node->children()[1]->value());
+ } else {
+ pending_error_ = absl_ports::InvalidArgumentError(
+ "MemberNode must have 1 or 2 children.");
+ return;
+ }
+ char* end;
+ double number = std::strtod(value.c_str(), &end);
+ if (end != value.c_str() + value.length()) {
+ // While it would be doable to support property references in the scoring
+ // grammar, we currently don't have an efficient way to support such a
+ // lookup (we'd have to read each document). As such, it's simpler to just
+ // restrict the scoring language to not include properties.
+ pending_error_ = absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Expect a numeric literal, but got ", value));
+ return;
+ }
+ stack_.push_back(ConstantScoreExpression::Create(number));
+}
+
+void ScoringVisitor::VisitFunctionHelper(const FunctionNode* node,
+ bool is_member_function) {
+ std::vector<std::unique_ptr<ScoreExpression>> args;
+ if (is_member_function) {
+ args.push_back(ThisExpression::Create());
+ }
+ for (const auto& arg : node->args()) {
+ arg->Accept(this);
+ if (has_pending_error()) {
+ return;
+ }
+ args.push_back(pop_stack());
+ }
+ const std::string& function_name = node->function_name()->value();
+ libtextclassifier3::StatusOr<std::unique_ptr<ScoreExpression>> expression =
+ absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Unknown function: ", function_name));
+
+ if (DocumentFunctionScoreExpression::kFunctionNames.find(function_name) !=
+ DocumentFunctionScoreExpression::kFunctionNames.end()) {
+ // Document-based function
+ expression = DocumentFunctionScoreExpression::Create(
+ DocumentFunctionScoreExpression::kFunctionNames.at(function_name),
+ std::move(args), &document_store_, default_score_, current_time_ms_);
+ } else if (function_name ==
+ RelevanceScoreFunctionScoreExpression::kFunctionName) {
+ // relevanceScore function
+ expression = RelevanceScoreFunctionScoreExpression::Create(
+ std::move(args), &bm25f_calculator_, default_score_);
+ } else if (function_name ==
+ ChildrenRankingSignalsFunctionScoreExpression::kFunctionName) {
+ // childrenRankingSignals function
+ expression = ChildrenRankingSignalsFunctionScoreExpression::Create(
+ std::move(args), join_children_fetcher_);
+ } else if (function_name ==
+ PropertyWeightsFunctionScoreExpression::kFunctionName) {
+ // propertyWeights function
+ expression = PropertyWeightsFunctionScoreExpression::Create(
+ std::move(args), &document_store_, &section_weights_, current_time_ms_);
+ } else if (MathFunctionScoreExpression::kFunctionNames.find(function_name) !=
+ MathFunctionScoreExpression::kFunctionNames.end()) {
+ // Math functions
+ expression = MathFunctionScoreExpression::Create(
+ MathFunctionScoreExpression::kFunctionNames.at(function_name),
+ std::move(args));
+ }
+
+ if (!expression.ok()) {
+ pending_error_ = expression.status();
+ return;
+ }
+ stack_.push_back(std::move(expression).ValueOrDie());
+}
+
+void ScoringVisitor::VisitUnaryOperator(const UnaryOperatorNode* node) {
+ if (node->operator_text() != "MINUS") {
+ pending_error_ = absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Unknown unary operator: ", node->operator_text()));
+ return;
+ }
+ node->child()->Accept(this);
+ if (has_pending_error()) {
+ return;
+ }
+ std::vector<std::unique_ptr<ScoreExpression>> children;
+ children.push_back(pop_stack());
+
+ libtextclassifier3::StatusOr<std::unique_ptr<ScoreExpression>> expression =
+ OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kNegative,
+ std::move(children));
+ if (!expression.ok()) {
+ pending_error_ = expression.status();
+ return;
+ }
+ stack_.push_back(std::move(expression).ValueOrDie());
+}
+
+void ScoringVisitor::VisitNaryOperator(const NaryOperatorNode* node) {
+ std::vector<std::unique_ptr<ScoreExpression>> children;
+ for (const auto& arg : node->children()) {
+ arg->Accept(this);
+ if (has_pending_error()) {
+ return;
+ }
+ children.push_back(pop_stack());
+ }
+
+ libtextclassifier3::StatusOr<std::unique_ptr<ScoreExpression>> expression =
+ absl_ports::InvalidArgumentError(
+ absl_ports::StrCat("Unknown Nary operator: ", node->operator_text()));
+
+ if (node->operator_text() == "PLUS") {
+ expression = OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kPlus, std::move(children));
+ } else if (node->operator_text() == "MINUS") {
+ expression = OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kMinus, std::move(children));
+ } else if (node->operator_text() == "TIMES") {
+ expression = OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kTimes, std::move(children));
+ } else if (node->operator_text() == "DIV") {
+ expression = OperatorScoreExpression::Create(
+ OperatorScoreExpression::OperatorType::kDiv, std::move(children));
+ }
+ if (!expression.ok()) {
+ pending_error_ = expression.status();
+ return;
+ }
+ stack_.push_back(std::move(expression).ValueOrDie());
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/advanced_scoring/scoring-visitor.h b/icing/scoring/advanced_scoring/scoring-visitor.h
new file mode 100644
index 0000000..cfee25b
--- /dev/null
+++ b/icing/scoring/advanced_scoring/scoring-visitor.h
@@ -0,0 +1,108 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_ADVANCED_SCORING_SCORING_VISITOR_H_
+#define ICING_SCORING_ADVANCED_SCORING_SCORING_VISITOR_H_
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/scoring/advanced_scoring/score-expression.h"
+#include "icing/scoring/bm25f-calculator.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+class ScoringVisitor : public AbstractSyntaxTreeVisitor {
+ public:
+ explicit ScoringVisitor(double default_score,
+ const DocumentStore* document_store,
+ const SchemaStore* schema_store,
+ SectionWeights* section_weights,
+ Bm25fCalculator* bm25f_calculator,
+ const JoinChildrenFetcher* join_children_fetcher,
+ int64_t current_time_ms)
+ : default_score_(default_score),
+ document_store_(*document_store),
+ schema_store_(*schema_store),
+ section_weights_(*section_weights),
+ bm25f_calculator_(*bm25f_calculator),
+ join_children_fetcher_(join_children_fetcher),
+ current_time_ms_(current_time_ms) {}
+
+ void VisitFunctionName(const FunctionNameNode* node) override;
+ void VisitString(const StringNode* node) override;
+ void VisitText(const TextNode* node) override;
+ void VisitMember(const MemberNode* node) override;
+
+ void VisitFunction(const FunctionNode* node) override {
+ return VisitFunctionHelper(node, /*is_member_function=*/false);
+ }
+
+ void VisitUnaryOperator(const UnaryOperatorNode* node) override;
+ void VisitNaryOperator(const NaryOperatorNode* node) override;
+
+ // RETURNS:
+ // - An ScoreExpression instance able to evaluate the expression on success.
+ // - INVALID_ARGUMENT if the AST does not conform to supported expressions,
+ // such as type errors.
+ // - INTERNAL if there are inconsistencies.
+ libtextclassifier3::StatusOr<std::unique_ptr<ScoreExpression>>
+ Expression() && {
+ if (has_pending_error()) {
+ return pending_error_;
+ }
+ if (stack_.size() != 1) {
+ return absl_ports::InternalError(IcingStringUtil::StringPrintf(
+ "Expect to get only one result from "
+ "ScoringVisitor, but got %zu. There must be inconsistencies.",
+ stack_.size()));
+ }
+ return std::move(stack_[0]);
+ }
+
+ private:
+ // Visit function node. If is_member_function is true, a ThisExpression will
+ // be added as the first function argument.
+ void VisitFunctionHelper(const FunctionNode* node, bool is_member_function);
+
+ bool has_pending_error() const { return !pending_error_.ok(); }
+
+ std::unique_ptr<ScoreExpression> pop_stack() {
+ std::unique_ptr<ScoreExpression> result = std::move(stack_.back());
+ stack_.pop_back();
+ return result;
+ }
+
+ double default_score_;
+ const DocumentStore& document_store_;
+ const SchemaStore& schema_store_;
+ SectionWeights& section_weights_;
+ Bm25fCalculator& bm25f_calculator_;
+ // A non-null join_children_fetcher_ indicates scoring in a join.
+ const JoinChildrenFetcher* join_children_fetcher_; // Does not own.
+
+ libtextclassifier3::Status pending_error_;
+ std::vector<std::unique_ptr<ScoreExpression>> stack_;
+ int64_t current_time_ms_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_ADVANCED_SCORING_SCORING_VISITOR_H_
diff --git a/icing/scoring/bm25f-calculator.cc b/icing/scoring/bm25f-calculator.cc
new file mode 100644
index 0000000..a80ef34
--- /dev/null
+++ b/icing/scoring/bm25f-calculator.cc
@@ -0,0 +1,248 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/bm25f-calculator.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/store/corpus-associated-scoring-data.h"
+#include "icing/store/corpus-id.h"
+#include "icing/store/document-associated-score-data.h"
+#include "icing/store/document-filter-data.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Smoothing parameter, determines the relevance of higher term frequency
+// documents. The higher k1, the higher their relevance. 1.2 is the default
+// value in the BM25F literature and works well in most corpora.
+constexpr float k1_ = 1.2f;
+// Smoothing parameter, determines the weight of the document length on the
+// final score. The higher b, the higher the influence of the document length.
+// 0.7 is the default value in the BM25F literature and works well in most
+// corpora.
+constexpr float b_ = 0.7f;
+
+// TODO(b/158603900): add tests for Bm25fCalculator
+Bm25fCalculator::Bm25fCalculator(const DocumentStore* document_store,
+ SectionWeights* section_weights,
+ int64_t current_time_ms)
+ : document_store_(document_store),
+ section_weights_(*section_weights),
+ current_time_ms_(current_time_ms) {}
+
+// During initialization, Bm25fCalculator iterates through
+// hit-iterators for each query term to pre-compute n(q_i) for each corpus under
+// consideration.
+void Bm25fCalculator::PrepareToScore(
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>*
+ query_term_iterators) {
+ Clear();
+ TermId term_id = 0;
+ for (auto& iter : *query_term_iterators) {
+ const std::string& term = iter.first;
+ if (term_id_map_.find(term) != term_id_map_.end()) {
+ continue;
+ }
+ term_id_map_[term] = ++term_id;
+ DocHitInfoIterator* term_it = iter.second.get();
+
+ while (term_it->Advance().ok()) {
+ auto status_or = document_store_->GetDocumentAssociatedScoreData(
+ term_it->doc_hit_info().document_id());
+ if (!status_or.ok()) {
+ ICING_LOG(ERROR) << "No document score data";
+ continue;
+ }
+ DocumentAssociatedScoreData data = status_or.ValueOrDie();
+ CorpusId corpus_id = data.corpus_id();
+ CorpusTermInfo corpus_term_info(corpus_id, term_id);
+ corpus_nqi_map_[corpus_term_info.value]++;
+ }
+ }
+}
+
+void Bm25fCalculator::Clear() {
+ term_id_map_.clear();
+ corpus_avgdl_map_.clear();
+ corpus_nqi_map_.clear();
+ corpus_idf_map_.clear();
+}
+
+// Computes BM25F relevance score for query terms matched in document D.
+//
+// BM25F = \sum_i IDF(q_i) * tf(q_i, D)
+//
+// where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query
+// term q_i in the corpus with document D, and tf(q_i, D) is the weighted and
+// normalized term frequency of query term q_i in the document D.
+float Bm25fCalculator::ComputeScore(const DocHitInfoIterator* query_it,
+ const DocHitInfo& hit_info,
+ double default_score) {
+ auto status_or =
+ document_store_->GetDocumentAssociatedScoreData(hit_info.document_id());
+ if (!status_or.ok()) {
+ ICING_LOG(ERROR) << "No document score data";
+ return default_score;
+ }
+ DocumentAssociatedScoreData data = status_or.ValueOrDie();
+ std::vector<TermMatchInfo> matched_terms_stats;
+ query_it->PopulateMatchedTermsStats(&matched_terms_stats);
+
+ float score = 0;
+ for (const TermMatchInfo& term_match_info : matched_terms_stats) {
+ float idf_weight =
+ GetCorpusIdfWeightForTerm(term_match_info.term, data.corpus_id());
+ float normalized_tf =
+ ComputedNormalizedTermFrequency(term_match_info, hit_info, data);
+ score += idf_weight * normalized_tf;
+ }
+
+ ICING_VLOG(1) << "BM25F: corpus_id:" << data.corpus_id()
+ << " docid:" << hit_info.document_id() << " score:" << score;
+ return score;
+}
+
+// Compute inverse document frequency (IDF) weight for query term in the given
+// corpus, and cache it in the map.
+//
+// N - n(q_i) + 0.5
+// IDF(q_i) = ln(1 + ------------------)
+// n(q_i) + 0.5
+//
+// where N is the number of documents in the corpus, and n(q_i) is the number
+// of documents in the corpus containing the query term q_i.
+float Bm25fCalculator::GetCorpusIdfWeightForTerm(std::string_view term,
+ CorpusId corpus_id) {
+ TermId term_id = term_id_map_[term];
+
+ CorpusTermInfo corpus_term_info(corpus_id, term_id);
+ auto iter = corpus_idf_map_.find(corpus_term_info.value);
+ if (iter != corpus_idf_map_.end()) {
+ return iter->second;
+ }
+
+ // First, figure out corpus scoring data.
+ auto status_or = document_store_->GetCorpusAssociatedScoreData(corpus_id);
+ if (!status_or.ok()) {
+ ICING_LOG(ERROR) << "No scoring data for corpus [" << corpus_id << "]";
+ return 0;
+ }
+ CorpusAssociatedScoreData csdata = status_or.ValueOrDie();
+
+ uint32_t num_docs = csdata.num_docs();
+ uint32_t nqi = corpus_nqi_map_[corpus_term_info.value];
+ float idf =
+ nqi != 0 ? log(1.0f + (num_docs - nqi + 0.5f) / (nqi + 0.5f)) : 0.0f;
+ corpus_idf_map_.insert({corpus_term_info.value, idf});
+ ICING_VLOG(1) << "corpus_id:" << corpus_id << " term:" << term
+ << " N:" << num_docs << "nqi:" << nqi << " idf:" << idf;
+ return idf;
+}
+
+// Get per corpus average document length and cache the result in the map.
+// The average doc length is calculated as:
+//
+// total_tokens_in_corpus
+// Avg Doc Length = -------------------------
+// num_docs_in_corpus + 1
+float Bm25fCalculator::GetCorpusAvgDocLength(CorpusId corpus_id) {
+ auto iter = corpus_avgdl_map_.find(corpus_id);
+ if (iter != corpus_avgdl_map_.end()) {
+ return iter->second;
+ }
+
+ // First, figure out corpus scoring data.
+ auto status_or = document_store_->GetCorpusAssociatedScoreData(corpus_id);
+ if (!status_or.ok()) {
+ ICING_LOG(ERROR) << "No scoring data for corpus [" << corpus_id << "]";
+ return 0;
+ }
+ CorpusAssociatedScoreData csdata = status_or.ValueOrDie();
+
+ corpus_avgdl_map_[corpus_id] = csdata.average_doc_length_in_tokens();
+ return csdata.average_doc_length_in_tokens();
+}
+
+// Computes normalized term frequency for query term q_i in document D.
+//
+// f(q_i, D) * (k1 + 1)
+// Normalized TF = --------------------------------------------
+// f(q_i, D) + k1 * (1 - b + b * |D| / avgdl)
+//
+// where f(q_i, D) is the frequency of query term q_i in document D,
+// |D| is the #tokens in D, avgdl is the average document length in the corpus,
+// k1 and b are smoothing parameters.
+float Bm25fCalculator::ComputedNormalizedTermFrequency(
+ const TermMatchInfo& term_match_info, const DocHitInfo& hit_info,
+ const DocumentAssociatedScoreData& data) {
+ uint32_t dl = data.length_in_tokens();
+ float avgdl = GetCorpusAvgDocLength(data.corpus_id());
+ float f_q = ComputeTermFrequencyForMatchedSections(
+ data.corpus_id(), term_match_info, hit_info.document_id());
+ float normalized_tf =
+ f_q * (k1_ + 1) / (f_q + k1_ * (1 - b_ + b_ * dl / avgdl));
+
+ ICING_VLOG(1) << "corpus_id:" << data.corpus_id()
+ << " docid:" << hit_info.document_id() << " dl:" << dl
+ << " avgdl:" << avgdl << " f_q:" << f_q
+ << " norm_tf:" << normalized_tf;
+ return normalized_tf;
+}
+
+float Bm25fCalculator::ComputeTermFrequencyForMatchedSections(
+ CorpusId corpus_id, const TermMatchInfo& term_match_info,
+ DocumentId document_id) const {
+ float sum = 0.0f;
+ SectionIdMask sections = term_match_info.section_ids_mask;
+ SchemaTypeId schema_type_id = GetSchemaTypeId(document_id);
+
+ while (sections != 0) {
+ SectionId section_id = __builtin_ctzll(sections);
+ sections &= ~(UINT64_C(1) << section_id);
+
+ Hit::TermFrequency tf = term_match_info.term_frequencies[section_id];
+ double weighted_tf = tf * section_weights_.GetNormalizedSectionWeight(
+ schema_type_id, section_id);
+ if (tf != Hit::kNoTermFrequency) {
+ sum += weighted_tf;
+ }
+ }
+ return sum;
+}
+
+SchemaTypeId Bm25fCalculator::GetSchemaTypeId(DocumentId document_id) const {
+ auto filter_data_optional = document_store_->GetAliveDocumentFilterData(
+ document_id, current_time_ms_);
+ if (!filter_data_optional) {
+ // This should never happen. The only failure case for
+ // GetAliveDocumentFilterData is if the document_id is outside of the range
+ // of allocated document_ids, which shouldn't be possible since we're
+ // getting this document_id from the posting lists.
+ ICING_LOG(WARNING) << "No document filter data for document ["
+ << document_id << "]";
+ return kInvalidSchemaTypeId;
+ }
+ return filter_data_optional.value().schema_type_id();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/bm25f-calculator.h b/icing/scoring/bm25f-calculator.h
new file mode 100644
index 0000000..36f9c68
--- /dev/null
+++ b/icing/scoring/bm25f-calculator.h
@@ -0,0 +1,177 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_BM25F_CALCULATOR_H_
+#define ICING_SCORING_BM25F_CALCULATOR_H_
+
+#include <cstdint>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/scoring/section-weights.h"
+#include "icing/store/corpus-id.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+// Bm25fCalculator encapsulates the logic to compute BM25F term-weight based
+// ranking function.
+//
+// The formula to compute BM25F is as follows:
+//
+// BM25F = \sum_i IDF(q_i) * tf(q_i, D)
+//
+// where IDF(q_i) is the Inverse Document Frequency (IDF) weight of the query
+// term q_i in the corpus with document D, and tf(q_i, D) is the weighted and
+// normalized term frequency of query term q_i in the document D.
+//
+// IDF(q_i) is computed as follows:
+//
+// N - n(q_i) + 0.5
+// IDF(q_i) = log(1 + ------------------)
+// n(q_i) + 0.5
+//
+// where N is the number of documents in the corpus, and n(q_i) is the number
+// of documents in the corpus containing the query term q_i.
+//
+// Lastly, tf(q_i, D) is computed as follows:
+//
+// f(q_i, D) * (k1 + 1)
+// Normalized TF = --------------------------------------------
+// f(q_i, D) + k1 * (1 - b + b * |D| / avgdl)
+//
+// where f(q_i, D) is the frequency of query term q_i in document D,
+// |D| is the #tokens in D, avgdl is the average document length in the corpus,
+// k1 and b are smoothing parameters.
+//
+// see: go/icing-bm25f
+// see: glossary/bm25
+class Bm25fCalculator {
+ public:
+ explicit Bm25fCalculator(const DocumentStore *document_store,
+ SectionWeights *section_weights,
+ int64_t current_time_ms);
+
+ // Precompute and cache statistics relevant to BM25F.
+ // Populates term_id_map_ and corpus_nqi_map_ for use while scoring other
+ // results.
+ // The query_term_iterators map is used to build the
+ // std::unordered_map<std::string_view, TermId> term_id_map_. It must
+ // outlive the bm25f-calculator otherwise the string_view key in term_id_map_,
+ // used later to compute a document score, will be meaningless.
+ void PrepareToScore(
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ *query_term_iterators);
+
+ // Compute the BM25F relevance score for the given hit, represented by
+ // DocHitInfo.
+ // The default score will be returned only when the scorer fails to find or
+ // calculate a score for the document.
+ float ComputeScore(const DocHitInfoIterator *query_it,
+ const DocHitInfo &hit_info, double default_score);
+
+ private:
+ // Compact ID for each query term.
+ using TermId = uint16_t;
+
+ // Compact representation of <CorpusId, TermId> for use as a key in a
+ // hash_map.
+ struct CorpusTermInfo {
+ // Layout bits: 16 bit CorpusId + 16 bit TermId
+ using Value = uint32_t;
+
+ Value value;
+
+ static constexpr int kCorpusIdBits = sizeof(CorpusId);
+ static constexpr int kTermIdBits = sizeof(TermId);
+
+ explicit CorpusTermInfo(CorpusId corpus_id, TermId term_id) : value(0) {
+ BITFIELD_OR(value, kTermIdBits, kCorpusIdBits,
+ static_cast<uint64_t>(corpus_id));
+ BITFIELD_OR(value, 0, kTermIdBits, term_id);
+ }
+
+ bool operator==(const CorpusTermInfo &other) const {
+ return value == other.value;
+ }
+ };
+
+ // Returns idf weight for the term and provided corpus.
+ float GetCorpusIdfWeightForTerm(std::string_view term, CorpusId corpus_id);
+
+ // Returns the average document length for the corpus. The average is
+ // calculated as the sum of tokens in the corpus' documents over the total
+ // number of documents plus one.
+ float GetCorpusAvgDocLength(CorpusId corpus_id);
+
+ // Returns the normalized term frequency for the term match and document hit.
+ // This normalizes the term frequency by applying smoothing parameters and
+ // factoring document length.
+ float ComputedNormalizedTermFrequency(
+ const TermMatchInfo &term_match_info, const DocHitInfo &hit_info,
+ const DocumentAssociatedScoreData &data);
+
+ // Returns the weighted term frequency for the term match and document. For
+ // each section the term is present, we scale the term frequency by its
+ // section weight. We return the sum of the weighted term frequencies over all
+ // sections.
+ float ComputeTermFrequencyForMatchedSections(
+ CorpusId corpus_id, const TermMatchInfo &term_match_info,
+ DocumentId document_id) const;
+
+ // Returns the schema type id for the document by retrieving it from the
+ // DocumentFilterData.
+ SchemaTypeId GetSchemaTypeId(DocumentId document_id) const;
+
+ // Clears cached scoring data and prepares the calculator for a new scoring
+ // run.
+ void Clear();
+
+ const DocumentStore *document_store_; // Does not own.
+
+ // Used for accessing normalized section weights when computing the weighted
+ // term frequency.
+ SectionWeights &section_weights_;
+
+ // Map from query term to compact term ID.
+ // Necessary as a key to the other maps.
+ // The use of the string_view as key here means that the query_term_iterators
+ // map must outlive the bm25f
+ std::unordered_map<std::string_view, TermId> term_id_map_;
+
+ // Map from corpus ID to average document length (avgdl).
+ // Necessary to calculate the normalized term frequency.
+ // This information is cached in the DocumentStore::CorpusScoreCache
+ std::unordered_map<CorpusId, float> corpus_avgdl_map_;
+ // Map from <corpus ID, term ID> to number of documents containing term q_i,
+ // called n(q_i).
+ // Necessary to calculate IDF(q_i) (inverse document frequency).
+ // This information must be calculated by iterating through the hits for these
+ // terms.
+ std::unordered_map<CorpusTermInfo::Value, uint32_t> corpus_nqi_map_;
+
+ // Map from <corpus ID, term ID> to IDF(q_i) (inverse document frequency).
+ std::unordered_map<CorpusTermInfo::Value, float> corpus_idf_map_;
+
+ int64_t current_time_ms_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_BM25F_CALCULATOR_H_
diff --git a/icing/scoring/priority-queue-scored-document-hits-ranker.h b/icing/scoring/priority-queue-scored-document-hits-ranker.h
new file mode 100644
index 0000000..0798d7d
--- /dev/null
+++ b/icing/scoring/priority-queue-scored-document-hits-ranker.h
@@ -0,0 +1,128 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_PRIORITY_QUEUE_SCORED_DOCUMENT_HITS_RANKER_H_
+#define ICING_SCORING_PRIORITY_QUEUE_SCORED_DOCUMENT_HITS_RANKER_H_
+
+#include <queue>
+#include <vector>
+
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/scoring/scored-document-hits-ranker.h"
+
+namespace icing {
+namespace lib {
+
+// ScoredDocumentHitsRanker interface implementation, based on
+// std::priority_queue. We can get next top hit in O(lgN) time.
+template <typename ScoredDataType,
+ typename Converter = typename ScoredDataType::Converter>
+class PriorityQueueScoredDocumentHitsRanker : public ScoredDocumentHitsRanker {
+ public:
+ explicit PriorityQueueScoredDocumentHitsRanker(
+ std::vector<ScoredDataType>&& scored_data_vec, bool is_descending = true);
+
+ ~PriorityQueueScoredDocumentHitsRanker() override = default;
+
+ // Note: ranker may store ScoredDocumentHit or JoinedScoredDocumentHit, so we
+ // have template for scored_data_pq_.
+ // - JoinedScoredDocumentHit is a superset of ScoredDocumentHit, so we unify
+ // the return type of PopNext to use the superset type
+ // JoinedScoredDocumentHit in order to make it simple, and rankers storing
+ // ScoredDocumentHit should convert it to JoinedScoredDocumentHit before
+ // returning. It makes the implementation simpler, especially for
+ // ResultRetriever, which now only needs to deal with one single return
+ // format.
+ // - JoinedScoredDocumentHit has ~2x size of ScoredDocumentHit. Since we cache
+ // ranker (which contains a priority queue of data) in ResultState, if we
+ // store the scored hits in JoinedScoredDocumentHit format directly, then it
+ // doubles the memory usage. Therefore, we still keep the flexibility to
+ // store ScoredDocumentHit or any other types of data, but require PopNext
+ // to convert it to JoinedScoredDocumentHit.
+ JoinedScoredDocumentHit PopNext() override;
+
+ void TruncateHitsTo(int new_size) override;
+
+ int size() const override { return scored_data_pq_.size(); }
+
+ bool empty() const override { return scored_data_pq_.empty(); }
+
+ private:
+ // Comparator for std::priority_queue. Since std::priority is a max heap
+ // (descending order), reverse it if we want ascending order.
+ class Comparator {
+ public:
+ explicit Comparator(bool is_ascending) : is_ascending_(is_ascending) {}
+
+ bool operator()(const ScoredDataType& lhs,
+ const ScoredDataType& rhs) const {
+ // STL comparator requirement: equal MUST return false.
+ // If writing `return is_ascending_ == !(lhs < rhs)`:
+ // - When lhs == rhs, !(lhs < rhs) is true
+ // - If is_ascending_ is true, then we return true for equal case!
+ if (is_ascending_) {
+ return rhs < lhs;
+ }
+ return lhs < rhs;
+ }
+
+ private:
+ bool is_ascending_;
+ };
+
+ Comparator comparator_;
+
+ // Use priority queue to get top K hits in O(KlgN) time.
+ std::priority_queue<ScoredDataType, std::vector<ScoredDataType>, Comparator>
+ scored_data_pq_;
+
+ Converter converter_;
+};
+
+template <typename ScoredDataType, typename Converter>
+PriorityQueueScoredDocumentHitsRanker<ScoredDataType, Converter>::
+ PriorityQueueScoredDocumentHitsRanker(
+ std::vector<ScoredDataType>&& scored_data_vec, bool is_descending)
+ : comparator_(/*is_ascending=*/!is_descending),
+ scored_data_pq_(comparator_, std::move(scored_data_vec)) {}
+
+template <typename ScoredDataType, typename Converter>
+JoinedScoredDocumentHit
+PriorityQueueScoredDocumentHitsRanker<ScoredDataType, Converter>::PopNext() {
+ ScoredDataType next_scored_data = scored_data_pq_.top();
+ scored_data_pq_.pop();
+ return converter_(std::move(next_scored_data));
+}
+
+template <typename ScoredDataType, typename Converter>
+void PriorityQueueScoredDocumentHitsRanker<
+ ScoredDataType, Converter>::TruncateHitsTo(int new_size) {
+ if (new_size < 0 || scored_data_pq_.size() <= new_size) {
+ return;
+ }
+
+ // Copying the best new_size results.
+ std::priority_queue<ScoredDataType, std::vector<ScoredDataType>, Comparator>
+ new_pq(comparator_);
+ for (int i = 0; i < new_size; ++i) {
+ new_pq.push(scored_data_pq_.top());
+ scored_data_pq_.pop();
+ }
+ scored_data_pq_ = std::move(new_pq);
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_PRIORITY_QUEUE_SCORED_DOCUMENT_HITS_RANKER_H_
diff --git a/icing/scoring/priority-queue-scored-document-hits-ranker_test.cc b/icing/scoring/priority-queue-scored-document-hits-ranker_test.cc
new file mode 100644
index 0000000..ace2350
--- /dev/null
+++ b/icing/scoring/priority-queue-scored-document-hits-ranker_test.cc
@@ -0,0 +1,255 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/priority-queue-scored-document-hits-ranker.h"
+
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/scoring/scored-document-hit.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+class Converter {
+ public:
+ JoinedScoredDocumentHit operator()(ScoredDocumentHit hit) const {
+ return converter_(std::move(hit));
+ }
+
+ private:
+ ScoredDocumentHit::Converter converter_;
+} converter;
+
+std::vector<JoinedScoredDocumentHit> PopAll(
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit>& ranker) {
+ std::vector<JoinedScoredDocumentHit> hits;
+ while (!ranker.empty()) {
+ hits.push_back(ranker.PopNext());
+ }
+ return hits;
+}
+
+TEST(PriorityQueueScoredDocumentHitsRankerTest, ShouldGetCorrectSizeAndEmpty) {
+ ScoredDocumentHit scored_hit_0(/*document_id=*/0, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_1(/*document_id=*/1, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_2(/*document_id=*/2, kSectionIdMaskNone,
+ /*score=*/1);
+
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit> ranker(
+ {scored_hit_1, scored_hit_0, scored_hit_2},
+ /*is_descending=*/true);
+ EXPECT_THAT(ranker.size(), Eq(3));
+ EXPECT_FALSE(ranker.empty());
+
+ ranker.PopNext();
+ EXPECT_THAT(ranker.size(), Eq(2));
+ EXPECT_FALSE(ranker.empty());
+
+ ranker.PopNext();
+ EXPECT_THAT(ranker.size(), Eq(1));
+ EXPECT_FALSE(ranker.empty());
+
+ ranker.PopNext();
+ EXPECT_THAT(ranker.size(), Eq(0));
+ EXPECT_TRUE(ranker.empty());
+}
+
+TEST(PriorityQueueScoredDocumentHitsRankerTest, ShouldRankInDescendingOrder) {
+ ScoredDocumentHit scored_hit_0(/*document_id=*/0, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_1(/*document_id=*/1, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_2(/*document_id=*/2, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_3(/*document_id=*/3, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_4(/*document_id=*/4, kSectionIdMaskNone,
+ /*score=*/1);
+
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit> ranker(
+ {scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3},
+ /*is_descending=*/true);
+
+ EXPECT_THAT(ranker, SizeIs(5));
+ std::vector<JoinedScoredDocumentHit> scored_document_hits = PopAll(ranker);
+ EXPECT_THAT(
+ scored_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(converter(scored_hit_4)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_3)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_2)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_1)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_0))));
+}
+
+TEST(PriorityQueueScoredDocumentHitsRankerTest, ShouldRankInAscendingOrder) {
+ ScoredDocumentHit scored_hit_0(/*document_id=*/0, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_1(/*document_id=*/1, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_2(/*document_id=*/2, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_3(/*document_id=*/3, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_4(/*document_id=*/4, kSectionIdMaskNone,
+ /*score=*/1);
+
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit> ranker(
+ {scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3},
+ /*is_descending=*/false);
+
+ EXPECT_THAT(ranker, SizeIs(5));
+ std::vector<JoinedScoredDocumentHit> scored_document_hits = PopAll(ranker);
+ EXPECT_THAT(
+ scored_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(converter(scored_hit_0)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_1)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_2)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_3)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_4))));
+}
+
+TEST(PriorityQueueScoredDocumentHitsRankerTest,
+ ShouldRankDuplicateScoredDocumentHits) {
+ ScoredDocumentHit scored_hit_0(/*document_id=*/0, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_1(/*document_id=*/1, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_2(/*document_id=*/2, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_3(/*document_id=*/3, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_4(/*document_id=*/4, kSectionIdMaskNone,
+ /*score=*/1);
+
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit> ranker(
+ {scored_hit_2, scored_hit_4, scored_hit_1, scored_hit_0, scored_hit_2,
+ scored_hit_2, scored_hit_4, scored_hit_3},
+ /*is_descending=*/true);
+
+ EXPECT_THAT(ranker, SizeIs(8));
+ std::vector<JoinedScoredDocumentHit> scored_document_hits = PopAll(ranker);
+ EXPECT_THAT(
+ scored_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(converter(scored_hit_4)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_4)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_3)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_2)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_2)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_2)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_1)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_0))));
+}
+
+TEST(PriorityQueueScoredDocumentHitsRankerTest,
+ ShouldRankEmptyScoredDocumentHits) {
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit> ranker(
+ /*scored_document_hits=*/{},
+ /*is_descending=*/true);
+ EXPECT_THAT(ranker, IsEmpty());
+}
+
+TEST(PriorityQueueScoredDocumentHitsRankerTest, ShouldTruncateToNewSize) {
+ ScoredDocumentHit scored_hit_0(/*document_id=*/0, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_1(/*document_id=*/1, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_2(/*document_id=*/2, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_3(/*document_id=*/3, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_4(/*document_id=*/4, kSectionIdMaskNone,
+ /*score=*/1);
+
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit> ranker(
+ {scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3},
+ /*is_descending=*/true);
+ ASSERT_THAT(ranker, SizeIs(5));
+
+ ranker.TruncateHitsTo(/*new_size=*/3);
+ EXPECT_THAT(ranker, SizeIs(3));
+ std::vector<JoinedScoredDocumentHit> scored_document_hits = PopAll(ranker);
+ EXPECT_THAT(
+ scored_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(converter(scored_hit_4)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_3)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_2))));
+}
+
+TEST(PriorityQueueScoredDocumentHitsRankerTest, ShouldTruncateToZero) {
+ ScoredDocumentHit scored_hit_0(/*document_id=*/0, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_1(/*document_id=*/1, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_2(/*document_id=*/2, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_3(/*document_id=*/3, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_4(/*document_id=*/4, kSectionIdMaskNone,
+ /*score=*/1);
+
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit> ranker(
+ {scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3},
+ /*is_descending=*/true);
+ ASSERT_THAT(ranker, SizeIs(5));
+
+ ranker.TruncateHitsTo(/*new_size=*/0);
+ EXPECT_THAT(ranker, IsEmpty());
+}
+
+TEST(PriorityQueueScoredDocumentHitsRankerTest, ShouldNotTruncateToNegative) {
+ ScoredDocumentHit scored_hit_0(/*document_id=*/0, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_1(/*document_id=*/1, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_2(/*document_id=*/2, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_3(/*document_id=*/3, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_hit_4(/*document_id=*/4, kSectionIdMaskNone,
+ /*score=*/1);
+
+ PriorityQueueScoredDocumentHitsRanker<ScoredDocumentHit> ranker(
+ {scored_hit_1, scored_hit_0, scored_hit_2, scored_hit_4, scored_hit_3},
+ /*is_descending=*/true);
+ ASSERT_THAT(ranker, SizeIs(Eq(5)));
+
+ ranker.TruncateHitsTo(/*new_size=*/-1);
+ EXPECT_THAT(ranker, SizeIs(Eq(5)));
+ // Contents are not affected.
+ std::vector<JoinedScoredDocumentHit> scored_document_hits = PopAll(ranker);
+ EXPECT_THAT(
+ scored_document_hits,
+ ElementsAre(EqualsJoinedScoredDocumentHit(converter(scored_hit_4)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_3)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_2)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_1)),
+ EqualsJoinedScoredDocumentHit(converter(scored_hit_0))));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/ranker.cc b/icing/scoring/ranker.cc
index fecee82..d59c98f 100644
--- a/icing/scoring/ranker.cc
+++ b/icing/scoring/ranker.cc
@@ -32,6 +32,7 @@ namespace {
// Helper function to wrap the heapify algorithm, it heapifies the target
// subtree node in place.
+// TODO(b/152934343) refactor the heapify function and making it into a class.
void Heapify(
std::vector<ScoredDocumentHit>* scored_document_hits,
int target_subtree_root_index,
@@ -71,32 +72,76 @@ void Heapify(
}
}
-// Helper function to extract the root from the heap. The heap structure will be
-// maintained.
-//
-// Returns:
-// The current root element on success
-// RESOURCE_EXHAUSTED_ERROR if heap is empty
-libtextclassifier3::StatusOr<ScoredDocumentHit> PopRoot(
- std::vector<ScoredDocumentHit>* scored_document_hits_heap,
- const ScoredDocumentHitComparator& scored_document_hit_comparator) {
- if (scored_document_hits_heap->empty()) {
- // An invalid ScoredDocumentHit
- return absl_ports::ResourceExhaustedError("Heap is empty");
+// Heapify the given term vector from top to bottom. Call it after add or
+// replace an element at the front of the vector.
+void HeapifyTermDown(std::vector<TermMetadata>& scored_terms,
+ int target_subtree_root_index) {
+ int heap_size = scored_terms.size();
+ if (target_subtree_root_index >= heap_size) {
+ return;
+ }
+
+ // Initializes subtree root as the current minimum node.
+ int min = target_subtree_root_index;
+ // If we represent a heap in an array/vector, indices of left and right
+ // children can be calculated as such.
+ const int left = target_subtree_root_index * 2 + 1;
+ const int right = target_subtree_root_index * 2 + 2;
+
+ // If left child is smaller than current minimum.
+ if (left < heap_size &&
+ scored_terms.at(left).score < scored_terms.at(min).score) {
+ min = left;
+ }
+
+ // If right child is smaller than current minimum.
+ if (right < heap_size &&
+ scored_terms.at(right).score < scored_terms.at(min).score) {
+ min = right;
+ }
+
+ // If the minimum is not the subtree root, swap and continue heapifying the
+ // lower level subtree.
+ if (min != target_subtree_root_index) {
+ std::swap(scored_terms.at(min), scored_terms.at(target_subtree_root_index));
+ HeapifyTermDown(scored_terms, min);
+ }
+}
+
+// Heapify the given term vector from bottom to top. Call it after add an
+// element at the end of the vector.
+void HeapifyTermUp(std::vector<TermMetadata>& scored_terms,
+ int target_subtree_child_index) {
+ // If we represent a heap in an array/vector, indices of root can be
+ // calculated as such.
+ const int root = (target_subtree_child_index + 1) / 2 - 1;
+
+ // If the current child is smaller than the root, swap and continue heapifying
+ // the upper level subtree
+ if (root >= 0 && scored_terms.at(target_subtree_child_index).score <
+ scored_terms.at(root).score) {
+ std::swap(scored_terms.at(root),
+ scored_terms.at(target_subtree_child_index));
+ HeapifyTermUp(scored_terms, root);
+ }
+}
+
+TermMetadata PopRootTerm(std::vector<TermMetadata>& scored_terms) {
+ if (scored_terms.empty()) {
+ // Return an invalid TermMetadata as a sentinel value.
+ return TermMetadata(/*content_in=*/"", /*hit_count_in=*/-1);
}
// Steps to extract root from heap:
// 1. copy out root
- ScoredDocumentHit root = scored_document_hits_heap->at(0);
- const size_t last_node_index = scored_document_hits_heap->size() - 1;
+ TermMetadata root = scored_terms.at(0);
+ const size_t last_node_index = scored_terms.size() - 1;
// 2. swap root and the last node
- std::swap(scored_document_hits_heap->at(0),
- scored_document_hits_heap->at(last_node_index));
+ std::swap(scored_terms.at(0), scored_terms.at(last_node_index));
// 3. remove last node
- scored_document_hits_heap->pop_back();
+ scored_terms.pop_back();
// 4. heapify root
- Heapify(scored_document_hits_heap, /*target_subtree_root_index=*/0,
- scored_document_hit_comparator);
+ HeapifyTermDown(scored_terms, /*target_subtree_root_index=*/0);
return root;
}
@@ -115,6 +160,42 @@ void BuildHeapInPlace(
}
}
+void PushToTermHeap(TermMetadata term, int number_to_return,
+ std::vector<TermMetadata>& scored_terms_heap) {
+ if (scored_terms_heap.size() < number_to_return) {
+ scored_terms_heap.push_back(std::move(term));
+ // We insert at end, so we should heapify bottom up.
+ HeapifyTermUp(scored_terms_heap, scored_terms_heap.size() - 1);
+ } else if (scored_terms_heap.at(0).score < term.score) {
+ scored_terms_heap.at(0) = std::move(term);
+ // We insert at root, so we should heapify top down.
+ HeapifyTermDown(scored_terms_heap, /*target_subtree_root_index=*/0);
+ }
+}
+
+libtextclassifier3::StatusOr<ScoredDocumentHit> PopNextTopResultFromHeap(
+ std::vector<ScoredDocumentHit>* scored_document_hits_heap,
+ const ScoredDocumentHitComparator& scored_document_hit_comparator) {
+ if (scored_document_hits_heap->empty()) {
+ // An invalid ScoredDocumentHit
+ return absl_ports::ResourceExhaustedError("Heap is empty");
+ }
+
+ // Steps to extract root from heap:
+ // 1. copy out root
+ ScoredDocumentHit root = scored_document_hits_heap->at(0);
+ const size_t last_node_index = scored_document_hits_heap->size() - 1;
+ // 2. swap root and the last node
+ std::swap(scored_document_hits_heap->at(0),
+ scored_document_hits_heap->at(last_node_index));
+ // 3. remove last node
+ scored_document_hits_heap->pop_back();
+ // 4. heapify root
+ Heapify(scored_document_hits_heap, /*target_subtree_root_index=*/0,
+ scored_document_hit_comparator);
+ return root;
+}
+
std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
const ScoredDocumentHitComparator& scored_document_hit_comparator) {
@@ -123,7 +204,8 @@ std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
num_results, static_cast<int>(scored_document_hits_heap->size()));
while (result_size-- > 0) {
libtextclassifier3::StatusOr<ScoredDocumentHit> next_best_document_hit_or =
- PopRoot(scored_document_hits_heap, scored_document_hit_comparator);
+ PopNextTopResultFromHeap(scored_document_hits_heap,
+ scored_document_hit_comparator);
if (next_best_document_hit_or.ok()) {
scored_document_hit_result.push_back(
std::move(next_best_document_hit_or).ValueOrDie());
@@ -134,5 +216,15 @@ std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
return scored_document_hit_result;
}
+std::vector<TermMetadata> PopAllTermsFromHeap(
+ std::vector<TermMetadata>& scored_terms_heap) {
+ std::vector<TermMetadata> top_term_result;
+ top_term_result.reserve(scored_terms_heap.size());
+ while (!scored_terms_heap.empty()) {
+ top_term_result.push_back(PopRootTerm(scored_terms_heap));
+ }
+ return top_term_result;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/scoring/ranker.h b/icing/scoring/ranker.h
index 785c133..bfe1077 100644
--- a/icing/scoring/ranker.h
+++ b/icing/scoring/ranker.h
@@ -17,6 +17,8 @@
#include <vector>
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/term-metadata.h"
#include "icing/scoring/scored-document-hit.h"
// Provides functionality to get the top N results from an unsorted vector.
@@ -31,6 +33,17 @@ void BuildHeapInPlace(
std::vector<ScoredDocumentHit>* scored_document_hits,
const ScoredDocumentHitComparator& scored_document_hit_comparator);
+// Returns the single next top result (i.e. the current root element) from the
+// given heap and remove it from the heap. The heap structure will be
+// maintained.
+//
+// Returns:
+// The next top result element on success
+// RESOURCE_EXHAUSTED_ERROR if heap is empty
+libtextclassifier3::StatusOr<ScoredDocumentHit> PopNextTopResultFromHeap(
+ std::vector<ScoredDocumentHit>* scored_document_hits_heap,
+ const ScoredDocumentHitComparator& scored_document_hit_comparator);
+
// Returns the top num_results results from the given heap and remove those
// results from the heap. An empty vector will be returned if heap is empty.
//
@@ -39,6 +52,18 @@ std::vector<ScoredDocumentHit> PopTopResultsFromHeap(
std::vector<ScoredDocumentHit>* scored_document_hits_heap, int num_results,
const ScoredDocumentHitComparator& scored_document_hit_comparator);
+// The heap is a min-heap. So that we can avoid some push operations by
+// comparing to the root term, and only pushing if greater than root. The time
+// complexity for a single push is O(lgK) which K is the number_to_return.
+// REQUIRED: scored_terms_heap is not null.
+void PushToTermHeap(TermMetadata term, int number_to_return,
+ std::vector<TermMetadata>& scored_terms_heap);
+
+// Return all terms from the given terms heap. And since the heap is a min-heap,
+// the output vector will be increasing order.
+// REQUIRED: scored_terms_heap is not null.
+std::vector<TermMetadata> PopAllTermsFromHeap(
+ std::vector<TermMetadata>& scored_terms_heap);
} // namespace lib
} // namespace icing
diff --git a/icing/scoring/ranker_benchmark.cc b/icing/scoring/ranker_benchmark.cc
index 8983dd9..c2f13de 100644
--- a/icing/scoring/ranker_benchmark.cc
+++ b/icing/scoring/ranker_benchmark.cc
@@ -27,7 +27,7 @@ namespace {
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
// //icing/scoring:ranker_benchmark
//
-// $ blaze-bin/icing/scoring/ranker_benchmark --benchmarks=all
+// $ blaze-bin/icing/scoring/ranker_benchmark --benchmark_filter=all
// --benchmark_memory_usage
//
// Run on an Android device:
@@ -38,7 +38,7 @@ namespace {
// $ adb push blaze-bin/icing/scoring/ranker_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/ranker_benchmark --benchmarks=all
+// $ adb shell /data/local/tmp/ranker_benchmark --benchmark_filter=all
void BM_GetTopN(benchmark::State& state) {
int num_to_score = state.range(0);
diff --git a/icing/scoring/score-and-rank_benchmark.cc b/icing/scoring/score-and-rank_benchmark.cc
index c3ed40a..7cb5a95 100644
--- a/icing/scoring/score-and-rank_benchmark.cc
+++ b/icing/scoring/score-and-rank_benchmark.cc
@@ -49,7 +49,7 @@
// //icing/scoring:score-and-rank_benchmark
//
// $ blaze-bin/icing/scoring/score-and-rank_benchmark
-// --benchmarks=all --benchmark_memory_usage
+// --benchmark_filter=all --benchmark_memory_usage
//
// Run on an Android device:
// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
@@ -59,7 +59,8 @@
// $ adb push blaze-bin/icing/scoring/score-and-rank_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/score-and-rank_benchmark --benchmarks=all
+// $ adb shell /data/local/tmp/score-and-rank_benchmark
+// --benchmark_filter=all
namespace icing {
namespace lib {
@@ -88,6 +89,18 @@ DocumentProto CreateEmailDocument(int id, int document_score,
.Build();
}
+libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store) {
+ return DocumentStore::Create(
+ filesystem, base_dir, clock, schema_store,
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr);
+}
+
void BM_ScoreAndRankDocumentHitsByDocumentScore(benchmark::State& state) {
const std::string base_dir = GetTestTempDir() + "/score_and_rank_benchmark";
const std::string document_store_dir = base_dir + "/document_store";
@@ -96,26 +109,33 @@ void BM_ScoreAndRankDocumentHitsByDocumentScore(benchmark::State& state) {
// Creates file directories
Filesystem filesystem;
filesystem.DeleteDirectoryRecursively(base_dir.c_str());
- filesystem.CreateDirectoryRecursively(document_store_dir.c_str());
- filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
-
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem, base_dir));
+ ASSERT_TRUE(
+ filesystem.CreateDirectoryRecursively(document_store_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(schema_store_dir.c_str()));
Clock clock;
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem, document_store_dir, &clock,
- schema_store.get()));
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_store_dir, &clock));
- ICING_ASSERT_OK(schema_store->SetSchema(CreateSchemaWithEmailType()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ CreateSchemaWithEmailType(), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ScoringSpecProto scoring_spec;
scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(scoring_spec, document_store.get()));
-
+ ScoringProcessor::Create(scoring_spec, document_store.get(),
+ schema_store.get(),
+ clock.GetSystemTimeMilliseconds()));
int num_to_score = state.range(0);
int num_of_documents = state.range(1);
@@ -155,7 +175,6 @@ void BM_ScoreAndRankDocumentHitsByDocumentScore(benchmark::State& state) {
PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/20,
scored_document_hit_comparator);
}
-
// Clean up
document_store.reset();
schema_store.reset();
@@ -195,26 +214,34 @@ void BM_ScoreAndRankDocumentHitsByCreationTime(benchmark::State& state) {
// Creates file directories
Filesystem filesystem;
filesystem.DeleteDirectoryRecursively(base_dir.c_str());
- filesystem.CreateDirectoryRecursively(document_store_dir.c_str());
- filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
-
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem, base_dir));
+ ASSERT_TRUE(
+ filesystem.CreateDirectoryRecursively(document_store_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(schema_store_dir.c_str()));
Clock clock;
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem, document_store_dir, &clock,
- schema_store.get()));
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_store_dir, &clock));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
- ICING_ASSERT_OK(schema_store->SetSchema(CreateSchemaWithEmailType()));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ CreateSchemaWithEmailType(), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ScoringSpecProto scoring_spec;
scoring_spec.set_rank_by(
ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(scoring_spec, document_store.get()));
+ ScoringProcessor::Create(scoring_spec, document_store.get(),
+ schema_store.get(),
+ clock.GetSystemTimeMilliseconds()));
int num_to_score = state.range(0);
int num_of_documents = state.range(1);
@@ -295,25 +322,33 @@ void BM_ScoreAndRankDocumentHitsNoScoring(benchmark::State& state) {
// Creates file directories
Filesystem filesystem;
filesystem.DeleteDirectoryRecursively(base_dir.c_str());
- filesystem.CreateDirectoryRecursively(document_store_dir.c_str());
- filesystem.CreateDirectoryRecursively(schema_store_dir.c_str());
-
- ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem, base_dir));
+ ASSERT_TRUE(
+ filesystem.CreateDirectoryRecursively(document_store_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(schema_store_dir.c_str()));
Clock clock;
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem, document_store_dir, &clock,
- schema_store.get()));
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_store_dir, &clock));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
- ICING_ASSERT_OK(schema_store->SetSchema(CreateSchemaWithEmailType()));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ CreateSchemaWithEmailType(), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ScoringSpecProto scoring_spec;
scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(scoring_spec, document_store.get()));
+ ScoringProcessor::Create(scoring_spec, document_store.get(),
+ schema_store.get(),
+ clock.GetSystemTimeMilliseconds()));
int num_to_score = state.range(0);
int num_of_documents = state.range(1);
@@ -381,6 +416,127 @@ BENCHMARK(BM_ScoreAndRankDocumentHitsNoScoring)
->ArgPair(10000, 18000)
->ArgPair(10000, 20000);
+void BM_ScoreAndRankDocumentHitsByRelevanceScoring(benchmark::State& state) {
+ const std::string base_dir = GetTestTempDir() + "/score_and_rank_benchmark";
+ const std::string document_store_dir = base_dir + "/document_store";
+ const std::string schema_store_dir = base_dir + "/schema_store";
+
+ // Creates file directories
+ Filesystem filesystem;
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+ ASSERT_TRUE(
+ filesystem.CreateDirectoryRecursively(document_store_dir.c_str()));
+ ASSERT_TRUE(filesystem.CreateDirectoryRecursively(schema_store_dir.c_str()));
+
+ Clock clock;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem, schema_store_dir, &clock));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ CreateSchemaWithEmailType(), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ScoringSpecProto scoring_spec;
+ scoring_spec.set_rank_by(ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(scoring_spec, document_store.get(),
+ schema_store.get(),
+ clock.GetSystemTimeMilliseconds()));
+
+ int num_to_score = state.range(0);
+ int num_of_documents = state.range(1);
+
+ std::mt19937 random_generator;
+ std::uniform_int_distribution<int> distribution(
+ 1, std::numeric_limits<int>::max());
+
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = 1U << section_id;
+
+ // Puts documents into document store
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos;
+ for (int i = 0; i < num_of_documents; i++) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store->Put(CreateEmailDocument(
+ /*id=*/i, /*document_score=*/1,
+ /*creation_timestamp_ms=*/1),
+ /*num_tokens=*/10));
+ DocHitInfoTermFrequencyPair doc_hit =
+ DocHitInfo(document_id, section_id_mask);
+ // Set five matches for term "foo" for each document hit.
+ doc_hit.UpdateSection(section_id, /*hit_term_frequency=*/5);
+ doc_hit_infos.push_back(doc_hit);
+ }
+
+ ScoredDocumentHitComparator scored_document_hit_comparator(
+ /*is_descending=*/true);
+
+ for (auto _ : state) {
+ // Creates a dummy DocHitInfoIterator with results, we need to pause the
+ // timer here so that the cost of copying test data is not included.
+ state.PauseTiming();
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+ // Create a query term iterator that assigns the document hits to term
+ // "foo".
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+ state.ResumeTiming();
+
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ scoring_processor->Score(std::move(doc_hit_info_iterator), num_to_score,
+ &query_term_iterators);
+
+ BuildHeapInPlace(&scored_document_hits, scored_document_hit_comparator);
+ // Ranks and gets the first page, 20 is a common page size
+ std::vector<ScoredDocumentHit> results =
+ PopTopResultsFromHeap(&scored_document_hits, /*num_results=*/20,
+ scored_document_hit_comparator);
+ }
+
+ // Clean up
+ document_store.reset();
+ schema_store.reset();
+ filesystem.DeleteDirectoryRecursively(base_dir.c_str());
+}
+BENCHMARK(BM_ScoreAndRankDocumentHitsByRelevanceScoring)
+ // num_to_score, num_of_documents in document store
+ ->ArgPair(1000, 30000)
+ ->ArgPair(3000, 30000)
+ ->ArgPair(5000, 30000)
+ ->ArgPair(7000, 30000)
+ ->ArgPair(9000, 30000)
+ ->ArgPair(11000, 30000)
+ ->ArgPair(13000, 30000)
+ ->ArgPair(15000, 30000)
+ ->ArgPair(17000, 30000)
+ ->ArgPair(19000, 30000)
+ ->ArgPair(21000, 30000)
+ ->ArgPair(23000, 30000)
+ ->ArgPair(25000, 30000)
+ ->ArgPair(27000, 30000)
+ ->ArgPair(29000, 30000)
+ // Starting from this line, we're trying to see if num_of_documents affects
+ // performance
+ ->ArgPair(10000, 10000)
+ ->ArgPair(10000, 12000)
+ ->ArgPair(10000, 14000)
+ ->ArgPair(10000, 16000)
+ ->ArgPair(10000, 18000)
+ ->ArgPair(10000, 20000);
+
} // namespace
} // namespace lib
diff --git a/icing/scoring/scored-document-hit.cc b/icing/scoring/scored-document-hit.cc
new file mode 100644
index 0000000..f519a16
--- /dev/null
+++ b/icing/scoring/scored-document-hit.cc
@@ -0,0 +1,30 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/scored-document-hit.h"
+
+namespace icing {
+namespace lib {
+
+JoinedScoredDocumentHit ScoredDocumentHit::Converter::operator()(
+ ScoredDocumentHit&& scored_doc_hit) const {
+ double final_score = scored_doc_hit.score();
+ return JoinedScoredDocumentHit(
+ final_score,
+ /*parent_scored_document_hit=*/std::move(scored_doc_hit),
+ /*child_scored_document_hits=*/{});
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/scored-document-hit.h b/icing/scoring/scored-document-hit.h
index c2e51b8..5fc2f3a 100644
--- a/icing/scoring/scored-document-hit.h
+++ b/icing/scoring/scored-document-hit.h
@@ -24,11 +24,19 @@
namespace icing {
namespace lib {
+class JoinedScoredDocumentHit;
+
// A data class containing information about the document, hit sections, and a
// score. The score is calculated against both the document and the hit
// sections.
class ScoredDocumentHit {
public:
+ class Converter {
+ public:
+ JoinedScoredDocumentHit operator()(
+ ScoredDocumentHit&& scored_doc_hit) const;
+ };
+
ScoredDocumentHit(DocumentId document_id, SectionIdMask hit_section_id_mask,
double score)
: document_id_(document_id),
@@ -53,8 +61,8 @@ class ScoredDocumentHit {
double score_;
} __attribute__((packed));
-static_assert(sizeof(ScoredDocumentHit) == 14,
- "Size of ScoredDocHit should be 14");
+static_assert(sizeof(ScoredDocumentHit) == 20,
+ "Size of ScoredDocHit should be 20");
static_assert(icing_is_packed_pod<ScoredDocumentHit>::value, "go/icing-ubsan");
// A custom comparator for ScoredDocumentHit that determines which
@@ -71,13 +79,79 @@ class ScoredDocumentHitComparator {
bool operator()(const ScoredDocumentHit& lhs,
const ScoredDocumentHit& rhs) const {
- return is_descending_ == !(lhs < rhs);
+ // STL comparator requirement: equal MUST return false.
+ // If writing `return is_descending_ == !(lhs < rhs)`:
+ // - When lhs == rhs, !(lhs < rhs) is true
+ // - If is_descending_ is true, then we return true for equal case!
+ if (is_descending_) {
+ return rhs < lhs;
+ }
+ return lhs < rhs;
}
private:
bool is_descending_;
};
+// A data class containing information about a composite document after joining,
+// including final score, parent ScoredDocumentHit, and a vector of all child
+// ScoredDocumentHits. The final score is calculated by the strategy specified
+// in join spec/rank strategy. It could be aggregated score, raw parent doc
+// score, or anything else.
+//
+// ScoredDocumentHitsRanker may store ScoredDocumentHit or
+// JoinedScoredDocumentHit.
+// - We could've created a virtual class for them and ScoredDocumentHitsRanker
+// uses the abstract type.
+// - However, Icing lib caches ScoredDocumentHitsRanker (which contains a list
+// of (Joined)ScoredDocumentHits) in ResultState. Inheriting the virtual class
+// makes both classes have additional 8 bytes for vtable, which increases 40%
+// and 15% memory usage respectively.
+// - Also since JoinedScoredDocumentHit is a super-set of ScoredDocumentHit,
+// let's avoid the common virtual class and instead implement a convert
+// function (original type -> JoinedScoredDocumentHit) for each class, so
+// ScoredDocumentHitsRanker::PopNext can return a common type (i.e.
+// JoinedScoredDocumentHit).
+class JoinedScoredDocumentHit {
+ public:
+ class Converter {
+ public:
+ JoinedScoredDocumentHit operator()(
+ JoinedScoredDocumentHit&& scored_doc_hit) const {
+ return scored_doc_hit;
+ }
+ };
+
+ explicit JoinedScoredDocumentHit(
+ double final_score, ScoredDocumentHit parent_scored_document_hit,
+ std::vector<ScoredDocumentHit> child_scored_document_hits)
+ : final_score_(final_score),
+ parent_scored_document_hit_(std::move(parent_scored_document_hit)),
+ child_scored_document_hits_(std::move(child_scored_document_hits)) {}
+
+ bool operator<(const JoinedScoredDocumentHit& other) const {
+ if (final_score_ != other.final_score_) {
+ return final_score_ < other.final_score_;
+ }
+ return parent_scored_document_hit_ < other.parent_scored_document_hit_;
+ }
+
+ double final_score() const { return final_score_; }
+
+ const ScoredDocumentHit& parent_scored_document_hit() const {
+ return parent_scored_document_hit_;
+ }
+
+ const std::vector<ScoredDocumentHit>& child_scored_document_hits() const {
+ return child_scored_document_hits_;
+ }
+
+ private:
+ double final_score_;
+ ScoredDocumentHit parent_scored_document_hit_;
+ std::vector<ScoredDocumentHit> child_scored_document_hits_;
+} __attribute__((packed));
+
} // namespace lib
} // namespace icing
diff --git a/icing/scoring/scored-document-hit_test.cc b/icing/scoring/scored-document-hit_test.cc
new file mode 100644
index 0000000..cb9703b
--- /dev/null
+++ b/icing/scoring/scored-document-hit_test.cc
@@ -0,0 +1,77 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/scored-document-hit.h"
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::DoubleEq;
+using ::testing::IsEmpty;
+
+TEST(ScoredDocumentHitTest, ScoredDocumentHitConvertToJoinedScoredDocumentHit) {
+ ScoredDocumentHit::Converter converter;
+
+ double score = 2.0;
+ ScoredDocumentHit scored_document_hit(/*document_id=*/5,
+ /*section_id_mask=*/49, score);
+
+ JoinedScoredDocumentHit joined_scored_document_hit =
+ converter(ScoredDocumentHit(scored_document_hit));
+ EXPECT_THAT(joined_scored_document_hit.final_score(), DoubleEq(score));
+ EXPECT_THAT(joined_scored_document_hit.parent_scored_document_hit(),
+ EqualsScoredDocumentHit(scored_document_hit));
+ EXPECT_THAT(joined_scored_document_hit.child_scored_document_hits(),
+ IsEmpty());
+}
+
+TEST(ScoredDocumentHitTest,
+ JoinedScoredDocumentHitConvertToJoinedScoredDocumentHit) {
+ JoinedScoredDocumentHit::Converter converter;
+
+ ScoredDocumentHit parent_scored_document_hit(/*document_id=*/5,
+ /*section_id_mask=*/49,
+ /*score=*/1.0);
+ std::vector<ScoredDocumentHit> child_scored_document_hits{
+ ScoredDocumentHit(/*document_id=*/1,
+ /*section_id_mask=*/1,
+ /*score=*/2.0),
+ ScoredDocumentHit(/*document_id=*/2,
+ /*section_id_mask=*/2,
+ /*score=*/3.0),
+ ScoredDocumentHit(/*document_id=*/3,
+ /*section_id_mask=*/3,
+ /*score=*/4.0)};
+
+ JoinedScoredDocumentHit joined_scored_document_hit(
+ /*final_score=*/12345.6789, std::move(parent_scored_document_hit),
+ std::move(child_scored_document_hits));
+ EXPECT_THAT(converter(JoinedScoredDocumentHit(joined_scored_document_hit)),
+ EqualsJoinedScoredDocumentHit(joined_scored_document_hit));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/scored-document-hits-ranker.h b/icing/scoring/scored-document-hits-ranker.h
new file mode 100644
index 0000000..9b76ce7
--- /dev/null
+++ b/icing/scoring/scored-document-hits-ranker.h
@@ -0,0 +1,62 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_SCORED_DOCUMENT_HITS_RANKER_H_
+#define ICING_SCORING_SCORED_DOCUMENT_HITS_RANKER_H_
+
+#include "icing/scoring/scored-document-hit.h"
+
+namespace icing {
+namespace lib {
+
+// TODO(sungyc): re-evaluate other similar implementations (e.g. std::sort +
+// std::queue/std::vector). Also revisit the capacity shrinking
+// issue for PopNext().
+
+// ScoredDocumentHitsRanker is an interface class for ranking
+// ScoredDocumentHits.
+class ScoredDocumentHitsRanker {
+ public:
+ virtual ~ScoredDocumentHitsRanker() = default;
+
+ // Pop the next top JoinedScoredDocumentHit and return. It is undefined to
+ // call PopNext on an empty ranker, so the caller should check if it is not
+ // empty before calling.
+ //
+ // Note: ranker may store ScoredDocumentHit or JoinedScoredDocumentHit. We can
+ // add template for this interface, but since JoinedScoredDocumentHit is a
+ // superset of ScoredDocumentHit, we unify the return type of PopNext to use
+ // the superset type JoinedScoredDocumentHit in order to make it simple, and
+ // rankers storing ScoredDocumentHit should convert it to
+ // JoinedScoredDocumentHit before returning. It makes the implementation
+ // simpler, especially for ResultRetriever, which now only needs to deal with
+ // one single return format.
+ virtual JoinedScoredDocumentHit PopNext() = 0;
+
+ // Truncates the remaining ScoredDocumentHits to the given size. The best
+ // ScoredDocumentHits (according to the ranking policy) should be kept.
+ // If new_size is invalid (< 0), or greater or equal to # of remaining
+ // ScoredDocumentHits, then no action will be taken. Otherwise truncates the
+ // the remaining ScoredDocumentHits to the given size.
+ virtual void TruncateHitsTo(int new_size) = 0;
+
+ virtual int size() const = 0;
+
+ virtual bool empty() const = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_SCORED_DOCUMENT_HITS_RANKER_H_
diff --git a/icing/scoring/scorer-factory.cc b/icing/scoring/scorer-factory.cc
new file mode 100644
index 0000000..e56f10c
--- /dev/null
+++ b/icing/scoring/scorer-factory.cc
@@ -0,0 +1,242 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/scorer-factory.h"
+
+#include <memory>
+#include <unordered_map>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/scoring/advanced_scoring/advanced-scorer.h"
+#include "icing/scoring/bm25f-calculator.h"
+#include "icing/scoring/scorer.h"
+#include "icing/scoring/section-weights.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+class DocumentScoreScorer : public Scorer {
+ public:
+ explicit DocumentScoreScorer(const DocumentStore* document_store,
+ double default_score)
+ : document_store_(*document_store), default_score_(default_score) {}
+
+ double GetScore(const DocHitInfo& hit_info,
+ const DocHitInfoIterator*) override {
+ ICING_ASSIGN_OR_RETURN(
+ DocumentAssociatedScoreData score_data,
+ document_store_.GetDocumentAssociatedScoreData(hit_info.document_id()),
+ default_score_);
+
+ return static_cast<double>(score_data.document_score());
+ }
+
+ private:
+ const DocumentStore& document_store_;
+ double default_score_;
+};
+
+class DocumentCreationTimestampScorer : public Scorer {
+ public:
+ explicit DocumentCreationTimestampScorer(const DocumentStore* document_store,
+ double default_score)
+ : document_store_(*document_store), default_score_(default_score) {}
+
+ double GetScore(const DocHitInfo& hit_info,
+ const DocHitInfoIterator*) override {
+ ICING_ASSIGN_OR_RETURN(
+ DocumentAssociatedScoreData score_data,
+ document_store_.GetDocumentAssociatedScoreData(hit_info.document_id()),
+ default_score_);
+
+ return static_cast<double>(score_data.creation_timestamp_ms());
+ }
+
+ private:
+ const DocumentStore& document_store_;
+ double default_score_;
+};
+
+class RelevanceScoreScorer : public Scorer {
+ public:
+ explicit RelevanceScoreScorer(
+ std::unique_ptr<SectionWeights> section_weights,
+ std::unique_ptr<Bm25fCalculator> bm25f_calculator, double default_score)
+ : section_weights_(std::move(section_weights)),
+ bm25f_calculator_(std::move(bm25f_calculator)),
+ default_score_(default_score) {}
+
+ void PrepareToScore(
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>*
+ query_term_iterators) override {
+ bm25f_calculator_->PrepareToScore(query_term_iterators);
+ }
+
+ double GetScore(const DocHitInfo& hit_info,
+ const DocHitInfoIterator* query_it) override {
+ if (!query_it) {
+ return default_score_;
+ }
+
+ return static_cast<double>(
+ bm25f_calculator_->ComputeScore(query_it, hit_info, default_score_));
+ }
+
+ private:
+ std::unique_ptr<SectionWeights> section_weights_;
+ std::unique_ptr<Bm25fCalculator> bm25f_calculator_;
+ double default_score_;
+};
+
+// A scorer which assigns scores to documents based on usage reports.
+class UsageScorer : public Scorer {
+ public:
+ UsageScorer(const DocumentStore* document_store,
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy,
+ double default_score, int64_t current_time_ms)
+ : document_store_(*document_store),
+ ranking_strategy_(ranking_strategy),
+ default_score_(default_score),
+ current_time_ms_(current_time_ms) {}
+
+ double GetScore(const DocHitInfo& hit_info,
+ const DocHitInfoIterator*) override {
+ std::optional<UsageStore::UsageScores> usage_scores =
+ document_store_.GetUsageScores(hit_info.document_id(),
+ current_time_ms_);
+ if (!usage_scores) {
+ // If there's no UsageScores entry present for this doc, then just
+ // treat it as a default instance.
+ usage_scores = UsageStore::UsageScores();
+ }
+
+ switch (ranking_strategy_) {
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT:
+ return usage_scores->usage_type1_count;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT:
+ return usage_scores->usage_type2_count;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT:
+ return usage_scores->usage_type3_count;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP:
+ return usage_scores->usage_type1_last_used_timestamp_s * 1000.0;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP:
+ return usage_scores->usage_type2_last_used_timestamp_s * 1000.0;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP:
+ return usage_scores->usage_type3_last_used_timestamp_s * 1000.0;
+ default:
+ // This shouldn't happen if this scorer is used correctly.
+ return default_score_;
+ }
+ }
+
+ private:
+ const DocumentStore& document_store_;
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy_;
+ double default_score_;
+ int64_t current_time_ms_;
+};
+
+// A special scorer which does nothing but assigns the default score to each
+// document. This is used especially when no scoring is required in a query.
+class NoScorer : public Scorer {
+ public:
+ explicit NoScorer(double default_score) : default_score_(default_score) {}
+
+ double GetScore(const DocHitInfo& hit_info,
+ const DocHitInfoIterator*) override {
+ return default_score_;
+ }
+
+ private:
+ double default_score_;
+};
+
+namespace scorer_factory {
+
+libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Create(
+ const ScoringSpecProto& scoring_spec, double default_score,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ int64_t current_time_ms, const JoinChildrenFetcher* join_children_fetcher) {
+ ICING_RETURN_ERROR_IF_NULL(document_store);
+ ICING_RETURN_ERROR_IF_NULL(schema_store);
+
+ if (!scoring_spec.advanced_scoring_expression().empty() &&
+ scoring_spec.rank_by() !=
+ ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION) {
+ return absl_ports::InvalidArgumentError(
+ "Advanced scoring is not enabled, but the advanced scoring expression "
+ "is not empty!");
+ }
+
+ switch (scoring_spec.rank_by()) {
+ case ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE:
+ return std::make_unique<DocumentScoreScorer>(document_store,
+ default_score);
+ case ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP:
+ return std::make_unique<DocumentCreationTimestampScorer>(document_store,
+ default_score);
+ case ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE: {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store, scoring_spec));
+
+ auto bm25f_calculator = std::make_unique<Bm25fCalculator>(
+ document_store, section_weights.get(), current_time_ms);
+ return std::make_unique<RelevanceScoreScorer>(std::move(section_weights),
+ std::move(bm25f_calculator),
+ default_score);
+ }
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT:
+ [[fallthrough]];
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT:
+ [[fallthrough]];
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT:
+ [[fallthrough]];
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP:
+ [[fallthrough]];
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP:
+ [[fallthrough]];
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP:
+ return std::make_unique<UsageScorer>(document_store,
+ scoring_spec.rank_by(),
+ default_score, current_time_ms);
+ case ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION:
+ if (scoring_spec.advanced_scoring_expression().empty()) {
+ return absl_ports::InvalidArgumentError(
+ "Advanced scoring is enabled, but the expression is empty!");
+ }
+ return AdvancedScorer::Create(scoring_spec, default_score, document_store,
+ schema_store, current_time_ms,
+ join_children_fetcher);
+ case ScoringSpecProto::RankingStrategy::JOIN_AGGREGATE_SCORE:
+ // Use join aggregate score to rank. Since the aggregation score is
+ // calculated by child documents after joining (in JoinProcessor), we can
+ // simply use NoScorer for parent documents.
+ [[fallthrough]];
+ case ScoringSpecProto::RankingStrategy::NONE:
+ return std::make_unique<NoScorer>(default_score);
+ }
+}
+
+} // namespace scorer_factory
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/scorer-factory.h b/icing/scoring/scorer-factory.h
new file mode 100644
index 0000000..659bebd
--- /dev/null
+++ b/icing/scoring/scorer-factory.h
@@ -0,0 +1,49 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_SCORER_FACTORY_H_
+#define ICING_SCORING_SCORER_FACTORY_H_
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/scoring/scorer.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+namespace scorer_factory {
+
+// Factory function to create a Scorer which does not take ownership of any
+// input components (DocumentStore), and all pointers must refer to valid
+// objects that outlive the created Scorer instance. The default score will be
+// returned only when the scorer fails to find or calculate a score for the
+// document.
+//
+// Returns:
+// A Scorer on success
+// FAILED_PRECONDITION on any null pointer input
+// INVALID_ARGUMENT if fails to create an instance
+libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Create(
+ const ScoringSpecProto& scoring_spec, double default_score,
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ int64_t current_time_ms,
+ const JoinChildrenFetcher* join_children_fetcher = nullptr);
+
+} // namespace scorer_factory
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_SCORER_FACTORY_H_
diff --git a/icing/scoring/scorer-test-utils.h b/icing/scoring/scorer-test-utils.h
new file mode 100644
index 0000000..e8ca853
--- /dev/null
+++ b/icing/scoring/scorer-test-utils.h
@@ -0,0 +1,77 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_SCORER_TEST_UTILS_H_
+#define ICING_SCORING_SCORER_TEST_UTILS_H_
+
+#include "icing/proto/scoring.pb.h"
+
+namespace icing {
+namespace lib {
+
+enum class ScorerTestingMode { kNormal, kAdvanced };
+
+inline ScoringSpecProto CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::Code ranking_strategy,
+ ScorerTestingMode testing_mode) {
+ ScoringSpecProto scoring_spec;
+ if (testing_mode != ScorerTestingMode::kAdvanced) {
+ scoring_spec.set_rank_by(ranking_strategy);
+ return scoring_spec;
+ }
+ scoring_spec.set_rank_by(
+ ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION);
+ switch (ranking_strategy) {
+ case ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE:
+ scoring_spec.set_advanced_scoring_expression("this.documentScore()");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP:
+ scoring_spec.set_advanced_scoring_expression("this.creationTimestamp()");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT:
+ scoring_spec.set_advanced_scoring_expression("this.usageCount(1)");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT:
+ scoring_spec.set_advanced_scoring_expression("this.usageCount(2)");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT:
+ scoring_spec.set_advanced_scoring_expression("this.usageCount(3)");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP:
+ scoring_spec.set_advanced_scoring_expression(
+ "this.usageLastUsedTimestamp(1)");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE2_LAST_USED_TIMESTAMP:
+ scoring_spec.set_advanced_scoring_expression(
+ "this.usageLastUsedTimestamp(2)");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::USAGE_TYPE3_LAST_USED_TIMESTAMP:
+ scoring_spec.set_advanced_scoring_expression(
+ "this.usageLastUsedTimestamp(3)");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE:
+ scoring_spec.set_advanced_scoring_expression("this.relevanceScore()");
+ return scoring_spec;
+ case ScoringSpecProto::RankingStrategy::NONE:
+ case ScoringSpecProto::RankingStrategy::JOIN_AGGREGATE_SCORE:
+ case ScoringSpecProto::RankingStrategy::ADVANCED_SCORING_EXPRESSION:
+ scoring_spec.set_rank_by(ranking_strategy);
+ return scoring_spec;
+ }
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_SCORER_TEST_UTILS_H_
diff --git a/icing/scoring/scorer.cc b/icing/scoring/scorer.cc
deleted file mode 100644
index ab5308c..0000000
--- a/icing/scoring/scorer.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/scoring/scorer.h"
-
-#include <memory>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/proto/scoring.pb.h"
-#include "icing/store/document-associated-score-data.h"
-#include "icing/store/document-id.h"
-#include "icing/store/document-store.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-class DocumentScoreScorer : public Scorer {
- public:
- explicit DocumentScoreScorer(const DocumentStore* document_store,
- double default_score)
- : document_store_(*document_store), default_score_(default_score) {}
-
- double GetScore(DocumentId document_id) override {
- ICING_ASSIGN_OR_RETURN(
- DocumentAssociatedScoreData score_data,
- document_store_.GetDocumentAssociatedScoreData(document_id),
- default_score_);
-
- return static_cast<double>(score_data.document_score());
- }
-
- private:
- const DocumentStore& document_store_;
- double default_score_;
-};
-
-class DocumentCreationTimestampScorer : public Scorer {
- public:
- explicit DocumentCreationTimestampScorer(const DocumentStore* document_store,
- double default_score)
- : document_store_(*document_store), default_score_(default_score) {}
-
- double GetScore(DocumentId document_id) override {
- ICING_ASSIGN_OR_RETURN(
- DocumentAssociatedScoreData score_data,
- document_store_.GetDocumentAssociatedScoreData(document_id),
- default_score_);
-
- return static_cast<double>(score_data.creation_timestamp_ms());
- }
-
- private:
- const DocumentStore& document_store_;
- double default_score_;
-};
-
-// A special scorer which does nothing but assigns the default score to each
-// document. This is used especially when no scoring is required in a query.
-class NoScorer : public Scorer {
- public:
- explicit NoScorer(double default_score) : default_score_(default_score) {}
-
- double GetScore(DocumentId document_id) override { return default_score_; }
-
- private:
- double default_score_;
-};
-
-libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Scorer::Create(
- ScoringSpecProto::RankingStrategy::Code rank_by, double default_score,
- const DocumentStore* document_store) {
- ICING_RETURN_ERROR_IF_NULL(document_store);
-
- switch (rank_by) {
- case ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE:
- return std::make_unique<DocumentScoreScorer>(document_store,
- default_score);
- case ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP:
- return std::make_unique<DocumentCreationTimestampScorer>(document_store,
- default_score);
- case ScoringSpecProto::RankingStrategy::NONE:
- return std::make_unique<NoScorer>(default_score);
- }
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/scoring/scorer.h b/icing/scoring/scorer.h
index 55c6b5c..ec48502 100644
--- a/icing/scoring/scorer.h
+++ b/icing/scoring/scorer.h
@@ -16,11 +16,11 @@
#define ICING_SCORING_SCORER_H_
#include <memory>
+#include <unordered_map>
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/proto/scoring.pb.h"
-#include "icing/store/document-id.h"
-#include "icing/store/document-store.h"
namespace icing {
namespace lib {
@@ -30,34 +30,30 @@ class Scorer {
public:
virtual ~Scorer() = default;
- // Factory function to create a Scorer which does not take ownership of any
- // input components (DocumentStore), and all pointers must refer to valid
- // objects that outlive the created Scorer instance. The default score will be
- // returned only when the scorer fails to find or calculate a score for the
- // document.
- //
- // Returns:
- // A Scorer on success
- // FAILED_PRECONDITION on any null pointer input
- // INVALID_ARGUMENT if fails to create an instance
- static libtextclassifier3::StatusOr<std::unique_ptr<Scorer>> Create(
- ScoringSpecProto::RankingStrategy::Code rank_by, double default_score,
- const DocumentStore* document_store);
-
// Returns a non-negative score of a document. The score can be a
// document-associated score which comes from the DocumentProto directly, an
- // accumulated score, or even an inferred score. If it fails to find or
- // calculate a score, the user-provided default score will be returned.
+ // accumulated score, a relevance score, or even an inferred score. If it
+ // fails to find or calculate a score, the user-provided default score will be
+ // returned.
//
// Some examples of possible scores:
// 1. Document-associated scores: document score, creation timestamp score.
// 2. Accumulated scores: usage count score.
// 3. Inferred scores: a score calculated by a machine learning model.
+ // 4. Relevance score: computed as BM25F score.
//
// NOTE: This method is performance-sensitive as it's called for every
// potential result document. We're trying to avoid returning StatusOr<double>
// to save a little more time and memory.
- virtual double GetScore(DocumentId document_id) = 0;
+ virtual double GetScore(const DocHitInfo& hit_info,
+ const DocHitInfoIterator* query_it = nullptr) = 0;
+
+ // Currently only overriden by the RelevanceScoreScorer.
+ // NOTE: the query_term_iterators map must
+ // outlive the scorer, see bm25f-calculator for more details.
+ virtual void PrepareToScore(
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>*
+ query_term_iterators) {}
};
} // namespace lib
diff --git a/icing/scoring/scorer_test.cc b/icing/scoring/scorer_test.cc
index 4dda603..5194c7f 100644
--- a/icing/scoring/scorer_test.cc
+++ b/icing/scoring/scorer_test.cc
@@ -21,10 +21,16 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/index/hit/doc-hit-info.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
+#include "icing/scoring/scorer-factory.h"
+#include "icing/scoring/scorer-test-utils.h"
+#include "icing/scoring/section-weights.h"
#include "icing/store/document-id.h"
#include "icing/store/document-store.h"
#include "icing/testing/common-matchers.h"
@@ -36,9 +42,8 @@ namespace lib {
namespace {
using ::testing::Eq;
-using ::testing::Test;
-class ScorerTest : public Test {
+class ScorerTest : public ::testing::TestWithParam<ScorerTestingMode> {
protected:
ScorerTest()
: test_dir_(GetTestTempDir() + "/icing"),
@@ -54,23 +59,34 @@ class ScorerTest : public Test {
fake_clock2_.SetSystemTimeMilliseconds(1572200000000);
ICING_ASSERT_OK_AND_ASSIGN(
- schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_));
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock1_));
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock1_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, doc_store_dir_, &fake_clock1_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
// Creates a simple email schema
- SchemaProto test_email_schema;
- auto type_config = test_email_schema.add_types();
- type_config->set_schema_type("email");
- auto subject = type_config->add_properties();
- subject->set_property_name("subject");
- subject->set_data_type(PropertyConfigProto::DataType::STRING);
- subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- ICING_ASSERT_OK(schema_store_->SetSchema(test_email_schema));
+ SchemaProto test_email_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED)))
+ .Build();
+
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ test_email_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
}
void TearDown() override {
@@ -81,10 +97,16 @@ class ScorerTest : public Test {
DocumentStore* document_store() { return document_store_.get(); }
+ SchemaStore* schema_store() { return schema_store_.get(); }
+
const FakeClock& fake_clock1() { return fake_clock1_; }
const FakeClock& fake_clock2() { return fake_clock2_; }
+ void SetFakeClock1Time(int64_t new_time) {
+ fake_clock1_.SetSystemTimeMilliseconds(new_time);
+ }
+
private:
const std::string test_dir_;
const std::string doc_store_dir_;
@@ -96,24 +118,53 @@ class ScorerTest : public Test {
FakeClock fake_clock2_;
};
-TEST_F(ScorerTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
- /*default_score=*/0, /*document_store=*/nullptr),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+UsageReport CreateUsageReport(std::string name_space, std::string uri,
+ int64_t timestamp_ms,
+ UsageReport::UsageType usage_type) {
+ UsageReport usage_report;
+ usage_report.set_document_namespace(name_space);
+ usage_report.set_document_uri(uri);
+ usage_report.set_usage_timestamp_ms(timestamp_ms);
+ usage_report.set_usage_type(usage_type);
+ return usage_report;
+}
+
+TEST_P(ScorerTest, CreationWithNullDocumentStoreShouldFail) {
+ EXPECT_THAT(
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam()),
+ /*default_score=*/0, /*document_store=*/nullptr, schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(ScorerTest, ShouldGetDefaultScore) {
+TEST_P(ScorerTest, CreationWithNullSchemaStoreShouldFail) {
+ EXPECT_THAT(
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam()),
+ /*default_score=*/0, document_store(),
+ /*schema_store=*/nullptr, fake_clock1().GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_P(ScorerTest, ShouldGetDefaultScoreIfDocumentDoesntExist) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
- /*default_score=*/10, document_store()));
-
- DocumentId non_existing_document_id = 1;
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam()),
+ /*default_score=*/10, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+
+ // Non existent document id
+ DocHitInfo docHitInfo = DocHitInfo(/*document_id_in=*/1);
// The caller-provided default score is returned
- EXPECT_THAT(scorer->GetScore(non_existing_document_id), Eq(10));
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
}
-TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) {
+TEST_P(ScorerTest, ShouldGetDefaultDocumentScore) {
// Creates a test document with the default document score 0
DocumentProto test_document =
DocumentBuilder()
@@ -127,13 +178,17 @@ TEST_F(ScorerTest, ShouldGetDefaultDocumentScore) {
document_store()->Put(test_document));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
- /*default_score=*/10, document_store()));
-
- EXPECT_THAT(scorer->GetScore(document_id), Eq(0));
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam()),
+ /*default_score=*/10, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(0));
}
-TEST_F(ScorerTest, ShouldGetCorrectDocumentScore) {
+TEST_P(ScorerTest, ShouldGetCorrectDocumentScore) {
// Creates a test document with document score 5
DocumentProto test_document =
DocumentBuilder()
@@ -148,13 +203,44 @@ TEST_F(ScorerTest, ShouldGetCorrectDocumentScore) {
document_store()->Put(test_document));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE,
- /*default_score=*/0, document_store()));
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(5));
+}
+
+// See scoring-processor_test.cc and icing-search-engine_test.cc for better
+// Bm25F scoring tests.
+TEST_P(ScorerTest, QueryIteratorNullRelevanceScoreShouldReturnDefaultScore) {
+ // Creates a test document with document score 5
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetScore(5)
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(fake_clock2().GetSystemTimeMilliseconds())
+ .Build();
- EXPECT_THAT(scorer->GetScore(document_id), Eq(5));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam()),
+ /*default_score=*/10, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer->GetScore(docHitInfo), Eq(10));
}
-TEST_F(ScorerTest, ShouldGetCorrectCreationTimestampScore) {
+TEST_P(ScorerTest, ShouldGetCorrectCreationTimestampScore) {
// Creates test_document1 with fake timestamp1
DocumentProto test_document1 =
DocumentBuilder()
@@ -178,34 +264,457 @@ TEST_F(ScorerTest, ShouldGetCorrectCreationTimestampScore) {
document_store()->Put(test_document2));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP,
- /*default_score=*/0, document_store()));
-
- EXPECT_THAT(scorer->GetScore(document_id1),
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+
+ DocHitInfo docHitInfo1 = DocHitInfo(document_id1);
+ DocHitInfo docHitInfo2 = DocHitInfo(document_id2);
+ EXPECT_THAT(scorer->GetScore(docHitInfo1),
Eq(fake_clock1().GetSystemTimeMilliseconds()));
- EXPECT_THAT(scorer->GetScore(document_id2),
+ EXPECT_THAT(scorer->GetScore(docHitInfo2),
Eq(fake_clock2().GetSystemTimeMilliseconds()));
}
-TEST_F(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) {
+TEST_P(ScorerTest, ShouldGetCorrectUsageCountScoreForType1) {
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ // Create 3 scorers for 3 different usage types.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer1,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer2,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer3,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ // Report a type1 usage.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1));
+
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(1));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+}
+
+TEST_P(ScorerTest, ShouldGetCorrectUsageCountScoreForType2) {
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ // Create 3 scorers for 3 different usage types.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Scorer> scorer,
- Scorer::Create(ScoringSpecProto::RankingStrategy::NONE,
- /*default_score=*/3, document_store()));
+ std::unique_ptr<Scorer> scorer1,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer2,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer3,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ // Report a type2 usage.
+ UsageReport usage_report_type2 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE2);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2));
+
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(1));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+}
+
+TEST_P(ScorerTest, ShouldGetCorrectUsageCountScoreForType3) {
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ // Create 3 scorers for 3 different usage types.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer1,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer2,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE2_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer3,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE3_COUNT, GetParam()),
+ /*default_score=*/0, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ // Report a type1 usage.
+ UsageReport usage_report_type3 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE3);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3));
+
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(1));
+}
+
+TEST_P(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType1) {
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ // Create 3 scorers for 3 different usage types.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer1,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer2,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE2_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer3,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE3_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ UsageReport usage_report_type1_time1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time1));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(1000));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ // Report usage with timestamp = 5000ms, score should be updated.
+ UsageReport usage_report_type1_time5 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time5));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5000));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ // Report usage with timestamp = 3000ms, score should not be updated.
+ UsageReport usage_report_type1_time3 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/3000,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1_time3));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(5000));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+}
+
+TEST_P(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType2) {
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ // Create 3 scorers for 3 different usage types.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer1,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer2,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE2_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer3,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE3_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ UsageReport usage_report_type2_time1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE2);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time1));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(1000));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ // Report usage with timestamp = 5000ms, score should be updated.
+ UsageReport usage_report_type2_time5 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE2);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time5));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5000));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ // Report usage with timestamp = 3000ms, score should not be updated.
+ UsageReport usage_report_type2_time3 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/3000,
+ UsageReport::USAGE_TYPE2);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type2_time3));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(5000));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+}
+
+TEST_P(ScorerTest, ShouldGetCorrectUsageTimestampScoreForType3) {
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ // Create 3 scorers for 3 different usage types.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer1,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer2,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE2_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer3,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE3_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(0));
+
+ UsageReport usage_report_type3_time1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE3);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time1));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(1000));
+
+ // Report usage with timestamp = 5000ms, score should be updated.
+ UsageReport usage_report_type3_time5 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE3);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time5));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5000));
+
+ // Report usage with timestamp = 3000ms, score should not be updated.
+ UsageReport usage_report_type3_time3 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/3000,
+ UsageReport::USAGE_TYPE3);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type3_time3));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer2->GetScore(docHitInfo), Eq(0));
+ EXPECT_THAT(scorer3->GetScore(docHitInfo), Eq(5000));
+}
- EXPECT_THAT(scorer->GetScore(/*document_id=*/0), Eq(3));
- EXPECT_THAT(scorer->GetScore(/*document_id=*/1), Eq(3));
- EXPECT_THAT(scorer->GetScore(/*document_id=*/2), Eq(3));
+TEST_P(ScorerTest, NoScorerShouldAlwaysReturnDefaultScore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer,
+ scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::NONE, GetParam()),
+ /*default_score=*/3, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+
+ DocHitInfo docHitInfo1 = DocHitInfo(/*document_id_in=*/0);
+ DocHitInfo docHitInfo2 = DocHitInfo(/*document_id_in=*/1);
+ DocHitInfo docHitInfo3 = DocHitInfo(/*document_id_in=*/2);
+ EXPECT_THAT(scorer->GetScore(docHitInfo1), Eq(3));
+ EXPECT_THAT(scorer->GetScore(docHitInfo2), Eq(3));
+ EXPECT_THAT(scorer->GetScore(docHitInfo3), Eq(3));
ICING_ASSERT_OK_AND_ASSIGN(
- scorer, Scorer::Create(ScoringSpecProto::RankingStrategy::NONE,
- /*default_score=*/111, document_store()));
+ scorer, scorer_factory::Create(
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::NONE, GetParam()),
+ /*default_score=*/111, document_store(), schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+
+ docHitInfo1 = DocHitInfo(/*document_id_in=*/4);
+ docHitInfo2 = DocHitInfo(/*document_id_in=*/5);
+ docHitInfo3 = DocHitInfo(/*document_id_in=*/6);
+ EXPECT_THAT(scorer->GetScore(docHitInfo1), Eq(111));
+ EXPECT_THAT(scorer->GetScore(docHitInfo2), Eq(111));
+ EXPECT_THAT(scorer->GetScore(docHitInfo3), Eq(111));
+}
- EXPECT_THAT(scorer->GetScore(/*document_id=*/4), Eq(111));
- EXPECT_THAT(scorer->GetScore(/*document_id=*/5), Eq(111));
- EXPECT_THAT(scorer->GetScore(/*document_id=*/6), Eq(111));
+TEST_P(ScorerTest, ShouldScaleUsageTimestampScoreForMaxTimestamp) {
+ DocumentProto test_document =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetCreationTimestampMs(fake_clock1().GetSystemTimeMilliseconds())
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store()->Put(test_document));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Scorer> scorer1,
+ scorer_factory::Create(CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::
+ USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ GetParam()),
+ /*default_score=*/0, document_store(),
+ schema_store(),
+ fake_clock1().GetSystemTimeMilliseconds()));
+ DocHitInfo docHitInfo = DocHitInfo(document_id);
+
+ // Create usage report for the maximum allowable timestamp.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1",
+ /*timestamp_ms=*/std::numeric_limits<uint32_t>::max() * 1000.0,
+ UsageReport::USAGE_TYPE1);
+
+ double max_int_usage_timestamp_score =
+ std::numeric_limits<uint32_t>::max() * 1000.0;
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_type1));
+ EXPECT_THAT(scorer1->GetScore(docHitInfo), Eq(max_int_usage_timestamp_score));
}
+INSTANTIATE_TEST_SUITE_P(ScorerTest, ScorerTest,
+ testing::Values(ScorerTestingMode::kNormal,
+ ScorerTestingMode::kAdvanced));
+
} // namespace
} // namespace lib
diff --git a/icing/scoring/scoring-processor.cc b/icing/scoring/scoring-processor.cc
index 0933094..b827bd8 100644
--- a/icing/scoring/scoring-processor.cc
+++ b/icing/scoring/scoring-processor.cc
@@ -14,7 +14,10 @@
#include "icing/scoring/scoring-processor.h"
+#include <limits>
#include <memory>
+#include <string>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -22,8 +25,10 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/index/hit/doc-hit-info.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/proto/scoring.pb.h"
#include "icing/scoring/ranker.h"
#include "icing/scoring/scored-document-hit.h"
+#include "icing/scoring/scorer-factory.h"
#include "icing/scoring/scorer.h"
#include "icing/store/document-store.h"
#include "icing/util/status-macros.h"
@@ -39,28 +44,36 @@ constexpr double kDefaultScoreInAscendingOrder =
libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>>
ScoringProcessor::Create(const ScoringSpecProto& scoring_spec,
- const DocumentStore* document_store) {
+ const DocumentStore* document_store,
+ const SchemaStore* schema_store,
+ int64_t current_time_ms,
+ const JoinChildrenFetcher* join_children_fetcher) {
ICING_RETURN_ERROR_IF_NULL(document_store);
+ ICING_RETURN_ERROR_IF_NULL(schema_store);
bool is_descending_order =
scoring_spec.order_by() == ScoringSpecProto::Order::DESC;
ICING_ASSIGN_OR_RETURN(
std::unique_ptr<Scorer> scorer,
- Scorer::Create(scoring_spec.rank_by(),
- is_descending_order ? kDefaultScoreInDescendingOrder
- : kDefaultScoreInAscendingOrder,
- document_store));
-
+ scorer_factory::Create(scoring_spec,
+ is_descending_order
+ ? kDefaultScoreInDescendingOrder
+ : kDefaultScoreInAscendingOrder,
+ document_store, schema_store, current_time_ms,
+ join_children_fetcher));
// Using `new` to access a non-public constructor.
return std::unique_ptr<ScoringProcessor>(
new ScoringProcessor(std::move(scorer)));
}
std::vector<ScoredDocumentHit> ScoringProcessor::Score(
- std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator,
- int num_to_score) {
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator, int num_to_score,
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>*
+ query_term_iterators,
+ QueryStatsProto::SearchStats* search_stats) {
std::vector<ScoredDocumentHit> scored_document_hits;
+ scorer_->PrepareToScore(query_term_iterators);
while (doc_hit_info_iterator->Advance().ok() && num_to_score-- > 0) {
const DocHitInfo& doc_hit_info = doc_hit_info_iterator->doc_hit_info();
@@ -69,11 +82,24 @@ std::vector<ScoredDocumentHit> ScoringProcessor::Score(
// The final score of the doc_hit_info = score of doc * demotion factor of
// hit.
double score =
- scorer_->GetScore(doc_hit_info.document_id()) * hit_demotion_factor;
+ scorer_->GetScore(doc_hit_info, doc_hit_info_iterator.get()) *
+ hit_demotion_factor;
scored_document_hits.emplace_back(
doc_hit_info.document_id(), doc_hit_info.hit_section_ids_mask(), score);
}
+ if (search_stats != nullptr) {
+ search_stats->set_num_documents_scored(scored_document_hits.size());
+ DocHitInfoIterator::CallStats iterator_call_stats =
+ doc_hit_info_iterator->GetCallStats();
+ search_stats->set_num_fetched_hits_lite_index(
+ iterator_call_stats.num_leaf_advance_calls_lite_index);
+ search_stats->set_num_fetched_hits_main_index(
+ iterator_call_stats.num_leaf_advance_calls_main_index);
+ search_stats->set_num_fetched_hits_integer_index(
+ iterator_call_stats.num_leaf_advance_calls_integer_index);
+ }
+
return scored_document_hits;
}
diff --git a/icing/scoring/scoring-processor.h b/icing/scoring/scoring-processor.h
index 60c3b32..8634a22 100644
--- a/icing/scoring/scoring-processor.h
+++ b/icing/scoring/scoring-processor.h
@@ -15,13 +15,19 @@
#ifndef ICING_SCORING_SCORING_PROCESSOR_H_
#define ICING_SCORING_SCORING_PROCESSOR_H_
+#include <cstdint>
#include <memory>
+#include <string>
+#include <unordered_map>
#include <utility>
#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/index/iterator/doc-hit-info-iterator.h"
+#include "icing/join/join-children-fetcher.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/scoring.pb.h"
+#include "icing/schema/schema-store.h"
#include "icing/scoring/scored-document-hit.h"
#include "icing/scoring/scorer.h"
#include "icing/store/document-store.h"
@@ -40,19 +46,24 @@ class ScoringProcessor {
// A ScoringProcessor on success
// FAILED_PRECONDITION on any null pointer input
static libtextclassifier3::StatusOr<std::unique_ptr<ScoringProcessor>> Create(
- const ScoringSpecProto& scoring_spec,
- const DocumentStore* document_store);
+ const ScoringSpecProto& scoring_spec, const DocumentStore* document_store,
+ const SchemaStore* schema_store, int64_t current_time_ms,
+ const JoinChildrenFetcher* join_children_fetcher = nullptr);
// Assigns scores to DocHitInfos from the given DocHitInfoIterator and returns
// a vector of ScoredDocumentHits. The size of results is no more than
// num_to_score. The order of results is the same as DocHitInfos from
// DocHitInfoIterator.
//
- // NOTE: if the scoring spec doesn't require a scoring strategy, all
+ // If necessary, query_term_iterators is used to compute the BM25F relevance
+ // score. NOTE: if the scoring spec doesn't require a scoring strategy, all
// ScoredDocumentHits will be assigned a default score 0.
std::vector<ScoredDocumentHit> Score(
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator,
- int num_to_score);
+ int num_to_score,
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>*
+ query_term_iterators = nullptr,
+ QueryStatsProto::SearchStats* search_stats = nullptr);
private:
explicit ScoringProcessor(std::unique_ptr<Scorer> scorer)
diff --git a/icing/scoring/scoring-processor_test.cc b/icing/scoring/scoring-processor_test.cc
index b93bf1a..deddff8 100644
--- a/icing/scoring/scoring-processor_test.cc
+++ b/icing/scoring/scoring-processor_test.cc
@@ -24,6 +24,10 @@
#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
#include "icing/proto/scoring.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/scoring/scorer-test-utils.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
#include "icing/testing/tmp-directory.h"
@@ -33,11 +37,13 @@ namespace lib {
namespace {
using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::Gt;
using ::testing::IsEmpty;
using ::testing::SizeIs;
-using ::testing::Test;
-class ScoringProcessorTest : public Test {
+class ScoringProcessorTest
+ : public ::testing::TestWithParam<ScorerTestingMode> {
protected:
ScoringProcessorTest()
: test_dir_(GetTestTempDir() + "/icing"),
@@ -50,24 +56,47 @@ class ScoringProcessorTest : public Test {
filesystem_.CreateDirectoryRecursively(doc_store_dir_.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
- ICING_ASSERT_OK_AND_ASSIGN(schema_store_,
- SchemaStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
ICING_ASSERT_OK_AND_ASSIGN(
- document_store_,
- DocumentStore::Create(&filesystem_, doc_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, doc_store_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ document_store_ = std::move(create_result.document_store);
// Creates a simple email schema
- SchemaProto test_email_schema;
- auto type_config = test_email_schema.add_types();
- type_config->set_schema_type("email");
- auto subject = type_config->add_properties();
- subject->set_property_name("subject");
- subject->set_data_type(PropertyConfigProto::DataType::STRING);
- subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- ICING_ASSERT_OK(schema_store_->SetSchema(test_email_schema));
+ SchemaProto test_email_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ test_email_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
}
void TearDown() override {
@@ -78,6 +107,10 @@ class ScoringProcessorTest : public Test {
DocumentStore* document_store() { return document_store_.get(); }
+ SchemaStore* schema_store() { return schema_store_.get(); }
+
+ const FakeClock& fake_clock() const { return fake_clock_; }
+
private:
const std::string test_dir_;
const std::string doc_store_dir_;
@@ -120,38 +153,84 @@ CreateAndInsertsDocumentsWithScores(DocumentStore* document_store,
return std::pair(doc_hit_infos, scored_document_hits);
}
-TEST_F(ScoringProcessorTest, CreationWithNullPointerShouldFail) {
+UsageReport CreateUsageReport(std::string name_space, std::string uri,
+ int64_t timestamp_ms,
+ UsageReport::UsageType usage_type) {
+ UsageReport usage_report;
+ usage_report.set_document_namespace(name_space);
+ usage_report.set_document_uri(uri);
+ usage_report.set_usage_timestamp_ms(timestamp_ms);
+ usage_report.set_usage_type(usage_type);
+ return usage_report;
+}
+
+TypePropertyWeights CreateTypePropertyWeights(
+ std::string schema_type, std::vector<PropertyWeight> property_weights) {
+ TypePropertyWeights type_property_weights;
+ type_property_weights.set_schema_type(std::move(schema_type));
+ type_property_weights.mutable_property_weights()->Reserve(
+ property_weights.size());
+
+ for (PropertyWeight& property_weight : property_weights) {
+ *type_property_weights.add_property_weights() = std::move(property_weight);
+ }
+
+ return type_property_weights;
+}
+
+PropertyWeight CreatePropertyWeight(std::string path, double weight) {
+ PropertyWeight property_weight;
+ property_weight.set_path(std::move(path));
+ property_weight.set_weight(weight);
+ return property_weight;
+}
+
+TEST_F(ScoringProcessorTest, CreationWithNullDocumentStoreShouldFail) {
ScoringSpecProto spec_proto;
- EXPECT_THAT(ScoringProcessor::Create(spec_proto, /*document_store=*/nullptr),
+ EXPECT_THAT(ScoringProcessor::Create(
+ spec_proto, /*document_store=*/nullptr, schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(ScoringProcessorTest, ShouldCreateInstance) {
+TEST_F(ScoringProcessorTest, CreationWithNullSchemaStoreShouldFail) {
ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
- ICING_EXPECT_OK(ScoringProcessor::Create(spec_proto, document_store()));
+ EXPECT_THAT(
+ ScoringProcessor::Create(spec_proto, document_store(),
+ /*schema_store=*/nullptr,
+ fake_clock().GetSystemTimeMilliseconds()),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(ScoringProcessorTest, ShouldHandleEmptyDocHitIterator) {
+TEST_P(ScoringProcessorTest, ShouldCreateInstance) {
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam());
+ ICING_EXPECT_OK(
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+}
+
+TEST_P(ScoringProcessorTest, ShouldHandleEmptyDocHitIterator) {
// Creates an empty DocHitInfoIterator
std::vector<DocHitInfo> doc_hit_infos = {};
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam());
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/5),
IsEmpty());
}
-TEST_F(ScoringProcessorTest, ShouldHandleNonPositiveNumToScore) {
+TEST_P(ScoringProcessorTest, ShouldHandleNonPositiveNumToScore) {
// Sets up documents
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id1,
@@ -164,13 +243,14 @@ TEST_F(ScoringProcessorTest, ShouldHandleNonPositiveNumToScore) {
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam());
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/-1),
@@ -183,7 +263,7 @@ TEST_F(ScoringProcessorTest, ShouldHandleNonPositiveNumToScore) {
IsEmpty());
}
-TEST_F(ScoringProcessorTest, ShouldRespectNumToScore) {
+TEST_P(ScoringProcessorTest, ShouldRespectNumToScore) {
// Sets up documents
ICING_ASSERT_OK_AND_ASSIGN(
auto doc_hit_result_pair,
@@ -194,13 +274,14 @@ TEST_F(ScoringProcessorTest, ShouldRespectNumToScore) {
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam());
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/2),
@@ -213,7 +294,7 @@ TEST_F(ScoringProcessorTest, ShouldRespectNumToScore) {
SizeIs(3));
}
-TEST_F(ScoringProcessorTest, ShouldScoreByDocumentScore) {
+TEST_P(ScoringProcessorTest, ShouldScoreByDocumentScore) {
// Creates input doc_hit_infos and expected output scored_document_hits
ICING_ASSERT_OK_AND_ASSIGN(
auto doc_hit_result_pair,
@@ -226,13 +307,14 @@ TEST_F(ScoringProcessorTest, ShouldScoreByDocumentScore) {
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam());
// Creates a ScoringProcessor
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3),
@@ -241,7 +323,574 @@ TEST_F(ScoringProcessorTest, ShouldScoreByDocumentScore) {
EqualsScoredDocumentHit(scored_document_hits.at(2))));
}
-TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) {
+TEST_P(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_DocumentsWithDifferentLength) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document3 =
+ CreateDocument("icing", "email/3", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/10));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store()->Put(document2, /*num_tokens=*/100));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store()->Put(document3, /*num_tokens=*/50));
+
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id1);
+ doc_hit_info1.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/1);
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(document_id2);
+ doc_hit_info2.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/1);
+ DocHitInfoTermFrequencyPair doc_hit_info3 = DocHitInfo(document_id3);
+ doc_hit_info3.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/1);
+
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = UINT64_C(1) << section_id;
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {
+ doc_hit_info1, doc_hit_info2, doc_hit_info3};
+
+ // Creates a dummy DocHitInfoIterator with 3 results for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+ // Since the three documents all contain the query term "foo" exactly once,
+ // the document's length determines the final score. Document shorter than the
+ // average corpus length are slightly boosted.
+ ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask,
+ /*score=*/0.187114);
+ ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask,
+ /*score=*/0.084904);
+ ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask,
+ /*score=*/0.121896);
+ EXPECT_THAT(
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/3, &query_term_iterators),
+ ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1),
+ EqualsScoredDocumentHit(expected_scored_doc_hit2),
+ EqualsScoredDocumentHit(expected_scored_doc_hit3)));
+}
+
+TEST_P(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_DocumentsWithSameLength) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document3 =
+ CreateDocument("icing", "email/3", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/10));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store()->Put(document2, /*num_tokens=*/10));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store()->Put(document3, /*num_tokens=*/10));
+
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id1);
+ doc_hit_info1.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/1);
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(document_id2);
+ doc_hit_info2.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/1);
+ DocHitInfoTermFrequencyPair doc_hit_info3 = DocHitInfo(document_id3);
+ doc_hit_info3.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/1);
+
+ SectionId section_id = 0;
+ SectionIdMask section_id_mask = UINT64_C(1) << section_id;
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {
+ doc_hit_info1, doc_hit_info2, doc_hit_info3};
+
+ // Creates a dummy DocHitInfoIterator with 3 results for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+ // Since the three documents all contain the query term "foo" exactly once
+ // and they have the same length, they will have the same BM25F scoret.
+ ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask,
+ /*score=*/0.118455);
+ ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask,
+ /*score=*/0.118455);
+ ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask,
+ /*score=*/0.118455);
+ EXPECT_THAT(
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/3, &query_term_iterators),
+ ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1),
+ EqualsScoredDocumentHit(expected_scored_doc_hit2),
+ EqualsScoredDocumentHit(expected_scored_doc_hit3)));
+}
+
+TEST_P(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_DocumentsWithDifferentQueryFrequency) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document3 =
+ CreateDocument("icing", "email/3", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/10));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store()->Put(document2, /*num_tokens=*/10));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id3,
+ document_store()->Put(document3, /*num_tokens=*/10));
+
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id1);
+ // Document 1 contains the query term "foo" 5 times
+ doc_hit_info1.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/5);
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(document_id2);
+ // Document 1 contains the query term "foo" 1 time
+ doc_hit_info2.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/1);
+ DocHitInfoTermFrequencyPair doc_hit_info3 = DocHitInfo(document_id3);
+ // Document 1 contains the query term "foo" 3 times
+ doc_hit_info3.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/1);
+ doc_hit_info3.UpdateSection(/*section_id*/ 1, /*hit_term_frequency=*/2);
+
+ SectionIdMask section_id_mask1 = 0b00000001;
+ SectionIdMask section_id_mask2 = 0b00000001;
+ SectionIdMask section_id_mask3 = 0b00000011;
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {
+ doc_hit_info1, doc_hit_info2, doc_hit_info3};
+
+ // Creates a dummy DocHitInfoIterator with 3 results for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+ // Since the three documents all have the same length, the score is decided by
+ // the frequency of the query term "foo".
+ ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask1,
+ /*score=*/0.226674);
+ ScoredDocumentHit expected_scored_doc_hit2(document_id2, section_id_mask2,
+ /*score=*/0.118455);
+ ScoredDocumentHit expected_scored_doc_hit3(document_id3, section_id_mask3,
+ /*score=*/0.196720);
+ EXPECT_THAT(
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/3, &query_term_iterators),
+ ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1),
+ EqualsScoredDocumentHit(expected_scored_doc_hit2),
+ EqualsScoredDocumentHit(expected_scored_doc_hit3)));
+}
+
+TEST_P(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_HitTermWithZeroFrequency) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/10));
+
+ // Document 1 contains the term "foo" 0 times in the "subject" property
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id1);
+ doc_hit_info1.UpdateSection(/*section_id*/ 0, /*hit_term_frequency=*/0);
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {doc_hit_info1};
+
+ // Creates a dummy DocHitInfoIterator with 1 result for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ SectionIdMask section_id_mask1 = 0b00000001;
+
+ // Since the document hit has zero frequency, expect a score of zero.
+ ScoredDocumentHit expected_scored_doc_hit1(document_id1, section_id_mask1,
+ /*score=*/0.000000);
+ EXPECT_THAT(
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/1, &query_term_iterators),
+ ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1)));
+}
+
+TEST_P(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_SameHitFrequencyDifferentPropertyWeights) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store()->Put(document2, /*num_tokens=*/1));
+
+ // Document 1 contains the term "foo" 1 time in the "body" property
+ SectionId body_section_id = 0;
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id1);
+ doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
+
+ // Document 2 contains the term "foo" 1 time in the "subject" property
+ SectionId subject_section_id = 1;
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(document_id2);
+ doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1);
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {doc_hit_info1,
+ doc_hit_info2};
+
+ // Creates a dummy DocHitInfoIterator with 2 results for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ PropertyWeight body_property_weight =
+ CreatePropertyWeight(/*path=*/"body", /*weight=*/0.5);
+ PropertyWeight subject_property_weight =
+ CreatePropertyWeight(/*path=*/"subject", /*weight=*/2.0);
+ *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
+ /*schema_type=*/"email", {body_property_weight, subject_property_weight});
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ SectionIdMask body_section_id_mask = 1U << body_section_id;
+ SectionIdMask subject_section_id_mask = 1U << subject_section_id;
+
+ // We expect document 2 to have a higher score than document 1 as it matches
+ // "foo" in the "subject" property, which is weighed higher than the "body"
+ // property. Final scores are computed with smoothing applied.
+ ScoredDocumentHit expected_scored_doc_hit1(document_id1, body_section_id_mask,
+ /*score=*/0.053624);
+ ScoredDocumentHit expected_scored_doc_hit2(document_id2,
+ subject_section_id_mask,
+ /*score=*/0.153094);
+ EXPECT_THAT(
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/2, &query_term_iterators),
+ ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1),
+ EqualsScoredDocumentHit(expected_scored_doc_hit2)));
+}
+
+TEST_P(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_WithImplicitPropertyWeight) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store()->Put(document2, /*num_tokens=*/1));
+
+ // Document 1 contains the term "foo" 1 time in the "body" property
+ SectionId body_section_id = 0;
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id1);
+ doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
+
+ // Document 2 contains the term "foo" 1 time in the "subject" property
+ SectionId subject_section_id = 1;
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(document_id2);
+ doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1);
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {doc_hit_info1,
+ doc_hit_info2};
+
+ // Creates a dummy DocHitInfoIterator with 2 results for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ PropertyWeight body_property_weight =
+ CreatePropertyWeight(/*path=*/"body", /*weight=*/0.5);
+ *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
+ /*schema_type=*/"email", {body_property_weight});
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ SectionIdMask body_section_id_mask = 1U << body_section_id;
+ SectionIdMask subject_section_id_mask = 1U << subject_section_id;
+
+ // We expect document 2 to have a higher score than document 1 as it matches
+ // "foo" in the "subject" property, which is weighed higher than the "body"
+ // property. This is because the "subject" property is implictly given a
+ // a weight of 1.0, the default weight value. Final scores are computed with
+ // smoothing applied.
+ ScoredDocumentHit expected_scored_doc_hit1(document_id1, body_section_id_mask,
+ /*score=*/0.094601);
+ ScoredDocumentHit expected_scored_doc_hit2(document_id2,
+ subject_section_id_mask,
+ /*score=*/0.153094);
+ EXPECT_THAT(
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/2, &query_term_iterators),
+ ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit1),
+ EqualsScoredDocumentHit(expected_scored_doc_hit2)));
+}
+
+TEST_P(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_WithDefaultPropertyWeight) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/1));
+
+ // Document 1 contains the term "foo" 1 time in the "body" property
+ SectionId body_section_id = 0;
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id1);
+ doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {doc_hit_info1};
+
+ // Creates a dummy DocHitInfoIterator with 1 result for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ *spec_proto.add_type_property_weights() =
+ CreateTypePropertyWeights(/*schema_type=*/"email", {});
+
+ // Creates a ScoringProcessor with no explicit weights set.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ ScoringSpecProto spec_proto_with_weights =
+ CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ PropertyWeight body_property_weight = CreatePropertyWeight(/*path=*/"body",
+ /*weight=*/1.0);
+ *spec_proto_with_weights.add_type_property_weights() =
+ CreateTypePropertyWeights(/*schema_type=*/"email",
+ {body_property_weight});
+
+ // Creates a ScoringProcessor with default weight set for "body" property.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor_with_weights,
+ ScoringProcessor::Create(spec_proto_with_weights, document_store(),
+ schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ // Create a doc hit iterator
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators_scoring_with_weights;
+ query_term_iterators_scoring_with_weights["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ SectionIdMask body_section_id_mask = 1U << body_section_id;
+
+ // We expect document 1 to have the same score whether a weight is explicitly
+ // set to 1.0 or implictly scored with the default weight. Final scores are
+ // computed with smoothing applied.
+ ScoredDocumentHit expected_scored_doc_hit(document_id1, body_section_id_mask,
+ /*score=*/0.208191);
+ EXPECT_THAT(
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/1, &query_term_iterators),
+ ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit)));
+
+ // Restore ownership of doc hit iterator and query term iterator to test.
+ doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ EXPECT_THAT(scoring_processor_with_weights->Score(
+ std::move(doc_hit_info_iterator),
+ /*num_to_score=*/1, &query_term_iterators),
+ ElementsAre(EqualsScoredDocumentHit(expected_scored_doc_hit)));
+}
+
+TEST_P(ScoringProcessorTest,
+ ShouldScoreByRelevanceScore_WithZeroPropertyWeight) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store()->Put(document1, /*num_tokens=*/1));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store()->Put(document2, /*num_tokens=*/1));
+
+ // Document 1 contains the term "foo" 1 time in the "body" property
+ SectionId body_section_id = 0;
+ DocHitInfoTermFrequencyPair doc_hit_info1 = DocHitInfo(document_id1);
+ doc_hit_info1.UpdateSection(body_section_id, /*hit_term_frequency=*/1);
+
+ // Document 2 contains the term "foo" 1 time in the "subject" property
+ SectionId subject_section_id = 1;
+ DocHitInfoTermFrequencyPair doc_hit_info2 = DocHitInfo(document_id2);
+ doc_hit_info2.UpdateSection(subject_section_id, /*hit_term_frequency=*/1);
+
+ // Creates input doc_hit_infos and expected output scored_document_hits
+ std::vector<DocHitInfoTermFrequencyPair> doc_hit_infos = {doc_hit_info1,
+ doc_hit_info2};
+
+ // Creates a dummy DocHitInfoIterator with 2 results for the query "foo"
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE, GetParam());
+
+ // Sets property weight for "body" to 0.0.
+ PropertyWeight body_property_weight =
+ CreatePropertyWeight(/*path=*/"body", /*weight=*/0.0);
+ // Sets property weight for "subject" to 1.0.
+ PropertyWeight subject_property_weight =
+ CreatePropertyWeight(/*path=*/"subject", /*weight=*/1.0);
+ *spec_proto.add_type_property_weights() = CreateTypePropertyWeights(
+ /*schema_type=*/"email", {body_property_weight, subject_property_weight});
+
+ // Creates a ScoringProcessor
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ std::unordered_map<std::string, std::unique_ptr<DocHitInfoIterator>>
+ query_term_iterators;
+ query_term_iterators["foo"] =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos, "foo");
+
+ std::vector<ScoredDocumentHit> scored_document_hits =
+ scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/2, &query_term_iterators);
+
+ // We expect document1 to have a score of 0.0 as the query term "foo" matches
+ // in the "body" property which has a weight of 0.0. This is a result of the
+ // weighted term frequency being scaled down to 0.0 for the hit. We expect
+ // document2 to have a positive score as the query term "foo" matches in the
+ // "subject" property which has a weight of 1.0.
+ EXPECT_THAT(scored_document_hits, SizeIs(2));
+ EXPECT_THAT(scored_document_hits.at(0).document_id(), Eq(document_id1));
+ EXPECT_THAT(scored_document_hits.at(0).score(), Eq(0.0));
+ EXPECT_THAT(scored_document_hits.at(1).document_id(), Eq(document_id2));
+ EXPECT_THAT(scored_document_hits.at(1).score(), Gt(0.0));
+}
+
+TEST_P(ScoringProcessorTest, ShouldScoreByCreationTimestamp) {
DocumentProto document1 =
CreateDocument("icing", "email/1", kDefaultScore,
/*creation_timestamp_ms=*/1571100001111);
@@ -274,13 +923,14 @@ TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) {
std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP);
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::CREATION_TIMESTAMP, GetParam());
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3),
@@ -289,7 +939,129 @@ TEST_F(ScoringProcessorTest, ShouldScoreByCreationTimestamp) {
EqualsScoredDocumentHit(scored_document_hit1)));
}
-TEST_F(ScoringProcessorTest, ShouldHandleNoScores) {
+TEST_P(ScoringProcessorTest, ShouldScoreByUsageCount) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document3 =
+ CreateDocument("icing", "email/3", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store()->Put(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store()->Put(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store()->Put(document3));
+
+ // Report usage for doc1 once and doc2 twice.
+ UsageReport usage_report_doc1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_doc2 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/2", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_doc1));
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_doc2));
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_doc2));
+
+ DocHitInfo doc_hit_info1(document_id1);
+ DocHitInfo doc_hit_info2(document_id2);
+ DocHitInfo doc_hit_info3(document_id3);
+ ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/1);
+ ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone,
+ /*score=*/2);
+ ScoredDocumentHit scored_document_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/0);
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2,
+ doc_hit_info3};
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_COUNT, GetParam());
+
+ // Creates a ScoringProcessor which ranks in descending order
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit1),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit3)));
+}
+
+TEST_P(ScoringProcessorTest, ShouldScoreByUsageTimestamp) {
+ DocumentProto document1 =
+ CreateDocument("icing", "email/1", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document2 =
+ CreateDocument("icing", "email/2", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+ DocumentProto document3 =
+ CreateDocument("icing", "email/3", kDefaultScore,
+ /*creation_timestamp_ms=*/kDefaultCreationTimestampMs);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
+ document_store()->Put(document1));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
+ document_store()->Put(document2));
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id3,
+ document_store()->Put(document3));
+
+ // Report usage for doc1 and doc2.
+ UsageReport usage_report_doc1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_doc2 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/2", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_doc1));
+ ICING_ASSERT_OK(document_store()->ReportUsage(usage_report_doc2));
+
+ DocHitInfo doc_hit_info1(document_id1);
+ DocHitInfo doc_hit_info2(document_id2);
+ DocHitInfo doc_hit_info3(document_id3);
+ ScoredDocumentHit scored_document_hit1(document_id1, kSectionIdMaskNone,
+ /*score=*/1000);
+ ScoredDocumentHit scored_document_hit2(document_id2, kSectionIdMaskNone,
+ /*score=*/5000);
+ ScoredDocumentHit scored_document_hit3(document_id3, kSectionIdMaskNone,
+ /*score=*/0);
+
+ // Creates a dummy DocHitInfoIterator with 3 results
+ std::vector<DocHitInfo> doc_hit_infos = {doc_hit_info1, doc_hit_info2,
+ doc_hit_info3};
+ std::unique_ptr<DocHitInfoIterator> doc_hit_info_iterator =
+ std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
+
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::USAGE_TYPE1_LAST_USED_TIMESTAMP,
+ GetParam());
+
+ // Creates a ScoringProcessor which ranks in descending order
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<ScoringProcessor> scoring_processor,
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
+
+ EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
+ /*num_to_score=*/3),
+ ElementsAre(EqualsScoredDocumentHit(scored_document_hit1),
+ EqualsScoredDocumentHit(scored_document_hit2),
+ EqualsScoredDocumentHit(scored_document_hit3)));
+}
+
+TEST_P(ScoringProcessorTest, ShouldHandleNoScores) {
// Creates input doc_hit_infos and corresponding scored_document_hits
ICING_ASSERT_OK_AND_ASSIGN(
auto doc_hit_result_pair,
@@ -310,13 +1082,14 @@ TEST_F(ScoringProcessorTest, ShouldHandleNoScores) {
ScoredDocumentHit scored_document_hit_default =
ScoredDocumentHit(4, kSectionIdMaskNone, /*score=*/0.0);
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE);
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::DOCUMENT_SCORE, GetParam());
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/4),
ElementsAre(EqualsScoredDocumentHit(scored_document_hit_default),
@@ -325,7 +1098,7 @@ TEST_F(ScoringProcessorTest, ShouldHandleNoScores) {
EqualsScoredDocumentHit(scored_document_hits.at(2))));
}
-TEST_F(ScoringProcessorTest, ShouldWrapResultsWhenNoScoring) {
+TEST_P(ScoringProcessorTest, ShouldWrapResultsWhenNoScoring) {
DocumentProto document1 = CreateDocument("icing", "email/1", /*score=*/1,
kDefaultCreationTimestampMs);
DocumentProto document2 = CreateDocument("icing", "email/2", /*score=*/2,
@@ -359,13 +1132,14 @@ TEST_F(ScoringProcessorTest, ShouldWrapResultsWhenNoScoring) {
std::make_unique<DocHitInfoIteratorDummy>(doc_hit_infos);
// A ScoringSpecProto with no scoring strategy
- ScoringSpecProto spec_proto;
- spec_proto.set_rank_by(ScoringSpecProto::RankingStrategy::NONE);
+ ScoringSpecProto spec_proto = CreateScoringSpecForRankingStrategy(
+ ScoringSpecProto::RankingStrategy::NONE, GetParam());
// Creates a ScoringProcessor which ranks in descending order
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<ScoringProcessor> scoring_processor,
- ScoringProcessor::Create(spec_proto, document_store()));
+ ScoringProcessor::Create(spec_proto, document_store(), schema_store(),
+ fake_clock().GetSystemTimeMilliseconds()));
EXPECT_THAT(scoring_processor->Score(std::move(doc_hit_info_iterator),
/*num_to_score=*/3),
@@ -374,6 +1148,10 @@ TEST_F(ScoringProcessorTest, ShouldWrapResultsWhenNoScoring) {
EqualsScoredDocumentHit(scored_document_hit1)));
}
+INSTANTIATE_TEST_SUITE_P(ScoringProcessorTest, ScoringProcessorTest,
+ testing::Values(ScorerTestingMode::kNormal,
+ ScorerTestingMode::kAdvanced));
+
} // namespace
} // namespace lib
diff --git a/icing/scoring/section-weights.cc b/icing/scoring/section-weights.cc
new file mode 100644
index 0000000..ed7cd5e
--- /dev/null
+++ b/icing/scoring/section-weights.cc
@@ -0,0 +1,151 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/section-weights.h"
+
+#include <cfloat>
+#include <unordered_map>
+#include <utility>
+
+#include "icing/proto/scoring.pb.h"
+#include "icing/schema/section.h"
+#include "icing/util/logging.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Normalizes all weights in the map to be in range [0.0, 1.0], where the max
+// weight is normalized to 1.0. In the case that all weights are equal to 0.0,
+// the normalized weight for each will be 0.0.
+inline void NormalizeSectionWeights(
+ double max_weight, std::unordered_map<SectionId, double>& section_weights) {
+ if (max_weight == 0.0) {
+ return;
+ }
+ for (auto& raw_weight : section_weights) {
+ raw_weight.second = raw_weight.second / max_weight;
+ }
+}
+} // namespace
+
+libtextclassifier3::StatusOr<std::unique_ptr<SectionWeights>>
+SectionWeights::Create(const SchemaStore* schema_store,
+ const ScoringSpecProto& scoring_spec) {
+ ICING_RETURN_ERROR_IF_NULL(schema_store);
+
+ std::unordered_map<SchemaTypeId, NormalizedSectionWeights>
+ schema_property_weight_map;
+ for (const TypePropertyWeights& type_property_weights :
+ scoring_spec.type_property_weights()) {
+ std::string_view schema_type = type_property_weights.schema_type();
+ auto schema_type_id_or = schema_store->GetSchemaTypeId(schema_type);
+ if (!schema_type_id_or.ok()) {
+ ICING_LOG(WARNING) << "No schema type id found for schema type: "
+ << schema_type;
+ continue;
+ }
+ SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
+ auto section_metadata_list_or =
+ schema_store->GetSectionMetadata(schema_type.data());
+ if (!section_metadata_list_or.ok()) {
+ ICING_LOG(WARNING) << "No metadata found for schema type: "
+ << schema_type;
+ continue;
+ }
+
+ const std::vector<SectionMetadata>* metadata_list =
+ section_metadata_list_or.ValueOrDie();
+
+ std::unordered_map<std::string, double> property_paths_weights;
+ for (const PropertyWeight& property_weight :
+ type_property_weights.property_weights()) {
+ double property_path_weight = property_weight.weight();
+
+ // Return error on negative weights.
+ if (property_path_weight < 0.0) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Property weight for property path \"%s\" is negative. Negative "
+ "weights are invalid.",
+ property_weight.path().c_str()));
+ }
+ property_paths_weights.insert(
+ {property_weight.path(), property_path_weight});
+ }
+ NormalizedSectionWeights normalized_section_weights =
+ ExtractNormalizedSectionWeights(property_paths_weights, *metadata_list);
+
+ schema_property_weight_map.insert(
+ {schema_type_id,
+ {/*section_weights*/ std::move(
+ normalized_section_weights.section_weights),
+ /*default_weight*/ normalized_section_weights.default_weight}});
+ }
+ // Using `new` to access a non-public constructor.
+ return std::unique_ptr<SectionWeights>(
+ new SectionWeights(std::move(schema_property_weight_map)));
+}
+
+double SectionWeights::GetNormalizedSectionWeight(SchemaTypeId schema_type_id,
+ SectionId section_id) const {
+ auto schema_type_map = schema_section_weight_map_.find(schema_type_id);
+ if (schema_type_map == schema_section_weight_map_.end()) {
+ // Return default weight if the schema type has no weights specified.
+ return kDefaultSectionWeight;
+ }
+
+ auto section_weight =
+ schema_type_map->second.section_weights.find(section_id);
+ if (section_weight == schema_type_map->second.section_weights.end()) {
+ // If there is no entry for SectionId, the weight is implicitly the
+ // normalized default weight.
+ return schema_type_map->second.default_weight;
+ }
+ return section_weight->second;
+}
+
+inline SectionWeights::NormalizedSectionWeights
+SectionWeights::ExtractNormalizedSectionWeights(
+ const std::unordered_map<std::string, double>& raw_weights,
+ const std::vector<SectionMetadata>& metadata_list) {
+ double max_weight = -std::numeric_limits<double>::infinity();
+ std::unordered_map<SectionId, double> section_weights;
+ for (const SectionMetadata& section_metadata : metadata_list) {
+ std::string_view metadata_path = section_metadata.path;
+ double section_weight = kDefaultSectionWeight;
+ auto iter = raw_weights.find(metadata_path.data());
+ if (iter != raw_weights.end()) {
+ section_weight = iter->second;
+ section_weights.insert({section_metadata.id, section_weight});
+ }
+ // Replace max if we see new max weight.
+ max_weight = std::max(max_weight, section_weight);
+ }
+
+ NormalizeSectionWeights(max_weight, section_weights);
+ // Set normalized default weight to 1.0 in case there is no section
+ // metadata and max_weight is -INF (we should not see this case).
+ double normalized_default_weight =
+ max_weight == -std::numeric_limits<double>::infinity()
+ ? kDefaultSectionWeight
+ : kDefaultSectionWeight / max_weight;
+ SectionWeights::NormalizedSectionWeights normalized_section_weights =
+ SectionWeights::NormalizedSectionWeights();
+ normalized_section_weights.section_weights = std::move(section_weights);
+ normalized_section_weights.default_weight = normalized_default_weight;
+ return normalized_section_weights;
+}
+} // namespace lib
+} // namespace icing
diff --git a/icing/scoring/section-weights.h b/icing/scoring/section-weights.h
new file mode 100644
index 0000000..ab69af2
--- /dev/null
+++ b/icing/scoring/section-weights.h
@@ -0,0 +1,96 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_SCORING_SECTION_WEIGHTS_H_
+#define ICING_SCORING_SECTION_WEIGHTS_H_
+
+#include <unordered_map>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+
+namespace icing {
+namespace lib {
+
+inline constexpr double kDefaultSectionWeight = 1.0;
+
+// Provides functions for setting and retrieving section weights for schema
+// type properties. Section weights are used to promote and demote term matches
+// in sections when scoring results. Section weights are provided by property
+// path, and can range from (0, DBL_MAX]. The SectionId is matched to the
+// property path by going over the schema type's section metadata. Weights that
+// correspond to a valid property path are then normalized against the maxmium
+// section weight, and put into map for quick access for scorers. By default,
+// a section is given a raw, pre-normalized weight of 1.0.
+class SectionWeights {
+ public:
+ // SectionWeights instances should not be copied.
+ SectionWeights(const SectionWeights&) = delete;
+ SectionWeights& operator=(const SectionWeights&) = delete;
+
+ // Factory function to create a SectionWeights instance. Raw weights are
+ // provided through the ScoringSpecProto. Provided property paths for weights
+ // are validated against the schema type's section metadata. If the property
+ // path doesn't exist, the property weight is ignored. If a weight is 0 or
+ // negative, an invalid argument error is returned. Raw weights are then
+ // normalized against the maximum weight for that schema type.
+ //
+ // Returns:
+ // A SectionWeights instance on success
+ // FAILED_PRECONDITION on any null pointer input
+ // INVALID_ARGUMENT if a provided weight for a property path is less than or
+ // equal to 0.
+ static libtextclassifier3::StatusOr<std::unique_ptr<SectionWeights>> Create(
+ const SchemaStore* schema_store, const ScoringSpecProto& scoring_spec);
+
+ // Returns the normalized section weight by SchemaTypeId and SectionId. If
+ // the SchemaTypeId, or the SectionId for a SchemaTypeId, is not found in the
+ // normalized weights map, the default weight is returned instead.
+ double GetNormalizedSectionWeight(SchemaTypeId schema_type_id,
+ SectionId section_id) const;
+
+ private:
+ // Holds the normalized section weights for a schema type, as well as the
+ // normalized default weight for sections that have no weight set.
+ struct NormalizedSectionWeights {
+ std::unordered_map<SectionId, double> section_weights;
+ double default_weight;
+ };
+
+ explicit SectionWeights(
+ const std::unordered_map<SchemaTypeId, NormalizedSectionWeights>
+ schema_section_weight_map)
+ : schema_section_weight_map_(std::move(schema_section_weight_map)) {}
+
+ // Creates a map of section ids to normalized weights from the raw property
+ // path weight map and section metadata and calculates the normalized default
+ // section weight.
+ static inline SectionWeights::NormalizedSectionWeights
+ ExtractNormalizedSectionWeights(
+ const std::unordered_map<std::string, double>& raw_weights,
+ const std::vector<SectionMetadata>& metadata_list);
+
+ // A map of (SchemaTypeId -> SectionId -> Normalized Weight), allows for fast
+ // look up of normalized weights. This is precomputed when creating a
+ // SectionWeights instance.
+ std::unordered_map<SchemaTypeId, NormalizedSectionWeights>
+ schema_section_weight_map_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_SCORING_SECTION_WEIGHTS_H_
diff --git a/icing/scoring/section-weights_test.cc b/icing/scoring/section-weights_test.cc
new file mode 100644
index 0000000..28b1797
--- /dev/null
+++ b/icing/scoring/section-weights_test.cc
@@ -0,0 +1,447 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/scoring/section-weights.h"
+
+#include <cfloat>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/scoring.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+
+class SectionWeightsTest : public testing::Test {
+ protected:
+ SectionWeightsTest()
+ : test_dir_(GetTestTempDir() + "/icing"),
+ schema_store_dir_(test_dir_ + "/schema_store") {}
+
+ void SetUp() override {
+ // Creates file directories
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
+
+ SchemaTypeConfigProto sender_schema =
+ SchemaTypeConfigBuilder()
+ .SetType("sender")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
+ .Build();
+ SchemaTypeConfigProto email_schema =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetDataType(PropertyConfigProto::DataType::STRING)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(
+ TermMatchType::PREFIX,
+ StringIndexingConfig::TokenizerType::PLAIN)
+ .SetDataType(PropertyConfigProto::DataType::STRING)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("sender")
+ .SetDataTypeDocument("sender",
+ /*index_nested_properties=*/true)
+ .SetCardinality(PropertyConfigProto::Cardinality::OPTIONAL))
+ .Build();
+ SchemaProto schema =
+ SchemaBuilder().AddType(sender_schema).AddType(email_schema).Build();
+
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ }
+
+ void TearDown() override {
+ schema_store_.reset();
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ SchemaStore *schema_store() { return schema_store_.get(); }
+
+ private:
+ const std::string test_dir_;
+ const std::string schema_store_dir_;
+ Filesystem filesystem_;
+ FakeClock fake_clock_;
+ std::unique_ptr<SchemaStore> schema_store_;
+};
+
+TEST_F(SectionWeightsTest, ShouldNormalizeSinglePropertyWeight) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("sender");
+
+ PropertyWeight *property_weight =
+ type_property_weights->add_property_weights();
+ property_weight->set_weight(5.0);
+ property_weight->set_path("name");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
+ schema_store()->GetSchemaTypeId("sender"));
+
+ // section_id 0 corresponds to property "name".
+ // We expect 1.0 as there is only one property in the "sender" schema type
+ // so it should take the max normalized weight of 1.0.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
+ /*section_id=*/0),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldAcceptMaxWeightValue) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("sender");
+
+ PropertyWeight *property_weight =
+ type_property_weights->add_property_weights();
+ property_weight->set_weight(DBL_MAX);
+ property_weight->set_path("name");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
+ schema_store()->GetSchemaTypeId("sender"));
+
+ // section_id 0 corresponds to property "name".
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
+ /*section_id=*/0),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldFailWithNegativeWeights) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_propery_weight =
+ type_property_weights->add_property_weights();
+ body_propery_weight->set_weight(-100.0);
+ body_propery_weight->set_path("body");
+
+ EXPECT_THAT(SectionWeights::Create(schema_store(), spec_proto).status(),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(SectionWeightsTest, ShouldAcceptZeroWeight) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ type_property_weights->add_property_weights();
+ body_property_weight->set_weight(2.0);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(0.0);
+ subject_property_weight->set_path("subject");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(1.0));
+ // Normalized weight for "subject" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(0.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldNormalizeToZeroWhenAllWeightsZero) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ type_property_weights->add_property_weights();
+ body_property_weight->set_weight(0.0);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *sender_property_weight =
+ type_property_weights->add_property_weights();
+ sender_property_weight->set_weight(0.0);
+ sender_property_weight->set_path("sender.name");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(0.0);
+ subject_property_weight->set_path("subject");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(0.0));
+ // Normalized weight for "sender.name" property (the nested property).
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(0.0));
+ // Normalized weight for "subject" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(0.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldReturnDefaultIfTypePropertyWeightsNotSet) {
+ ScoringSpecProto spec_proto;
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(kDefaultSectionWeight));
+}
+
+TEST_F(SectionWeightsTest, ShouldSetNestedPropertyWeights) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ type_property_weights->add_property_weights();
+ body_property_weight->set_weight(1.0);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(100.0);
+ subject_property_weight->set_path("subject");
+
+ PropertyWeight *nested_property_weight =
+ type_property_weights->add_property_weights();
+ nested_property_weight->set_weight(50.0);
+ nested_property_weight->set_path("sender.name");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(0.01));
+ // Normalized weight for "sender.name" property (the nested property).
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(0.5));
+ // Normalized weight for "subject" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldNormalizeIfAllWeightsBelowOne) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ type_property_weights->add_property_weights();
+ body_property_weight->set_weight(0.1);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *sender_name_weight =
+ type_property_weights->add_property_weights();
+ sender_name_weight->set_weight(0.2);
+ sender_name_weight->set_path("sender.name");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(0.4);
+ subject_property_weight->set_path("subject");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(1.0 / 4.0));
+ // Normalized weight for "sender.name" property (the nested property).
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(2.0 / 4.0));
+ // Normalized weight for "subject" property.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldSetNestedPropertyWeightSeparatelyForTypes) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *email_type_property_weights =
+ spec_proto.add_type_property_weights();
+ email_type_property_weights->set_schema_type("email");
+
+ PropertyWeight *body_property_weight =
+ email_type_property_weights->add_property_weights();
+ body_property_weight->set_weight(1.0);
+ body_property_weight->set_path("body");
+
+ PropertyWeight *subject_property_weight =
+ email_type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(100.0);
+ subject_property_weight->set_path("subject");
+
+ PropertyWeight *sender_name_property_weight =
+ email_type_property_weights->add_property_weights();
+ sender_name_property_weight->set_weight(50.0);
+ sender_name_property_weight->set_path("sender.name");
+
+ TypePropertyWeights *sender_type_property_weights =
+ spec_proto.add_type_property_weights();
+ sender_type_property_weights->set_schema_type("sender");
+
+ PropertyWeight *sender_property_weight =
+ sender_type_property_weights->add_property_weights();
+ sender_property_weight->set_weight(25.0);
+ sender_property_weight->set_path("sender");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId sender_schema_type_id,
+ schema_store()->GetSchemaTypeId("sender"));
+
+ // Normalized weight for "sender.name" property (the nested property)
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(0.5));
+ // Normalized weight for "name" property for "sender" schema type. As it is
+ // the only property of the type, it should take the max normalized weight of
+ // 1.0.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(sender_schema_type_id,
+ /*section_id=*/2),
+ Eq(1.0));
+}
+
+TEST_F(SectionWeightsTest, ShouldSkipNonExistentPathWhenSettingWeights) {
+ ScoringSpecProto spec_proto;
+
+ TypePropertyWeights *type_property_weights =
+ spec_proto.add_type_property_weights();
+ type_property_weights->set_schema_type("email");
+
+ // If this property weight isn't skipped, then the max property weight would
+ // be set to 100.0 and all weights would be normalized against the max.
+ PropertyWeight *non_valid_property_weight =
+ type_property_weights->add_property_weights();
+ non_valid_property_weight->set_weight(100.0);
+ non_valid_property_weight->set_path("sender.organization");
+
+ PropertyWeight *subject_property_weight =
+ type_property_weights->add_property_weights();
+ subject_property_weight->set_weight(10.0);
+ subject_property_weight->set_path("subject");
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SectionWeights> section_weights,
+ SectionWeights::Create(schema_store(), spec_proto));
+ ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
+ schema_store()->GetSchemaTypeId("email"));
+
+ // Normalized weight for "body" property. Because the weight is not explicitly
+ // set, it is set to the default of 1.0 before being normalized.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/0),
+ Eq(0.1));
+ // Normalized weight for "sender.name" property (the nested property). Because
+ // the weight is not explicitly set, it is set to the default of 1.0 before
+ // being normalized.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/1),
+ Eq(0.1));
+ // Normalized weight for "subject" property. Because the invalid property path
+ // is skipped when assigning weights, subject takes the max normalized weight
+ // of 1.0 instead.
+ EXPECT_THAT(section_weights->GetNormalizedSectionWeight(email_schema_type_id,
+ /*section_id=*/2),
+ Eq(1.0));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/corpus-associated-scoring-data.h b/icing/store/corpus-associated-scoring-data.h
new file mode 100644
index 0000000..52be5cd
--- /dev/null
+++ b/icing/store/corpus-associated-scoring-data.h
@@ -0,0 +1,79 @@
+// Copyright (C) 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_
+#define ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_
+
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+#include "icing/legacy/core/icing-packed-pod.h"
+
+namespace icing {
+namespace lib {
+
+// This is the cache entity of corpus-associated scores. The ground-truth data
+// is stored somewhere else. The cache includes:
+// 1. Number of documents contained in the corpus.
+// Positive values are required.
+// 2. The sum of the documents' lengths, in number of tokens.
+class CorpusAssociatedScoreData {
+ public:
+ explicit CorpusAssociatedScoreData(int num_docs = 0,
+ int64_t sum_length_in_tokens = 0)
+ : sum_length_in_tokens_(sum_length_in_tokens), num_docs_(num_docs) {}
+
+ bool operator==(const CorpusAssociatedScoreData& other) const {
+ return num_docs_ == other.num_docs() &&
+ sum_length_in_tokens_ == other.sum_length_in_tokens();
+ }
+
+ uint32_t num_docs() const { return num_docs_; }
+ void set_num_docs(uint32_t val) { num_docs_ = val; }
+
+ uint64_t sum_length_in_tokens() const { return sum_length_in_tokens_; }
+ void set_sum_length_in_tokens(uint64_t val) { sum_length_in_tokens_ = val; }
+
+ float average_doc_length_in_tokens() const {
+ return sum_length_in_tokens_ / (1.0f + num_docs_);
+ }
+
+ // Adds a new document.
+ // Adds the document's length to the total length of the corpus,
+ // sum_length_in_tokens_.
+ void AddDocument(uint32_t doc_length_in_tokens) {
+ ++num_docs_;
+ sum_length_in_tokens_ =
+ (std::numeric_limits<int>::max() - doc_length_in_tokens <
+ sum_length_in_tokens_)
+ ? std::numeric_limits<int>::max()
+ : sum_length_in_tokens_ + doc_length_in_tokens;
+ }
+
+ private:
+ // The sum total of the length of all documents in the corpus.
+ int sum_length_in_tokens_;
+ int num_docs_;
+} __attribute__((packed));
+
+static_assert(sizeof(CorpusAssociatedScoreData) == 8,
+ "Size of CorpusAssociatedScoreData should be 8");
+static_assert(icing_is_packed_pod<CorpusAssociatedScoreData>::value,
+ "go/icing-ubsan");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_TYPE_NAMESPACE_ASSOCIATED_SCORING_DATA_H_
diff --git a/icing/store/corpus-id.h b/icing/store/corpus-id.h
new file mode 100644
index 0000000..01135b9
--- /dev/null
+++ b/icing/store/corpus-id.h
@@ -0,0 +1,32 @@
+// Copyright (C) 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_CORPUS_ID_H_
+#define ICING_STORE_CORPUS_ID_H_
+
+#include <cstdint>
+
+namespace icing {
+namespace lib {
+
+// Identifier for corpus, i.e. a <namespace, schema_type> pair>, in
+// DocumentProto. Generated in DocumentStore.
+using CorpusId = int32_t;
+
+inline constexpr CorpusId kInvalidCorpusId = -1;
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_CORPUS_ID_H_
diff --git a/icing/store/document-associated-score-data.h b/icing/store/document-associated-score-data.h
index b9039c5..9a711c8 100644
--- a/icing/store/document-associated-score-data.h
+++ b/icing/store/document-associated-score-data.h
@@ -19,6 +19,7 @@
#include <type_traits>
#include "icing/legacy/core/icing-packed-pod.h"
+#include "icing/store/corpus-id.h"
namespace icing {
namespace lib {
@@ -26,33 +27,46 @@ namespace lib {
// This is the cache entity of document-associated scores. It contains scores
// that are related to the document itself. The ground-truth data is stored
// somewhere else. The cache includes:
-// 1. Document score. It's defined in and passed from DocumentProto.score.
+// 1. Corpus Id.
+// 2. Document score. It's defined in and passed from DocumentProto.score.
// Positive values are required.
-// 2. Document creation timestamp. Unix timestamp of when the document is
+// 3. Document creation timestamp. Unix timestamp of when the document is
// created and inserted into Icing.
+// 4. Document length in number of tokens.
class DocumentAssociatedScoreData {
public:
- explicit DocumentAssociatedScoreData(int document_score,
- int64_t creation_timestamp_ms)
- : document_score_(document_score),
- creation_timestamp_ms_(creation_timestamp_ms) {}
+ explicit DocumentAssociatedScoreData(CorpusId corpus_id, int document_score,
+ int64_t creation_timestamp_ms,
+ int length_in_tokens = 0)
+ : creation_timestamp_ms_(creation_timestamp_ms),
+ corpus_id_(corpus_id),
+ document_score_(document_score),
+ length_in_tokens_(length_in_tokens) {}
bool operator==(const DocumentAssociatedScoreData& other) const {
return document_score_ == other.document_score() &&
- creation_timestamp_ms_ == other.creation_timestamp_ms();
+ creation_timestamp_ms_ == other.creation_timestamp_ms() &&
+ length_in_tokens_ == other.length_in_tokens() &&
+ corpus_id_ == other.corpus_id();
}
+ CorpusId corpus_id() const { return corpus_id_; }
+
int document_score() const { return document_score_; }
int64_t creation_timestamp_ms() const { return creation_timestamp_ms_; }
+ int length_in_tokens() const { return length_in_tokens_; }
+
private:
- int document_score_;
int64_t creation_timestamp_ms_;
+ CorpusId corpus_id_;
+ int document_score_;
+ int length_in_tokens_;
} __attribute__((packed));
-static_assert(sizeof(DocumentAssociatedScoreData) == 12,
- "Size of DocumentAssociatedScoreData should be 12");
+static_assert(sizeof(DocumentAssociatedScoreData) == 20,
+ "Size of DocumentAssociatedScoreData should be 20");
static_assert(icing_is_packed_pod<DocumentAssociatedScoreData>::value,
"go/icing-ubsan");
diff --git a/icing/store/document-filter-data.h b/icing/store/document-filter-data.h
index 198bc49..3970132 100644
--- a/icing/store/document-filter-data.h
+++ b/icing/store/document-filter-data.h
@@ -25,6 +25,7 @@ namespace icing {
namespace lib {
using SchemaTypeId = int16_t;
+inline constexpr SchemaTypeId kInvalidSchemaTypeId = -1;
class DocumentFilterData {
public:
diff --git a/icing/store/document-id.h b/icing/store/document-id.h
index cbe9959..7ea33b8 100644
--- a/icing/store/document-id.h
+++ b/icing/store/document-id.h
@@ -23,10 +23,11 @@ namespace lib {
// Id of a document
using DocumentId = int32_t;
-// We use 20 bits to encode document_ids and use the largest value (1M - 1) to
+// We use 22 bits to encode document_ids and use the largest value (2^22 - 1) to
// represent an invalid document_id.
-inline constexpr int kDocumentIdBits = 20;
-inline constexpr DocumentId kInvalidDocumentId = (1u << kDocumentIdBits) - 1;
+inline constexpr int kDocumentIdBits = 22;
+inline constexpr DocumentId kInvalidDocumentId =
+ (INT32_C(1) << kDocumentIdBits) - 1;
inline constexpr DocumentId kMinDocumentId = 0;
inline constexpr DocumentId kMaxDocumentId = kInvalidDocumentId - 1;
diff --git a/icing/store/document-log-creator.cc b/icing/store/document-log-creator.cc
new file mode 100644
index 0000000..2abd315
--- /dev/null
+++ b/icing/store/document-log-creator.cc
@@ -0,0 +1,205 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/document-log-creator.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/annotate.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/file-backed-proto-log.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Used in DocumentId mapper to mark a document as deleted
+constexpr char kDocumentLogFilename[] = "document_log";
+
+std::string DocumentLogFilenameV0() {
+ // Originally only had this one version, no suffix.
+ return kDocumentLogFilename;
+}
+
+std::string DocumentLogFilenameV1() {
+ return absl_ports::StrCat(kDocumentLogFilename, "_v1");
+}
+
+std::string MakeDocumentLogFilenameV0(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", DocumentLogFilenameV0());
+}
+
+std::string MakeDocumentLogFilenameV1(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", DocumentLogFilenameV1());
+}
+
+} // namespace
+
+std::string DocumentLogCreator::GetDocumentLogFilename() {
+ // This should always return the latest version of the document log in use.
+ // The current latest version is V1.
+ return DocumentLogFilenameV1();
+}
+
+libtextclassifier3::StatusOr<DocumentLogCreator::CreateResult>
+DocumentLogCreator::Create(const Filesystem* filesystem,
+ const std::string& base_dir,
+ int32_t compression_level) {
+ bool v0_exists =
+ filesystem->FileExists(MakeDocumentLogFilenameV0(base_dir).c_str());
+ bool v1_exists =
+ filesystem->FileExists(MakeDocumentLogFilenameV1(base_dir).c_str());
+
+ bool new_file = false;
+ int preexisting_file_version = kCurrentVersion;
+ if (v0_exists && !v1_exists) {
+ ICING_RETURN_IF_ERROR(
+ MigrateFromV0ToV1(filesystem, base_dir, compression_level));
+
+ // Need to regenerate derived files since documents may be written to a
+ // different file offset in the log.
+ preexisting_file_version = 0;
+ } else if (!v1_exists) {
+ // First time initializing a v1 log. There are no existing derived files at
+ // this point, so we should generate some. "regenerate" here also means
+ // "generate for the first time", i.e. we shouldn't expect there to be any
+ // existing derived files.
+ new_file = true;
+ }
+
+ ICING_ASSIGN_OR_RETURN(
+ PortableFileBackedProtoLog<DocumentWrapper>::CreateResult
+ log_create_result,
+ PortableFileBackedProtoLog<DocumentWrapper>::Create(
+ filesystem, MakeDocumentLogFilenameV1(base_dir),
+ PortableFileBackedProtoLog<DocumentWrapper>::Options(
+ /*compress_in=*/true,
+ PortableFileBackedProtoLog<DocumentWrapper>::kMaxProtoSize,
+ compression_level)));
+
+ CreateResult create_result = {std::move(log_create_result),
+ preexisting_file_version, new_file};
+ return create_result;
+}
+
+libtextclassifier3::Status DocumentLogCreator::MigrateFromV0ToV1(
+ const Filesystem* filesystem, const std::string& base_dir,
+ int32_t compression_level) {
+ ICING_VLOG(1) << "Migrating from v0 to v1 document log.";
+
+ // Our v0 proto log was non-portable, create it so we can read protos out from
+ // it.
+ auto v0_create_result_or = FileBackedProtoLog<DocumentWrapper>::Create(
+ filesystem, MakeDocumentLogFilenameV0(base_dir),
+ FileBackedProtoLog<DocumentWrapper>::Options(/*compress_in=*/true));
+ if (!v0_create_result_or.ok()) {
+ return absl_ports::Annotate(
+ v0_create_result_or.status(),
+ "Failed to initialize v0 document log while migrating.");
+ return v0_create_result_or.status();
+ }
+ FileBackedProtoLog<DocumentWrapper>::CreateResult v0_create_result =
+ std::move(v0_create_result_or).ValueOrDie();
+ std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> v0_proto_log =
+ std::move(v0_create_result.proto_log);
+
+ // Create a v1 portable proto log that we will write our protos to.
+ auto v1_create_result_or =
+ PortableFileBackedProtoLog<DocumentWrapper>::Create(
+ filesystem, MakeDocumentLogFilenameV1(base_dir),
+ PortableFileBackedProtoLog<DocumentWrapper>::Options(
+ /*compress_in=*/true,
+ /*max_proto_size_in=*/
+ PortableFileBackedProtoLog<DocumentWrapper>::kMaxProtoSize,
+ /*compression_level_in=*/compression_level));
+ if (!v1_create_result_or.ok()) {
+ return absl_ports::Annotate(
+ v1_create_result_or.status(),
+ "Failed to initialize v1 document log while migrating.");
+ }
+ PortableFileBackedProtoLog<DocumentWrapper>::CreateResult v1_create_result =
+ std::move(v1_create_result_or).ValueOrDie();
+ std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> v1_proto_log =
+ std::move(v1_create_result.proto_log);
+
+ // Dummy empty document to be used when copying over deleted documents.
+ DocumentProto empty_document;
+
+ // Start reading out from the old log and putting them in the new log.
+ ICING_ASSIGN_OR_RETURN(FileBackedProtoLog<DocumentWrapper>::Iterator iterator,
+ v0_proto_log->GetIterator());
+ auto iterator_status = iterator.Advance();
+ while (iterator_status.ok()) {
+ libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
+ v0_proto_log->ReadProto(iterator.GetOffset());
+
+ bool deleted_document = false;
+ DocumentWrapper document_wrapper;
+ if (absl_ports::IsNotFound(document_wrapper_or.status())) {
+ // Proto was erased, we can skip copying this into our new log.
+ *document_wrapper.mutable_document() = empty_document;
+ deleted_document = true;
+ } else if (!document_wrapper_or.ok()) {
+ // Some real error, pass up
+ return document_wrapper_or.status();
+ } else {
+ document_wrapper = std::move(document_wrapper_or).ValueOrDie();
+ }
+
+ auto offset_or = v1_proto_log->WriteProto(document_wrapper);
+ if (!offset_or.ok()) {
+ return absl_ports::Annotate(
+ offset_or.status(),
+ "Failed to write proto to v1 document log while migrating.");
+ }
+
+ // If the original document was deleted, erase the proto we just wrote.
+ // We do this to maintain the document_ids, i.e. we still want document_id 2
+ // to point to a deleted document even though we may not have the document
+ // contents anymore. DocumentStore guarantees that the document_ids don't
+ // change unless an Optimize is triggered.
+ if (deleted_document) {
+ int64_t offset = offset_or.ValueOrDie();
+ auto erased_status = v1_proto_log->EraseProto(offset);
+ if (!erased_status.ok()) {
+ return absl_ports::Annotate(
+ erased_status,
+ "Failed to erase proto in v1 document log while migrating.");
+ }
+ }
+
+ iterator_status = iterator.Advance();
+ }
+
+ // Close out our file log pointers.
+ v0_proto_log.reset();
+ v1_proto_log.reset();
+
+ return libtextclassifier3::Status::OK;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/document-log-creator.h b/icing/store/document-log-creator.h
new file mode 100644
index 0000000..0c2794a
--- /dev/null
+++ b/icing/store/document-log-creator.h
@@ -0,0 +1,85 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_DOCUMENT_LOG_CREATOR_H_
+#define ICING_STORE_DOCUMENT_LOG_CREATOR_H_
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/proto/document_wrapper.pb.h"
+
+namespace icing {
+namespace lib {
+
+// Handles creation of the document log and any underlying migrations that may
+// be necessary.
+class DocumentLogCreator {
+ public:
+ // Version 0 refers to FileBackedProtoLog
+ // Version 1 refers to PortableFileBackedProtoLog with kFileFormatVersion = 0
+ static constexpr int32_t kCurrentVersion = 1;
+ struct CreateResult {
+ // The create result passed up from the PortableFileBackedProtoLog::Create.
+ // Contains the document log.
+ PortableFileBackedProtoLog<DocumentWrapper>::CreateResult log_create_result;
+
+ // The version number of the pre-existing document log file.
+ // If there is no document log file, it will be set to kCurrentVersion.
+ int preexisting_file_version;
+
+ // Whether the created file is new.
+ bool new_file;
+ };
+
+ // Creates the document log in the base_dir. Will create one if it doesn't
+ // already exist.
+ //
+ // This also handles any potential migrations from old document log versions.
+ // At the end of this call, the most up-to-date log will be returned and will
+ // be usable.
+ //
+ // Returns:
+ // CreateResult on success.
+ // INTERNAL on any I/O error.
+ static libtextclassifier3::StatusOr<DocumentLogCreator::CreateResult> Create(
+ const Filesystem* filesystem, const std::string& base_dir,
+ int32_t compression_level);
+
+ // Returns the filename of the document log, without any directory prefixes.
+ // Used mainly for testing purposes.
+ static std::string GetDocumentLogFilename();
+
+ private:
+ // Handles migrating a v0 document log (not portable) to a v1 document log
+ // (portable). This will initialize the log in the beginning, and close it
+ // when migration is done. Callers will need to reinitialize the log on their
+ // own.
+ //
+ // Returns:
+ // OK on success.
+ // INVALID_ARGUMENT if some invalid option was passed to the document log.
+ // INTERNAL on I/O error.
+ static libtextclassifier3::Status MigrateFromV0ToV1(
+ const Filesystem* filesystem, const std::string& base_dir,
+ int32_t compression_level);
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_DOCUMENT_LOG_CREATOR_H_
diff --git a/icing/store/document-store.cc b/icing/store/document-store.cc
index 93cebaa..094eea1 100644
--- a/icing/store/document-store.cc
+++ b/icing/store/document-store.cc
@@ -17,8 +17,10 @@
#include <cstdint>
#include <limits>
#include <memory>
+#include <optional>
#include <string>
#include <string_view>
+#include <unordered_map>
#include <utility>
#include <vector>
@@ -32,19 +34,38 @@
#include "icing/file/file-backed-vector.h"
#include "icing/file/filesystem.h"
#include "icing/file/memory-mapped-file.h"
+#include "icing/file/portable-file-backed-proto-log.h"
#include "icing/legacy/core/icing-string-util.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/usage.pb.h"
#include "icing/schema/schema-store.h"
+#include "icing/store/corpus-associated-scoring-data.h"
+#include "icing/store/corpus-id.h"
#include "icing/store/document-associated-score-data.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
-#include "icing/store/key-mapper.h"
+#include "icing/store/document-log-creator.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
#include "icing/store/namespace-id.h"
+#include "icing/store/persistent-hash-map-key-mapper.h"
+#include "icing/store/usage-store.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
+#include "icing/util/data-loss.h"
+#include "icing/util/encode-util.h"
+#include "icing/util/fingerprint-util.h"
#include "icing/util/logging.h"
#include "icing/util/status-macros.h"
+#include "icing/util/tokenized-document.h"
namespace icing {
namespace lib {
@@ -53,18 +74,32 @@ namespace {
// Used in DocumentId mapper to mark a document as deleted
constexpr int64_t kDocDeletedFlag = -1;
-constexpr char kDocumentLogFilename[] = "document_log";
constexpr char kDocumentIdMapperFilename[] = "document_id_mapper";
+constexpr char kUriHashMapperWorkingPath[] = "uri_mapper";
constexpr char kDocumentStoreHeaderFilename[] = "document_store_header";
constexpr char kScoreCacheFilename[] = "score_cache";
+constexpr char kCorpusScoreCache[] = "corpus_score_cache";
constexpr char kFilterCacheFilename[] = "filter_cache";
constexpr char kNamespaceMapperFilename[] = "namespace_mapper";
-
-constexpr int32_t kUriMapperMaxSize = 12 * 1024 * 1024; // 12 MiB
-
-// 384 KiB for a KeyMapper would allow each internal array to have a max of
-// 128 KiB for storage.
+constexpr char kUsageStoreDirectoryName[] = "usage_store";
+constexpr char kCorpusIdMapperFilename[] = "corpus_mapper";
+
+// Determined through manual testing to allow for 4 million uris. 4 million
+// because we allow up to 4 million DocumentIds.
+constexpr int32_t kUriDynamicTrieKeyMapperMaxSize =
+ 144 * 1024 * 1024; // 144 MiB
+
+constexpr int32_t kUriHashKeyMapperMaxNumEntries =
+ kMaxDocumentId + 1; // 1 << 22, 4M
+// - Key: namespace_id_str (3 bytes) + fingerprinted_uri (10 bytes) + '\0' (1
+// byte)
+// - Value: DocumentId (4 bytes)
+constexpr int32_t kUriHashKeyMapperKVByteSize = 13 + 1 + sizeof(DocumentId);
+
+// 384 KiB for a DynamicTrieKeyMapper would allow each internal array to have a
+// max of 128 KiB for storage.
constexpr int32_t kNamespaceMapperMaxSize = 3 * 128 * 1024; // 384 KiB
+constexpr int32_t kCorpusMapperMaxSize = 3 * 128 * 1024; // 384 KiB
DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
DocumentWrapper document_wrapper;
@@ -72,49 +107,26 @@ DocumentWrapper CreateDocumentWrapper(DocumentProto&& document) {
return document_wrapper;
}
-DocumentWrapper CreateDocumentTombstone(std::string_view document_namespace,
- std::string_view document_uri) {
- DocumentWrapper document_wrapper;
- document_wrapper.set_deleted(true);
- DocumentProto* document = document_wrapper.mutable_document();
- document->set_namespace_(std::string(document_namespace));
- document->set_uri(std::string(document_uri));
- return document_wrapper;
-}
-
-DocumentWrapper CreateNamespaceTombstone(std::string_view document_namespace) {
- DocumentWrapper document_wrapper;
- document_wrapper.set_deleted(true);
- DocumentProto* document = document_wrapper.mutable_document();
- document->set_namespace_(std::string(document_namespace));
- return document_wrapper;
-}
-
-DocumentWrapper CreateSchemaTypeTombstone(
- std::string_view document_schema_type) {
- DocumentWrapper document_wrapper;
- document_wrapper.set_deleted(true);
- DocumentProto* document = document_wrapper.mutable_document();
- document->set_schema(std::string(document_schema_type));
- return document_wrapper;
-}
-
std::string MakeHeaderFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kDocumentStoreHeaderFilename);
}
-std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
- return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
+std::string MakeUriHashMapperWorkingPath(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kUriHashMapperWorkingPath);
}
-std::string MakeDocumentLogFilename(const std::string& base_dir) {
- return absl_ports::StrCat(base_dir, "/", kDocumentLogFilename);
+std::string MakeDocumentIdMapperFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kDocumentIdMapperFilename);
}
std::string MakeScoreCacheFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kScoreCacheFilename);
}
+std::string MakeCorpusScoreCache(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kCorpusScoreCache);
+}
+
std::string MakeFilterCacheFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kFilterCacheFilename);
}
@@ -123,27 +135,12 @@ std::string MakeNamespaceMapperFilename(const std::string& base_dir) {
return absl_ports::StrCat(base_dir, "/", kNamespaceMapperFilename);
}
-// TODO(adorokhine): This class internally uses an 8-byte fingerprint of the
-// Key and stores the key/value in a file-backed-trie that adds an ~80 byte
-// overhead per key. As we know that these fingerprints are always 8-bytes in
-// length and that they're random, we might be able to store them more
-// compactly.
-std::string MakeFingerprint(std::string_view name_space, std::string_view uri) {
- // Using a 64-bit fingerprint to represent the key could lead to collisions.
- // But, even with 200K unique keys, the probability of collision is about
- // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
- uint64_t fprint =
- tc3farmhash::Fingerprint64(absl_ports::StrCat(name_space, uri));
+std::string MakeUsageStoreDirectoryName(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kUsageStoreDirectoryName);
+}
- std::string encoded_fprint;
- // DynamicTrie cannot handle keys with '0' as bytes. So, we encode it in
- // base128 and add 1 to make sure that no byte is '0'. This increases the
- // size of the encoded_fprint from 8-bytes to 10-bytes.
- while (fprint) {
- encoded_fprint.push_back((fprint & 0x7F) + 1);
- fprint >>= 7;
- }
- return encoded_fprint;
+std::string MakeCorpusMapperFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/", kCorpusIdMapperFilename);
}
int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
@@ -167,76 +164,275 @@ int64_t CalculateExpirationTimestampMs(int64_t creation_timestamp_ms,
return expiration_timestamp_ms;
}
+InitializeStatsProto::RecoveryCause GetRecoveryCause(
+ const DocumentLogCreator::CreateResult& create_result,
+ bool force_recovery_and_revalidate_documents) {
+ if (force_recovery_and_revalidate_documents) {
+ return InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC;
+ } else if (create_result.log_create_result.has_data_loss()) {
+ return InitializeStatsProto::DATA_LOSS;
+ } else if (create_result.preexisting_file_version !=
+ DocumentLogCreator::kCurrentVersion) {
+ return InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT;
+ }
+ return InitializeStatsProto::NONE;
+}
+
+InitializeStatsProto::DocumentStoreDataStatus GetDataStatus(
+ DataLoss data_loss) {
+ switch (data_loss) {
+ case DataLoss::PARTIAL:
+ return InitializeStatsProto::PARTIAL_LOSS;
+ case DataLoss::COMPLETE:
+ return InitializeStatsProto::COMPLETE_LOSS;
+ case DataLoss::NONE:
+ return InitializeStatsProto::NO_DATA_LOSS;
+ }
+}
+
+std::unordered_map<NamespaceId, std::string> GetNamespaceIdsToNamespaces(
+ const KeyMapper<NamespaceId>* key_mapper) {
+ std::unordered_map<NamespaceId, std::string> namespace_ids_to_namespaces;
+
+ std::unique_ptr<typename KeyMapper<NamespaceId>::Iterator> itr =
+ key_mapper->GetIterator();
+ while (itr->Advance()) {
+ namespace_ids_to_namespaces.insert(
+ {itr->GetValue(), std::string(itr->GetKey())});
+ }
+ return namespace_ids_to_namespaces;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<
+ KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>>
+CreateUriMapper(const Filesystem& filesystem, const std::string& base_dir,
+ bool pre_mapping_fbv, bool use_persistent_hash_map) {
+ std::string uri_hash_mapper_working_path =
+ MakeUriHashMapperWorkingPath(base_dir);
+ // Due to historic issue, we use document store's base_dir directly as
+ // DynamicTrieKeyMapper's working directory for uri mapper.
+ // DynamicTrieKeyMapper also creates a subdirectory "key_mapper_dir", so the
+ // actual files will be put under "<base_dir>/key_mapper_dir/".
+ bool dynamic_trie_key_mapper_dir_exists = filesystem.DirectoryExists(
+ absl_ports::StrCat(base_dir, "/key_mapper_dir").c_str());
+ bool persistent_hash_map_dir_exists =
+ filesystem.DirectoryExists(uri_hash_mapper_working_path.c_str());
+ if ((use_persistent_hash_map && dynamic_trie_key_mapper_dir_exists) ||
+ (!use_persistent_hash_map && persistent_hash_map_dir_exists)) {
+ // Return a failure here so that the caller can properly delete and rebuild
+ // this component.
+ return absl_ports::FailedPreconditionError("Key mapper type mismatch");
+ }
+
+ if (use_persistent_hash_map) {
+ return PersistentHashMapKeyMapper<
+ DocumentId, fingerprint_util::FingerprintStringFormatter>::
+ Create(filesystem, std::move(uri_hash_mapper_working_path),
+ pre_mapping_fbv,
+ /*max_num_entries=*/kUriHashKeyMapperMaxNumEntries,
+ /*average_kv_byte_size=*/kUriHashKeyMapperKVByteSize);
+ } else {
+ return DynamicTrieKeyMapper<DocumentId,
+ fingerprint_util::FingerprintStringFormatter>::
+ Create(filesystem, base_dir, kUriDynamicTrieKeyMapperMaxSize);
+ }
+}
+
} // namespace
+std::string DocumentStore::MakeFingerprint(
+ NamespaceId namespace_id, std::string_view namespace_,
+ std::string_view uri_or_schema) const {
+ if (!namespace_id_fingerprint_) {
+ // Using a 64-bit fingerprint to represent the key could lead to collisions.
+ // But, even with 200K unique keys, the probability of collision is about
+ // one-in-a-billion (https://en.wikipedia.org/wiki/Birthday_attack).
+ uint64_t fprint = tc3farmhash::Fingerprint64(
+ absl_ports::StrCat(namespace_, uri_or_schema));
+ return fingerprint_util::GetFingerprintString(fprint);
+ } else {
+ return NamespaceFingerprintIdentifier(namespace_id, uri_or_schema)
+ .EncodeToCString();
+ }
+}
+
DocumentStore::DocumentStore(const Filesystem* filesystem,
const std::string_view base_dir,
const Clock* clock,
- const SchemaStore* schema_store)
+ const SchemaStore* schema_store,
+ bool namespace_id_fingerprint,
+ bool pre_mapping_fbv, bool use_persistent_hash_map,
+ int32_t compression_level)
: filesystem_(filesystem),
base_dir_(base_dir),
clock_(*clock),
schema_store_(schema_store),
- document_validator_(schema_store) {}
+ document_validator_(schema_store),
+ namespace_id_fingerprint_(namespace_id_fingerprint),
+ pre_mapping_fbv_(pre_mapping_fbv),
+ use_persistent_hash_map_(use_persistent_hash_map),
+ compression_level_(compression_level) {}
+
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
+ const DocumentProto& document, int32_t num_tokens,
+ PutDocumentStatsProto* put_document_stats) {
+ return Put(DocumentProto(document), num_tokens, put_document_stats);
+}
libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
- const DocumentProto& document) {
- return Put(DocumentProto(document));
+ DocumentProto&& document, int32_t num_tokens,
+ PutDocumentStatsProto* put_document_stats) {
+ document.mutable_internal_fields()->set_length_in_tokens(num_tokens);
+ return InternalPut(std::move(document), put_document_stats);
}
DocumentStore::~DocumentStore() {
if (initialized_) {
- if (!PersistToDisk().ok()) {
+ if (!PersistToDisk(PersistType::FULL).ok()) {
ICING_LOG(ERROR)
<< "Error persisting to disk in DocumentStore destructor";
}
}
}
-libtextclassifier3::StatusOr<std::unique_ptr<DocumentStore>>
-DocumentStore::Create(const Filesystem* filesystem, const std::string& base_dir,
- const Clock* clock, const SchemaStore* schema_store) {
+libtextclassifier3::StatusOr<DocumentStore::CreateResult> DocumentStore::Create(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store,
+ bool force_recovery_and_revalidate_documents, bool namespace_id_fingerprint,
+ bool pre_mapping_fbv, bool use_persistent_hash_map,
+ int32_t compression_level, InitializeStatsProto* initialize_stats) {
ICING_RETURN_ERROR_IF_NULL(filesystem);
ICING_RETURN_ERROR_IF_NULL(clock);
ICING_RETURN_ERROR_IF_NULL(schema_store);
- auto document_store = std::unique_ptr<DocumentStore>(
- new DocumentStore(filesystem, base_dir, clock, schema_store));
- ICING_RETURN_IF_ERROR(document_store->Initialize());
- return document_store;
+ auto document_store = std::unique_ptr<DocumentStore>(new DocumentStore(
+ filesystem, base_dir, clock, schema_store, namespace_id_fingerprint,
+ pre_mapping_fbv, use_persistent_hash_map, compression_level));
+ ICING_ASSIGN_OR_RETURN(
+ InitializeResult initialize_result,
+ document_store->Initialize(force_recovery_and_revalidate_documents,
+ initialize_stats));
+
+ CreateResult create_result;
+ create_result.document_store = std::move(document_store);
+ create_result.data_loss = initialize_result.data_loss;
+ create_result.derived_files_regenerated =
+ initialize_result.derived_files_regenerated;
+ return create_result;
}
-libtextclassifier3::Status DocumentStore::Initialize() {
- auto create_result_or = FileBackedProtoLog<DocumentWrapper>::Create(
- filesystem_, MakeDocumentLogFilename(base_dir_),
- FileBackedProtoLog<DocumentWrapper>::Options(
- /*compress_in=*/true));
+/* static */ libtextclassifier3::Status DocumentStore::DiscardDerivedFiles(
+ const Filesystem* filesystem, const std::string& base_dir) {
+ // Header
+ const std::string header_filename = MakeHeaderFilename(base_dir);
+ if (!filesystem->DeleteFile(MakeHeaderFilename(base_dir).c_str())) {
+ return absl_ports::InternalError("Couldn't delete header file");
+ }
+
+ // Document key mapper. Doesn't hurt to delete both dynamic trie and
+ // persistent hash map without checking.
+ ICING_RETURN_IF_ERROR(
+ DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem, base_dir));
+ ICING_RETURN_IF_ERROR(PersistentHashMapKeyMapper<DocumentId>::Delete(
+ *filesystem, MakeUriHashMapperWorkingPath(base_dir)));
+
+ // Document id mapper
+ ICING_RETURN_IF_ERROR(FileBackedVector<int64_t>::Delete(
+ *filesystem, MakeDocumentIdMapperFilename(base_dir)));
+
+ // Document associated score cache
+ ICING_RETURN_IF_ERROR(FileBackedVector<DocumentAssociatedScoreData>::Delete(
+ *filesystem, MakeScoreCacheFilename(base_dir)));
+
+ // Filter cache
+ ICING_RETURN_IF_ERROR(FileBackedVector<DocumentFilterData>::Delete(
+ *filesystem, MakeFilterCacheFilename(base_dir)));
+
+ // Namespace mapper
+ ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<NamespaceId>::Delete(
+ *filesystem, MakeNamespaceMapperFilename(base_dir)));
+
+ // Corpus mapper
+ ICING_RETURN_IF_ERROR(DynamicTrieKeyMapper<CorpusId>::Delete(
+ *filesystem, MakeCorpusMapperFilename(base_dir)));
+
+ // Corpus associated score cache
+ ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
+ *filesystem, MakeCorpusScoreCache(base_dir)));
+
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<DocumentStore::InitializeResult>
+DocumentStore::Initialize(bool force_recovery_and_revalidate_documents,
+ InitializeStatsProto* initialize_stats) {
+ auto create_result_or =
+ DocumentLogCreator::Create(filesystem_, base_dir_, compression_level_);
+
// TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
if (!create_result_or.ok()) {
ICING_LOG(ERROR) << create_result_or.status().error_message()
- << "\nFailed to initialize DocumentLog";
+ << "\nFailed to initialize DocumentLog.";
return create_result_or.status();
}
- FileBackedProtoLog<DocumentWrapper>::CreateResult create_result =
+ DocumentLogCreator::CreateResult create_result =
std::move(create_result_or).ValueOrDie();
- document_log_ = std::move(create_result.proto_log);
- if (create_result.data_loss) {
- ICING_LOG(WARNING)
- << "Data loss in document log, regenerating derived files.";
- libtextclassifier3::Status status = RegenerateDerivedFiles();
+ document_log_ = std::move(create_result.log_create_result.proto_log);
+ InitializeStatsProto::RecoveryCause recovery_cause =
+ GetRecoveryCause(create_result, force_recovery_and_revalidate_documents);
+
+ bool derived_files_regenerated = false;
+ if (recovery_cause != InitializeStatsProto::NONE || create_result.new_file) {
+ ICING_LOG(INFO) << "Starting Document Store Recovery with cause="
+ << recovery_cause << ", and create result { new_file="
+ << create_result.new_file << ", preeisting_file_version="
+ << create_result.preexisting_file_version << ", data_loss="
+ << create_result.log_create_result.data_loss
+ << "} and kCurrentVersion="
+ << DocumentLogCreator::kCurrentVersion;
+ // We can't rely on any existing derived files. Recreate them from scratch.
+ // Currently happens if:
+ // 1) This is a new log and we don't have derived files yet
+ // 2) Client wanted us to force a regeneration.
+ // 3) Log has some data loss, can't rely on existing derived data.
+ std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
+ libtextclassifier3::Status status =
+ RegenerateDerivedFiles(force_recovery_and_revalidate_documents);
+ if (recovery_cause != InitializeStatsProto::NONE) {
+ // Only consider it a recovery if the client forced a recovery or there
+ // was data loss. Otherwise, this could just be the first time we're
+ // initializing and generating derived files.
+ derived_files_regenerated = true;
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_document_store_recovery_latency_ms(
+ document_recovery_timer->GetElapsedMilliseconds());
+ initialize_stats->set_document_store_recovery_cause(recovery_cause);
+ initialize_stats->set_document_store_data_status(
+ GetDataStatus(create_result.log_create_result.data_loss));
+ }
+ }
if (!status.ok()) {
ICING_LOG(ERROR)
<< "Failed to regenerate derived files for DocumentStore";
return status;
}
} else {
- if (!InitializeDerivedFiles().ok()) {
- ICING_VLOG(1)
+ if (!InitializeExistingDerivedFiles().ok()) {
+ ICING_LOG(WARNING)
<< "Couldn't find derived files or failed to initialize them, "
"regenerating derived files for DocumentStore.";
- libtextclassifier3::Status status = RegenerateDerivedFiles();
+ std::unique_ptr<Timer> document_recovery_timer = clock_.GetNewTimer();
+ derived_files_regenerated = true;
+ libtextclassifier3::Status status = RegenerateDerivedFiles(
+ /*force_recovery_and_revalidate_documents=*/false);
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_document_store_recovery_cause(
+ InitializeStatsProto::IO_ERROR);
+ initialize_stats->set_document_store_recovery_latency_ms(
+ document_recovery_timer->GetElapsedMilliseconds());
+ }
if (!status.ok()) {
ICING_LOG(ERROR)
<< "Failed to regenerate derived files for DocumentStore";
@@ -246,11 +442,17 @@ libtextclassifier3::Status DocumentStore::Initialize() {
}
initialized_ = true;
+ if (initialize_stats != nullptr) {
+ initialize_stats->set_num_documents(document_id_mapper_->num_elements());
+ }
- return libtextclassifier3::Status::OK;
+ InitializeResult initialize_result = {
+ .data_loss = create_result.log_create_result.data_loss,
+ .derived_files_regenerated = derived_files_regenerated};
+ return initialize_result;
}
-libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() {
+libtextclassifier3::Status DocumentStore::InitializeExistingDerivedFiles() {
if (!HeaderExists()) {
// Without a header, we don't know if things are consistent between each
// other so the caller should just regenerate everything from ground
@@ -265,15 +467,16 @@ libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() {
absl_ports::StrCat("Couldn't read: ", MakeHeaderFilename(base_dir_)));
}
- if (header.magic != DocumentStore::Header::kMagic) {
+ if (header.magic !=
+ DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_)) {
return absl_ports::InternalError(absl_ports::StrCat(
"Invalid header kMagic for file: ", MakeHeaderFilename(base_dir_)));
}
// TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
- auto document_key_mapper_or =
- KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
+ auto document_key_mapper_or = CreateUriMapper(
+ *filesystem_, base_dir_, pre_mapping_fbv_, use_persistent_hash_map_);
if (!document_key_mapper_or.ok()) {
ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
<< "Failed to initialize KeyMapper";
@@ -305,9 +508,32 @@ libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() {
ICING_ASSIGN_OR_RETURN(
namespace_mapper_,
- KeyMapper<NamespaceId>::Create(*filesystem_,
- MakeNamespaceMapperFilename(base_dir_),
- kNamespaceMapperMaxSize));
+ DynamicTrieKeyMapper<NamespaceId>::Create(
+ *filesystem_, MakeNamespaceMapperFilename(base_dir_),
+ kNamespaceMapperMaxSize));
+
+ ICING_ASSIGN_OR_RETURN(
+ usage_store_,
+ UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
+
+ auto corpus_mapper_or =
+ DynamicTrieKeyMapper<CorpusId,
+ fingerprint_util::FingerprintStringFormatter>::
+ Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
+ kCorpusMapperMaxSize);
+ if (!corpus_mapper_or.ok()) {
+ return std::move(corpus_mapper_or).status();
+ }
+ corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
+
+ ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
+ FileBackedVector<CorpusAssociatedScoreData>::Create(
+ *filesystem_, MakeCorpusScoreCache(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+
+ // Ensure the usage store is the correct size.
+ ICING_RETURN_IF_ERROR(
+ usage_store_->TruncateTo(document_id_mapper_->num_elements()));
ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
if (checksum.Get() != header.checksum) {
@@ -318,137 +544,128 @@ libtextclassifier3::Status DocumentStore::InitializeDerivedFiles() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
+libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles(
+ bool revalidate_documents) {
ICING_RETURN_IF_ERROR(ResetDocumentKeyMapper());
ICING_RETURN_IF_ERROR(ResetDocumentIdMapper());
ICING_RETURN_IF_ERROR(ResetDocumentAssociatedScoreCache());
ICING_RETURN_IF_ERROR(ResetFilterCache());
ICING_RETURN_IF_ERROR(ResetNamespaceMapper());
+ ICING_RETURN_IF_ERROR(ResetCorpusMapper());
+ ICING_RETURN_IF_ERROR(ResetCorpusAssociatedScoreCache());
+
+ // Creates a new UsageStore instance. Note that we don't reset the data in
+ // usage store here because we're not able to regenerate the usage scores.
+ ICING_ASSIGN_OR_RETURN(
+ usage_store_,
+ UsageStore::Create(filesystem_, MakeUsageStoreDirectoryName(base_dir_)));
// Iterates through document log
auto iterator = document_log_->GetIterator();
auto iterator_status = iterator.Advance();
+ libtextclassifier3::StatusOr<int64_t> element_size =
+ document_log_->GetElementsFileSize();
+ libtextclassifier3::StatusOr<int64_t> disk_usage =
+ document_log_->GetDiskUsage();
+ if (element_size.ok() && disk_usage.ok()) {
+ ICING_VLOG(1) << "Starting recovery of document store. Document store "
+ "elements file size:"
+ << element_size.ValueOrDie()
+ << ", disk usage=" << disk_usage.ValueOrDie();
+ }
while (iterator_status.ok()) {
- ICING_ASSIGN_OR_RETURN(DocumentWrapper document_wrapper,
- document_log_->ReadProto(iterator.GetOffset()));
- if (document_wrapper.deleted()) {
- if (!document_wrapper.document().uri().empty()) {
- // Individual document deletion.
- auto document_id_or =
- GetDocumentId(document_wrapper.document().namespace_(),
- document_wrapper.document().uri());
- // Updates document_id mapper with deletion
- if (document_id_or.ok()) {
- ICING_RETURN_IF_ERROR(document_id_mapper_->Set(
- document_id_or.ValueOrDie(), kDocDeletedFlag));
- } else if (!absl_ports::IsNotFound(document_id_or.status())) {
- // Real error
- return absl_ports::Annotate(
- document_id_or.status(),
- absl_ports::StrCat("Failed to find document id. namespace: ",
- document_wrapper.document().namespace_(),
- ", uri: ", document_wrapper.document().uri()));
- }
- } else if (!document_wrapper.document().namespace_().empty()) {
- // Namespace deletion.
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesNamespaceDeleted(
- document_wrapper.document().namespace_()));
-
- } else if (!document_wrapper.document().schema().empty()) {
- // SchemaType deletion.
- auto schema_type_id_or = schema_store_->GetSchemaTypeId(
- document_wrapper.document().schema());
-
- if (schema_type_id_or.ok()) {
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(
- schema_type_id_or.ValueOrDie()));
- } else {
- // The deleted schema type doesn't have a SchemaTypeId we can refer
- // to in the FilterCache.
- //
- // TODO(cassiewang): We could avoid reading out all the documents.
- // When we see a schema type doesn't have a SchemaTypeId, assign the
- // unknown schema type a unique, temporary SchemaTypeId and store
- // that in the FilterCache. Then, when we see the schema type
- // tombstone here, we can look up its temporary SchemaTypeId and
- // just iterate through the FilterCache to mark those documents as
- // deleted.
- int size = document_id_mapper_->num_elements();
- for (DocumentId document_id = 0; document_id < size; document_id++) {
- auto document_or = Get(document_id);
- if (absl_ports::IsNotFound(document_or.status())) {
- // Skip nonexistent documents
- continue;
- } else if (!document_or.ok()) {
- // Real error, pass up
- return absl_ports::Annotate(
- document_or.status(),
- IcingStringUtil::StringPrintf(
- "Failed to retrieve Document for DocumentId %d",
- document_id));
- }
-
- // Guaranteed to have a document now.
- DocumentProto document = document_or.ValueOrDie();
-
- if (document.schema() == document_wrapper.document().schema()) {
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id, kDocDeletedFlag));
- }
- }
- }
- } else {
- return absl_ports::InternalError(
- "Encountered an invalid tombstone during recovery!");
- }
- } else {
- // Updates key mapper and document_id mapper with the new document
- DocumentId new_document_id = document_id_mapper_->num_elements();
- ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
- MakeFingerprint(document_wrapper.document().namespace_(),
- document_wrapper.document().uri()),
- new_document_id));
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
-
- ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
- new_document_id,
- DocumentAssociatedScoreData(
- document_wrapper.document().score(),
- document_wrapper.document().creation_timestamp_ms())));
-
- SchemaTypeId schema_type_id;
- auto schema_type_id_or =
- schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
- if (absl_ports::IsNotFound(schema_type_id_or.status())) {
- // Didn't find a SchemaTypeId. This means that the DocumentStore and
- // the SchemaStore are out of sync. But DocumentStore can't do
- // anything about it so just ignore this for now. This should be
- // detected/handled by the owner of DocumentStore. Set it to some
- // arbitrary invalid value for now, it'll get updated to the correct
- // ID later.
- schema_type_id = -1;
- } else if (!schema_type_id_or.ok()) {
- // Real error. Pass it up
- return schema_type_id_or.status();
- } else {
- // We're guaranteed that SchemaTypeId is valid now
- schema_type_id = schema_type_id_or.ValueOrDie();
- }
+ ICING_VLOG(2) << "Attempting to read document at offset="
+ << iterator.GetOffset();
+ libtextclassifier3::StatusOr<DocumentWrapper> document_wrapper_or =
+ document_log_->ReadProto(iterator.GetOffset());
- ICING_ASSIGN_OR_RETURN(
- NamespaceId namespace_id,
- namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
- namespace_mapper_->num_keys()));
+ if (absl_ports::IsNotFound(document_wrapper_or.status())) {
+ // The erased document still occupies 1 document id.
+ DocumentId new_document_id = document_id_mapper_->num_elements();
+ ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
+ iterator_status = iterator.Advance();
+ continue;
+ } else if (!document_wrapper_or.ok()) {
+ return document_wrapper_or.status();
+ }
- int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
- document_wrapper.document().creation_timestamp_ms(),
- document_wrapper.document().ttl_ms());
+ DocumentWrapper document_wrapper =
+ std::move(document_wrapper_or).ValueOrDie();
+ // Revalidate that this document is still compatible if requested.
+ if (revalidate_documents) {
+ if (!document_validator_.Validate(document_wrapper.document()).ok()) {
+ // Document is no longer valid with the current schema. Mark as
+ // deleted
+ DocumentId new_document_id = document_id_mapper_->num_elements();
+ ICING_RETURN_IF_ERROR(document_log_->EraseProto(iterator.GetOffset()));
+ ICING_RETURN_IF_ERROR(ClearDerivedData(new_document_id));
+ continue;
+ }
+ }
- ICING_RETURN_IF_ERROR(UpdateFilterCache(
- new_document_id, DocumentFilterData(namespace_id, schema_type_id,
- expiration_timestamp_ms)));
+ ICING_ASSIGN_OR_RETURN(
+ NamespaceId namespace_id,
+ namespace_mapper_->GetOrPut(document_wrapper.document().namespace_(),
+ namespace_mapper_->num_keys()));
+
+ // Updates key mapper and document_id mapper with the new document
+ DocumentId new_document_id = document_id_mapper_->num_elements();
+ ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
+ MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
+ document_wrapper.document().uri()),
+ new_document_id));
+ ICING_RETURN_IF_ERROR(
+ document_id_mapper_->Set(new_document_id, iterator.GetOffset()));
+
+ SchemaTypeId schema_type_id;
+ auto schema_type_id_or =
+ schema_store_->GetSchemaTypeId(document_wrapper.document().schema());
+ if (absl_ports::IsNotFound(schema_type_id_or.status())) {
+ // Didn't find a SchemaTypeId. This means that the DocumentStore and
+ // the SchemaStore are out of sync. But DocumentStore can't do
+ // anything about it so just ignore this for now. This should be
+ // detected/handled by the owner of DocumentStore. Set it to some
+ // arbitrary invalid value for now, it'll get updated to the correct
+ // ID later.
+ schema_type_id = -1;
+ } else if (!schema_type_id_or.ok()) {
+ // Real error. Pass it up
+ return schema_type_id_or.status();
+ } else {
+ // We're guaranteed that SchemaTypeId is valid now
+ schema_type_id = schema_type_id_or.ValueOrDie();
}
+
+ // Update corpus maps
+ std::string corpus =
+ MakeFingerprint(namespace_id, document_wrapper.document().namespace_(),
+ document_wrapper.document().schema());
+ ICING_ASSIGN_OR_RETURN(
+ CorpusId corpusId,
+ corpus_mapper_->GetOrPut(corpus, corpus_mapper_->num_keys()));
+
+ ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
+ GetCorpusAssociatedScoreDataToUpdate(corpusId));
+ scoring_data.AddDocument(
+ document_wrapper.document().internal_fields().length_in_tokens());
+
+ ICING_RETURN_IF_ERROR(
+ UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
+
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ new_document_id,
+ DocumentAssociatedScoreData(
+ corpusId, document_wrapper.document().score(),
+ document_wrapper.document().creation_timestamp_ms(),
+ document_wrapper.document().internal_fields().length_in_tokens())));
+
+ int64_t expiration_timestamp_ms = CalculateExpirationTimestampMs(
+ document_wrapper.document().creation_timestamp_ms(),
+ document_wrapper.document().ttl_ms());
+
+ ICING_RETURN_IF_ERROR(UpdateFilterCache(
+ new_document_id, DocumentFilterData(namespace_id, schema_type_id,
+ expiration_timestamp_ms)));
iterator_status = iterator.Advance();
}
@@ -460,6 +677,10 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
"Failed to iterate through proto log.");
}
+ // Shrink usage_store_ to the correct size.
+ ICING_RETURN_IF_ERROR(
+ usage_store_->TruncateTo(document_id_mapper_->num_elements()));
+
// Write the header
ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
ICING_RETURN_IF_ERROR(UpdateHeader(checksum));
@@ -468,22 +689,33 @@ libtextclassifier3::Status DocumentStore::RegenerateDerivedFiles() {
}
libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
+ // Only one type of KeyMapper (either DynamicTrieKeyMapper or
+ // PersistentHashMapKeyMapper) will actually exist at any moment, but it is ok
+ // to call Delete() for both since Delete() returns OK if any of them doesn't
+ // exist.
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
document_key_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status =
- KeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
+ DynamicTrieKeyMapper<DocumentId>::Delete(*filesystem_, base_dir_);
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete old key mapper";
+ << "Failed to delete old dynamic trie key mapper";
+ return status;
+ }
+ status = PersistentHashMapKeyMapper<DocumentId>::Delete(
+ *filesystem_, MakeUriHashMapperWorkingPath(base_dir_));
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete old persistent hash map key mapper";
return status;
}
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
- auto document_key_mapper_or =
- KeyMapper<DocumentId>::Create(*filesystem_, base_dir_, kUriMapperMaxSize);
+ auto document_key_mapper_or = CreateUriMapper(
+ *filesystem_, base_dir_, pre_mapping_fbv_, use_persistent_hash_map_);
if (!document_key_mapper_or.ok()) {
ICING_LOG(ERROR) << document_key_mapper_or.status().error_message()
<< "Failed to re-init key mapper";
@@ -496,7 +728,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentKeyMapper() {
libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
document_id_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
*filesystem_, MakeDocumentIdMapperFilename(base_dir_));
@@ -505,7 +737,7 @@ libtextclassifier3::Status DocumentStore::ResetDocumentIdMapper() {
<< "Failed to delete old document_id mapper";
return status;
}
- // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // TODO(b/216487496): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
auto document_id_mapper_or = FileBackedVector<int64_t>::Create(
*filesystem_, MakeDocumentIdMapperFilename(base_dir_),
@@ -531,6 +763,18 @@ libtextclassifier3::Status DocumentStore::ResetDocumentAssociatedScoreCache() {
return libtextclassifier3::Status::OK;
}
+libtextclassifier3::Status DocumentStore::ResetCorpusAssociatedScoreCache() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ corpus_score_cache_.reset();
+ ICING_RETURN_IF_ERROR(FileBackedVector<CorpusAssociatedScoreData>::Delete(
+ *filesystem_, MakeCorpusScoreCache(base_dir_)));
+ ICING_ASSIGN_OR_RETURN(corpus_score_cache_,
+ FileBackedVector<CorpusAssociatedScoreData>::Create(
+ *filesystem_, MakeCorpusScoreCache(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC));
+ return libtextclassifier3::Status::OK;
+}
+
libtextclassifier3::Status DocumentStore::ResetFilterCache() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
filter_cache_.reset();
@@ -546,9 +790,9 @@ libtextclassifier3::Status DocumentStore::ResetFilterCache() {
libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
// TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
namespace_mapper_.reset();
- // TODO(b/144458732): Implement a more robust version of TC_RETURN_IF_ERROR
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
// that can support error logging.
- libtextclassifier3::Status status = KeyMapper<NamespaceId>::Delete(
+ libtextclassifier3::Status status = DynamicTrieKeyMapper<NamespaceId>::Delete(
*filesystem_, MakeNamespaceMapperFilename(base_dir_));
if (!status.ok()) {
ICING_LOG(ERROR) << status.error_message()
@@ -557,9 +801,33 @@ libtextclassifier3::Status DocumentStore::ResetNamespaceMapper() {
}
ICING_ASSIGN_OR_RETURN(
namespace_mapper_,
- KeyMapper<NamespaceId>::Create(*filesystem_,
- MakeNamespaceMapperFilename(base_dir_),
- kNamespaceMapperMaxSize));
+ DynamicTrieKeyMapper<NamespaceId>::Create(
+ *filesystem_, MakeNamespaceMapperFilename(base_dir_),
+ kNamespaceMapperMaxSize));
+ return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::Status DocumentStore::ResetCorpusMapper() {
+ // TODO(b/139734457): Replace ptr.reset()->Delete->Create flow with Reset().
+ corpus_mapper_.reset();
+ // TODO(b/216487496): Implement a more robust version of TC_RETURN_IF_ERROR
+ // that can support error logging.
+ libtextclassifier3::Status status = DynamicTrieKeyMapper<CorpusId>::Delete(
+ *filesystem_, MakeCorpusMapperFilename(base_dir_));
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete old corpus_id mapper";
+ return status;
+ }
+ auto corpus_mapper_or =
+ DynamicTrieKeyMapper<CorpusId,
+ fingerprint_util::FingerprintStringFormatter>::
+ Create(*filesystem_, MakeCorpusMapperFilename(base_dir_),
+ kCorpusMapperMaxSize);
+ if (!corpus_mapper_or.ok()) {
+ return std::move(corpus_mapper_or).status();
+ }
+ corpus_mapper_ = std::move(corpus_mapper_or).ValueOrDie();
return libtextclassifier3::Status::OK;
}
@@ -576,7 +844,15 @@ libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
}
Crc32 document_log_checksum = std::move(checksum_or).ValueOrDie();
- Crc32 document_key_mapper_checksum = document_key_mapper_->ComputeChecksum();
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = document_key_mapper_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of DocumentKeyMapper";
+ return checksum_or.status();
+ }
+ Crc32 document_key_mapper_checksum = std::move(checksum_or).ValueOrDie();
// TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
@@ -608,7 +884,40 @@ libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
}
Crc32 filter_cache_checksum = std::move(checksum_or).ValueOrDie();
- Crc32 namespace_mapper_checksum = namespace_mapper_->ComputeChecksum();
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = namespace_mapper_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of namespace mapper";
+ return checksum_or.status();
+ }
+ Crc32 namespace_mapper_checksum = std::move(checksum_or).ValueOrDie();
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = corpus_mapper_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(ERROR) << checksum_or.status().error_message()
+ << "Failed to compute checksum of corpus mapper";
+ return checksum_or.status();
+ }
+ Crc32 corpus_mapper_checksum = std::move(checksum_or).ValueOrDie();
+
+ // TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
+ // that can support error logging.
+ checksum_or = corpus_score_cache_->ComputeChecksum();
+ if (!checksum_or.ok()) {
+ ICING_LOG(WARNING) << checksum_or.status().error_message()
+ << "Failed to compute checksum of score cache";
+ return checksum_or.status();
+ }
+ Crc32 corpus_score_cache_checksum = std::move(checksum_or).ValueOrDie();
+
+ // NOTE: We purposely don't include usage_store checksum here because we can't
+ // regenerate it from ground truth documents. If it gets corrupted, we'll just
+ // clear all usage reports, but we shouldn't throw everything else in the
+ // document store out.
total_checksum.Append(std::to_string(document_log_checksum.Get()));
total_checksum.Append(std::to_string(document_key_mapper_checksum.Get()));
@@ -616,6 +925,8 @@ libtextclassifier3::StatusOr<Crc32> DocumentStore::ComputeChecksum() const {
total_checksum.Append(std::to_string(score_cache_checksum.Get()));
total_checksum.Append(std::to_string(filter_cache_checksum.Get()));
total_checksum.Append(std::to_string(namespace_mapper_checksum.Get()));
+ total_checksum.Append(std::to_string(corpus_mapper_checksum.Get()));
+ total_checksum.Append(std::to_string(corpus_score_cache_checksum.Get()));
return total_checksum;
}
@@ -635,27 +946,37 @@ bool DocumentStore::HeaderExists() {
libtextclassifier3::Status DocumentStore::UpdateHeader(const Crc32& checksum) {
// Write the header
DocumentStore::Header header;
- header.magic = DocumentStore::Header::kMagic;
+ header.magic =
+ DocumentStore::Header::GetCurrentMagic(namespace_id_fingerprint_);
header.checksum = checksum.Get();
// This should overwrite the header.
- if (!filesystem_->Write(MakeHeaderFilename(base_dir_).c_str(), &header,
- sizeof(header))) {
+ ScopedFd sfd(
+ filesystem_->OpenForWrite(MakeHeaderFilename(base_dir_).c_str()));
+ if (!sfd.is_valid() ||
+ !filesystem_->Write(sfd.get(), &header, sizeof(header)) ||
+ !filesystem_->DataSync(sfd.get())) {
return absl_ports::InternalError(absl_ports::StrCat(
"Failed to write DocStore header: ", MakeHeaderFilename(base_dir_)));
}
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
- DocumentProto&& document) {
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::InternalPut(
+ DocumentProto&& document, PutDocumentStatsProto* put_document_stats) {
+ std::unique_ptr<Timer> put_timer = clock_.GetNewTimer();
ICING_RETURN_IF_ERROR(document_validator_.Validate(document));
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_document_size(document.ByteSizeLong());
+ }
+
// Copy fields needed before they are moved
std::string name_space = document.namespace_();
std::string uri = document.uri();
std::string schema = document.schema();
int document_score = document.score();
+ int32_t length_in_tokens = document.internal_fields().length_in_tokens();
int64_t creation_timestamp_ms = document.creation_timestamp_ms();
// Sets the creation timestamp if caller hasn't specified.
@@ -688,19 +1009,40 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
// Creates a new document id, updates key mapper and document_id mapper
DocumentId new_document_id = document_id_mapper_->num_elements();
- ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
- MakeFingerprint(name_space, uri), new_document_id));
- ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
-
- ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
- new_document_id,
- DocumentAssociatedScoreData(document_score, creation_timestamp_ms)));
+ if (!IsDocumentIdValid(new_document_id)) {
+ return absl_ports::ResourceExhaustedError(
+ "Exceeded maximum number of documents. Try calling Optimize to reclaim "
+ "some space.");
+ }
// Update namespace maps
ICING_ASSIGN_OR_RETURN(
NamespaceId namespace_id,
namespace_mapper_->GetOrPut(name_space, namespace_mapper_->num_keys()));
+ // Updates key mapper and document_id mapper
+ ICING_RETURN_IF_ERROR(document_key_mapper_->Put(
+ MakeFingerprint(namespace_id, name_space, uri), new_document_id));
+ ICING_RETURN_IF_ERROR(document_id_mapper_->Set(new_document_id, file_offset));
+
+ // Update corpus maps
+ ICING_ASSIGN_OR_RETURN(CorpusId corpusId,
+ corpus_mapper_->GetOrPut(
+ MakeFingerprint(namespace_id, name_space, schema),
+ corpus_mapper_->num_keys()));
+
+ ICING_ASSIGN_OR_RETURN(CorpusAssociatedScoreData scoring_data,
+ GetCorpusAssociatedScoreDataToUpdate(corpusId));
+ scoring_data.AddDocument(length_in_tokens);
+
+ ICING_RETURN_IF_ERROR(
+ UpdateCorpusAssociatedScoreCache(corpusId, scoring_data));
+
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ new_document_id,
+ DocumentAssociatedScoreData(corpusId, document_score,
+ creation_timestamp_ms, length_in_tokens)));
+
ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
schema_store_->GetSchemaTypeId(schema));
@@ -709,21 +1051,40 @@ libtextclassifier3::StatusOr<DocumentId> DocumentStore::Put(
expiration_timestamp_ms)));
if (old_document_id_or.ok()) {
- // Mark the old document id as deleted.
- ICING_RETURN_IF_ERROR(document_id_mapper_->Set(
- old_document_id_or.ValueOrDie(), kDocDeletedFlag));
+ // The old document exists, copy over the usage scores and delete the old
+ // document.
+ DocumentId old_document_id = old_document_id_or.ValueOrDie();
+
+ ICING_RETURN_IF_ERROR(
+ usage_store_->CloneUsageScores(/*from_document_id=*/old_document_id,
+ /*to_document_id=*/new_document_id));
+
+ // Delete the old document. It's fine if it's not found since it might have
+ // been deleted previously.
+ auto delete_status =
+ Delete(old_document_id, clock_.GetSystemTimeMilliseconds());
+ if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
+ // Real error, pass it up.
+ return delete_status;
+ }
+ }
+
+ if (put_document_stats != nullptr) {
+ put_document_stats->set_document_store_latency_ms(
+ put_timer->GetElapsedMilliseconds());
}
return new_document_id;
}
libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
- const std::string_view name_space, const std::string_view uri) const {
+ const std::string_view name_space, const std::string_view uri,
+ bool clear_internal_fields) const {
// TODO(b/147231617): Make a better way to replace the error message in an
// existing Status.
auto document_id_or = GetDocumentId(name_space, uri);
if (absl_ports::IsNotFound(document_id_or.status())) {
- ICING_LOG(ERROR) << document_id_or.status().error_message();
+ ICING_VLOG(1) << document_id_or.status().error_message();
return libtextclassifier3::Status(
document_id_or.status().CanonicalCode(),
IcingStringUtil::StringPrintf("Document (%s, %s) not found.",
@@ -745,9 +1106,30 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
}
libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
- DocumentId document_id) const {
- ICING_ASSIGN_OR_RETURN(int64_t document_log_offset,
- DoesDocumentExistAndGetFileOffset(document_id));
+ DocumentId document_id, bool clear_internal_fields) const {
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
+ auto document_filter_data_optional_ =
+ GetAliveDocumentFilterData(document_id, current_time_ms);
+ if (!document_filter_data_optional_) {
+ // The document doesn't exist. Let's check if the document id is invalid, we
+ // will return InvalidArgumentError. Otherwise we should return NOT_FOUND
+ // error.
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id '%d' invalid.", document_id));
+ }
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Document id '%d' doesn't exist", document_id));
+ }
+
+ auto document_log_offset_or = document_id_mapper_->Get(document_id);
+ if (!document_log_offset_or.ok()) {
+ // Since we've just checked that our document_id is valid a few lines
+ // above, there's no reason this should fail and an error should never
+ // happen.
+ return absl_ports::InternalError("Failed to find document offset.");
+ }
+ int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
// TODO(b/144458732): Implement a more robust version of TC_ASSIGN_OR_RETURN
// that can support error logging.
@@ -759,30 +1141,57 @@ libtextclassifier3::StatusOr<DocumentProto> DocumentStore::Get(
}
DocumentWrapper document_wrapper =
std::move(document_wrapper_or).ValueOrDie();
+ if (clear_internal_fields) {
+ document_wrapper.mutable_document()->clear_internal_fields();
+ }
return std::move(*document_wrapper.mutable_document());
}
libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
const std::string_view name_space, const std::string_view uri) const {
- auto document_id_or =
- document_key_mapper_->Get(MakeFingerprint(name_space, uri));
- if (!document_id_or.ok()) {
- return absl_ports::Annotate(
- document_id_or.status(),
- absl_ports::StrCat("Failed to find DocumentId by key: ", name_space,
- ", ", uri));
+ auto namespace_id_or = namespace_mapper_->Get(name_space);
+ libtextclassifier3::Status status = namespace_id_or.status();
+ if (status.ok()) {
+ NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+ auto document_id_or = document_key_mapper_->Get(
+ MakeFingerprint(namespace_id, name_space, uri));
+ status = document_id_or.status();
+ if (status.ok()) {
+ // Guaranteed to have a DocumentId now
+ return document_id_or.ValueOrDie();
+ }
+ }
+ return absl_ports::Annotate(
+ status, absl_ports::StrCat(
+ "Failed to find DocumentId by key: ", name_space, ", ", uri));
+}
+
+libtextclassifier3::StatusOr<DocumentId> DocumentStore::GetDocumentId(
+ const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier)
+ const {
+ if (!namespace_id_fingerprint_) {
+ return absl_ports::FailedPreconditionError(
+ "Cannot lookup document id by namespace id + fingerprint without "
+ "enabling it on uri_mapper");
}
- // Guaranteed to have a DocumentId now
- return document_id_or.ValueOrDie();
+ auto document_id_or = document_key_mapper_->Get(
+ namespace_fingerprint_identifier.EncodeToCString());
+ if (document_id_or.ok()) {
+ return document_id_or.ValueOrDie();
+ }
+ return absl_ports::Annotate(
+ std::move(document_id_or).status(),
+ "Failed to find DocumentId by namespace id + fingerprint");
}
std::vector<std::string> DocumentStore::GetAllNamespaces() const {
std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
- namespace_mapper_->GetValuesToKeys();
+ GetNamespaceIdsToNamespaces(namespace_mapper_.get());
std::unordered_set<NamespaceId> existing_namespace_ids;
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
++document_id) {
// filter_cache_->Get can only fail if document_id is < 0
@@ -795,7 +1204,7 @@ std::vector<std::string> DocumentStore::GetAllNamespaces() const {
}
const DocumentFilterData* data = status_or_data.ValueOrDie();
- if (DoesDocumentExist(document_id)) {
+ if (GetAliveDocumentFilterData(document_id, current_time_ms)) {
existing_namespace_ids.insert(data->namespace_id());
}
}
@@ -808,44 +1217,54 @@ std::vector<std::string> DocumentStore::GetAllNamespaces() const {
return existing_namespaces;
}
-libtextclassifier3::StatusOr<int64_t>
-DocumentStore::DoesDocumentExistAndGetFileOffset(DocumentId document_id) const {
- if (!IsDocumentIdValid(document_id)) {
- return absl_ports::InvalidArgumentError(
- IcingStringUtil::StringPrintf("DocumentId %d is invalid", document_id));
+std::optional<DocumentFilterData> DocumentStore::GetAliveDocumentFilterData(
+ DocumentId document_id, int64_t current_time_ms) const {
+ if (IsDeleted(document_id)) {
+ return std::nullopt;
}
+ return GetNonExpiredDocumentFilterData(document_id, current_time_ms);
+}
+bool DocumentStore::IsDeleted(DocumentId document_id) const {
auto file_offset_or = document_id_mapper_->Get(document_id);
-
- bool deleted =
- file_offset_or.ok() && *file_offset_or.ValueOrDie() == kDocDeletedFlag;
- if (deleted || absl_ports::IsOutOfRange(file_offset_or.status())) {
- // Document has been deleted or doesn't exist
- return absl_ports::NotFoundError(
- IcingStringUtil::StringPrintf("Document %d not found", document_id));
+ if (!file_offset_or.ok()) {
+ // This would only happen if document_id is out of range of the
+ // document_id_mapper, meaning we got some invalid document_id. Callers
+ // should already have checked that their document_id is valid or used
+ // DoesDocumentExist(WithStatus). Regardless, return true since the
+ // document doesn't exist.
+ return true;
}
+ int64_t file_offset = *file_offset_or.ValueOrDie();
+ return file_offset == kDocDeletedFlag;
+}
- ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
- filter_cache_->Get(document_id));
- if (clock_.GetSystemTimeMilliseconds() >=
- filter_data->expiration_timestamp_ms()) {
- // Past the expiration time, so also return NOT FOUND since it *shouldn't*
- // exist anymore.
- return absl_ports::NotFoundError(
- IcingStringUtil::StringPrintf("Document %d not found", document_id));
+// Returns DocumentFilterData if the document is not expired. Otherwise,
+// std::nullopt.
+std::optional<DocumentFilterData>
+DocumentStore::GetNonExpiredDocumentFilterData(DocumentId document_id,
+ int64_t current_time_ms) const {
+ auto filter_data_or = filter_cache_->GetCopy(document_id);
+ if (!filter_data_or.ok()) {
+ // This would only happen if document_id is out of range of the
+ // filter_cache, meaning we got some invalid document_id. Callers should
+ // already have checked that their document_id is valid or used
+ // DoesDocumentExist(WithStatus). Regardless, return true since the
+ // document doesn't exist.
+ return std::nullopt;
}
+ DocumentFilterData document_filter_data = filter_data_or.ValueOrDie();
- ICING_RETURN_IF_ERROR(file_offset_or.status());
- return *file_offset_or.ValueOrDie();
-}
-
-bool DocumentStore::DoesDocumentExist(DocumentId document_id) const {
- // If we can successfully get the document log offset, the document exists.
- return DoesDocumentExistAndGetFileOffset(document_id).ok();
+ // Check if it's past the expiration time
+ if (current_time_ms >= document_filter_data.expiration_timestamp_ms()) {
+ return std::nullopt;
+ }
+ return document_filter_data;
}
libtextclassifier3::Status DocumentStore::Delete(
- const std::string_view name_space, const std::string_view uri) {
+ const std::string_view name_space, const std::string_view uri,
+ int64_t current_time_ms) {
// Try to get the DocumentId first
auto document_id_or = GetDocumentId(name_space, uri);
if (!document_id_or.ok()) {
@@ -854,36 +1273,33 @@ libtextclassifier3::Status DocumentStore::Delete(
absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
", uri: ", uri));
}
+ return Delete(document_id_or.ValueOrDie(), current_time_ms);
+}
- // Check if the DocumentId's Document still exists.
- DocumentId document_id = document_id_or.ValueOrDie();
- auto file_offset_or = DoesDocumentExistAndGetFileOffset(document_id);
- if (!file_offset_or.ok()) {
- return absl_ports::Annotate(
- file_offset_or.status(),
- absl_ports::StrCat("Failed to delete Document. namespace: ", name_space,
- ", uri: ", uri));
+libtextclassifier3::Status DocumentStore::Delete(DocumentId document_id,
+ int64_t current_time_ms) {
+ auto document_filter_data_optional_ =
+ GetAliveDocumentFilterData(document_id, current_time_ms);
+ if (!document_filter_data_optional_) {
+ // The document doesn't exist. We should return InvalidArgumentError if the
+ // document id is invalid. Otherwise we should return NOT_FOUND error.
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id '%d' invalid.", document_id));
+ }
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "Document id '%d' doesn't exist", document_id));
}
- // Update ground truth first.
- // To delete a proto we don't directly remove it. Instead, we mark it as
- // deleted first by appending a tombstone of it and actually remove it from
- // file later in Optimize()
- // TODO(b/144458732): Implement a more robust version of ICING_RETURN_IF_ERROR
- // that can support error logging.
- libtextclassifier3::Status status =
- document_log_->WriteProto(CreateDocumentTombstone(name_space, uri))
- .status();
- if (!status.ok()) {
- return absl_ports::Annotate(
- status, absl_ports::StrCat("Failed to delete Document. namespace: ",
- name_space, ", uri: ", uri));
+ auto document_log_offset_or = document_id_mapper_->Get(document_id);
+ if (!document_log_offset_or.ok()) {
+ return absl_ports::InternalError("Failed to find document offset.");
}
+ int64_t document_log_offset = *document_log_offset_or.ValueOrDie();
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id_or.ValueOrDie(), kDocDeletedFlag));
-
- return libtextclassifier3::Status::OK;
+ // Erases document proto.
+ ICING_RETURN_IF_ERROR(document_log_->EraseProto(document_log_offset));
+ return ClearDerivedData(document_id);
}
libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
@@ -891,163 +1307,269 @@ libtextclassifier3::StatusOr<NamespaceId> DocumentStore::GetNamespaceId(
return namespace_mapper_->Get(name_space);
}
+libtextclassifier3::StatusOr<CorpusId> DocumentStore::GetCorpusId(
+ const std::string_view name_space, const std::string_view schema) const {
+ ICING_ASSIGN_OR_RETURN(NamespaceId namespace_id,
+ namespace_mapper_->Get(name_space));
+ return corpus_mapper_->Get(MakeFingerprint(namespace_id, name_space, schema));
+}
+
+libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
+ ResultSpecProto::ResultGroupingType result_group_type,
+ const std::string_view name_space, const std::string_view schema) const {
+ auto namespace_id = GetNamespaceId(name_space);
+ auto schema_type_id = schema_store_->GetSchemaTypeId(schema);
+ switch (result_group_type) {
+ case ResultSpecProto::NONE:
+ return absl_ports::InvalidArgumentError(
+ "Cannot group by ResultSpecProto::NONE");
+ case ResultSpecProto::SCHEMA_TYPE:
+ if (schema_type_id.ok()) {
+ return schema_type_id.ValueOrDie();
+ }
+ break;
+ case ResultSpecProto::NAMESPACE:
+ if (namespace_id.ok()) {
+ return namespace_id.ValueOrDie();
+ }
+ break;
+ case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
+ if (namespace_id.ok() && schema_type_id.ok()) {
+ // TODO(b/258715421): Temporary workaround to get a
+ // ResultGroupingEntryId given the Namespace string
+ // and Schema string.
+ return namespace_id.ValueOrDie() << 16 | schema_type_id.ValueOrDie();
+ }
+ break;
+ }
+ return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
+}
+
+libtextclassifier3::StatusOr<int32_t> DocumentStore::GetResultGroupingEntryId(
+ ResultSpecProto::ResultGroupingType result_group_type,
+ const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const {
+ switch (result_group_type) {
+ case ResultSpecProto::NONE:
+ return absl_ports::InvalidArgumentError(
+ "Cannot group by ResultSpecProto::NONE");
+ case ResultSpecProto::SCHEMA_TYPE:
+ return schema_type_id;
+ case ResultSpecProto::NAMESPACE:
+ return namespace_id;
+ case ResultSpecProto::NAMESPACE_AND_SCHEMA_TYPE:
+ // TODO(b/258715421): Temporary workaround to get a ResultGroupingEntryId
+ // given the Namespace Id and SchemaType Id.
+ return namespace_id << 16 | schema_type_id;
+ }
+ return absl_ports::NotFoundError("Cannot generate ResultGrouping Entry Id");
+}
+
libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
DocumentStore::GetDocumentAssociatedScoreData(DocumentId document_id) const {
- auto score_data_or = score_cache_->Get(document_id);
+ auto score_data_or = score_cache_->GetCopy(document_id);
if (!score_data_or.ok()) {
ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
<< " from score_cache_";
- return score_data_or.status();
+ return absl_ports::NotFoundError(
+ std::move(score_data_or).status().error_message());
}
- return *std::move(score_data_or).ValueOrDie();
+
+ DocumentAssociatedScoreData document_associated_score_data =
+ std::move(score_data_or).ValueOrDie();
+ return document_associated_score_data;
}
-libtextclassifier3::StatusOr<DocumentFilterData>
-DocumentStore::GetDocumentFilterData(DocumentId document_id) const {
- auto filter_data_or = filter_cache_->Get(document_id);
- if (!filter_data_or.ok()) {
- ICING_LOG(ERROR) << " while trying to access DocumentId " << document_id
- << " from filter_cache_";
- return filter_data_or.status();
+libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
+DocumentStore::GetCorpusAssociatedScoreData(CorpusId corpus_id) const {
+ auto score_data_or = corpus_score_cache_->GetCopy(corpus_id);
+ if (!score_data_or.ok()) {
+ return score_data_or.status();
}
- return *std::move(filter_data_or).ValueOrDie();
+
+ CorpusAssociatedScoreData corpus_associated_score_data =
+ std::move(score_data_or).ValueOrDie();
+ return corpus_associated_score_data;
}
-libtextclassifier3::Status DocumentStore::DeleteByNamespace(
- std::string_view name_space) {
- auto namespace_id_or = namespace_mapper_->Get(name_space);
- if (!namespace_id_or.ok()) {
- return absl_ports::Annotate(
- namespace_id_or.status(),
- absl_ports::StrCat("Failed to delete by namespace. namespace: ",
- name_space));
+libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
+DocumentStore::GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const {
+ auto corpus_scoring_data_or = GetCorpusAssociatedScoreData(corpus_id);
+ if (corpus_scoring_data_or.ok()) {
+ return std::move(corpus_scoring_data_or).ValueOrDie();
+ }
+ CorpusAssociatedScoreData scoringData;
+ // OUT_OF_RANGE is the StatusCode returned when a corpus id is added to
+ // corpus_score_cache_ for the first time.
+ if (corpus_scoring_data_or.status().CanonicalCode() ==
+ libtextclassifier3::StatusCode::OUT_OF_RANGE) {
+ return scoringData;
}
+ return corpus_scoring_data_or.status();
+}
- // Update ground truth first.
- // To delete an entire namespace, we append a tombstone that only contains
- // the deleted bit and the name of the deleted namespace.
- // TODO(b/144458732): Implement a more robust version of
- // ICING_RETURN_IF_ERROR that can support error logging.
- libtextclassifier3::Status status =
- document_log_->WriteProto(CreateNamespaceTombstone(name_space)).status();
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete namespace. namespace = "
- << name_space;
- return status;
+// TODO(b/273826815): Decide on and adopt a consistent pattern for handling
+// NOT_FOUND 'errors' returned by our internal classes.
+std::optional<UsageStore::UsageScores> DocumentStore::GetUsageScores(
+ DocumentId document_id, int64_t current_time_ms) const {
+ std::optional<DocumentFilterData> opt =
+ GetAliveDocumentFilterData(document_id, current_time_ms);
+ if (!opt) {
+ return std::nullopt;
+ }
+ if (document_id >= usage_store_->num_elements()) {
+ return std::nullopt;
}
+ auto usage_scores_or = usage_store_->GetUsageScores(document_id);
+ if (!usage_scores_or.ok()) {
+ ICING_LOG(ERROR) << "Error retrieving usage for " << document_id << ": "
+ << usage_scores_or.status().error_message();
+ return std::nullopt;
+ }
+ return std::move(usage_scores_or).ValueOrDie();
+}
- ICING_ASSIGN_OR_RETURN(bool updated_existing_document,
- UpdateDerivedFilesNamespaceDeleted(name_space));
- if (!updated_existing_document) {
- // Treat the fact that no existing documents had this namespace to be the
- // same as this namespace not existing at all.
- return absl_ports::NotFoundError(
- absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
+libtextclassifier3::Status DocumentStore::ReportUsage(
+ const UsageReport& usage_report) {
+ ICING_ASSIGN_OR_RETURN(DocumentId document_id,
+ GetDocumentId(usage_report.document_namespace(),
+ usage_report.document_uri()));
+ // We can use the internal version here because we got our document_id from
+ // our internal data structures. We would have thrown some error if the
+ // namespace and/or uri were incorrect.
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
+ if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
+ // Document was probably deleted or expired.
+ return absl_ports::NotFoundError(absl_ports::StrCat(
+ "Couldn't report usage on a nonexistent document: (namespace: '",
+ usage_report.document_namespace(), "', uri: '",
+ usage_report.document_uri(), "')"));
}
- return libtextclassifier3::Status::OK;
+
+ return usage_store_->AddUsageReport(usage_report, document_id);
}
-libtextclassifier3::StatusOr<bool>
-DocumentStore::UpdateDerivedFilesNamespaceDeleted(std::string_view name_space) {
+DocumentStore::DeleteByGroupResult DocumentStore::DeleteByNamespace(
+ std::string_view name_space) {
+ DeleteByGroupResult result;
auto namespace_id_or = namespace_mapper_->Get(name_space);
if (!namespace_id_or.ok()) {
- return namespace_id_or.status();
+ result.status = absl_ports::Annotate(
+ namespace_id_or.status(),
+ absl_ports::StrCat("Failed to find namespace: ", name_space));
+ return result;
}
-
- // Guaranteed to have a NamespaceId now.
NamespaceId namespace_id = namespace_id_or.ValueOrDie();
+ auto num_deleted_or = BatchDelete(namespace_id, kInvalidSchemaTypeId);
+ if (!num_deleted_or.ok()) {
+ result.status = std::move(num_deleted_or).status();
+ return result;
+ }
- // Tracks if there were any existing documents with this namespace that we
- // will mark as deleted.
- bool updated_existing_document = false;
-
- // Traverse FilterCache and delete all docs that match namespace_id
- for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
- ++document_id) {
- // filter_cache_->Get can only fail if document_id is < 0
- // or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
- ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
- filter_cache_->Get(document_id));
- if (data->namespace_id() == namespace_id) {
- if (DoesDocumentExist(document_id)) {
- updated_existing_document = true;
- }
-
- // docid_mapper_->Set can only fail if document_id is < 0
- // or >= docid_mapper_->num_elements. So the only possible way to get an
- // error here would be if filter_cache_->num_elements >
- // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id, kDocDeletedFlag));
- }
+ result.num_docs_deleted = num_deleted_or.ValueOrDie();
+ if (result.num_docs_deleted <= 0) {
+ // Treat the fact that no existing documents had this namespace to be the
+ // same as this namespace not existing at all.
+ result.status = absl_ports::NotFoundError(
+ absl_ports::StrCat("Namespace '", name_space, "' doesn't exist"));
+ return result;
}
- return updated_existing_document;
+ return result;
}
-libtextclassifier3::Status DocumentStore::DeleteBySchemaType(
+DocumentStore::DeleteByGroupResult DocumentStore::DeleteBySchemaType(
std::string_view schema_type) {
+ DeleteByGroupResult result;
auto schema_type_id_or = schema_store_->GetSchemaTypeId(schema_type);
if (!schema_type_id_or.ok()) {
- return absl_ports::Annotate(
+ result.status = absl_ports::Annotate(
schema_type_id_or.status(),
- absl_ports::StrCat("Failed to delete by schema type. schema_type: ",
+ absl_ports::StrCat("Failed to find schema type. schema_type: ",
schema_type));
+ return result;
}
-
- // Update ground truth first.
- // To delete an entire schema type, we append a tombstone that only contains
- // the deleted bit and the name of the deleted schema type.
- // TODO(b/144458732): Implement a more robust version of
- // ICING_RETURN_IF_ERROR that can support error logging.
- libtextclassifier3::Status status =
- document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
- .status();
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete schema_type. schema_type = "
- << schema_type;
- return status;
- }
-
- // Guaranteed to have a SchemaTypeId now
SchemaTypeId schema_type_id = schema_type_id_or.ValueOrDie();
+ auto num_deleted_or = BatchDelete(kInvalidNamespaceId, schema_type_id);
+ if (!num_deleted_or.ok()) {
+ result.status = std::move(num_deleted_or).status();
+ return result;
+ }
- ICING_RETURN_IF_ERROR(UpdateDerivedFilesSchemaTypeDeleted(schema_type_id));
+ result.num_docs_deleted = num_deleted_or.ValueOrDie();
+ if (result.num_docs_deleted <= 0) {
+ result.status = absl_ports::NotFoundError(absl_ports::StrCat(
+ "No documents found with schema type '", schema_type, "'"));
+ return result;
+ }
- return libtextclassifier3::Status::OK;
+ return result;
}
-libtextclassifier3::Status DocumentStore::UpdateDerivedFilesSchemaTypeDeleted(
- SchemaTypeId schema_type_id) {
- // Traverse FilterCache and delete all docs that match schema_type_id.
+libtextclassifier3::StatusOr<int> DocumentStore::BatchDelete(
+ NamespaceId namespace_id, SchemaTypeId schema_type_id) {
+ // Tracks if there were any existing documents with this namespace that we
+ // will mark as deleted.
+ int num_updated_documents = 0;
+
+ // Traverse FilterCache and delete all docs that match namespace_id and
+ // schema_type_id.
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
++document_id) {
// filter_cache_->Get can only fail if document_id is < 0
// or >= filter_cache_->num_elements. So, this error SHOULD NEVER HAPPEN.
ICING_ASSIGN_OR_RETURN(const DocumentFilterData* data,
filter_cache_->Get(document_id));
- if (data->schema_type_id() == schema_type_id) {
- // docid_mapper_->Set can only fail if document_id is < 0
- // or >= docid_mapper_->num_elements. So the only possible way to get an
- // error here would be if filter_cache_->num_elements >
- // docid_mapper_->num_elements, which SHOULD NEVER HAPPEN.
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id, kDocDeletedFlag));
+
+ // Check namespace only when the input namespace id is valid.
+ if (namespace_id != kInvalidNamespaceId &&
+ (data->namespace_id() == kInvalidNamespaceId ||
+ data->namespace_id() != namespace_id)) {
+ // The document has already been hard-deleted or isn't from the desired
+ // namespace.
+ continue;
+ }
+
+ // Check schema type only when the input schema type id is valid.
+ if (schema_type_id != kInvalidSchemaTypeId &&
+ (data->schema_type_id() == kInvalidSchemaTypeId ||
+ data->schema_type_id() != schema_type_id)) {
+ // The document has already been hard-deleted or doesn't have the
+ // desired schema type.
+ continue;
+ }
+
+ // The document has the desired namespace and schema type, it either
+ // exists or has expired.
+ libtextclassifier3::Status delete_status =
+ Delete(document_id, current_time_ms);
+ if (absl_ports::IsNotFound(delete_status)) {
+ continue;
+ } else if (!delete_status.ok()) {
+ // Real error, pass up.
+ return delete_status;
}
+ ++num_updated_documents;
}
- return libtextclassifier3::Status::OK;
+ return num_updated_documents;
}
-libtextclassifier3::Status DocumentStore::PersistToDisk() {
+libtextclassifier3::Status DocumentStore::PersistToDisk(
+ PersistType::Code persist_type) {
+ if (persist_type == PersistType::LITE) {
+ // only persist the document log.
+ return document_log_->PersistToDisk();
+ }
ICING_RETURN_IF_ERROR(document_log_->PersistToDisk());
ICING_RETURN_IF_ERROR(document_key_mapper_->PersistToDisk());
ICING_RETURN_IF_ERROR(document_id_mapper_->PersistToDisk());
ICING_RETURN_IF_ERROR(score_cache_->PersistToDisk());
ICING_RETURN_IF_ERROR(filter_cache_->PersistToDisk());
ICING_RETURN_IF_ERROR(namespace_mapper_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(usage_store_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(corpus_mapper_->PersistToDisk());
+ ICING_RETURN_IF_ERROR(corpus_score_cache_->PersistToDisk());
// Update the combined checksum and write to header file.
ICING_ASSIGN_OR_RETURN(Crc32 checksum, ComputeChecksum());
@@ -1056,23 +1578,140 @@ libtextclassifier3::Status DocumentStore::PersistToDisk() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::StatusOr<int64_t> DocumentStore::GetDiskUsage() const {
- ICING_ASSIGN_OR_RETURN(const int64_t document_log_disk_usage,
- document_log_->GetDiskUsage());
- ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_disk_usage,
- document_key_mapper_->GetDiskUsage());
- ICING_ASSIGN_OR_RETURN(const int64_t document_id_mapper_disk_usage,
- document_id_mapper_->GetDiskUsage());
- ICING_ASSIGN_OR_RETURN(const int64_t score_cache_disk_usage,
- score_cache_->GetDiskUsage());
- ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_disk_usage,
- filter_cache_->GetDiskUsage());
- ICING_ASSIGN_OR_RETURN(const int64_t namespace_mapper_disk_usage,
- namespace_mapper_->GetDiskUsage());
+int64_t GetValueOrDefault(const libtextclassifier3::StatusOr<int64_t>& value_or,
+ int64_t default_value) {
+ return (value_or.ok()) ? value_or.ValueOrDie() : default_value;
+}
- return document_log_disk_usage + document_key_mapper_disk_usage +
- document_id_mapper_disk_usage + score_cache_disk_usage +
- filter_cache_disk_usage + namespace_mapper_disk_usage;
+DocumentStorageInfoProto DocumentStore::GetMemberStorageInfo() const {
+ DocumentStorageInfoProto storage_info;
+ storage_info.set_document_log_size(
+ GetValueOrDefault(document_log_->GetDiskUsage(), -1));
+ storage_info.set_key_mapper_size(
+ GetValueOrDefault(document_key_mapper_->GetDiskUsage(), -1));
+ storage_info.set_document_id_mapper_size(
+ GetValueOrDefault(document_id_mapper_->GetDiskUsage(), -1));
+ storage_info.set_score_cache_size(
+ GetValueOrDefault(score_cache_->GetDiskUsage(), -1));
+ storage_info.set_filter_cache_size(
+ GetValueOrDefault(filter_cache_->GetDiskUsage(), -1));
+ storage_info.set_namespace_id_mapper_size(
+ GetValueOrDefault(namespace_mapper_->GetDiskUsage(), -1));
+ storage_info.set_corpus_mapper_size(
+ GetValueOrDefault(corpus_mapper_->GetDiskUsage(), -1));
+ storage_info.set_corpus_score_cache_size(
+ GetValueOrDefault(corpus_score_cache_->GetDiskUsage(), -1));
+ return storage_info;
+}
+
+DocumentStorageInfoProto DocumentStore::CalculateDocumentStatusCounts(
+ DocumentStorageInfoProto storage_info) const {
+ int total_num_alive = 0;
+ int total_num_expired = 0;
+ int total_num_deleted = 0;
+ std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
+ GetNamespaceIdsToNamespaces(namespace_mapper_.get());
+ std::unordered_map<std::string, NamespaceStorageInfoProto>
+ namespace_to_storage_info;
+
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
+ for (DocumentId document_id = 0;
+ document_id < document_id_mapper_->num_elements(); ++document_id) {
+ // Check if it's deleted first.
+ if (IsDeleted(document_id)) {
+ // We don't have the namespace id of hard deleted documents anymore, so
+ // we can't add to our namespace storage info.
+ ++total_num_deleted;
+ continue;
+ }
+
+ // At this point, the document is either alive or expired, we can get
+ // namespace info for it.
+ auto filter_data_or = filter_cache_->Get(document_id);
+ if (!filter_data_or.ok()) {
+ ICING_VLOG(1) << "Error trying to get filter data for document store "
+ "storage info counts.";
+ continue;
+ }
+ const DocumentFilterData* filter_data = filter_data_or.ValueOrDie();
+ auto itr = namespace_id_to_namespace.find(filter_data->namespace_id());
+ if (itr == namespace_id_to_namespace.end()) {
+ ICING_VLOG(1) << "Error trying to find namespace for document store "
+ "storage info counts.";
+ continue;
+ }
+ const std::string& name_space = itr->second;
+
+ // Always set the namespace, if the NamespaceStorageInfoProto didn't exist
+ // before, we'll get back a default instance of it.
+ NamespaceStorageInfoProto& namespace_storage_info =
+ namespace_to_storage_info[name_space];
+ namespace_storage_info.set_namespace_(name_space);
+
+ // Get usage scores
+ auto usage_scores_or = usage_store_->GetUsageScores(document_id);
+ if (!usage_scores_or.ok()) {
+ ICING_VLOG(1) << "Error trying to get usage scores for document store "
+ "storage info counts.";
+ continue;
+ }
+ UsageStore::UsageScores usage_scores = usage_scores_or.ValueOrDie();
+
+ // Update our stats
+ if (!GetNonExpiredDocumentFilterData(document_id, current_time_ms)) {
+ ++total_num_expired;
+ namespace_storage_info.set_num_expired_documents(
+ namespace_storage_info.num_expired_documents() + 1);
+ if (usage_scores.usage_type1_count > 0) {
+ namespace_storage_info.set_num_expired_documents_usage_type1(
+ namespace_storage_info.num_expired_documents_usage_type1() + 1);
+ }
+ if (usage_scores.usage_type2_count > 0) {
+ namespace_storage_info.set_num_expired_documents_usage_type2(
+ namespace_storage_info.num_expired_documents_usage_type2() + 1);
+ }
+ if (usage_scores.usage_type3_count > 0) {
+ namespace_storage_info.set_num_expired_documents_usage_type3(
+ namespace_storage_info.num_expired_documents_usage_type3() + 1);
+ }
+ } else {
+ ++total_num_alive;
+ namespace_storage_info.set_num_alive_documents(
+ namespace_storage_info.num_alive_documents() + 1);
+ if (usage_scores.usage_type1_count > 0) {
+ namespace_storage_info.set_num_alive_documents_usage_type1(
+ namespace_storage_info.num_alive_documents_usage_type1() + 1);
+ }
+ if (usage_scores.usage_type2_count > 0) {
+ namespace_storage_info.set_num_alive_documents_usage_type2(
+ namespace_storage_info.num_alive_documents_usage_type2() + 1);
+ }
+ if (usage_scores.usage_type3_count > 0) {
+ namespace_storage_info.set_num_alive_documents_usage_type3(
+ namespace_storage_info.num_alive_documents_usage_type3() + 1);
+ }
+ }
+ }
+
+ for (auto& itr : namespace_to_storage_info) {
+ storage_info.mutable_namespace_storage_info()->Add(std::move(itr.second));
+ }
+ storage_info.set_num_alive_documents(total_num_alive);
+ storage_info.set_num_deleted_documents(total_num_deleted);
+ storage_info.set_num_expired_documents(total_num_expired);
+ return storage_info;
+}
+
+DocumentStorageInfoProto DocumentStore::GetStorageInfo() const {
+ DocumentStorageInfoProto storage_info = GetMemberStorageInfo();
+ int64_t directory_size = filesystem_->GetDiskUsage(base_dir_.c_str());
+ if (directory_size != Filesystem::kBadFileSize) {
+ storage_info.set_document_store_size(directory_size);
+ } else {
+ storage_info.set_document_store_size(-1);
+ }
+ storage_info.set_num_namespaces(namespace_mapper_->num_keys());
+ return CalculateDocumentStatusCounts(std::move(storage_info));
}
libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
@@ -1082,6 +1721,7 @@ libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
document_validator_.UpdateSchemaStore(schema_store);
int size = document_id_mapper_->num_elements();
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
for (DocumentId document_id = 0; document_id < size; document_id++) {
auto document_or = Get(document_id);
if (absl_ports::IsNotFound(document_or.status())) {
@@ -1103,12 +1743,16 @@ libtextclassifier3::Status DocumentStore::UpdateSchemaStore(
// Update the SchemaTypeId for this entry
ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
schema_store_->GetSchemaTypeId(document.schema()));
- filter_cache_->mutable_array()[document_id].set_schema_type_id(
- schema_type_id);
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<DocumentFilterData>::MutableView
+ doc_filter_data_view,
+ filter_cache_->GetMutable(document_id));
+ doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
} else {
// Document is no longer valid with the new SchemaStore. Mark as
// deleted
- auto delete_status = Delete(document.namespace_(), document.uri());
+ auto delete_status =
+ Delete(document.namespace_(), document.uri(), current_time_ms);
if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
// Real error, pass up
return delete_status;
@@ -1131,50 +1775,20 @@ libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
schema_store_ = schema_store;
document_validator_.UpdateSchemaStore(schema_store);
- // Append a tombstone for each deleted schema type. This way, we don't have
- // to read out each document, check if the schema type has been deleted, and
- // append a tombstone per-document.
- for (const auto& schema_type :
- set_schema_result.schema_types_deleted_by_name) {
- // TODO(b/144458732): Implement a more robust version of
- // ICING_RETURN_IF_ERROR that can support error logging.
- libtextclassifier3::Status status =
- document_log_->WriteProto(CreateSchemaTypeTombstone(schema_type))
- .status();
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
- << "Failed to delete schema_type. schema_type = "
- << schema_type;
- return status;
- }
- }
-
int size = document_id_mapper_->num_elements();
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
for (DocumentId document_id = 0; document_id < size; document_id++) {
- auto exists_or = DoesDocumentExistAndGetFileOffset(document_id);
- if (absl_ports::IsNotFound(exists_or.status())) {
+ if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
// Skip nonexistent documents
continue;
- } else if (!exists_or.ok()) {
- // Real error, pass up
- return absl_ports::Annotate(
- exists_or.status(),
- IcingStringUtil::StringPrintf("Failed to retrieve DocumentId %d",
- document_id));
}
// Guaranteed that the document exists now.
ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
filter_cache_->Get(document_id));
- if (set_schema_result.schema_types_deleted_by_id.count(
- filter_data->schema_type_id()) != 0) {
- // We already created a tombstone for this deleted type. Just update the
- // derived files now.
- ICING_RETURN_IF_ERROR(
- document_id_mapper_->Set(document_id, kDocDeletedFlag));
- continue;
- }
+ bool delete_document = set_schema_result.schema_types_deleted_by_id.count(
+ filter_data->schema_type_id()) != 0;
// Check if we need to update the FilterCache entry for this document. It
// may have been assigned a different SchemaTypeId in the new SchemaStore.
@@ -1195,20 +1809,23 @@ libtextclassifier3::Status DocumentStore::OptimizedUpdateSchemaStore(
ICING_ASSIGN_OR_RETURN(
SchemaTypeId schema_type_id,
schema_store_->GetSchemaTypeId(document.schema()));
- filter_cache_->mutable_array()[document_id].set_schema_type_id(
- schema_type_id);
+ ICING_ASSIGN_OR_RETURN(
+ typename FileBackedVector<DocumentFilterData>::MutableView
+ doc_filter_data_view,
+ filter_cache_->GetMutable(document_id));
+ doc_filter_data_view.Get().set_schema_type_id(schema_type_id);
}
-
if (revalidate_document) {
- if (!document_validator_.Validate(document).ok()) {
- // Document is no longer valid with the new SchemaStore. Mark as
- // deleted
- auto delete_status = Delete(document.namespace_(), document.uri());
- if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
- // Real error, pass up
- return delete_status;
- }
- }
+ delete_document = !document_validator_.Validate(document).ok();
+ }
+ }
+
+ if (delete_document) {
+ // Document is no longer valid with the new SchemaStore. Mark as deleted
+ auto delete_status = Delete(document_id, current_time_ms);
+ if (!delete_status.ok() && !absl_ports::IsNotFound(delete_status)) {
+ // Real error, pass up
+ return delete_status;
}
}
}
@@ -1221,24 +1838,44 @@ libtextclassifier3::Status DocumentStore::Optimize() {
return libtextclassifier3::Status::OK;
}
-libtextclassifier3::Status DocumentStore::OptimizeInto(
- const std::string& new_directory) {
+libtextclassifier3::StatusOr<DocumentStore::OptimizeResult>
+DocumentStore::OptimizeInto(const std::string& new_directory,
+ const LanguageSegmenter* lang_segmenter,
+ OptimizeStatsProto* stats) const {
// Validates directory
if (new_directory == base_dir_) {
return absl_ports::InvalidArgumentError(
"New directory is the same as the current one.");
}
- ICING_ASSIGN_OR_RETURN(auto new_doc_store,
- DocumentStore::Create(filesystem_, new_directory,
- &clock_, schema_store_));
+ ICING_ASSIGN_OR_RETURN(
+ auto doc_store_create_result,
+ DocumentStore::Create(filesystem_, new_directory, &clock_, schema_store_,
+ /*force_recovery_and_revalidate_documents=*/false,
+ namespace_id_fingerprint_, pre_mapping_fbv_,
+ use_persistent_hash_map_, compression_level_,
+ /*initialize_stats=*/nullptr));
+ std::unique_ptr<DocumentStore> new_doc_store =
+ std::move(doc_store_create_result.document_store);
// Writes all valid docs into new document store (new directory)
- int size = document_id_mapper_->num_elements();
- for (DocumentId document_id = 0; document_id < size; document_id++) {
- auto document_or = Get(document_id);
+ int document_cnt = document_id_mapper_->num_elements();
+ int num_deleted_documents = 0;
+ int num_expired_documents = 0;
+ UsageStore::UsageScores default_usage;
+
+ OptimizeResult result;
+ result.document_id_old_to_new.resize(document_cnt, kInvalidDocumentId);
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
+ for (DocumentId document_id = 0; document_id < document_cnt; document_id++) {
+ auto document_or = Get(document_id, /*clear_internal_fields=*/false);
if (absl_ports::IsNotFound(document_or.status())) {
- // Skip nonexistent documents
+ if (IsDeleted(document_id)) {
+ ++num_deleted_documents;
+ } else if (!GetNonExpiredDocumentFilterData(document_id,
+ current_time_ms)) {
+ ++num_expired_documents;
+ }
continue;
} else if (!document_or.ok()) {
// Real error, pass up
@@ -1249,20 +1886,104 @@ libtextclassifier3::Status DocumentStore::OptimizeInto(
}
// Guaranteed to have a document now.
- DocumentProto document_to_keep = document_or.ValueOrDie();
- // TODO(b/144458732): Implement a more robust version of
- // ICING_RETURN_IF_ERROR that can support error logging.
- libtextclassifier3::Status status =
- new_doc_store->Put(std::move(document_to_keep)).status();
- if (!status.ok()) {
- ICING_LOG(ERROR) << status.error_message()
+ DocumentProto document_to_keep = std::move(document_or).ValueOrDie();
+
+ libtextclassifier3::StatusOr<DocumentId> new_document_id_or;
+ if (document_to_keep.internal_fields().length_in_tokens() == 0) {
+ auto tokenized_document_or = TokenizedDocument::Create(
+ schema_store_, lang_segmenter, document_to_keep);
+ if (!tokenized_document_or.ok()) {
+ return absl_ports::Annotate(
+ tokenized_document_or.status(),
+ IcingStringUtil::StringPrintf(
+ "Failed to tokenize Document for DocumentId %d", document_id));
+ }
+ TokenizedDocument tokenized_document(
+ std::move(tokenized_document_or).ValueOrDie());
+ new_document_id_or = new_doc_store->Put(
+ std::move(document_to_keep), tokenized_document.num_string_tokens());
+ } else {
+ // TODO(b/144458732): Implement a more robust version of
+ // TC_ASSIGN_OR_RETURN that can support error logging.
+ new_document_id_or =
+ new_doc_store->InternalPut(std::move(document_to_keep));
+ }
+ if (!new_document_id_or.ok()) {
+ ICING_LOG(ERROR) << new_document_id_or.status().error_message()
<< "Failed to write into new document store";
- return status;
+ return new_document_id_or.status();
+ }
+
+ result.document_id_old_to_new[document_id] =
+ new_document_id_or.ValueOrDie();
+
+ // Copy over usage scores.
+ ICING_ASSIGN_OR_RETURN(UsageStore::UsageScores usage_scores,
+ usage_store_->GetUsageScores(document_id));
+ if (!(usage_scores == default_usage)) {
+ // If the usage scores for this document are the default (no usage), then
+ // don't bother setting it. No need to possibly allocate storage if
+ // there's nothing interesting to store.
+ DocumentId new_document_id = new_document_id_or.ValueOrDie();
+ ICING_RETURN_IF_ERROR(
+ new_doc_store->SetUsageScores(new_document_id, usage_scores));
}
}
- ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk());
- return libtextclassifier3::Status::OK;
+ // Construct namespace_id_old_to_new
+ int namespace_cnt = namespace_mapper_->num_keys();
+ std::unordered_map<NamespaceId, std::string> old_namespaces =
+ GetNamespaceIdsToNamespaces(namespace_mapper_.get());
+ if (namespace_cnt != old_namespaces.size()) {
+ // This really shouldn't happen. If it really happens, then:
+ // - It won't block DocumentStore optimization, so don't return error here.
+ // - Instead, write a warning log here and hint the caller to rebuild index.
+ ICING_LOG(WARNING) << "Unexpected old namespace count " << namespace_cnt
+ << " vs " << old_namespaces.size();
+ result.should_rebuild_index = true;
+ } else {
+ result.namespace_id_old_to_new.resize(namespace_cnt, kInvalidNamespaceId);
+ for (const auto& [old_namespace_id, ns] : old_namespaces) {
+ if (old_namespace_id >= result.namespace_id_old_to_new.size()) {
+ // This really shouldn't happen. If it really happens, then:
+ // - It won't block DocumentStore optimization, so don't return error
+ // here.
+ // - Instead, write a warning log here and hint the caller to rebuild
+ // index.
+ ICING_LOG(WARNING) << "Found unexpected namespace id "
+ << old_namespace_id << ". Should be in range 0 to "
+ << result.namespace_id_old_to_new.size()
+ << " (exclusive).";
+ result.namespace_id_old_to_new.clear();
+ result.should_rebuild_index = true;
+ break;
+ }
+
+ auto new_namespace_id_or = new_doc_store->namespace_mapper_->Get(ns);
+ if (!new_namespace_id_or.ok()) {
+ if (absl_ports::IsNotFound(new_namespace_id_or.status())) {
+ continue;
+ }
+ // Real error, return it.
+ return std::move(new_namespace_id_or).status();
+ }
+
+ NamespaceId new_namespace_id = new_namespace_id_or.ValueOrDie();
+ // Safe to use bracket to assign given that we've checked the range above.
+ result.namespace_id_old_to_new[old_namespace_id] = new_namespace_id;
+ }
+ }
+
+ if (stats != nullptr) {
+ stats->set_num_original_documents(document_cnt);
+ stats->set_num_deleted_documents(num_deleted_documents);
+ stats->set_num_expired_documents(num_expired_documents);
+ stats->set_num_original_namespaces(namespace_cnt);
+ stats->set_num_deleted_namespaces(
+ namespace_cnt - new_doc_store->namespace_mapper_->num_keys());
+ }
+ ICING_RETURN_IF_ERROR(new_doc_store->PersistToDisk(PersistType::FULL));
+ return result;
}
libtextclassifier3::StatusOr<DocumentStore::OptimizeInfo>
@@ -1271,9 +1992,10 @@ DocumentStore::GetOptimizeInfo() const {
// Figure out our ratio of optimizable/total docs.
int32_t num_documents = document_id_mapper_->num_elements();
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
for (DocumentId document_id = kMinDocumentId; document_id < num_documents;
++document_id) {
- if (!DoesDocumentExist(document_id)) {
+ if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
++optimize_info.optimizable_docs;
}
@@ -1298,26 +2020,39 @@ DocumentStore::GetOptimizeInfo() const {
score_cache_->GetElementsFileSize());
ICING_ASSIGN_OR_RETURN(const int64_t filter_cache_file_size,
filter_cache_->GetElementsFileSize());
+ ICING_ASSIGN_OR_RETURN(const int64_t corpus_score_cache_file_size,
+ corpus_score_cache_->GetElementsFileSize());
+
+ // Usage store might be sparse, but we'll still use file size for more
+ // accurate counting.
+ ICING_ASSIGN_OR_RETURN(const int64_t usage_store_file_size,
+ usage_store_->GetElementsFileSize());
- // We use a combined disk usage and file size for the KeyMapper because it's
- // backed by a trie, which has some sparse property bitmaps.
+ // We use a combined disk usage and file size for the DynamicTrieKeyMapper
+ // because it's backed by a trie, which has some sparse property bitmaps.
ICING_ASSIGN_OR_RETURN(const int64_t document_key_mapper_size,
document_key_mapper_->GetElementsSize());
- // We don't include the namespace mapper because it's not clear if we could
- // recover any space even if Optimize were called. Deleting 100s of documents
- // could still leave a few documents of a namespace, and then there would be
- // no change.
+ // We don't include the namespace_mapper or the corpus_mapper because it's
+ // not clear if we could recover any space even if Optimize were called.
+ // Deleting 100s of documents could still leave a few documents of a
+ // namespace, and then there would be no change.
int64_t total_size = document_log_file_size + document_key_mapper_size +
document_id_mapper_file_size + score_cache_file_size +
- filter_cache_file_size;
+ filter_cache_file_size + corpus_score_cache_file_size +
+ usage_store_file_size;
optimize_info.estimated_optimizable_bytes =
total_size * optimize_info.optimizable_docs / optimize_info.total_docs;
return optimize_info;
}
+libtextclassifier3::Status DocumentStore::UpdateCorpusAssociatedScoreCache(
+ CorpusId corpus_id, const CorpusAssociatedScoreData& score_data) {
+ return corpus_score_cache_->Set(corpus_id, score_data);
+}
+
libtextclassifier3::Status DocumentStore::UpdateDocumentAssociatedScoreCache(
DocumentId document_id, const DocumentAssociatedScoreData& score_data) {
return score_cache_->Set(document_id, score_data);
@@ -1328,5 +2063,92 @@ libtextclassifier3::Status DocumentStore::UpdateFilterCache(
return filter_cache_->Set(document_id, filter_data);
}
+libtextclassifier3::Status DocumentStore::ClearDerivedData(
+ DocumentId document_id) {
+ // We intentionally leave the data in key_mapper_ because locating that data
+ // requires fetching namespace and uri. Leaving data in key_mapper_ should
+ // be fine because the data is hashed.
+
+ ICING_RETURN_IF_ERROR(document_id_mapper_->Set(document_id, kDocDeletedFlag));
+
+ // Resets the score cache entry
+ ICING_RETURN_IF_ERROR(UpdateDocumentAssociatedScoreCache(
+ document_id, DocumentAssociatedScoreData(kInvalidCorpusId,
+ /*document_score=*/-1,
+ /*creation_timestamp_ms=*/-1,
+ /*length_in_tokens=*/0)));
+
+ // Resets the filter cache entry
+ ICING_RETURN_IF_ERROR(UpdateFilterCache(
+ document_id, DocumentFilterData(kInvalidNamespaceId, kInvalidSchemaTypeId,
+ /*expiration_timestamp_ms=*/-1)));
+
+ // Clears the usage scores.
+ return usage_store_->DeleteUsageScores(document_id);
+}
+
+libtextclassifier3::Status DocumentStore::SetUsageScores(
+ DocumentId document_id, const UsageStore::UsageScores& usage_scores) {
+ return usage_store_->SetUsageScores(document_id, usage_scores);
+}
+
+libtextclassifier3::StatusOr<
+ google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
+DocumentStore::CollectCorpusInfo() const {
+ google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo> corpus_info;
+ libtextclassifier3::StatusOr<const SchemaProto*> schema_proto_or =
+ schema_store_->GetSchema();
+ if (!schema_proto_or.ok()) {
+ return corpus_info;
+ }
+ // Maps from CorpusId to the corresponding protocol buffer in the result.
+ std::unordered_map<CorpusId, DocumentDebugInfoProto::CorpusInfo*> info_map;
+ std::unordered_map<NamespaceId, std::string> namespace_id_to_namespace =
+ GetNamespaceIdsToNamespaces(namespace_mapper_.get());
+ const SchemaProto* schema_proto = schema_proto_or.ValueOrDie();
+ int64_t current_time_ms = clock_.GetSystemTimeMilliseconds();
+ for (DocumentId document_id = 0; document_id < filter_cache_->num_elements();
+ ++document_id) {
+ if (!GetAliveDocumentFilterData(document_id, current_time_ms)) {
+ continue;
+ }
+ ICING_ASSIGN_OR_RETURN(const DocumentFilterData* filter_data,
+ filter_cache_->Get(document_id));
+ ICING_ASSIGN_OR_RETURN(const DocumentAssociatedScoreData* score_data,
+ score_cache_->Get(document_id));
+ const std::string& name_space =
+ namespace_id_to_namespace[filter_data->namespace_id()];
+ const std::string& schema =
+ schema_proto->types()[filter_data->schema_type_id()].schema_type();
+ auto iter = info_map.find(score_data->corpus_id());
+ if (iter == info_map.end()) {
+ DocumentDebugInfoProto::CorpusInfo* entry = corpus_info.Add();
+ entry->set_namespace_(name_space);
+ entry->set_schema(schema);
+ iter = info_map.insert({score_data->corpus_id(), entry}).first;
+ }
+ iter->second->set_total_documents(iter->second->total_documents() + 1);
+ iter->second->set_total_token(iter->second->total_token() +
+ score_data->length_in_tokens());
+ }
+ return corpus_info;
+}
+
+libtextclassifier3::StatusOr<DocumentDebugInfoProto>
+DocumentStore::GetDebugInfo(int verbosity) const {
+ DocumentDebugInfoProto debug_info;
+ *debug_info.mutable_document_storage_info() = GetStorageInfo();
+ ICING_ASSIGN_OR_RETURN(Crc32 crc, ComputeChecksum());
+ debug_info.set_crc(crc.Get());
+ if (verbosity > 0) {
+ ICING_ASSIGN_OR_RETURN(
+ google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>
+ corpus_info,
+ CollectCorpusInfo());
+ *debug_info.mutable_corpus_info() = std::move(corpus_info);
+ }
+ return debug_info;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/document-store.h b/icing/store/document-store.h
index 3f4b72f..c228e8b 100644
--- a/icing/store/document-store.h
+++ b/icing/store/document-store.h
@@ -26,17 +26,32 @@
#include "icing/file/file-backed-proto-log.h"
#include "icing/file/file-backed-vector.h"
#include "icing/file/filesystem.h"
+#include "icing/file/portable-file-backed-proto-log.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/logging.pb.h"
+#include "icing/proto/optimize.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/usage.pb.h"
#include "icing/schema/schema-store.h"
+#include "icing/store/corpus-associated-scoring-data.h"
+#include "icing/store/corpus-id.h"
#include "icing/store/document-associated-score-data.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
#include "icing/store/key-mapper.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
#include "icing/store/namespace-id.h"
+#include "icing/store/usage-store.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/util/clock.h"
#include "icing/util/crc32.h"
+#include "icing/util/data-loss.h"
#include "icing/util/document-validator.h"
+#include "icing/util/fingerprint-util.h"
namespace icing {
namespace lib {
@@ -45,13 +60,19 @@ namespace lib {
class DocumentStore {
public:
struct Header {
- static constexpr int32_t kMagic = 0x746f7265;
+ static int32_t GetCurrentMagic(bool namespace_id_fingerprint) {
+ return namespace_id_fingerprint ? kNewMagic : kOldMagic;
+ }
// Holds the magic as a quick sanity check against file corruption.
int32_t magic;
// Checksum of the DocumentStore's sub-component's checksums.
uint32_t checksum;
+
+ private:
+ static constexpr int32_t kOldMagic = 0x746f7265;
+ static constexpr int32_t kNewMagic = 0x1b99c8b0;
};
struct OptimizeInfo {
@@ -68,6 +89,31 @@ class DocumentStore {
int32_t optimizable_docs = 0;
};
+ struct DeleteByGroupResult {
+ // Status representing whether or not the operation succeeded. See the
+ // comments above the function that returns this result to determine what
+ // possible statuses could be returned.
+ libtextclassifier3::Status status;
+
+ int num_docs_deleted = 0;
+ };
+
+ struct CreateResult {
+ // A successfully initialized document store.
+ std::unique_ptr<DocumentStore> document_store;
+
+ // The data status after initializing from a previous state. Data loss can
+ // happen if the file is corrupted or some previously added data was
+ // unpersisted. This may be used to signal that any derived data off of the
+ // document store may need to be regenerated.
+ DataLoss data_loss;
+
+ // A boolean flag indicating if derived files of the document store have
+ // been regenerated or not. This is usually a signal for callers to detect
+ // if any id assignment has changed (e.g. NamespaceId).
+ bool derived_files_regenerated;
+ };
+
// Not copyable
DocumentStore(const DocumentStore&) = delete;
DocumentStore& operator=(const DocumentStore&) = delete;
@@ -80,53 +126,89 @@ class DocumentStore {
// previously initialized with this directory, it will reload the files saved
// by the last instance.
//
- // Does not take any ownership, and all pointers must refer to valid objects
- // that outlive the one constructed.
+ // force_recovery_and_revalidate_documents=true will pre-emptively throw out
+ // the derived files and validate each document while recreating them. This
+ // can be used to indicate that the schema (and type ids) may have changed and
+ // those changes might not have been applied to the document store.
+ //
+ // If initialize_stats is present, the fields related to DocumentStore will be
+ // populated.
+ //
+ // Does not take any ownership, and all pointers except initialize_stats must
+ // refer to valid objects that outlive the one constructed.
//
// TODO(cassiewang): Consider returning a status indicating that derived files
// were regenerated. This may be helpful in logs.
//
// Returns:
- // A DocumentStore on success
+ // A DocumentStore::CreateResult on success
// FAILED_PRECONDITION on any null pointer input
// INTERNAL_ERROR on IO error
- static libtextclassifier3::StatusOr<std::unique_ptr<DocumentStore>> Create(
+ static libtextclassifier3::StatusOr<DocumentStore::CreateResult> Create(
const Filesystem* filesystem, const std::string& base_dir,
- const Clock* clock, const SchemaStore* schema_store);
+ const Clock* clock, const SchemaStore* schema_store,
+ bool force_recovery_and_revalidate_documents,
+ bool namespace_id_fingerprint, bool pre_mapping_fbv,
+ bool use_persistent_hash_map, int32_t compression_level,
+ InitializeStatsProto* initialize_stats);
+
+ // Discards all derived data in the document store.
+ //
+ // Returns:
+ // OK on success or nothing to discard
+ // INTERNAL_ERROR on any I/O errors
+ static libtextclassifier3::Status DiscardDerivedFiles(
+ const Filesystem* filesystem, const std::string& base_dir);
// Returns the maximum DocumentId that the DocumentStore has assigned. If
// there has not been any DocumentIds assigned, i.e. the DocumentStore is
// empty, then kInvalidDocumentId is returned. This does not filter out
- // DocumentIds of deleted documents.
- const DocumentId last_added_document_id() const {
+ // DocumentIds of deleted or expired documents.
+ DocumentId last_added_document_id() const {
if (document_id_mapper_->num_elements() == 0) {
return kInvalidDocumentId;
}
return document_id_mapper_->num_elements() - 1;
}
+ // Returns the number of documents. The result does not filter out DocumentIds
+ // of deleted or expired documents.
+ int num_documents() const { return document_id_mapper_->num_elements(); }
+
// Puts the document into document store.
//
+ // If put_document_stats is present, the fields related to DocumentStore will
+ // be populated.
+ //
// Returns:
// A newly generated document id on success
+ // RESOURCE_EXHAUSED if exceeds maximum number of allowed documents
// FAILED_PRECONDITION if schema hasn't been set yet
// NOT_FOUND if the schema_type or a property config of the document doesn't
// exist in schema
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<DocumentId> Put(const DocumentProto& document);
- libtextclassifier3::StatusOr<DocumentId> Put(DocumentProto&& document);
+ libtextclassifier3::StatusOr<DocumentId> Put(
+ const DocumentProto& document, int32_t num_tokens = 0,
+ PutDocumentStatsProto* put_document_stats = nullptr);
+ libtextclassifier3::StatusOr<DocumentId> Put(
+ DocumentProto&& document, int32_t num_tokens = 0,
+ PutDocumentStatsProto* put_document_stats = nullptr);
// Finds and returns the document identified by the given key (namespace +
- // uri)
+ // uri). If 'clear_internal_fields' is true, document level data that's
+ // generated internally by DocumentStore is cleared.
//
// Returns:
// The document found on success
// NOT_FOUND if the key doesn't exist or document has been deleted
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<DocumentProto> Get(std::string_view name_space,
- std::string_view uri) const;
+ libtextclassifier3::StatusOr<DocumentProto> Get(
+ std::string_view name_space, std::string_view uri,
+ bool clear_internal_fields = true) const;
- // Finds and returns the document identified by the given document id
+ // Finds and returns the document identified by the given document id. If
+ // 'clear_internal_fields' is true, document level data that's generated
+ // internally by DocumentStore is cleared.
//
// Returns:
// The document found on success
@@ -134,30 +216,42 @@ class DocumentStore {
// maximum value
// NOT_FOUND if the document doesn't exist or has been deleted
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<DocumentProto> Get(DocumentId document_id) const;
+ libtextclassifier3::StatusOr<DocumentProto> Get(
+ DocumentId document_id, bool clear_internal_fields = true) const;
// Returns all namespaces which have at least 1 active document (not deleted
// or expired). Order of namespaces is undefined.
std::vector<std::string> GetAllNamespaces() const;
- // Check if a document exists. Existence means it hasn't been deleted and it
- // hasn't expired yet.
+ // Deletes the document identified by the given namespace and uri. The
+ // document proto will be erased immediately.
+ //
+ // NOTE:
+ // Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
- // boolean whether a document exists or not
- bool DoesDocumentExist(DocumentId document_id) const;
+ // OK on success
+ // NOT_FOUND if no document exists with namespace, uri
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::Status Delete(std::string_view name_space,
+ std::string_view uri,
+ int64_t current_time_ms);
- // Deletes the document identified by the given namespace and uri
+ // Deletes the document identified by the given document_id. The document
+ // proto will be erased immediately.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // NOTE:
+ // Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
- // NOT_FOUND if no document exists with namespace, uri
+ // NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status Delete(std::string_view name_space,
- std::string_view uri);
+ // INVALID_ARGUMENT if document_id is invalid.
+ libtextclassifier3::Status Delete(DocumentId document_id,
+ int64_t current_time_ms);
// Returns the NamespaceId of the string namespace
//
@@ -168,74 +262,163 @@ class DocumentStore {
libtextclassifier3::StatusOr<NamespaceId> GetNamespaceId(
std::string_view name_space) const;
+ // Helper method to find a DocumentId that is associated with the given
+ // namespace and uri.
+ //
+ // NOTE: The DocumentId may refer to a invalid document (deleted
+ // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
+ // refers to a valid Document.
+ //
+ // Returns:
+ // A DocumentId on success
+ // NOT_FOUND if the key doesn't exist
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
+ std::string_view name_space, std::string_view uri) const;
+
+ // Helper method to find a DocumentId that is associated with the given
+ // NamespaceFingerprintIdentifier.
+ //
+ // NOTE: The DocumentId may refer to a invalid document (deleted
+ // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
+ // refers to a valid Document.
+ //
+ // Returns:
+ // A DocumentId on success
+ // NOT_FOUND if the key doesn't exist
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
+ const NamespaceFingerprintIdentifier& namespace_fingerprint_identifier)
+ const;
+
+ // Returns the CorpusId associated with the given namespace and schema.
+ //
+ // Returns:
+ // A CorpusId on success
+ // NOT_FOUND if the key doesn't exist
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<CorpusId> GetCorpusId(
+ const std::string_view name_space, const std::string_view schema) const;
+
+ // Returns the ResultGroupingEntryId associated with the given namespace
+ // and schema.
+ //
+ // NOTE: ResultGroupingEntryIds that are generated by calls with different
+ // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
+ // are only guarenteed to be unique within their own ResultGroupingType.
+ //
+ // Returns:
+ // A ResultGroupingEntryId on success
+ // NOT_FOUND if the key doesn't exist
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
+ ResultSpecProto::ResultGroupingType result_group_type,
+ const std::string_view name_space, const std::string_view schema) const;
+
+ // Returns the ResultGrouping Entry Id associated with the given NamespaceId
+ // and SchemaTypeId
+ //
+ // NOTE: ResultGroupingEntryIds that are generated by calls with different
+ // ResultGroupingTypes should not be compared. Returned ResultGroupingEntryIds
+ // are only guarenteed to be unique within their own ResultGroupingType.
+ //
+ // Returns:
+ // A ResultGroupingEntryId on success
+ // NOT_FOUND if the key doesn't exist
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int32_t> GetResultGroupingEntryId(
+ ResultSpecProto::ResultGroupingType result_group_type,
+ const NamespaceId namespace_id, const SchemaTypeId schema_type_id) const;
+
// Returns the DocumentAssociatedScoreData of the document specified by the
// DocumentId.
//
- // NOTE: This does not check if the document exists and will return the
- // DocumentFilterData of the document even if it has been deleted. Users
- // should check DoesDocumentExist(document_id) if they only want existing
- // documents' DocumentFilterData.
- //
// Returns:
// DocumentAssociatedScoreData on success
- // OUT_OF_RANGE if document_id is negative or exceeds previously seen
- // DocumentIds
+ // NOT_FOUND if the document or the score data is not found
libtextclassifier3::StatusOr<DocumentAssociatedScoreData>
GetDocumentAssociatedScoreData(DocumentId document_id) const;
- // Returns the DocumentFilterData of the document specified by the DocumentId.
+ // Returns the CorpusAssociatedScoreData of the corpus specified by the
+ // corpus_id.
//
- // NOTE: This does not check if the document exists and will return the
- // DocumentFilterData of the document even if it has been deleted. Users
- // should check DoesDocumentExist(document_id) if they only want existing
- // documents' DocumentFilterData.
+ // NOTE: This does not check if the corpus exists and will return the
+ // CorpusAssociatedScoreData of the corpus even if all documents belonging to
+ // that corpus have been deleted.
//
// Returns:
- // DocumentFilterData on success
- // OUT_OF_RANGE if document_id is negative or exceeds previously seen
- // DocumentIds
- libtextclassifier3::StatusOr<DocumentFilterData> GetDocumentFilterData(
- DocumentId document_id) const;
+ // CorpusAssociatedScoreData on success
+ // OUT_OF_RANGE if corpus_id is negative or exceeds previously seen
+ // CorpusIds
+ libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
+ GetCorpusAssociatedScoreData(CorpusId corpus_id) const;
- // Deletes all documents belonging to the given namespace.
+ // Gets the document filter data if a document exists. Otherwise, will get a
+ // false optional.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // Existence means it hasn't been deleted and it hasn't expired yet.
+ //
+ // Returns:
+ // True:DocumentFilterData if the given document exists.
+ // False if the given document doesn't exist.
+ std::optional<DocumentFilterData> GetAliveDocumentFilterData(
+ DocumentId document_id, int64_t current_time_ms) const;
+
+ // Gets the usage scores of a document.
+ //
+ // Returns:
+ // UsageScores on success
+ // nullopt if there are no usage scores stored for the requested docid.
+ std::optional<UsageStore::UsageScores> GetUsageScores(
+ DocumentId document_id, int64_t current_time_ms) const;
+
+ // Reports usage. The corresponding usage scores of the specified document in
+ // the report will be updated.
+ //
+ // Returns:
+ // OK on success
+ // NOT_FOUND if the [namesapce + uri] key in the report doesn't exist
+ // INTERNAL_ERROR on I/O errors.
+ libtextclassifier3::Status ReportUsage(const UsageReport& usage_report);
+
+ // Deletes all documents belonging to the given namespace. The documents will
+ // be erased immediately.
+ //
+ // NOTE:
+ // Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
// NOT_FOUND if namespace doesn't exist
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status DeleteByNamespace(std::string_view name_space);
+ DeleteByGroupResult DeleteByNamespace(std::string_view name_space);
- // Deletes all documents belonging to the given schema type
+ // Deletes all documents belonging to the given schema type. The documents
+ // will be erased immediately.
//
- // NOTE: Space is not reclaimed for deleted documents until Optimize() is
- // called.
+ // NOTE:
+ // Space is not reclaimed for deleted documents until Optimize() is
+ // called.
//
// Returns:
// OK on success
// NOT_FOUND if schema_type doesn't exist
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status DeleteBySchemaType(std::string_view schema_type);
+ DeleteByGroupResult DeleteBySchemaType(std::string_view schema_type);
// Syncs all the data and metadata changes to disk.
//
// Returns:
// OK on success
// INTERNAL on I/O error
- libtextclassifier3::Status PersistToDisk();
+ libtextclassifier3::Status PersistToDisk(PersistType::Code persist_type);
- // Calculates and returns the disk usage in bytes. Rounds up to the nearest
- // block size.
+ // Calculates the StorageInfo for the Document Store.
//
- // Returns:
- // Disk usage on success
- // INTERNAL_ERROR on IO error
- //
- // TODO(samzheng): consider returning a struct which has the breakdown of each
- // component.
- libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+ // If an IO error occurs while trying to calculate the value for a field, then
+ // that field will be set to -1.
+ DocumentStorageInfoProto GetStorageInfo() const;
// Update any derived data off of the SchemaStore with the new SchemaStore.
// This may include pointers, SchemaTypeIds, etc.
@@ -277,20 +460,39 @@ class DocumentStore {
// INTERNAL_ERROR on IO error
libtextclassifier3::Status Optimize();
+ struct OptimizeResult {
+ // A vector that maps old document id to new document id.
+ std::vector<DocumentId> document_id_old_to_new;
+
+ // A vector that maps old namespace id to new namespace id. Will be empty if
+ // should_rebuild_index is set to true.
+ std::vector<NamespaceId> namespace_id_old_to_new;
+
+ // A boolean flag that hints the caller (usually IcingSearchEngine) if it
+ // should rebuild index instead of adopting the id changes via the 2 vectors
+ // above. It will be set to true if finding any id inconsistency.
+ bool should_rebuild_index = false;
+ };
// Copy data from current base directory into a new directory. Any outdated or
- // deleted data won't be copied. During the process, document ids will be
- // reassigned so any files / classes that are based on old document ids may be
- // outdated.
+ // deleted data won't be copied. During the process, document/namespace ids
+ // will be reassigned so any files / classes that are based on old
+ // document/namespace ids may be outdated.
+ //
+ // stats will be set if non-null.
//
// NOTE: The tasks in this method are too expensive to be executed in
// real-time. The caller should decide how frequently and when to call this
// method based on device usage.
//
// Returns:
- // OK on success
+ // OptimizeResult which contains a vector mapping from old document id to
+ // new document id and another vector mapping from old namespace id to new
+ // namespace id, on success
// INVALID_ARGUMENT if new_directory is same as current base directory
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status OptimizeInto(const std::string& new_directory);
+ libtextclassifier3::StatusOr<OptimizeResult> OptimizeInto(
+ const std::string& new_directory, const LanguageSegmenter* lang_segmenter,
+ OptimizeStatsProto* stats = nullptr) const;
// Calculates status for a potential Optimize call. Includes how many docs
// there are vs how many would be optimized away. And also includes an
@@ -309,10 +511,25 @@ class DocumentStore {
// INTERNAL_ERROR on compute error
libtextclassifier3::StatusOr<Crc32> ComputeChecksum() const;
+ // Get debug information for the document store.
+ // verbosity <= 0, simplest debug information
+ // verbosity > 0, also return the total number of documents and tokens in each
+ // (namespace, schema type) pair.
+ //
+ // Returns:
+ // DocumentDebugInfoProto on success
+ // INTERNAL_ERROR on IO errors, crc compute error
+ libtextclassifier3::StatusOr<DocumentDebugInfoProto> GetDebugInfo(
+ int verbosity) const;
+
private:
// Use DocumentStore::Create() to instantiate.
- DocumentStore(const Filesystem* filesystem, std::string_view base_dir,
- const Clock* clock, const SchemaStore* schema_store);
+ explicit DocumentStore(const Filesystem* filesystem,
+ std::string_view base_dir, const Clock* clock,
+ const SchemaStore* schema_store,
+ bool namespace_id_fingerprint, bool pre_mapping_fbv,
+ bool use_persistent_hash_map,
+ int32_t compression_level);
const Filesystem* const filesystem_;
const std::string base_dir_;
@@ -325,20 +542,39 @@ class DocumentStore {
// Used to validate incoming documents
DocumentValidator document_validator_;
+ // Whether to use namespace id or namespace name to build up fingerprint for
+ // document_key_mapper_ and corpus_mapper_.
+ bool namespace_id_fingerprint_;
+
+ // Flag indicating whether memory map max possible file size for underlying
+ // FileBackedVector before growing the actual file size.
+ bool pre_mapping_fbv_;
+
+ // Flag indicating whether use persistent hash map as the key mapper (if
+ // false, then fall back to dynamic trie key mapper). Note: we only use
+ // persistent hash map for uri mapper if it is true.
+ bool use_persistent_hash_map_;
+
+ const int32_t compression_level_;
+
// A log used to store all documents, it serves as a ground truth of doc
// store. key_mapper_ and document_id_mapper_ can be regenerated from it.
- std::unique_ptr<FileBackedProtoLog<DocumentWrapper>> document_log_;
+ std::unique_ptr<PortableFileBackedProtoLog<DocumentWrapper>> document_log_;
// Key (namespace + uri) to DocumentId mapping
- std::unique_ptr<KeyMapper<DocumentId>> document_key_mapper_;
+ std::unique_ptr<
+ KeyMapper<DocumentId, fingerprint_util::FingerprintStringFormatter>>
+ document_key_mapper_;
// DocumentId to file offset mapping
std::unique_ptr<FileBackedVector<int64_t>> document_id_mapper_;
// A cache of document associated scores. The ground truth of the scores is
// DocumentProto stored in document_log_. This cache contains:
+ // - CorpusId
// - Document score
// - Document creation timestamp in seconds
+ // - Document length in number of tokens
std::unique_ptr<FileBackedVector<DocumentAssociatedScoreData>> score_cache_;
// A cache of data, indexed by DocumentId, used to filter documents. Currently
@@ -348,11 +584,31 @@ class DocumentStore {
// - Expiration timestamp in seconds
std::unique_ptr<FileBackedVector<DocumentFilterData>> filter_cache_;
+ // A cache of corpus associated scores. The ground truth of the scores is
+ // DocumentProto stored in document_log_. This cache contains:
+ // - Number of documents belonging to the corpus score
+ // - The sum of the documents' lengths, in number of tokens.
+ std::unique_ptr<FileBackedVector<CorpusAssociatedScoreData>>
+ corpus_score_cache_;
+
// Maps namespaces to a densely-assigned unique id. Namespaces are assigned an
// id when the first document belonging to that namespace is added to the
// DocumentStore. Namespaces may be removed from the mapper during compaction.
std::unique_ptr<KeyMapper<NamespaceId>> namespace_mapper_;
+ // Maps a corpus, i.e. a (namespace, schema type) pair, to a densely-assigned
+ // unique id. A coprus is assigned an
+ // id when the first document belonging to that corpus is added to the
+ // DocumentStore. Corpus ids may be removed from the mapper during compaction.
+ std::unique_ptr<
+ KeyMapper<CorpusId, fingerprint_util::FingerprintStringFormatter>>
+ corpus_mapper_;
+
+ // A storage class that caches all usage scores. Usage scores are not
+ // considered as ground truth. Usage scores are associated with document ids
+ // so they need to be updated when document ids change.
+ std::unique_ptr<UsageStore> usage_store_;
+
// Used internally to indicate whether the class has been initialized. This is
// to guard against cases where the object has been created, but Initialize
// fails in the constructor. If we have successfully exited the constructor,
@@ -360,16 +616,31 @@ class DocumentStore {
// worry about this field.
bool initialized_ = false;
- libtextclassifier3::Status Initialize();
+ struct InitializeResult {
+ DataLoss data_loss;
+
+ // A boolean flag indicating if derived files of the document store have
+ // been regenerated or not. This is usually a signal for callers to detect
+ // if any id assignment has changed (e.g. NamespaceId).
+ bool derived_files_regenerated;
+ };
+ libtextclassifier3::StatusOr<InitializeResult> Initialize(
+ bool force_recovery_and_revalidate_documents,
+ InitializeStatsProto* initialize_stats);
// Creates sub-components and verifies the integrity of each sub-component.
+ // This assumes that the the underlying files already exist, and will return
+ // an error if it doesn't find what it's expecting.
//
// Returns an error if subcomponents failed to initialize successfully.
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status InitializeDerivedFiles();
+ libtextclassifier3::Status InitializeExistingDerivedFiles();
// Re-generates all files derived from the ground truth: the document log.
//
+ // revalidate_documents=true will also cause each document to be revalidated
+ // the schema as it is read out of the document log.
+ //
// NOTE: if this function fails, the only thing we can do is to retry it until
// it succeeds or prevent the initialization of a DocumentStore. The
// DocumentStore object wouldn't work reliably if this fails.
@@ -380,7 +651,7 @@ class DocumentStore {
// document_id
// mapper.
// 3. Create header and store the updated combined checksum
- libtextclassifier3::Status RegenerateDerivedFiles();
+ libtextclassifier3::Status RegenerateDerivedFiles(bool revalidate_documents);
// Resets the unique_ptr to the document_key_mapper, deletes the underlying
// file, and re-creates a new instance of the document_key_mapper .
@@ -400,6 +671,12 @@ class DocumentStore {
// Returns OK or any IO errors.
libtextclassifier3::Status ResetDocumentAssociatedScoreCache();
+ // Resets the unique_ptr to the corpus_score_cache, deletes the underlying
+ // file, and re-creates a new instance of the corpus_score_cache.
+ //
+ // Returns OK or any IO errors.
+ libtextclassifier3::Status ResetCorpusAssociatedScoreCache();
+
// Resets the unique_ptr to the filter_cache, deletes the underlying file, and
// re-creates a new instance of the filter_cache.
//
@@ -412,82 +689,137 @@ class DocumentStore {
// Returns OK or any IO errors.
libtextclassifier3::Status ResetNamespaceMapper();
+ // Resets the unique_ptr to the corpus_mapper, deletes the underlying file,
+ // and re-creates a new instance of the corpus_mapper.
+ //
+ // Returns OK or any IO errors.
+ libtextclassifier3::Status ResetCorpusMapper();
+
// Checks if the header exists already. This does not create the header file
// if it doesn't exist.
bool HeaderExists();
- // Update and replace the header file. Creates the header file if it doesn't
- // exist.
+ // Update, replace and persist the header file. Creates the header file if it
+ // doesn't exist.
//
// Returns:
// OK on success
// INTERNAL on I/O error
libtextclassifier3::Status UpdateHeader(const Crc32& checksum);
- // Update derived files that `name_space` has been deleted. This is primarily
- // useful if we're trying to update derived files when we've already seen a
- // namespace tombstone, and don't need to write another tombstone.
- //
- // NOTE: Space is not reclaimed in the derived files until Optimize() is
- // called.
- //
- // Returns:
- // bool on whether an existing document was actually updated to be deleted
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<bool> UpdateDerivedFilesNamespaceDeleted(
- std::string_view name_space);
+ libtextclassifier3::StatusOr<DocumentId> InternalPut(
+ DocumentProto&& document,
+ PutDocumentStatsProto* put_document_stats = nullptr);
- // Update derived files that the schema type schema_type_id has been deleted.
- // This is primarily useful if we're trying to update derived files when we've
- // already seen a schema type tombstone, and don't need to write another
- // tombstone.
+ // Helper function to do batch deletes. Documents with the given
+ // "namespace_id" and "schema_type_id" will be deleted. If callers don't need
+ // to specify the namespace or schema type, pass in kInvalidNamespaceId or
+ // kInvalidSchemaTypeId. The document protos with their derived data will be
+ // erased / cleared immediately.
//
// NOTE: Space is not reclaimed in the derived files until Optimize() is
// called.
//
// Returns:
- // OK on success
+ // Number of documents that were actually updated to be deleted
// INTERNAL_ERROR on IO error
- libtextclassifier3::Status UpdateDerivedFilesSchemaTypeDeleted(
- SchemaTypeId schema_type_id);
+ libtextclassifier3::StatusOr<int> BatchDelete(NamespaceId namespace_id,
+ SchemaTypeId schema_type_id);
- // Helper method to find a DocumentId that is associated with the given
- // namespace and uri.
+ // Returns the CorpusAssociatedScoreData of the corpus specified by the
+ // corpus_id.
//
- // NOTE: The DocumentId may refer to a invalid document (deleted
- // or expired). Callers can call DoesDocumentExist(document_id) to ensure it
- // refers to a valid Document.
+ // If the corpus_id has never been seen before, it returns a
+ // CorpusAssociatedScoreData with properties set to default values.
+ //
+ // NOTE: This does not check if the corpus exists and will return the
+ // CorpusAssociatedScoreData of the corpus even if all documents belonging to
+ // that corpus have been deleted.
//
// Returns:
- // A DocumentId on success
- // NOT_FOUND if the key doesn't exist
- // INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<DocumentId> GetDocumentId(
- std::string_view name_space, std::string_view uri) const;
+ // CorpusAssociatedScoreData on success
+ libtextclassifier3::StatusOr<CorpusAssociatedScoreData>
+ GetCorpusAssociatedScoreDataToUpdate(CorpusId corpus_id) const;
- // Helper method to validate the document id and return the file offset of the
- // associated document in document_log_.
- //
- // This can be a more informative call than just DoesDocumentExist because it
- // can return more status errors on whether the Document actually doesn't
- // exist or if there was an internal error while accessing files.
+ // Check if a document exists. Existence means it hasn't been deleted and it
+ // hasn't expired yet.
//
// Returns:
- // The file offset on success
+ // OK if the document exists
// INVALID_ARGUMENT if document_id is less than 0 or greater than the
// maximum value
// NOT_FOUND if the document doesn't exist (i.e. deleted or expired)
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> DoesDocumentExistAndGetFileOffset(
+ libtextclassifier3::Status DoesDocumentExistWithStatus(
DocumentId document_id) const;
+ // Checks if a document has been deleted
+ //
+ // This is for internal-use only because we assume that the document_id is
+ // already valid. If you're unsure if the document_id is valid, use
+ // DoesDocumentExist(document_id) instead, which will perform those additional
+ // checks.
+ bool IsDeleted(DocumentId document_id) const;
+
+ // Checks if a document has expired.
+ //
+ // This is for internal-use only because we assume that the document_id is
+ // already valid. If you're unsure if the document_id is valid, use
+ // DoesDocumentExist(document_id) instead, which will perform those additional
+ // checks.
+
+ // Returns:
+ // True:DocumentFilterData if the given document isn't expired.
+ // False if the given doesn't document is expired.
+ std::optional<DocumentFilterData> GetNonExpiredDocumentFilterData(
+ DocumentId document_id, int64_t current_time_ms) const;
+
// Updates the entry in the score cache for document_id.
libtextclassifier3::Status UpdateDocumentAssociatedScoreCache(
DocumentId document_id, const DocumentAssociatedScoreData& score_data);
+ // Updates the entry in the corpus score cache for corpus_id.
+ libtextclassifier3::Status UpdateCorpusAssociatedScoreCache(
+ CorpusId corpus_id, const CorpusAssociatedScoreData& score_data);
+
// Updates the entry in the filter cache for document_id.
libtextclassifier3::Status UpdateFilterCache(
DocumentId document_id, const DocumentFilterData& filter_data);
+
+ // Helper method to clear the derived data of a document
+ libtextclassifier3::Status ClearDerivedData(DocumentId document_id);
+
+ // Sets usage scores for the given document.
+ libtextclassifier3::Status SetUsageScores(
+ DocumentId document_id, const UsageStore::UsageScores& usage_scores);
+
+ // Returns:
+ // - on success, a DocumentStorageInfoProto with the fields relating to the
+ // size of Document Store member variables populated.
+ // - INTERNAL on failure to get file size
+ DocumentStorageInfoProto GetMemberStorageInfo() const;
+
+ // Returns:
+ // - on success, the storage_info that was passed in but with the number of
+ // alive, deleted and expired documents also set.
+ // - OUT_OF_RANGE, this should never happen. This could only be returned if
+ // the document_id_mapper somehow became larger than the filter cache.
+ DocumentStorageInfoProto CalculateDocumentStatusCounts(
+ DocumentStorageInfoProto storage_info) const;
+
+ // Returns:
+ // - on success, a RepeatedPtrField for CorpusInfo collected.
+ // - OUT_OF_RANGE, this should never happen.
+ libtextclassifier3::StatusOr<
+ google::protobuf::RepeatedPtrField<DocumentDebugInfoProto::CorpusInfo>>
+ CollectCorpusInfo() const;
+
+ // Build fingerprint for the keys of document_key_mapper_ and corpus_mapper_.
+ // Note that namespace_id_fingerprint_ controls the way that a fingerprint is
+ // built.
+ std::string MakeFingerprint(NamespaceId namespace_id,
+ std::string_view namespace_,
+ std::string_view uri_or_schema) const;
};
} // namespace lib
diff --git a/icing/store/document-store_benchmark.cc b/icing/store/document-store_benchmark.cc
new file mode 100644
index 0000000..46d76d8
--- /dev/null
+++ b/icing/store/document-store_benchmark.cc
@@ -0,0 +1,342 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <ostream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <unordered_set>
+#include <vector>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/persist.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/schema-store.h"
+#include "icing/store/document-store.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/util/clock.h"
+
+// Run on a Linux workstation:
+// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/store:document-store_benchmark
+//
+// $ blaze-bin/icing/store/document-store_benchmark
+// --benchmark_filter=all --benchmark_memory_usage
+//
+// Run on an Android device:
+// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
+// --config=android_arm64 -c opt --dynamic_mode=off --copt=-gmlt
+// //icing/store:document-store_benchmark
+//
+// $ adb push blaze-bin/icing/store/document-store_benchmark
+// /data/local/tmp/
+//
+// $ adb shell /data/local/tmp/document-store_benchmark
+// --benchmark_filter=all
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+class DestructibleDirectory {
+ public:
+ explicit DestructibleDirectory(const Filesystem& filesystem,
+ const std::string& dir)
+ : filesystem_(filesystem), dir_(dir) {
+ filesystem_.CreateDirectoryRecursively(dir_.c_str());
+ }
+ ~DestructibleDirectory() {
+ filesystem_.DeleteDirectoryRecursively(dir_.c_str());
+ }
+
+ private:
+ Filesystem filesystem_;
+ std::string dir_;
+};
+
+DocumentProto CreateDocument(const std::string namespace_,
+ const std::string uri) {
+ return DocumentBuilder()
+ .SetKey(namespace_, uri)
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .Build();
+}
+
+SchemaProto CreateSchema() {
+ return SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+}
+
+std::unique_ptr<SchemaStore> CreateSchemaStore(Filesystem filesystem,
+ const std::string directory,
+ const Clock* clock) {
+ const std::string schema_store_dir = directory + "/schema";
+ filesystem.CreateDirectoryRecursively(schema_store_dir.data());
+ std::unique_ptr<SchemaStore> schema_store =
+ SchemaStore::Create(&filesystem, schema_store_dir, clock).ValueOrDie();
+
+ auto set_schema_status = schema_store->SetSchema(
+ CreateSchema(), /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false);
+ if (!set_schema_status.ok()) {
+ ICING_LOG(ERROR) << set_schema_status.status().error_message();
+ }
+
+ return schema_store;
+}
+
+libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store) {
+ return DocumentStore::Create(
+ filesystem, base_dir, clock, schema_store,
+ /*force_recovery_and_revalidate_documents=*/false,
+ /*namespace_id_fingerprint=*/false, /*pre_mapping_fbv=*/false,
+ /*use_persistent_hash_map=*/false,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr);
+}
+
+void BM_DoesDocumentExistBenchmark(benchmark::State& state) {
+ Filesystem filesystem;
+ Clock clock;
+
+ std::string directory = GetTestTempDir() + "/icing";
+ DestructibleDirectory ddir(filesystem, directory);
+
+ std::string document_store_dir = directory + "/store";
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, directory, &clock);
+
+ filesystem.CreateDirectoryRecursively(document_store_dir.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ int max_document_id = 300000;
+ for (int i = 0; i < max_document_id; ++i) {
+ // Put and delete a lot of documents to fill up our derived files with
+ // stuff.
+ ICING_ASSERT_OK(document_store->Put(
+ CreateDocument("namespace", /*uri=*/std::to_string(i))));
+ ICING_ASSERT_OK(document_store->Delete("namespace",
+ /*uri=*/std::to_string(i),
+ clock.GetSystemTimeMilliseconds()));
+ }
+
+ std::default_random_engine random;
+ std::uniform_int_distribution<> dist(1, max_document_id);
+ for (auto s : state) {
+ // Check random document ids to see if they exist. Hopefully to simulate
+ // page faulting in different sections of our mmapped derived files.
+ int document_id = dist(random);
+ benchmark::DoNotOptimize(document_store->GetAliveDocumentFilterData(
+ document_id, clock.GetSystemTimeMilliseconds()));
+ }
+}
+BENCHMARK(BM_DoesDocumentExistBenchmark);
+
+void BM_Put(benchmark::State& state) {
+ Filesystem filesystem;
+ Clock clock;
+
+ std::string directory = GetTestTempDir() + "/icing";
+ DestructibleDirectory ddir(filesystem, directory);
+
+ std::string document_store_dir = directory + "/store";
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, directory, &clock);
+
+ filesystem.CreateDirectoryRecursively(document_store_dir.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document = CreateDocument("namespace", "uri");
+
+ for (auto s : state) {
+ // It's ok that this is the same document over and over. We'll create a new
+ // document_id for it and still insert the proto into the underlying log.
+ benchmark::DoNotOptimize(document_store->Put(document));
+ }
+}
+BENCHMARK(BM_Put);
+
+void BM_GetSameDocument(benchmark::State& state) {
+ Filesystem filesystem;
+ Clock clock;
+
+ std::string directory = GetTestTempDir() + "/icing";
+ DestructibleDirectory ddir(filesystem, directory);
+
+ std::string document_store_dir = directory + "/store";
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, directory, &clock);
+
+ filesystem.CreateDirectoryRecursively(document_store_dir.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK(document_store->Put(CreateDocument("namespace", "uri")));
+
+ for (auto s : state) {
+ benchmark::DoNotOptimize(document_store->Get("namespace", "uri"));
+ }
+}
+BENCHMARK(BM_GetSameDocument);
+
+void BM_Delete(benchmark::State& state) {
+ Filesystem filesystem;
+ Clock clock;
+
+ std::string directory = GetTestTempDir() + "/icing";
+ DestructibleDirectory ddir(filesystem, directory);
+
+ std::string document_store_dir = directory + "/store";
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, directory, &clock);
+
+ filesystem.CreateDirectoryRecursively(document_store_dir.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document = CreateDocument("namespace", "uri");
+
+ for (auto s : state) {
+ state.PauseTiming();
+ ICING_ASSERT_OK(document_store->Put(document));
+ state.ResumeTiming();
+
+ benchmark::DoNotOptimize(document_store->Delete(
+ "namespace", "uri", clock.GetSystemTimeMilliseconds()));
+ }
+}
+BENCHMARK(BM_Delete);
+
+void BM_Create(benchmark::State& state) {
+ Filesystem filesystem;
+ Clock clock;
+
+ std::string directory = GetTestTempDir() + "/icing";
+ std::string document_store_dir = directory + "/store";
+
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, directory, &clock);
+
+ // Create an initial document store and put some data in.
+ {
+ DestructibleDirectory ddir(filesystem, directory);
+
+ filesystem.CreateDirectoryRecursively(document_store_dir.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document = CreateDocument("namespace", "uri");
+ ICING_ASSERT_OK(document_store->Put(document));
+ ICING_ASSERT_OK(document_store->PersistToDisk(PersistType::FULL));
+ }
+
+ // Recreating it with some content to checksum over.
+ DestructibleDirectory ddir(filesystem, directory);
+
+ filesystem.CreateDirectoryRecursively(document_store_dir.data());
+
+ for (auto s : state) {
+ benchmark::DoNotOptimize(CreateDocumentStore(
+ &filesystem, document_store_dir, &clock, schema_store.get()));
+ }
+}
+BENCHMARK(BM_Create);
+
+void BM_ComputeChecksum(benchmark::State& state) {
+ Filesystem filesystem;
+ Clock clock;
+
+ std::string directory = GetTestTempDir() + "/icing";
+ DestructibleDirectory ddir(filesystem, directory);
+
+ std::string document_store_dir = directory + "/store";
+ std::unique_ptr<SchemaStore> schema_store =
+ CreateSchemaStore(filesystem, directory, &clock);
+
+ filesystem.CreateDirectoryRecursively(document_store_dir.data());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem, document_store_dir, &clock,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document = CreateDocument("namespace", "uri");
+ ICING_ASSERT_OK(document_store->Put(document));
+ ICING_ASSERT_OK(document_store->PersistToDisk(PersistType::LITE));
+
+ for (auto s : state) {
+ benchmark::DoNotOptimize(document_store->ComputeChecksum());
+ }
+}
+BENCHMARK(BM_ComputeChecksum);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/document-store_test.cc b/icing/store/document-store_test.cc
index ad56b9a..2d4cd99 100644
--- a/icing/store/document-store_test.cc
+++ b/icing/store/document-store_test.cc
@@ -15,10 +15,13 @@
#include "icing/store/document-store.h"
#include <cstdint>
+#include <filesystem>
#include <limits>
#include <memory>
+#include <optional>
#include <string>
+#include "icing/text_classifier/lib3/utils/base/status.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
@@ -28,23 +31,44 @@
#include "icing/file/memory-mapped-file.h"
#include "icing/file/mock-filesystem.h"
#include "icing/portable/equals-proto.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/debug.pb.h"
#include "icing/proto/document.pb.h"
+#include "icing/proto/document_wrapper.pb.h"
+#include "icing/proto/logging.pb.h"
#include "icing/proto/schema.pb.h"
+#include "icing/proto/storage.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
+#include "icing/store/corpus-associated-scoring-data.h"
+#include "icing/store/corpus-id.h"
#include "icing/store/document-filter-data.h"
#include "icing/store/document-id.h"
+#include "icing/store/document-log-creator.h"
+#include "icing/store/namespace-fingerprint-identifier.h"
#include "icing/store/namespace-id.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
#include "icing/util/crc32.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
+namespace {
+
using ::icing::lib::portable_equals_proto::EqualsProto;
using ::testing::_;
+using ::testing::ElementsAre;
using ::testing::Eq;
+using ::testing::Ge;
using ::testing::Gt;
using ::testing::HasSubstr;
using ::testing::IsEmpty;
@@ -54,15 +78,70 @@ using ::testing::Not;
using ::testing::Return;
using ::testing::UnorderedElementsAre;
-class DocumentStoreTest : public ::testing::Test {
+const NamespaceStorageInfoProto& GetNamespaceStorageInfo(
+ const DocumentStorageInfoProto& storage_info,
+ const std::string& name_space) {
+ for (const NamespaceStorageInfoProto& namespace_storage_info :
+ storage_info.namespace_storage_info()) {
+ if (namespace_storage_info.namespace_() == name_space) {
+ return namespace_storage_info;
+ }
+ }
+ // Didn't find our namespace, fail the test.
+ EXPECT_TRUE(false) << "Failed to find namespace '" << name_space
+ << "' in DocumentStorageInfoProto.";
+ static const auto& default_namespace_storage_info =
+ *new NamespaceStorageInfoProto();
+ return default_namespace_storage_info;
+}
+
+UsageReport CreateUsageReport(std::string name_space, std::string uri,
+ int64_t timestamp_ms,
+ UsageReport::UsageType usage_type) {
+ UsageReport usage_report;
+ usage_report.set_document_namespace(name_space);
+ usage_report.set_document_uri(uri);
+ usage_report.set_usage_timestamp_ms(timestamp_ms);
+ usage_report.set_usage_type(usage_type);
+ return usage_report;
+}
+
+PortableFileBackedProtoLog<DocumentWrapper>::Header ReadDocumentLogHeader(
+ Filesystem filesystem, const std::string& file_path) {
+ PortableFileBackedProtoLog<DocumentWrapper>::Header header;
+ filesystem.PRead(file_path.c_str(), &header,
+ sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header),
+ /*offset=*/0);
+ return header;
+}
+
+void WriteDocumentLogHeader(
+ Filesystem filesystem, const std::string& file_path,
+ PortableFileBackedProtoLog<DocumentWrapper>::Header& header) {
+ filesystem.Write(file_path.c_str(), &header,
+ sizeof(PortableFileBackedProtoLog<DocumentWrapper>::Header));
+}
+
+struct DocumentStoreTestParam {
+ bool namespace_id_fingerprint;
+ bool pre_mapping_fbv;
+ bool use_persistent_hash_map;
+
+ explicit DocumentStoreTestParam(bool namespace_id_fingerprint_in,
+ bool pre_mapping_fbv_in,
+ bool use_persistent_hash_map_in)
+ : namespace_id_fingerprint(namespace_id_fingerprint_in),
+ pre_mapping_fbv(pre_mapping_fbv_in),
+ use_persistent_hash_map(use_persistent_hash_map_in) {}
+};
+
+class DocumentStoreTest
+ : public ::testing::TestWithParam<DocumentStoreTestParam> {
protected:
DocumentStoreTest()
: test_dir_(GetTestTempDir() + "/icing"),
document_store_dir_(test_dir_ + "/document_store"),
schema_store_dir_(test_dir_ + "/schema_store") {
- filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
- filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
test_document1_ =
DocumentBuilder()
.SetKey("icing", "email/1")
@@ -88,37 +167,86 @@ class DocumentStoreTest : public ::testing::Test {
}
void SetUp() override {
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
-
- auto subject = type_config->add_properties();
- subject->set_property_name("subject");
- subject->set_data_type(PropertyConfigProto::DataType::STRING);
- subject->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- subject->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- subject->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
-
- auto body = type_config->add_properties();
- body->set_property_name("body");
- body->set_data_type(PropertyConfigProto::DataType::STRING);
- body->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- body->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- body->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ // If we've specified using the reverse-JNI method for segmentation (i.e.
+ // not ICU), then we won't have the ICU data file included to set up.
+ // Technically, we could choose to use reverse-JNI for segmentation AND
+ // include an ICU data file, but that seems unlikely and our current BUILD
+ // setup doesn't do this.
+ // File generated via icu_data_file rule in //icing/BUILD.
+ std::string icu_data_file_path =
+ GetTestFilePath("icing/icu.dat");
+ ICING_ASSERT_OK(
+ icu_data_file_helper::SetUpICUDataFile(icu_data_file_path));
+ }
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
- schema_store_, SchemaStore::Create(&filesystem_, schema_store_dir_));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(segmenter_options)));
}
void TearDown() override {
+ lang_segmenter_.reset();
+ schema_store_.reset();
filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
}
+ void CorruptDocStoreHeaderChecksumFile() {
+ // Change the DocStore's header combined checksum so that it won't match the
+ // recalculated checksum on initialization. This will force a regeneration
+ // of derived files from ground truth.
+ const std::string header_file =
+ absl_ports::StrCat(document_store_dir_, "/document_store_header");
+ DocumentStore::Header header;
+ header.magic = DocumentStore::Header::GetCurrentMagic(
+ GetParam().namespace_id_fingerprint);
+ header.checksum = 10; // Arbitrary garbage checksum
+ filesystem_.DeleteFile(header_file.c_str());
+ filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+ }
+
+ libtextclassifier3::StatusOr<DocumentStore::CreateResult> CreateDocumentStore(
+ const Filesystem* filesystem, const std::string& base_dir,
+ const Clock* clock, const SchemaStore* schema_store) {
+ return DocumentStore::Create(
+ filesystem, base_dir, clock, schema_store,
+ /*force_recovery_and_revalidate_documents=*/false,
+ GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv,
+ GetParam().use_persistent_hash_map,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr);
+ }
+
const Filesystem filesystem_;
const std::string test_dir_;
FakeClock fake_clock_;
@@ -127,6 +255,7 @@ class DocumentStoreTest : public ::testing::Test {
DocumentProto test_document1_;
DocumentProto test_document2_;
std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
// Document1 values
const int document1_score_ = 1;
@@ -142,34 +271,36 @@ class DocumentStoreTest : public ::testing::Test {
const int64_t document2_expiration_timestamp_ = 3; // creation + ttl
};
-TEST_F(DocumentStoreTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(DocumentStore::Create(/*filesystem=*/nullptr, document_store_dir_,
- &fake_clock_, schema_store_.get()),
+TEST_P(DocumentStoreTest, CreationWithNullPointerShouldFail) {
+ EXPECT_THAT(CreateDocumentStore(/*filesystem=*/nullptr, document_store_dir_,
+ &fake_clock_, schema_store_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(DocumentStore::Create(&filesystem_, document_store_dir_,
- /*clock=*/nullptr, schema_store_.get()),
+ EXPECT_THAT(CreateDocumentStore(&filesystem_, document_store_dir_,
+ /*clock=*/nullptr, schema_store_.get()),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
- EXPECT_THAT(DocumentStore::Create(&filesystem_, document_store_dir_,
- &fake_clock_, /*schema_store=*/nullptr),
+ EXPECT_THAT(CreateDocumentStore(&filesystem_, document_store_dir_,
+ &fake_clock_, /*schema_store=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
-TEST_F(DocumentStoreTest, CreationWithBadFilesystemShouldFail) {
+TEST_P(DocumentStoreTest, CreationWithBadFilesystemShouldFail) {
MockFilesystem mock_filesystem;
ON_CALL(mock_filesystem, OpenForWrite(_)).WillByDefault(Return(false));
- EXPECT_THAT(DocumentStore::Create(&mock_filesystem, document_store_dir_,
- &fake_clock_, schema_store_.get()),
+ EXPECT_THAT(CreateDocumentStore(&mock_filesystem, document_store_dir_,
+ &fake_clock_, schema_store_.get()),
StatusIs(libtextclassifier3::StatusCode::INTERNAL));
}
-TEST_F(DocumentStoreTest, PutAndGetInSameNamespaceOk) {
+TEST_P(DocumentStoreTest, PutAndGetInSameNamespaceOk) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
// Both documents have namespace of "icing"
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
@@ -183,11 +314,13 @@ TEST_F(DocumentStoreTest, PutAndGetInSameNamespaceOk) {
IsOkAndHolds(EqualsProto(test_document2_)));
}
-TEST_F(DocumentStoreTest, PutAndGetAcrossNamespacesOk) {
+TEST_P(DocumentStoreTest, PutAndGetAcrossNamespacesOk) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
// Can handle different namespaces with same url
DocumentProto foo_document = DocumentBuilder()
@@ -214,11 +347,13 @@ TEST_F(DocumentStoreTest, PutAndGetAcrossNamespacesOk) {
// Validates that putting an document with the same key will overwrite previous
// document and old doc ids are not getting reused.
-TEST_F(DocumentStoreTest, PutSameKey) {
+TEST_P(DocumentStoreTest, PutSameKey) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
// Creates two documents with the same key (namespace + uri)
DocumentProto document1 = DocumentProto(test_document1_);
@@ -241,53 +376,64 @@ TEST_F(DocumentStoreTest, PutSameKey) {
EXPECT_THAT(doc_store->Put(document3), IsOkAndHolds(Not(document_id1)));
}
-TEST_F(DocumentStoreTest, IsDocumentExisting) {
+TEST_P(DocumentStoreTest, IsDocumentExistingWithoutStatus) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
doc_store->Put(DocumentProto(test_document1_)));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
doc_store->Put(DocumentProto(test_document2_)));
- EXPECT_THAT(doc_store->DoesDocumentExist(document_id1), IsTrue());
- EXPECT_THAT(doc_store->DoesDocumentExist(document_id2), IsTrue());
+ EXPECT_TRUE(doc_store->GetAliveDocumentFilterData(
+ document_id1, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_TRUE(doc_store->GetAliveDocumentFilterData(
+ document_id2, fake_clock_.GetSystemTimeMilliseconds()));
DocumentId invalid_document_id_negative = -1;
- EXPECT_THAT(doc_store->DoesDocumentExist(invalid_document_id_negative),
- IsFalse());
+ EXPECT_FALSE(doc_store->GetAliveDocumentFilterData(
+ invalid_document_id_negative, fake_clock_.GetSystemTimeMilliseconds()));
DocumentId invalid_document_id_greater_than_max = kMaxDocumentId + 2;
- EXPECT_THAT(
- doc_store->DoesDocumentExist(invalid_document_id_greater_than_max),
- IsFalse());
+ EXPECT_FALSE(doc_store->GetAliveDocumentFilterData(
+ invalid_document_id_greater_than_max,
+ fake_clock_.GetSystemTimeMilliseconds()));
- EXPECT_THAT(doc_store->DoesDocumentExist(kInvalidDocumentId), IsFalse());
+ EXPECT_FALSE(doc_store->GetAliveDocumentFilterData(
+ kInvalidDocumentId, fake_clock_.GetSystemTimeMilliseconds()));
DocumentId invalid_document_id_out_of_range = document_id2 + 1;
- EXPECT_THAT(doc_store->DoesDocumentExist(invalid_document_id_out_of_range),
- IsFalse());
+ EXPECT_FALSE(doc_store->GetAliveDocumentFilterData(
+ invalid_document_id_out_of_range,
+ fake_clock_.GetSystemTimeMilliseconds()));
}
-TEST_F(DocumentStoreTest, GetDeletedDocumentNotFound) {
+TEST_P(DocumentStoreTest, GetDeletedDocumentNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_EXPECT_OK(document_store->Put(DocumentProto(test_document1_)));
EXPECT_THAT(
document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
IsOkAndHolds(EqualsProto(test_document1_)));
- ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
- test_document1_.uri()));
+ ICING_EXPECT_OK(document_store->Delete(
+ test_document1_.namespace_(), test_document1_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(
document_store->Get(test_document1_.namespace_(), test_document1_.uri()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, GetExpiredDocumentNotFound) {
+TEST_P(DocumentStoreTest, GetExpiredDocumentNotFound) {
DocumentProto document = DocumentBuilder()
.SetKey("namespace", "uri")
.SetSchema("email")
@@ -296,9 +442,12 @@ TEST_F(DocumentStoreTest, GetExpiredDocumentNotFound) {
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_EXPECT_OK(document_store->Put(document));
EXPECT_THAT(document_store->Get("namespace", "uri"),
IsOkAndHolds(EqualsProto(document)));
@@ -319,11 +468,14 @@ TEST_F(DocumentStoreTest, GetExpiredDocumentNotFound) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, GetInvalidDocumentId) {
+TEST_P(DocumentStoreTest, GetInvalidDocumentId) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
doc_store->Put(DocumentProto(test_document1_)));
@@ -343,62 +495,90 @@ TEST_F(DocumentStoreTest, GetInvalidDocumentId) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteOk) {
+TEST_P(DocumentStoreTest, DeleteNonexistentDocumentNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
- // Get() after Delete() returns NOT_FOUND
- ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
- doc_store->Put(DocumentProto(test_document1_)));
- EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
- EXPECT_THAT(doc_store->Get(document_id),
+ // Validates that deleting something non-existing won't append anything to
+ // ground truth
+ int64_t document_log_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
+
+ EXPECT_THAT(document_store->Delete("nonexistent_namespace", "nonexistent_uri",
+ fake_clock_.GetSystemTimeMilliseconds()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ int64_t document_log_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
+ EXPECT_THAT(document_log_size_before, Eq(document_log_size_after));
}
-TEST_F(DocumentStoreTest, DeleteNonexistentDocumentNotFound) {
+TEST_P(DocumentStoreTest, DeleteNonexistentDocumentPrintableErrorMessage) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
// Validates that deleting something non-existing won't append anything to
// ground truth
- int64_t ground_truth_size_before = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
-
- EXPECT_THAT(
- document_store->Delete("nonexistent_namespace", "nonexistent_uri"),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ int64_t document_log_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
+
+ libtextclassifier3::Status status = document_store->Delete(
+ "android$contacts/", "661", fake_clock_.GetSystemTimeMilliseconds());
+ EXPECT_THAT(status, StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ for (char c : status.error_message()) {
+ EXPECT_THAT(std::isprint(c), IsTrue());
+ }
- int64_t ground_truth_size_after = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+ int64_t document_log_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
+ EXPECT_THAT(document_log_size_before, Eq(document_log_size_after));
}
-TEST_F(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) {
+TEST_P(DocumentStoreTest, DeleteAlreadyDeletedDocumentNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_EXPECT_OK(document_store->Put(test_document1_));
// First time is OK
- ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
- test_document1_.uri()));
+ ICING_EXPECT_OK(document_store->Delete(
+ test_document1_.namespace_(), test_document1_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
// Deleting it again is NOT_FOUND
EXPECT_THAT(document_store->Delete(test_document1_.namespace_(),
- test_document1_.uri()),
+ test_document1_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
+TEST_P(DocumentStoreTest, DeleteByNamespaceOk) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
DocumentProto document1 = test_document1_;
document1.set_namespace_("namespace.1");
@@ -422,7 +602,10 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
// DELETE namespace.1. document1 and document 4 should be deleted. document2
// and document3 should still be retrievable.
- ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace.1"));
+ DocumentStore::DeleteByGroupResult group_result =
+ doc_store->DeleteByNamespace("namespace.1");
+ EXPECT_THAT(group_result.status, IsOk());
+ EXPECT_THAT(group_result.num_docs_deleted, Eq(2));
EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document2.namespace_(), document2.uri()),
@@ -433,42 +616,53 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceOk) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
+TEST_P(DocumentStoreTest, DeleteByNamespaceNonexistentNamespaceNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
// Validates that deleting something non-existing won't append anything to
// ground truth
- int64_t ground_truth_size_before = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ int64_t document_log_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
- EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace"),
+ EXPECT_THAT(doc_store->DeleteByNamespace("nonexistent_namespace").status,
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- int64_t ground_truth_size_after = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+ int64_t document_log_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
+ EXPECT_THAT(document_log_size_before, Eq(document_log_size_after));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) {
+TEST_P(DocumentStoreTest, DeleteByNamespaceNoExistingDocumentsNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_EXPECT_OK(document_store->Put(test_document1_));
- ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
- test_document1_.uri()));
+ ICING_EXPECT_OK(document_store->Delete(
+ test_document1_.namespace_(), test_document1_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
// At this point, there are no existing documents with the namespace, even
// though Icing's derived files know about this namespace. We should still
// return NOT_FOUND since nothing existing has this namespace.
- EXPECT_THAT(document_store->DeleteByNamespace(test_document1_.namespace_()),
- StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(
+ document_store->DeleteByNamespace(test_document1_.namespace_()).status,
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
+TEST_P(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
DocumentProto document1 = test_document1_;
document1.set_namespace_("namespace.1");
document1.set_uri("uri1");
@@ -485,12 +679,15 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
document4.set_namespace_("namespace.1");
document4.set_uri("uri2");
- int64_t ground_truth_size_before;
+ int64_t document_log_size_before;
{
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK(doc_store->Put(document1));
ICING_ASSERT_OK(doc_store->Put(document2));
ICING_ASSERT_OK(doc_store->Put(document3));
@@ -498,33 +695,32 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
// DELETE namespace.1. document1 and document 4 should be deleted. document2
// and document3 should still be retrievable.
- ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace.1"));
-
- ground_truth_size_before = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ DocumentStore::DeleteByGroupResult group_result =
+ doc_store->DeleteByNamespace("namespace.1");
+ EXPECT_THAT(group_result.status, IsOk());
+ EXPECT_THAT(group_result.num_docs_deleted, Eq(2));
+
+ document_log_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
} // Destructors should update checksum and persist all data to file.
- // Change the DocStore's header combined checksum so that it won't match the
- // recalculated checksum on initialization. This will force a regeneration of
- // derived files from ground truth.
- const std::string header_file =
- absl_ports::StrCat(document_store_dir_, "/document_store_header");
- DocumentStore::Header header;
- header.magic = DocumentStore::Header::kMagic;
- header.checksum = 10; // Arbitrary garbage checksum
- filesystem_.DeleteFile(header_file.c_str());
- filesystem_.Write(header_file.c_str(), &header, sizeof(header));
-
+ CorruptDocStoreHeaderChecksumFile();
// Successfully recover from a corrupt derived file issue.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
// Make sure we didn't add anything to the ground truth after we recovered.
- int64_t ground_truth_size_after = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_EQ(ground_truth_size_before, ground_truth_size_after);
+ int64_t document_log_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
+ EXPECT_EQ(document_log_size_before, document_log_size_after);
EXPECT_THAT(doc_store->Get(document1.namespace_(), document1.uri()),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -536,28 +732,31 @@ TEST_F(DocumentStoreTest, DeleteByNamespaceRecoversOk) {
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- type_config = schema.add_types();
- type_config->set_schema_type("message");
- type_config = schema.add_types();
- type_config->set_schema_type("person");
+TEST_P(DocumentStoreTest, DeleteBySchemaTypeOk) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .AddType(SchemaTypeConfigBuilder().SetType("person"))
+ .Build();
std::string schema_store_dir = schema_store_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
DocumentProto email_document_1 = DocumentBuilder()
.SetKey("namespace1", "1")
@@ -593,7 +792,10 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
// Delete the "email" type and ensure that it works across both
// email_document's namespaces. And that other documents aren't affected.
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
+ DocumentStore::DeleteByGroupResult group_result =
+ document_store->DeleteBySchemaType("email");
+ EXPECT_THAT(group_result.status, IsOk());
+ EXPECT_THAT(group_result.num_docs_deleted, Eq(2));
EXPECT_THAT(document_store->Get(email_1_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(document_store->Get(email_2_document_id),
@@ -604,7 +806,9 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
IsOkAndHolds(EqualsProto(person_document)));
// Delete the "message" type and check that other documents aren't affected
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("message"));
+ group_result = document_store->DeleteBySchemaType("message");
+ EXPECT_THAT(group_result.status, IsOk());
+ EXPECT_THAT(group_result.num_docs_deleted, Eq(1));
EXPECT_THAT(document_store->Get(email_1_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(document_store->Get(email_2_document_id),
@@ -615,56 +819,67 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeOk) {
IsOkAndHolds(EqualsProto(person_document)));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
+TEST_P(DocumentStoreTest, DeleteBySchemaTypeNonexistentSchemaTypeNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
// Validates that deleting something non-existing won't append anything to
// ground truth
- int64_t ground_truth_size_before = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ int64_t document_log_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
- EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type"),
+ EXPECT_THAT(document_store->DeleteBySchemaType("nonexistent_type").status,
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
- int64_t ground_truth_size_after = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ int64_t document_log_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
- EXPECT_THAT(ground_truth_size_before, Eq(ground_truth_size_after));
+ EXPECT_THAT(document_log_size_before, Eq(document_log_size_after));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsOk) {
+TEST_P(DocumentStoreTest, DeleteBySchemaTypeNoExistingDocumentsNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_EXPECT_OK(document_store->Put(test_document1_));
- ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
- test_document1_.uri()));
+ ICING_EXPECT_OK(document_store->Delete(
+ test_document1_.namespace_(), test_document1_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
- // At this point, there are no existing documents with the schema type, but we
- // still return OK because the SchemaStore is the ground truth on schemas and
- // knows about the type
- ICING_EXPECT_OK(document_store->DeleteBySchemaType(test_document1_.schema()));
+ EXPECT_THAT(
+ document_store->DeleteBySchemaType(test_document1_.schema()).status,
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- type_config = schema.add_types();
- type_config->set_schema_type("message");
+TEST_P(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
std::string schema_store_dir = schema_store_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
DocumentId email_document_id;
DocumentId message_document_id;
@@ -680,12 +895,14 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
.SetSchema("message")
.SetCreationTimestampMs(1)
.Build();
- int64_t ground_truth_size_before;
+ int64_t document_log_size_before;
{
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(email_document_id,
document_store->Put(email_document));
@@ -693,33 +910,32 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
document_store->Put(message_document));
// Delete "email". "message" documents should still be retrievable.
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
-
- ground_truth_size_before = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ DocumentStore::DeleteByGroupResult group_result =
+ document_store->DeleteBySchemaType("email");
+ EXPECT_THAT(group_result.status, IsOk());
+ EXPECT_THAT(group_result.num_docs_deleted, Eq(1));
+
+ document_log_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
} // Destructors should update checksum and persist all data to file.
- // Change the DocumentStore's header combined checksum so that it won't match
- // the recalculated checksum on initialization. This will force a regeneration
- // of derived files from ground truth.
- const std::string header_file =
- absl_ports::StrCat(document_store_dir_, "/document_store_header");
- DocumentStore::Header header;
- header.magic = DocumentStore::Header::kMagic;
- header.checksum = 10; // Arbitrary garbage checksum
- filesystem_.DeleteFile(header_file.c_str());
- filesystem_.Write(header_file.c_str(), &header, sizeof(header));
-
+ CorruptDocStoreHeaderChecksumFile();
// Successfully recover from a corrupt derived file issue.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
// Make sure we didn't add anything to the ground truth after we recovered.
- int64_t ground_truth_size_after = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_EQ(ground_truth_size_before, ground_truth_size_after);
+ int64_t document_log_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
+ EXPECT_EQ(document_log_size_before, document_log_size_after);
EXPECT_THAT(document_store->Get(email_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -727,21 +943,37 @@ TEST_F(DocumentStoreTest, DeleteBySchemaTypeRecoversOk) {
IsOkAndHolds(EqualsProto(message_document)));
}
-TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) {
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- type_config = schema.add_types();
- type_config->set_schema_type("message");
+TEST_P(DocumentStoreTest, PutDeleteThenPut) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ ICING_EXPECT_OK(doc_store->Put(test_document1_));
+ ICING_EXPECT_OK(doc_store->Delete(test_document1_.namespace_(),
+ test_document1_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+ ICING_EXPECT_OK(doc_store->Put(test_document1_));
+}
+
+TEST_P(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
std::string schema_store_dir = schema_store_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
- ICING_ASSERT_OK(schema_store->SetSchema(schema));
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
DocumentId email_document_id;
DocumentId message_document_id;
@@ -757,12 +989,14 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) {
.SetSchema("message")
.SetCreationTimestampMs(1)
.Build();
- int64_t ground_truth_size_before;
+ int64_t document_log_size_before;
{
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(email_document_id,
document_store->Put(email_document));
@@ -770,45 +1004,46 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) {
document_store->Put(message_document));
// Delete "email". "message" documents should still be retrievable.
- ICING_EXPECT_OK(document_store->DeleteBySchemaType("email"));
+ DocumentStore::DeleteByGroupResult group_result =
+ document_store->DeleteBySchemaType("email");
+ EXPECT_THAT(group_result.status, IsOk());
+ EXPECT_THAT(group_result.num_docs_deleted, Eq(1));
EXPECT_THAT(document_store->Get(email_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(document_store->Get(message_document_id),
IsOkAndHolds(EqualsProto(message_document)));
- ground_truth_size_before = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
+ document_log_size_before = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
} // Destructors should update checksum and persist all data to file.
- // Change the DocumentStore's header combined checksum so that it won't match
- // the recalculated checksum on initialization. This will force a regeneration
- // of derived files from ground truth.
- const std::string header_file =
- absl_ports::StrCat(document_store_dir_, "/document_store_header");
- DocumentStore::Header header;
- header.magic = DocumentStore::Header::kMagic;
- header.checksum = 10; // Arbitrary garbage checksum
- filesystem_.DeleteFile(header_file.c_str());
- filesystem_.Write(header_file.c_str(), &header, sizeof(header));
-
- SchemaProto new_schema;
- type_config = new_schema.add_types();
- type_config->set_schema_type("message");
+ CorruptDocStoreHeaderChecksumFile();
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
ICING_EXPECT_OK(schema_store->SetSchema(
- new_schema, /*ignore_errors_and_delete_documents=*/true));
+ new_schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
// Successfully recover from a corrupt derived file issue.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
// Make sure we didn't add anything to the ground truth after we recovered.
- int64_t ground_truth_size_after = filesystem_.GetFileSize(
- absl_ports::StrCat(document_store_dir_, "/document_log").c_str());
- EXPECT_EQ(ground_truth_size_before, ground_truth_size_after);
+ int64_t document_log_size_after = filesystem_.GetFileSize(
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str());
+ EXPECT_EQ(document_log_size_before, document_log_size_after);
EXPECT_THAT(document_store->Get(email_document_id),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -816,11 +1051,13 @@ TEST_F(DocumentStoreTest, DeletedSchemaTypeFromSchemaStoreRecoversOk) {
IsOkAndHolds(EqualsProto(message_document)));
}
-TEST_F(DocumentStoreTest, OptimizeInto) {
+TEST_P(DocumentStoreTest, OptimizeIntoSingleNamespace) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
DocumentProto document1 = DocumentBuilder()
.SetKey("namespace", "uri1")
@@ -850,33 +1087,50 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
ICING_ASSERT_OK(doc_store->Put(document2));
ICING_ASSERT_OK(doc_store->Put(document3));
- std::string original_document_log = document_store_dir_ + "/document_log";
+ std::string original_document_log = absl_ports::StrCat(
+ document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename());
+
int64_t original_size =
filesystem_.GetFileSize(original_document_log.c_str());
// Optimizing into the same directory is not allowed
- EXPECT_THAT(doc_store->OptimizeInto(document_store_dir_),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
- HasSubstr("directory is the same")));
+ EXPECT_THAT(
+ doc_store->OptimizeInto(document_store_dir_, lang_segmenter_.get()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("directory is the same")));
std::string optimized_dir = document_store_dir_ + "_optimize";
- std::string optimized_document_log = optimized_dir + "/document_log";
+ std::string optimized_document_log =
+ optimized_dir + "/" + DocumentLogCreator::GetDocumentLogFilename();
// Validates that the optimized document log has the same size if nothing is
- // deleted
+ // deleted. Also namespace ids remain the same.
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result1,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result1.document_id_old_to_new, ElementsAre(0, 1, 2));
+ EXPECT_THAT(optimize_result1.namespace_id_old_to_new, ElementsAre(0));
+ EXPECT_THAT(optimize_result1.should_rebuild_index, IsFalse());
int64_t optimized_size1 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_EQ(original_size, optimized_size1);
// Validates that the optimized document log has a smaller size if something
- // is deleted
+ // is deleted. Namespace ids remain the same.
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(doc_store->Delete("namespace", "uri1"));
- ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ ICING_ASSERT_OK(doc_store->Delete("namespace", "uri1",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // DocumentId 0 is removed.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result2,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result2.document_id_old_to_new,
+ ElementsAre(kInvalidDocumentId, 0, 1));
+ EXPECT_THAT(optimize_result2.namespace_id_old_to_new, ElementsAre(0));
+ EXPECT_THAT(optimize_result2.should_rebuild_index, IsFalse());
int64_t optimized_size2 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_THAT(original_size, Gt(optimized_size2));
@@ -886,32 +1140,307 @@ TEST_F(DocumentStoreTest, OptimizeInto) {
fake_clock_.SetSystemTimeMilliseconds(300);
// Validates that the optimized document log has a smaller size if something
- // expired
+ // expired. Namespace ids remain the same.
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ // DocumentId 0 is removed, and DocumentId 2 is expired.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result3,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result3.document_id_old_to_new,
+ ElementsAre(kInvalidDocumentId, 0, kInvalidDocumentId));
+ EXPECT_THAT(optimize_result3.namespace_id_old_to_new, ElementsAre(0));
+ EXPECT_THAT(optimize_result3.should_rebuild_index, IsFalse());
+ int64_t optimized_size3 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(optimized_size2, Gt(optimized_size3));
+
+ // Delete the last document
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ ICING_ASSERT_OK(doc_store->Delete("namespace", "uri2",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // DocumentId 0 and 1 is removed, and DocumentId 2 is expired. Since no
+ // document with the namespace is added into new document store, the namespace
+ // id will be invalid.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result4,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(
+ optimize_result4.document_id_old_to_new,
+ ElementsAre(kInvalidDocumentId, kInvalidDocumentId, kInvalidDocumentId));
+ EXPECT_THAT(optimize_result4.namespace_id_old_to_new,
+ ElementsAre(kInvalidNamespaceId));
+ EXPECT_THAT(optimize_result4.should_rebuild_index, IsFalse());
+ int64_t optimized_size4 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(optimized_size3, Gt(optimized_size4));
+}
+
+TEST_P(DocumentStoreTest, OptimizeIntoMultipleNamespaces) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document0 = DocumentBuilder()
+ .SetKey("namespace1", "uri0")
+ .SetSchema("email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .Build();
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .Build();
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "uri2")
+ .SetSchema("email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .Build();
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace1", "uri3")
+ .SetSchema("email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .Build();
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace3", "uri4")
+ .SetSchema("email")
+ .SetCreationTimestampMs(100)
+ .SetTtlMs(1000)
+ .Build();
+
+ // Nothing should have expired yet.
+ fake_clock_.SetSystemTimeMilliseconds(100);
+
+ ICING_ASSERT_OK(doc_store->Put(document0));
+ ICING_ASSERT_OK(doc_store->Put(document1));
+ ICING_ASSERT_OK(doc_store->Put(document2));
+ ICING_ASSERT_OK(doc_store->Put(document3));
+ ICING_ASSERT_OK(doc_store->Put(document4));
+
+ std::string original_document_log = absl_ports::StrCat(
+ document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename());
+
+ int64_t original_size =
+ filesystem_.GetFileSize(original_document_log.c_str());
+
+ std::string optimized_dir = document_store_dir_ + "_optimize";
+ std::string optimized_document_log =
+ optimized_dir + "/" + DocumentLogCreator::GetDocumentLogFilename();
+
+ // Validates that the optimized document log has the same size if nothing is
+ // deleted. Also namespace ids remain the same.
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result1,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result1.document_id_old_to_new,
+ ElementsAre(0, 1, 2, 3, 4));
+ EXPECT_THAT(optimize_result1.namespace_id_old_to_new, ElementsAre(0, 1, 2));
+ EXPECT_THAT(optimize_result1.should_rebuild_index, IsFalse());
+ int64_t optimized_size1 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_EQ(original_size, optimized_size1);
+
+ // Validates that the optimized document log has a smaller size if something
+ // is deleted.
ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(doc_store->OptimizeInto(optimized_dir));
+ // Delete DocumentId 0 with namespace1.
+ // - Before: ["namespace1#uri0", "namespace1#uri1", "namespace2#uri2",
+ // "namespace1#uri3", "namespace3#uri4"]
+ // - After: [nil, "namespace1#uri1", "namespace2#uri2", "namespace1#uri3",
+ // "namespace3#uri4"]
+ // In this case, new_doc_store will assign namespace ids in ["namespace1",
+ // "namespace2", "namespace3"] order. Since new_doc_store has the same order
+ // of namespace id assignment, namespace ids remain the same.
+ ICING_ASSERT_OK(doc_store->Delete("namespace1", "uri0",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result2,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result2.document_id_old_to_new,
+ ElementsAre(kInvalidDocumentId, 0, 1, 2, 3));
+ EXPECT_THAT(optimize_result2.namespace_id_old_to_new, ElementsAre(0, 1, 2));
+ EXPECT_THAT(optimize_result2.should_rebuild_index, IsFalse());
+ int64_t optimized_size2 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(original_size, Gt(optimized_size2));
+
+ // Validates that the optimized document log has a smaller size if something
+ // is deleted.
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ // Delete DocumentId 1 with namespace1.
+ // - Before: [nil, "namespace1#uri1", "namespace2#uri2", "namespace1#uri3",
+ // "namespace3#uri4"]
+ // - After: [nil, nil, "namespace2#uri2", "namespace1#uri3",
+ // "namespace3#uri4"]
+ // In this case, new_doc_store will assign namespace ids in ["namespace2",
+ // "namespace1", "namespace3"] order, so namespace_id_old_to_new should
+ // reflect the change.
+ ICING_ASSERT_OK(doc_store->Delete("namespace1", "uri1",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result3,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result3.document_id_old_to_new,
+ ElementsAre(kInvalidDocumentId, kInvalidDocumentId, 0, 1, 2));
+ EXPECT_THAT(optimize_result3.namespace_id_old_to_new, ElementsAre(1, 0, 2));
+ EXPECT_THAT(optimize_result3.should_rebuild_index, IsFalse());
int64_t optimized_size3 =
filesystem_.GetFileSize(optimized_document_log.c_str());
EXPECT_THAT(optimized_size2, Gt(optimized_size3));
+
+ // Validates that the optimized document log has a smaller size if something
+ // is deleted.
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ // Delete DocumentId 3 with namespace1.
+ // - Before: [nil, nil, "namespace2#uri2", "namespace1#uri3",
+ // "namespace3#uri4"]
+ // - After: [nil, nil, "namespace2#uri2", nil, "namespace3#uri4"]
+ // In this case, new_doc_store will assign namespace ids in ["namespace2",
+ // "namespace3"] order and "namespace1" will be never assigned, so
+ // namespace_id_old_to_new should reflect the change.
+ ICING_ASSERT_OK(doc_store->Delete("namespace1", "uri3",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result4,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result4.document_id_old_to_new,
+ ElementsAre(kInvalidDocumentId, kInvalidDocumentId, 0,
+ kInvalidDocumentId, 1));
+ EXPECT_THAT(optimize_result4.namespace_id_old_to_new,
+ ElementsAre(kInvalidNamespaceId, 0, 1));
+ EXPECT_THAT(optimize_result4.should_rebuild_index, IsFalse());
+ int64_t optimized_size4 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(optimized_size3, Gt(optimized_size4));
+
+ // Validates that the optimized document log has a smaller size if something
+ // is deleted.
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ // Delete DocumentId 4 with namespace3.
+ // - Before: [nil, nil, "namespace2#uri2", nil, "namespace3#uri4"]
+ // - After: [nil, nil, "namespace2#uri2", nil, nil]
+ // In this case, new_doc_store will assign namespace ids in ["namespace2"]
+ // order and "namespace1", "namespace3" will be never assigned, so
+ // namespace_id_old_to_new should reflect the change.
+ ICING_ASSERT_OK(doc_store->Delete("namespace3", "uri4",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result5,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result5.document_id_old_to_new,
+ ElementsAre(kInvalidDocumentId, kInvalidDocumentId, 0,
+ kInvalidDocumentId, kInvalidDocumentId));
+ EXPECT_THAT(optimize_result5.namespace_id_old_to_new,
+ ElementsAre(kInvalidNamespaceId, 0, kInvalidNamespaceId));
+ EXPECT_THAT(optimize_result5.should_rebuild_index, IsFalse());
+ int64_t optimized_size5 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(optimized_size4, Gt(optimized_size5));
+
+ // Validates that the optimized document log has a smaller size if something
+ // is deleted.
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+ // Delete DocumentId 2 with namespace2.
+ // - Before: [nil, nil, "namespace2#uri2", nil, nil]
+ // - After: [nil, nil, nil, nil, nil]
+ // In this case, all documents were deleted, so there will be no namespace ids
+ // either. namespace_id_old_to_new should reflect the change.
+ ICING_ASSERT_OK(doc_store->Delete("namespace2", "uri2",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result6,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(
+ optimize_result6.document_id_old_to_new,
+ ElementsAre(kInvalidDocumentId, kInvalidDocumentId, kInvalidDocumentId,
+ kInvalidDocumentId, kInvalidDocumentId));
+ EXPECT_THAT(optimize_result6.namespace_id_old_to_new,
+ ElementsAre(kInvalidNamespaceId, kInvalidNamespaceId,
+ kInvalidNamespaceId));
+ EXPECT_THAT(optimize_result6.should_rebuild_index, IsFalse());
+ int64_t optimized_size6 =
+ filesystem_.GetFileSize(optimized_document_log.c_str());
+ EXPECT_THAT(optimized_size5, Gt(optimized_size6));
}
-TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) {
+TEST_P(DocumentStoreTest, OptimizeIntoForEmptyDocumentStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ std::string optimized_dir = document_store_dir_ + "_optimize";
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
+ ASSERT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::OptimizeResult optimize_result,
+ doc_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+ EXPECT_THAT(optimize_result.document_id_old_to_new, IsEmpty());
+ EXPECT_THAT(optimize_result.namespace_id_old_to_new, IsEmpty());
+ EXPECT_THAT(optimize_result.should_rebuild_index, IsFalse());
+}
+
+TEST_P(DocumentStoreTest, ShouldRecoverFromDataLoss) {
DocumentId document_id1, document_id2;
{
// Can put and delete fine.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(document_id1,
- doc_store->Put(DocumentProto(test_document1_)));
- ICING_ASSERT_OK_AND_ASSIGN(document_id2,
- doc_store->Put(DocumentProto(test_document2_)));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id1,
+ doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id2,
+ doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4));
EXPECT_THAT(doc_store->Get(document_id1),
IsOkAndHolds(EqualsProto(test_document1_)));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
- EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+ // Checks derived score cache
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/8)));
+
+ // Delete document 1
+ EXPECT_THAT(doc_store->Delete("icing", "email/1",
+ fake_clock_.GetSystemTimeMilliseconds()),
+ IsOk());
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document_id2),
@@ -924,59 +1453,98 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromDataLoss) {
DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
const std::string serialized_document = document.SerializeAsString();
- const std::string document_log_file =
- absl_ports::StrCat(document_store_dir_, "/document_log");
+ const std::string document_log_file = absl_ports::StrCat(
+ document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename());
int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str());
filesystem_.PWrite(document_log_file.c_str(), file_size,
serialized_document.data(), serialized_document.size());
// Successfully recover from a data loss issue.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
-
// Checks derived filter cache
- EXPECT_THAT(doc_store->GetDocumentFilterData(document_id2),
- IsOkAndHolds(DocumentFilterData(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData doc_filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ document_id2, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(doc_filter_data,
+ Eq(DocumentFilterData(
/*namespace_id=*/0,
/*schema_type_id=*/0, document2_expiration_timestamp_)));
+
// Checks derived score cache
- EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
- IsOkAndHolds(DocumentAssociatedScoreData(
- document2_score_, document2_creation_timestamp_)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/4)));
}
-TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) {
+TEST_P(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) {
DocumentId document_id1, document_id2;
{
// Can put and delete fine.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(document_id1,
- doc_store->Put(DocumentProto(test_document1_)));
- ICING_ASSERT_OK_AND_ASSIGN(document_id2,
- doc_store->Put(DocumentProto(test_document2_)));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id1,
+ doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id2,
+ doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4));
EXPECT_THAT(doc_store->Get(document_id1),
IsOkAndHolds(EqualsProto(test_document1_)));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
- EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+ // Checks derived score cache
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/8)));
+ // Delete document 1
+ EXPECT_THAT(doc_store->Delete("icing", "email/1",
+ fake_clock_.GetSystemTimeMilliseconds()),
+ IsOk());
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
+
+ EXPECT_THAT(doc_store->ReportUsage(CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/2",
+ /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1)),
+ IsOk());
}
- // "Corrupt" one of the derived files by adding non-checksummed data to
- // it. This will mess up the checksum and throw an error on the derived file's
- // initialization.
+ // "Corrupt" one of the derived files by modifying an existing data without
+ // calling PersistToDisk() or updating its checksum. This will mess up the
+ // checksum and throw an error on the derived file's initialization.
const std::string document_id_mapper_file =
absl_ports::StrCat(document_store_dir_, "/document_id_mapper");
ICING_ASSERT_OK_AND_ASSIGN(
@@ -984,94 +1552,243 @@ TEST_F(DocumentStoreTest, ShouldRecoverFromCorruptDerivedFile) {
FileBackedVector<int64_t>::Create(
filesystem_, document_id_mapper_file,
MemoryMappedFile::READ_WRITE_AUTO_SYNC));
- int64_t corrupt_document_id = 3;
- int64_t corrupt_offset = 3;
+ int64_t corrupt_document_id = 1;
+ int64_t corrupt_offset = 123456;
EXPECT_THAT(document_id_mapper->Set(corrupt_document_id, corrupt_offset),
IsOk());
+ // Will get error when initializing document id mapper file, so it will
+ // trigger RegenerateDerivedFiles.
// Successfully recover from a corrupt derived file issue.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
// Checks derived filter cache
- EXPECT_THAT(doc_store->GetDocumentFilterData(document_id2),
- IsOkAndHolds(DocumentFilterData(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData doc_filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ document_id2, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(doc_filter_data,
+ Eq(DocumentFilterData(
/*namespace_id=*/0,
/*schema_type_id=*/0, document2_expiration_timestamp_)));
+
// Checks derived score cache
- EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
- IsOkAndHolds(DocumentAssociatedScoreData(
- document2_score_, document2_creation_timestamp_)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/4)));
+
+ // Checks usage score data - note that they aren't regenerated from
+ // scratch.
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_count = 1;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ doc_store->GetUsageScores(document_id2,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
}
-TEST_F(DocumentStoreTest, ShouldRecoverFromBadChecksum) {
+TEST_P(DocumentStoreTest, ShouldRecoverFromDiscardDerivedFiles) {
DocumentId document_id1, document_id2;
{
// Can put and delete fine.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(document_id1,
- doc_store->Put(DocumentProto(test_document1_)));
- ICING_ASSERT_OK_AND_ASSIGN(document_id2,
- doc_store->Put(DocumentProto(test_document2_)));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id1,
+ doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id2,
+ doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4));
EXPECT_THAT(doc_store->Get(document_id1),
IsOkAndHolds(EqualsProto(test_document1_)));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
- EXPECT_THAT(doc_store->Delete("icing", "email/1"), IsOk());
+ // Checks derived score cache
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/8)));
+ // Delete document 1
+ EXPECT_THAT(doc_store->Delete("icing", "email/1",
+ fake_clock_.GetSystemTimeMilliseconds()),
+ IsOk());
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
+
+ EXPECT_THAT(doc_store->ReportUsage(CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/2",
+ /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1)),
+ IsOk());
}
- // Change the DocStore's header combined checksum so that it won't match the
- // recalculated checksum on initialization. This will force a regeneration of
- // derived files from ground truth.
- const std::string header_file =
- absl_ports::StrCat(document_store_dir_, "/document_store_header");
- DocumentStore::Header header;
- header.magic = DocumentStore::Header::kMagic;
- header.checksum = 10; // Arbitrary garbage checksum
- filesystem_.DeleteFile(header_file.c_str());
- filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+ // Discard all derived files.
+ ICING_ASSERT_OK(
+ DocumentStore::DiscardDerivedFiles(&filesystem_, document_store_dir_));
- // Successfully recover from a corrupt derived file issue.
+ // Successfully recover after discarding all derived files.
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+
+ // Checks derived filter cache
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData doc_filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ document_id2, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(doc_filter_data,
+ Eq(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0, document2_expiration_timestamp_)));
+
+ // Checks derived score cache.
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/4)));
+
+ // Checks usage score data - note that they aren't regenerated from
+ // scratch.
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_count = 1;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ doc_store->GetUsageScores(document_id2,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+}
+
+TEST_P(DocumentStoreTest, ShouldRecoverFromBadChecksum) {
+ DocumentId document_id1, document_id2;
+ {
+ // Can put and delete fine.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id1,
+ doc_store->Put(DocumentProto(test_document1_), /*num_tokens=*/4));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id2,
+ doc_store->Put(DocumentProto(test_document2_), /*num_tokens=*/4));
+ EXPECT_THAT(doc_store->Get(document_id1),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+ // Checks derived score cache
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/8)));
+ EXPECT_THAT(doc_store->Delete("icing", "email/1",
+ fake_clock_.GetSystemTimeMilliseconds()),
+ IsOk());
+ EXPECT_THAT(doc_store->Get(document_id1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->Get(document_id2),
+ IsOkAndHolds(EqualsProto(test_document2_)));
+ }
+
+ CorruptDocStoreHeaderChecksumFile();
+ // Successfully recover from a corrupt derived file issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
EXPECT_THAT(doc_store->Get(document_id1),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(doc_store->Get(document_id2),
IsOkAndHolds(EqualsProto(test_document2_)));
// Checks derived filter cache
- EXPECT_THAT(doc_store->GetDocumentFilterData(document_id2),
- IsOkAndHolds(DocumentFilterData(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData doc_filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ document_id2, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(doc_filter_data,
+ Eq(DocumentFilterData(
/*namespace_id=*/0,
/*schema_type_id=*/0, document2_expiration_timestamp_)));
// Checks derived score cache
- EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
- IsOkAndHolds(DocumentAssociatedScoreData(
- document2_score_, document2_creation_timestamp_)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/4)));
}
-TEST_F(DocumentStoreTest, GetDiskUsage) {
+TEST_P(DocumentStoreTest, GetStorageInfo) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
- ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_doc_store_size,
- doc_store->GetDiskUsage());
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentStorageInfoProto doc_store_storage_info = doc_store->GetStorageInfo();
+ int64_t empty_doc_store_size = doc_store_storage_info.document_store_size();
EXPECT_THAT(empty_doc_store_size, Gt(0));
DocumentProto document = DocumentBuilder()
@@ -1080,33 +1797,38 @@ TEST_F(DocumentStoreTest, GetDiskUsage) {
.AddStringProperty("subject", "foo")
.Build();
- // Since our GetDiskUsage can only get sizes in increments of block_size, we
+ // Since GetStorageInfo can only get sizes in increments of block_size, we
// need to insert enough documents so the disk usage will increase by at least
// 1 block size. The number 100 is a bit arbitrary, gotten from manually
// testing.
for (int i = 0; i < 100; ++i) {
ICING_ASSERT_OK(doc_store->Put(document));
}
- EXPECT_THAT(doc_store->GetDiskUsage(),
- IsOkAndHolds(Gt(empty_doc_store_size)));
+ doc_store_storage_info = doc_store->GetStorageInfo();
+ EXPECT_THAT(doc_store_storage_info.document_store_size(),
+ Gt(empty_doc_store_size));
// Bad file system
MockFilesystem mock_filesystem;
- ON_CALL(mock_filesystem, GetDiskUsage(A<const char *>()))
+ ON_CALL(mock_filesystem, GetDiskUsage(A<const char*>()))
.WillByDefault(Return(Filesystem::kBadFileSize));
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store_with_mock_filesystem,
- DocumentStore::Create(&mock_filesystem, document_store_dir_, &fake_clock_,
- schema_store_.get()));
- EXPECT_THAT(doc_store_with_mock_filesystem->GetDiskUsage(),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ create_result, CreateDocumentStore(&mock_filesystem, document_store_dir_,
+ &fake_clock_, schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store_with_mock_filesystem =
+ std::move(create_result.document_store);
+
+ doc_store_storage_info = doc_store_with_mock_filesystem->GetStorageInfo();
+ EXPECT_THAT(doc_store_storage_info.document_store_size(), Eq(-1));
}
-TEST_F(DocumentStoreTest, MaxDocumentId) {
+TEST_P(DocumentStoreTest, MaxDocumentId) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
// Since the DocumentStore is empty, we get an invalid DocumentId
EXPECT_THAT(doc_store->last_added_document_id(), Eq(kInvalidDocumentId));
@@ -1116,7 +1838,8 @@ TEST_F(DocumentStoreTest, MaxDocumentId) {
EXPECT_THAT(doc_store->last_added_document_id(), Eq(document_id1));
// Still returns the last DocumentId even if it was deleted
- ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1",
+ fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(doc_store->last_added_document_id(), Eq(document_id1));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id2,
@@ -1124,11 +1847,13 @@ TEST_F(DocumentStoreTest, MaxDocumentId) {
EXPECT_THAT(doc_store->last_added_document_id(), Eq(document_id2));
}
-TEST_F(DocumentStoreTest, GetNamespaceId) {
+TEST_P(DocumentStoreTest, GetNamespaceId) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
DocumentProto document_namespace1 =
DocumentBuilder().SetKey("namespace1", "1").SetSchema("email").Build();
@@ -1145,15 +1870,23 @@ TEST_F(DocumentStoreTest, GetNamespaceId) {
// DocumentStore
EXPECT_THAT(doc_store->GetNamespaceId("namespace2"), IsOkAndHolds(Eq(1)));
+ // DELETE namespace1 - document_namespace1 is deleted.
+ DocumentStore::DeleteByGroupResult group_result =
+ doc_store->DeleteByNamespace("namespace1");
+ EXPECT_THAT(group_result.status, IsOk());
+ EXPECT_THAT(group_result.num_docs_deleted, Eq(1));
+
// NamespaceMapper doesn't care if the document has been deleted
EXPECT_THAT(doc_store->GetNamespaceId("namespace1"), IsOkAndHolds(Eq(0)));
}
-TEST_F(DocumentStoreTest, GetDuplicateNamespaceId) {
+TEST_P(DocumentStoreTest, GetDuplicateNamespaceId) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
DocumentProto document1 =
DocumentBuilder().SetKey("namespace", "1").SetSchema("email").Build();
@@ -1167,43 +1900,431 @@ TEST_F(DocumentStoreTest, GetDuplicateNamespaceId) {
EXPECT_THAT(doc_store->GetNamespaceId("namespace"), IsOkAndHolds(Eq(0)));
}
-TEST_F(DocumentStoreTest, NonexistentNamespaceNotFound) {
+TEST_P(DocumentStoreTest, NonexistentNamespaceNotFound) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
EXPECT_THAT(doc_store->GetNamespaceId("nonexistent_namespace"),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
-TEST_F(DocumentStoreTest, FilterCacheHoldsDeletedDocumentData) {
+TEST_P(DocumentStoreTest, GetCorpusDuplicateCorpusId) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "1").SetSchema("email").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "2").SetSchema("email").Build();
+
+ ICING_ASSERT_OK(doc_store->Put(document1));
+ ICING_ASSERT_OK(doc_store->Put(document2));
+
+ // CorpusId of 0 since it was the first namespace seen by the DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusId("namespace", "email"),
+ IsOkAndHolds(Eq(0)));
+}
+
+TEST_P(DocumentStoreTest, GetCorpusId) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document_corpus1 =
+ DocumentBuilder().SetKey("namespace1", "1").SetSchema("email").Build();
+ DocumentProto document_corpus2 =
+ DocumentBuilder().SetKey("namespace2", "2").SetSchema("email").Build();
+
+ ICING_ASSERT_OK(doc_store->Put(DocumentProto(document_corpus1)));
+ ICING_ASSERT_OK(doc_store->Put(DocumentProto(document_corpus2)));
+
+ // CorpusId of 0 since it was the first corpus seen by the DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusId("namespace1", "email"),
+ IsOkAndHolds(Eq(0)));
+
+ // CorpusId of 1 since it was the second corpus seen by the
+ // DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusId("namespace2", "email"),
+ IsOkAndHolds(Eq(1)));
+
+ // DELETE namespace1 - document_corpus1 is deleted.
+ DocumentStore::DeleteByGroupResult group_result =
+ doc_store->DeleteByNamespace("namespace1");
+ EXPECT_THAT(group_result.status, IsOk());
+ EXPECT_THAT(group_result.num_docs_deleted, Eq(1));
+
+ // CorpusMapper doesn't care if the document has been deleted
+ EXPECT_THAT(doc_store->GetNamespaceId("namespace1"), IsOkAndHolds(Eq(0)));
+}
+
+TEST_P(DocumentStoreTest, NonexistentCorpusNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_THAT(
+ doc_store->GetCorpusId("nonexistent_namespace", "nonexistent_schema"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+
+ DocumentProto document_corpus =
+ DocumentBuilder().SetKey("namespace1", "1").SetSchema("email").Build();
+ ICING_ASSERT_OK(doc_store->Put(DocumentProto(document_corpus)));
+
+ EXPECT_THAT(doc_store->GetCorpusId("nonexistent_namespace", "email"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->GetCorpusId("namespace1", "nonexistent_schema"),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_P(DocumentStoreTest, GetCorpusAssociatedScoreDataSameCorpus) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 =
+ DocumentBuilder().SetKey("namespace", "1").SetSchema("email").Build();
+ DocumentProto document2 =
+ DocumentBuilder().SetKey("namespace", "2").SetSchema("email").Build();
+
+ ICING_ASSERT_OK(doc_store->Put(document1, /*num_tokens=*/5));
+ ICING_ASSERT_OK(doc_store->Put(document2, /*num_tokens=*/7));
+
+ // CorpusId of 0 since it was the first namespace seen by the DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/2, /*sum_length_in_tokens=*/12)));
+ // Only one corpus exists
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_P(DocumentStoreTest, GetCorpusAssociatedScoreData) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document_corpus1 =
+ DocumentBuilder().SetKey("namespace1", "1").SetSchema("email").Build();
+ DocumentProto document_corpus2 =
+ DocumentBuilder().SetKey("namespace2", "2").SetSchema("email").Build();
+
+ ICING_ASSERT_OK(
+ doc_store->Put(DocumentProto(document_corpus1), /*num_tokens=*/5));
+ ICING_ASSERT_OK(
+ doc_store->Put(DocumentProto(document_corpus2), /*num_tokens=*/7));
+
+ // CorpusId of 0 since it was the first corpus seen by the DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/5)));
+
+ // CorpusId of 1 since it was the second corpus seen by the
+ // DocumentStore
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/1),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/7)));
+
+ // DELETE namespace1 - document_corpus1 is deleted.
+ ICING_EXPECT_OK(doc_store->DeleteByNamespace("namespace1").status);
+
+ // Corpus score cache doesn't care if the document has been deleted
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ IsOkAndHolds(CorpusAssociatedScoreData(
+ /*num_docs=*/1, /*sum_length_in_tokens=*/5)));
+}
+
+TEST_P(DocumentStoreTest, NonexistentCorpusAssociatedScoreDataOutOfRange) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_THAT(doc_store->GetCorpusAssociatedScoreData(/*corpus_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_P(DocumentStoreTest, GetDocumentAssociatedScoreDataSameCorpus) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace", "1")
+ .SetSchema("email")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace", "2")
+ .SetSchema("email")
+ .SetScore(document2_score_)
+ .SetCreationTimestampMs(
+ document2_creation_timestamp_) // A random timestamp
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ doc_store->Put(DocumentProto(document1), /*num_tokens=*/5));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ doc_store->Put(DocumentProto(document2), /*num_tokens=*/7));
+
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/5)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/7)));
+}
+
+TEST_P(DocumentStoreTest, GetDocumentAssociatedScoreDataDifferentCorpus) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 =
+ DocumentBuilder()
+ .SetKey("namespace1", "1")
+ .SetSchema("email")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .Build();
+ DocumentProto document2 =
+ DocumentBuilder()
+ .SetKey("namespace2", "2")
+ .SetSchema("email")
+ .SetScore(document2_score_)
+ .SetCreationTimestampMs(
+ document2_creation_timestamp_) // A random timestamp
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ doc_store->Put(DocumentProto(document1), /*num_tokens=*/5));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ doc_store->Put(DocumentProto(document2), /*num_tokens=*/7));
+
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id1),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0, document1_score_, document1_creation_timestamp_,
+ /*length_in_tokens=*/5)));
+ EXPECT_THAT(
+ doc_store->GetDocumentAssociatedScoreData(document_id2),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/1, document2_score_, document2_creation_timestamp_,
+ /*length_in_tokens=*/7)));
+}
+
+TEST_P(DocumentStoreTest, NonexistentDocumentAssociatedScoreDataNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(/*document_id=*/0),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+}
+
+TEST_P(DocumentStoreTest, NonexistentDocumentFilterDataNotFound) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ EXPECT_FALSE(doc_store->GetAliveDocumentFilterData(
+ /*document_id=*/0, fake_clock_.GetSystemTimeMilliseconds()));
+}
+
+TEST_P(DocumentStoreTest, DeleteClearsFilterCache) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
doc_store->Put(test_document1_));
- EXPECT_THAT(
- doc_store->GetDocumentFilterData(document_id),
- IsOkAndHolds(DocumentFilterData(
- /*namespace_id=*/0,
- /*schema_type_id=*/0,
- /*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData doc_filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(doc_filter_data,
+ Eq(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0, document1_expiration_timestamp_)));
- // FilterCache doesn't care if the document has been deleted
- ICING_ASSERT_OK(doc_store->Delete("icing", "email/1"));
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // Associated entry of the deleted document is removed.
+ EXPECT_FALSE(doc_store->GetAliveDocumentFilterData(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+}
+
+TEST_P(DocumentStoreTest, DeleteClearsScoreCache) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_, /*num_tokens=*/4));
+
+ EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id),
+ IsOkAndHolds(DocumentAssociatedScoreData(
+ /*corpus_id=*/0,
+ /*document_score=*/document1_score_,
+ /*creation_timestamp_ms=*/document1_creation_timestamp_,
+ /*length_in_tokens=*/4)));
+
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1",
+ fake_clock_.GetSystemTimeMilliseconds()));
+ // Associated entry of the deleted document is removed.
EXPECT_THAT(
- doc_store->GetDocumentFilterData(document_id),
- IsOkAndHolds(DocumentFilterData(
- /*namespace_id=*/0,
- /*schema_type_id=*/0,
- /*expiration_timestamp_ms=*/document1_expiration_timestamp_)));
+ doc_store->GetDocumentAssociatedScoreData(document_id),
+ IsOkAndHolds(DocumentAssociatedScoreData(kInvalidCorpusId,
+ /*document_score=*/-1,
+ /*creation_timestamp_ms=*/-1,
+ /*length_in_tokens=*/0)));
+}
+
+TEST_P(DocumentStoreTest, DeleteShouldPreventUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
+ // Report usage with type 1.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type1));
+
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_count = 1;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ doc_store->GetUsageScores(document_id,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Delete the document.
+ ICING_ASSERT_OK(doc_store->Delete("icing", "email/1",
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ // Can't report or get usage scores on the deleted document
+ ASSERT_THAT(
+ doc_store->ReportUsage(usage_report_type1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("Couldn't report usage on a nonexistent document")));
+
+ EXPECT_FALSE(doc_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+}
+
+TEST_P(DocumentStoreTest, ExpirationShouldPreventUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document = DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(10)
+ .SetTtlMs(100)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
+
+ // Some arbitrary time before the document's creation time (10) + ttl (100)
+ fake_clock_.SetSystemTimeMilliseconds(109);
+
+ // Report usage with type 1.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type1));
+
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_count = 1;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ doc_store->GetUsageScores(document_id,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Some arbitrary time past the document's creation time (10) + ttl (100)
+ fake_clock_.SetSystemTimeMilliseconds(200);
+
+ // Can't report or get usage scores on the expired document
+ ASSERT_THAT(
+ doc_store->ReportUsage(usage_report_type1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND,
+ HasSubstr("Couldn't report usage on a nonexistent document")));
+
+ EXPECT_FALSE(doc_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
}
-TEST_F(DocumentStoreTest,
+TEST_P(DocumentStoreTest,
ExpirationTimestampIsSumOfNonZeroTtlAndCreationTimestamp) {
DocumentProto document = DocumentBuilder()
.SetKey("namespace1", "1")
@@ -1213,20 +2334,24 @@ TEST_F(DocumentStoreTest,
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
-
- EXPECT_THAT(
- doc_store->GetDocumentFilterData(document_id),
- IsOkAndHolds(DocumentFilterData(/*namespace_id=*/0,
- /*schema_type_id=*/0,
- /*expiration_timestamp_ms=*/1100)));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData doc_filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(doc_filter_data, Eq(DocumentFilterData(
+ /*namespace_id=*/0,
+ /*schema_type_id=*/0,
+ /*expiration_timestamp_ms=*/1100)));
}
-TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) {
+TEST_P(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) {
DocumentProto document = DocumentBuilder()
.SetKey("namespace1", "1")
.SetSchema("email")
@@ -1235,21 +2360,28 @@ TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxIfTtlIsZero) {
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData doc_filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+
EXPECT_THAT(
- doc_store->GetDocumentFilterData(document_id),
- IsOkAndHolds(DocumentFilterData(
+ doc_filter_data,
+ Eq(DocumentFilterData(
/*namespace_id=*/0,
/*schema_type_id=*/0,
/*expiration_timestamp_ms=*/std::numeric_limits<int64_t>::max())));
}
-TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) {
+TEST_P(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) {
DocumentProto document =
DocumentBuilder()
.SetKey("namespace1", "1")
@@ -1259,21 +2391,28 @@ TEST_F(DocumentStoreTest, ExpirationTimestampIsInt64MaxOnOverflow) {
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id, doc_store->Put(document));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData doc_filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+
EXPECT_THAT(
- doc_store->GetDocumentFilterData(document_id),
- IsOkAndHolds(DocumentFilterData(
+ doc_filter_data,
+ Eq(DocumentFilterData(
/*namespace_id=*/0,
/*schema_type_id=*/0,
/*expiration_timestamp_ms=*/std::numeric_limits<int64_t>::max())));
}
-TEST_F(DocumentStoreTest, CreationTimestampShouldBePopulated) {
+TEST_P(DocumentStoreTest, CreationTimestampShouldBePopulated) {
// Creates a document without a given creation timestamp
DocumentProto document_without_creation_timestamp =
DocumentBuilder()
@@ -1286,9 +2425,11 @@ TEST_F(DocumentStoreTest, CreationTimestampShouldBePopulated) {
int64_t fake_real_time = 100;
fake_clock_.SetSystemTimeMilliseconds(fake_real_time);
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(
DocumentId document_id,
@@ -1302,7 +2443,7 @@ TEST_F(DocumentStoreTest, CreationTimestampShouldBePopulated) {
Eq(fake_real_time));
}
-TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) {
+TEST_P(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) {
DocumentProto document1 = DocumentBuilder()
.SetKey("icing", "email/1")
.SetSchema("email")
@@ -1310,16 +2451,18 @@ TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) {
// With default doc score 0
.Build();
DocumentProto document2 = DocumentBuilder()
- .SetKey("icing", "email/1")
+ .SetKey("icing", "email/2")
.SetSchema("email")
.AddStringProperty("subject", "subject foo")
.SetScore(5)
.Build();
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> doc_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id1,
doc_store->Put(document1));
@@ -1328,18 +2471,24 @@ TEST_F(DocumentStoreTest, ShouldWriteAndReadScoresCorrectly) {
EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id1),
IsOkAndHolds(DocumentAssociatedScoreData(
- /*document_score=*/0, /*creation_timestamp_ms=*/0)));
+ /*corpus_id=*/0,
+ /*document_score=*/0, /*creation_timestamp_ms=*/0,
+ /*length_in_tokens=*/0)));
EXPECT_THAT(doc_store->GetDocumentAssociatedScoreData(document_id2),
IsOkAndHolds(DocumentAssociatedScoreData(
- /*document_score=*/5, /*creation_timestamp_ms=*/0)));
+ /*corpus_id=*/0,
+ /*document_score=*/5, /*creation_timestamp_ms=*/0,
+ /*length_in_tokens=*/0)));
}
-TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) {
+TEST_P(DocumentStoreTest, ComputeChecksumSameBetweenCalls) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
ICING_EXPECT_OK(document_store->Put(test_document1_));
ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, document_store->ComputeChecksum());
@@ -1348,11 +2497,13 @@ TEST_F(DocumentStoreTest, ComputeChecksumSameBetweenCalls) {
EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum));
}
-TEST_F(DocumentStoreTest, ComputeChecksumSameAcrossInstances) {
+TEST_P(DocumentStoreTest, ComputeChecksumSameAcrossInstances) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
ICING_EXPECT_OK(document_store->Put(test_document1_));
ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, document_store->ComputeChecksum());
@@ -1360,17 +2511,20 @@ TEST_F(DocumentStoreTest, ComputeChecksumSameAcrossInstances) {
// Destroy the previous instance and recreate DocumentStore
document_store.reset();
ICING_ASSERT_OK_AND_ASSIGN(
- document_store, DocumentStore::Create(&filesystem_, document_store_dir_,
- &fake_clock_, schema_store_.get()));
+ create_result, CreateDocumentStore(&filesystem_, document_store_dir_,
+ &fake_clock_, schema_store_.get()));
+ document_store = std::move(create_result.document_store);
EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(checksum));
}
-TEST_F(DocumentStoreTest, ComputeChecksumChangesOnModification) {
+TEST_P(DocumentStoreTest, ComputeChecksumChangesOnNewDocument) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
ICING_EXPECT_OK(document_store->Put(test_document1_));
ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, document_store->ComputeChecksum());
@@ -1380,7 +2534,25 @@ TEST_F(DocumentStoreTest, ComputeChecksumChangesOnModification) {
IsOkAndHolds(Not(Eq(checksum))));
}
-TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
+TEST_P(DocumentStoreTest, ComputeChecksumDoesntChangeOnNewUsage) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_EXPECT_OK(document_store->Put(test_document1_));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum, document_store->ComputeChecksum());
+
+ UsageReport usage_report =
+ CreateUsageReport(test_document1_.namespace_(), test_document1_.uri(),
+ /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1);
+ ICING_EXPECT_OK(document_store->ReportUsage(usage_report));
+ EXPECT_THAT(document_store->ComputeChecksum(), IsOkAndHolds(Eq(checksum)));
+}
+
+TEST_P(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
const std::string schema_store_dir = schema_store_dir_ + "_custom";
DocumentId email_document_id;
@@ -1407,13 +2579,15 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- type_config = schema.add_types();
- type_config->set_schema_type("message");
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
schema_store->GetSchemaTypeId("email"));
@@ -1421,18 +2595,21 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
schema_store->GetSchemaTypeId("message"));
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
// Insert and verify a "email "document
ICING_ASSERT_OK_AND_ASSIGN(
email_document_id, document_store->Put(DocumentProto(email_document)));
EXPECT_THAT(document_store->Get(email_document_id),
IsOkAndHolds(EqualsProto(email_document)));
- ICING_ASSERT_OK_AND_ASSIGN(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
DocumentFilterData email_data,
- document_store->GetDocumentFilterData(email_document_id));
+ document_store->GetAliveDocumentFilterData(
+ email_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(email_data.schema_type_id(), Eq(email_schema_type_id));
email_namespace_id = email_data.namespace_id();
email_expiration_timestamp = email_data.expiration_timestamp_ms();
@@ -1443,24 +2620,16 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
document_store->Put(DocumentProto(message_document)));
EXPECT_THAT(document_store->Get(message_document_id),
IsOkAndHolds(EqualsProto(message_document)));
- ICING_ASSERT_OK_AND_ASSIGN(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
DocumentFilterData message_data,
- document_store->GetDocumentFilterData(message_document_id));
+ document_store->GetAliveDocumentFilterData(
+ message_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(message_data.schema_type_id(), Eq(message_schema_type_id));
message_namespace_id = message_data.namespace_id();
message_expiration_timestamp = message_data.expiration_timestamp_ms();
} // Everything destructs and commits changes to file
- // Change the DocumentStore's header combined checksum so that it won't match
- // the recalculated checksum on initialization. This will force a regeneration
- // of derived files from ground truth.
- const std::string header_file =
- absl_ports::StrCat(document_store_dir_, "/document_store_header");
- DocumentStore::Header header;
- header.magic = DocumentStore::Header::kMagic;
- header.checksum = 10; // Arbitrary garbage checksum
- filesystem_.DeleteFile(header_file.c_str());
- filesystem_.Write(header_file.c_str(), &header, sizeof(header));
+ CorruptDocStoreHeaderChecksumFile();
// Change the schema so that we don't know of the Document's type anymore.
// Since we can't set backwards incompatible changes, we do some file-level
@@ -1470,11 +2639,14 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+ SchemaProto schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId email_schema_type_id,
schema_store->GetSchemaTypeId("email"));
@@ -1482,16 +2654,19 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
// Successfully recover from a corrupt derived file issue. We don't fail just
// because the "message" schema type is missing
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
// "email" document is fine
EXPECT_THAT(document_store->Get(email_document_id),
IsOkAndHolds(EqualsProto(email_document)));
- ICING_ASSERT_OK_AND_ASSIGN(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
DocumentFilterData email_data,
- document_store->GetDocumentFilterData(email_document_id));
+ document_store->GetAliveDocumentFilterData(
+ email_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(email_data.schema_type_id(), Eq(email_schema_type_id));
// Make sure that all the other fields are stll valid/the same
EXPECT_THAT(email_data.namespace_id(), Eq(email_namespace_id));
@@ -1501,9 +2676,10 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
// "message" document has an invalid SchemaTypeId
EXPECT_THAT(document_store->Get(message_document_id),
IsOkAndHolds(EqualsProto(message_document)));
- ICING_ASSERT_OK_AND_ASSIGN(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
DocumentFilterData message_data,
- document_store->GetDocumentFilterData(message_document_id));
+ document_store->GetAliveDocumentFilterData(
+ message_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(message_data.schema_type_id(), Eq(-1));
// Make sure that all the other fields are stll valid/the same
EXPECT_THAT(message_data.namespace_id(), Eq(message_namespace_id));
@@ -1511,22 +2687,24 @@ TEST_F(DocumentStoreTest, RegenerateDerivedFilesSkipsUnknownSchemaTypeIds) {
Eq(message_expiration_timestamp));
}
-TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) {
+TEST_P(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) {
const std::string schema_store_dir = test_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
// Set a schema
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- type_config = schema.add_types();
- type_config->set_schema_type("message");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_email_schema_type_id,
schema_store->GetSchemaTypeId("email"));
@@ -1547,32 +2725,38 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) {
// Add the documents and check SchemaTypeIds match
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
document_store->Put(email_document));
- ICING_ASSERT_OK_AND_ASSIGN(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
DocumentFilterData email_data,
- document_store->GetDocumentFilterData(email_document_id));
+ document_store->GetAliveDocumentFilterData(
+ email_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(email_data.schema_type_id(), Eq(old_email_schema_type_id));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
document_store->Put(message_document));
- ICING_ASSERT_OK_AND_ASSIGN(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
DocumentFilterData message_data,
- document_store->GetDocumentFilterData(message_document_id));
+ document_store->GetAliveDocumentFilterData(
+ message_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(message_data.schema_type_id(), Eq(old_message_schema_type_id));
// Rearrange the schema types. Since SchemaTypeId is assigned based on order,
// this should change the SchemaTypeIds.
- schema.clear_types();
- type_config = schema.add_types();
- type_config->set_schema_type("message");
- type_config = schema.add_types();
- type_config->set_schema_type("email");
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId new_email_schema_type_id,
schema_store->GetSchemaTypeId("email"));
@@ -1586,38 +2770,40 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreUpdatesSchemaTypeIds) {
ICING_EXPECT_OK(document_store->UpdateSchemaStore(schema_store.get()));
// Check that the FilterCache holds the new SchemaTypeIds
- ICING_ASSERT_OK_AND_ASSIGN(
- email_data, document_store->GetDocumentFilterData(email_document_id));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ email_data,
+ document_store->GetAliveDocumentFilterData(
+ email_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(email_data.schema_type_id(), Eq(new_email_schema_type_id));
- ICING_ASSERT_OK_AND_ASSIGN(
- message_data, document_store->GetDocumentFilterData(message_document_id));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ message_data,
+ document_store->GetAliveDocumentFilterData(
+ message_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(message_data.schema_type_id(), Eq(new_message_schema_type_id));
}
-TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) {
+TEST_P(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) {
const std::string schema_store_dir = test_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
// Set a schema
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
-
- auto property_config = type_config->add_properties();
- property_config->set_property_name("subject");
- property_config->set_data_type(PropertyConfigProto::DataType::STRING);
- property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property_config->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Add two documents, with and without a subject
DocumentProto email_without_subject = DocumentBuilder()
@@ -1637,9 +2823,12 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) {
// Insert documents and check they're ok
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_without_subject_document_id,
document_store->Put(email_without_subject));
EXPECT_THAT(document_store->Get(email_without_subject_document_id),
@@ -1656,7 +2845,8 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) {
PropertyConfigProto::Cardinality::REQUIRED);
ICING_EXPECT_OK(schema_store->SetSchema(
- schema, /*ignore_errors_and_delete_documents=*/true));
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
ICING_EXPECT_OK(document_store->UpdateSchemaStore(schema_store.get()));
@@ -1669,23 +2859,25 @@ TEST_F(DocumentStoreTest, UpdateSchemaStoreDeletesInvalidDocuments) {
IsOkAndHolds(EqualsProto(email_with_subject)));
}
-TEST_F(DocumentStoreTest,
+TEST_P(DocumentStoreTest,
UpdateSchemaStoreDeletesDocumentsByDeletedSchemaType) {
const std::string schema_store_dir = test_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
// Set a schema
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- type_config = schema.add_types();
- type_config->set_schema_type("message");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Add a "email" and "message" document
DocumentProto email_document = DocumentBuilder()
@@ -1704,9 +2896,12 @@ TEST_F(DocumentStoreTest,
// Insert documents and check they're ok
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
document_store->Put(email_document));
EXPECT_THAT(document_store->Get(email_document_id),
@@ -1717,13 +2912,15 @@ TEST_F(DocumentStoreTest,
EXPECT_THAT(document_store->Get(message_document_id),
IsOkAndHolds(EqualsProto(message_document)));
- SchemaProto new_schema;
- type_config = new_schema.add_types();
- type_config->set_schema_type("message");
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
ICING_EXPECT_OK(
schema_store->SetSchema(new_schema,
- /*ignore_errors_and_delete_documents=*/true));
+ /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
ICING_EXPECT_OK(document_store->UpdateSchemaStore(schema_store.get()));
@@ -1736,22 +2933,24 @@ TEST_F(DocumentStoreTest,
IsOkAndHolds(EqualsProto(message_document)));
}
-TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) {
+TEST_P(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) {
const std::string schema_store_dir = test_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
// Set a schema
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- type_config = schema.add_types();
- type_config->set_schema_type("message");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId old_email_schema_type_id,
schema_store->GetSchemaTypeId("email"));
@@ -1772,33 +2971,40 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) {
// Add the documents and check SchemaTypeIds match
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
document_store->Put(email_document));
- ICING_ASSERT_OK_AND_ASSIGN(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
DocumentFilterData email_data,
- document_store->GetDocumentFilterData(email_document_id));
+ document_store->GetAliveDocumentFilterData(
+ email_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(email_data.schema_type_id(), Eq(old_email_schema_type_id));
ICING_ASSERT_OK_AND_ASSIGN(DocumentId message_document_id,
document_store->Put(message_document));
- ICING_ASSERT_OK_AND_ASSIGN(
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
DocumentFilterData message_data,
- document_store->GetDocumentFilterData(message_document_id));
+ document_store->GetAliveDocumentFilterData(
+ message_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(message_data.schema_type_id(), Eq(old_message_schema_type_id));
// Rearrange the schema types. Since SchemaTypeId is assigned based on order,
// this should change the SchemaTypeIds.
- schema.clear_types();
- type_config = schema.add_types();
- type_config->set_schema_type("message");
- type_config = schema.add_types();
- type_config->set_schema_type("email");
+ schema = SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .Build();
- ICING_ASSERT_OK_AND_ASSIGN(SchemaStore::SetSchemaResult set_schema_result,
- schema_store->SetSchema(schema));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ SchemaStore::SetSchemaResult set_schema_result,
+ schema_store->SetSchema(schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
ICING_ASSERT_OK_AND_ASSIGN(SchemaTypeId new_email_schema_type_id,
schema_store->GetSchemaTypeId("email"));
@@ -1813,38 +3019,40 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreUpdatesSchemaTypeIds) {
schema_store.get(), set_schema_result));
// Check that the FilterCache holds the new SchemaTypeIds
- ICING_ASSERT_OK_AND_ASSIGN(
- email_data, document_store->GetDocumentFilterData(email_document_id));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ email_data,
+ document_store->GetAliveDocumentFilterData(
+ email_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(email_data.schema_type_id(), Eq(new_email_schema_type_id));
- ICING_ASSERT_OK_AND_ASSIGN(
- message_data, document_store->GetDocumentFilterData(message_document_id));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ message_data,
+ document_store->GetAliveDocumentFilterData(
+ message_document_id, fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(message_data.schema_type_id(), Eq(new_message_schema_type_id));
}
-TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) {
+TEST_P(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) {
const std::string schema_store_dir = test_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
// Set a schema
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
-
- auto property_config = type_config->add_properties();
- property_config->set_property_name("subject");
- property_config->set_data_type(PropertyConfigProto::DataType::STRING);
- property_config->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- property_config->mutable_indexing_config()->set_term_match_type(
- TermMatchType::EXACT_ONLY);
- property_config->mutable_indexing_config()->set_tokenizer_type(
- IndexingConfig::TokenizerType::PLAIN);
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Add two documents, with and without a subject
DocumentProto email_without_subject = DocumentBuilder()
@@ -1864,9 +3072,12 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) {
// Insert documents and check they're ok
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_without_subject_document_id,
document_store->Put(email_without_subject));
EXPECT_THAT(document_store->Get(email_without_subject_document_id),
@@ -1885,7 +3096,8 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) {
ICING_ASSERT_OK_AND_ASSIGN(
SchemaStore::SetSchemaResult set_schema_result,
schema_store->SetSchema(schema,
- /*ignore_errors_and_delete_documents=*/true));
+ /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
ICING_EXPECT_OK(document_store->OptimizedUpdateSchemaStore(
schema_store.get(), set_schema_result));
@@ -1899,23 +3111,25 @@ TEST_F(DocumentStoreTest, OptimizedUpdateSchemaStoreDeletesInvalidDocuments) {
IsOkAndHolds(EqualsProto(email_with_subject)));
}
-TEST_F(DocumentStoreTest,
+TEST_P(DocumentStoreTest,
OptimizedUpdateSchemaStoreDeletesDocumentsByDeletedSchemaType) {
const std::string schema_store_dir = test_dir_ + "_custom";
filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
// Set a schema
- SchemaProto schema;
- auto type_config = schema.add_types();
- type_config->set_schema_type("email");
- type_config = schema.add_types();
- type_config->set_schema_type("message");
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("email"))
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, schema_store_dir));
- ICING_EXPECT_OK(schema_store->SetSchema(schema));
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+ ICING_EXPECT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
// Add a "email" and "message" document
DocumentProto email_document = DocumentBuilder()
@@ -1934,9 +3148,12 @@ TEST_F(DocumentStoreTest,
// Insert documents and check they're ok
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
ICING_ASSERT_OK_AND_ASSIGN(DocumentId email_document_id,
document_store->Put(email_document));
EXPECT_THAT(document_store->Get(email_document_id),
@@ -1947,14 +3164,16 @@ TEST_F(DocumentStoreTest,
EXPECT_THAT(document_store->Get(message_document_id),
IsOkAndHolds(EqualsProto(message_document)));
- SchemaProto new_schema;
- type_config = new_schema.add_types();
- type_config->set_schema_type("message");
+ SchemaProto new_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder().SetType("message"))
+ .Build();
ICING_ASSERT_OK_AND_ASSIGN(
SchemaStore::SetSchemaResult set_schema_result,
schema_store->SetSchema(new_schema,
- /*ignore_errors_and_delete_documents=*/true));
+ /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false));
ICING_EXPECT_OK(document_store->OptimizedUpdateSchemaStore(
schema_store.get(), set_schema_result));
@@ -1968,11 +3187,13 @@ TEST_F(DocumentStoreTest,
IsOkAndHolds(EqualsProto(message_document)));
}
-TEST_F(DocumentStoreTest, GetOptimizeInfo) {
+TEST_P(DocumentStoreTest, GetOptimizeInfo) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
// Nothing should be optimizable yet
ICING_ASSERT_OK_AND_ASSIGN(DocumentStore::OptimizeInfo optimize_info,
@@ -1990,8 +3211,9 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) {
EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
// Delete a document. Now something is optimizable
- ICING_EXPECT_OK(document_store->Delete(test_document1_.namespace_(),
- test_document1_.uri()));
+ ICING_EXPECT_OK(document_store->Delete(
+ test_document1_.namespace_(), test_document1_.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
ICING_ASSERT_OK_AND_ASSIGN(optimize_info, document_store->GetOptimizeInfo());
EXPECT_THAT(optimize_info.total_docs, Eq(1));
EXPECT_THAT(optimize_info.optimizable_docs, Eq(1));
@@ -2002,12 +3224,14 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) {
std::string optimized_dir = document_store_dir_ + "_optimize";
EXPECT_TRUE(filesystem_.DeleteDirectoryRecursively(optimized_dir.c_str()));
EXPECT_TRUE(filesystem_.CreateDirectoryRecursively(optimized_dir.c_str()));
- ICING_ASSERT_OK(document_store->OptimizeInto(optimized_dir));
+ ICING_ASSERT_OK(
+ document_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
document_store.reset();
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> optimized_document_store,
- DocumentStore::Create(&filesystem_, optimized_dir, &fake_clock_,
- schema_store_.get()));
+ create_result, CreateDocumentStore(&filesystem_, optimized_dir,
+ &fake_clock_, schema_store_.get()));
+ std::unique_ptr<DocumentStore> optimized_document_store =
+ std::move(create_result.document_store);
ICING_ASSERT_OK_AND_ASSIGN(optimize_info,
optimized_document_store->GetOptimizeInfo());
@@ -2016,11 +3240,13 @@ TEST_F(DocumentStoreTest, GetOptimizeInfo) {
EXPECT_THAT(optimize_info.estimated_optimizable_bytes, Eq(0));
}
-TEST_F(DocumentStoreTest, GetAllNamespaces) {
+TEST_P(DocumentStoreTest, GetAllNamespaces) {
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<DocumentStore> document_store,
- DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
- schema_store_.get()));
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
// Empty namespaces to start with
EXPECT_THAT(document_store->GetAllNamespaces(), IsEmpty());
@@ -2066,13 +3292,15 @@ TEST_F(DocumentStoreTest, GetAllNamespaces) {
// After deleting namespace2_uri1, there's still namespace2_uri2, so
// "namespace2" still shows up in results
- ICING_EXPECT_OK(document_store->Delete("namespace2", "uri1"));
+ ICING_EXPECT_OK(document_store->Delete(
+ "namespace2", "uri1", fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(document_store->GetAllNamespaces(),
UnorderedElementsAre("namespace1", "namespace2", "namespace3"));
// After deleting namespace2_uri2, there's no more documents in "namespace2"
- ICING_EXPECT_OK(document_store->Delete("namespace2", "uri2"));
+ ICING_EXPECT_OK(document_store->Delete(
+ "namespace2", "uri2", fake_clock_.GetSystemTimeMilliseconds()));
EXPECT_THAT(document_store->GetAllNamespaces(),
UnorderedElementsAre("namespace1", "namespace3"));
@@ -2084,5 +3312,1587 @@ TEST_F(DocumentStoreTest, GetAllNamespaces) {
UnorderedElementsAre("namespace1"));
}
+TEST_P(DocumentStoreTest, ReportUsageWithDifferentTimestampsAndGetUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store->Put(test_document1_));
+
+ // Report usage with type 1 and time 1.
+ UsageReport usage_report_type1_time1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type1_time1));
+
+ UsageStore::UsageScores expected_scores;
+ expected_scores.usage_type1_last_used_timestamp_s = 1;
+ ++expected_scores.usage_type1_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ document_store->GetUsageScores(document_id,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Report usage with type 1 and time 5, time should be updated.
+ UsageReport usage_report_type1_time5 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type1_time5));
+
+ expected_scores.usage_type1_last_used_timestamp_s = 5;
+ ++expected_scores.usage_type1_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores, document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Report usage with type 2 and time 1.
+ UsageReport usage_report_type2_time1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE2);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type2_time1));
+
+ expected_scores.usage_type2_last_used_timestamp_s = 1;
+ ++expected_scores.usage_type2_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores, document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Report usage with type 2 and time 5.
+ UsageReport usage_report_type2_time5 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE2);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type2_time5));
+
+ expected_scores.usage_type2_last_used_timestamp_s = 5;
+ ++expected_scores.usage_type2_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores, document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Report usage with type 3 and time 1.
+ UsageReport usage_report_type3_time1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE3);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type3_time1));
+
+ expected_scores.usage_type3_last_used_timestamp_s = 1;
+ ++expected_scores.usage_type3_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores, document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Report usage with type 3 and time 5.
+ UsageReport usage_report_type3_time5 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/5000,
+ UsageReport::USAGE_TYPE3);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type3_time5));
+
+ expected_scores.usage_type3_last_used_timestamp_s = 5;
+ ++expected_scores.usage_type3_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores, document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+}
+
+TEST_P(DocumentStoreTest, ReportUsageWithDifferentTypesAndGetUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ document_store->Put(test_document1_));
+
+ // Report usage with type 1.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type1));
+
+ UsageStore::UsageScores expected_scores;
+ ++expected_scores.usage_type1_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ document_store->GetUsageScores(document_id,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Report usage with type 2.
+ UsageReport usage_report_type2 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE2);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type2));
+
+ ++expected_scores.usage_type2_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores, document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Report usage with type 3.
+ UsageReport usage_report_type3 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE3);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type3));
+
+ ++expected_scores.usage_type3_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores, document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+}
+
+TEST_P(DocumentStoreTest, UsageScoresShouldNotBeClearedOnChecksumMismatch) {
+ UsageStore::UsageScores expected_scores;
+ DocumentId document_id;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(document_id,
+ document_store->Put(test_document1_));
+
+ // Report usage with type 1.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type1));
+
+ ++expected_scores.usage_type1_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+ }
+
+ CorruptDocStoreHeaderChecksumFile();
+ // Successfully recover from a corrupt derived file issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ // Usage scores should be the same.
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ document_store->GetUsageScores(document_id,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+}
+
+TEST_P(DocumentStoreTest, UsageScoresShouldBeAvailableAfterDataLoss) {
+ UsageStore::UsageScores expected_scores;
+ DocumentId document_id;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ document_id, document_store->Put(DocumentProto(test_document1_)));
+
+ // Report usage with type 1.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type1));
+
+ ++expected_scores.usage_type1_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ document_store->GetUsageScores(
+ document_id, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+ }
+
+ // "Corrupt" the content written in the log by adding non-checksummed data to
+ // it. This will mess up the checksum of the proto log, forcing it to rewind
+ // to the last saved point.
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+ const std::string serialized_document = document.SerializeAsString();
+
+ const std::string document_log_file = absl_ports::StrCat(
+ document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename());
+ int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str());
+ filesystem_.PWrite(document_log_file.c_str(), file_size,
+ serialized_document.data(), serialized_document.size());
+
+ // Successfully recover from a data loss issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ // Usage scores should still be available.
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ document_store->GetUsageScores(document_id,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+}
+
+TEST_P(DocumentStoreTest, UsageScoresShouldBeCopiedOverToUpdatedDocument) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id,
+ document_store->Put(DocumentProto(test_document1_)));
+
+ // Report usage with type 1.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/1", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report_type1));
+
+ UsageStore::UsageScores expected_scores;
+ ++expected_scores.usage_type1_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ document_store->GetUsageScores(document_id,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Update the document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId updated_document_id,
+ document_store->Put(DocumentProto(test_document1_)));
+ // We should get a different document id.
+ ASSERT_THAT(updated_document_id, Not(Eq(document_id)));
+
+ // Usage scores should be the same.
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores,
+ document_store->GetUsageScores(updated_document_id,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+}
+
+TEST_P(DocumentStoreTest, UsageScoresShouldPersistOnOptimize) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id1,
+ document_store->Put(DocumentProto(test_document1_)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentId document_id2,
+ document_store->Put(DocumentProto(test_document2_)));
+ ICING_ASSERT_OK(document_store->Delete(
+ document_id1, fake_clock_.GetSystemTimeMilliseconds()));
+
+ // Report usage of document 2.
+ UsageReport usage_report = CreateUsageReport(
+ /*name_space=*/"icing", /*uri=*/"email/2", /*timestamp_ms=*/0,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(document_store->ReportUsage(usage_report));
+
+ UsageStore::UsageScores expected_scores;
+ ++expected_scores.usage_type1_count;
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ UsageStore::UsageScores actual_scores,
+ document_store->GetUsageScores(document_id2,
+ fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+
+ // Run optimize
+ std::string optimized_dir = document_store_dir_ + "/optimize_test";
+ filesystem_.CreateDirectoryRecursively(optimized_dir.c_str());
+ ICING_ASSERT_OK(
+ document_store->OptimizeInto(optimized_dir, lang_segmenter_.get()));
+
+ // Get optimized document store
+ ICING_ASSERT_OK_AND_ASSIGN(
+ create_result, CreateDocumentStore(&filesystem_, optimized_dir,
+ &fake_clock_, schema_store_.get()));
+ std::unique_ptr<DocumentStore> optimized_document_store =
+ std::move(create_result.document_store);
+
+ // Usage scores should be the same.
+ // The original document_id2 should have become document_id2 - 1.
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ actual_scores,
+ optimized_document_store->GetUsageScores(
+ document_id2 - 1, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(actual_scores, Eq(expected_scores));
+}
+
+TEST_P(DocumentStoreTest, DetectPartialDataLoss) {
+ {
+ // Can put and delete fine.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
+ EXPECT_THAT(create_result.derived_files_regenerated, IsFalse());
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(DocumentProto(test_document1_)));
+ EXPECT_THAT(doc_store->Get(document_id),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+ }
+
+ // "Corrupt" the content written in the log by adding non-checksummed data to
+ // it. This will mess up the checksum of the proto log, forcing it to rewind
+ // to the last saved point and triggering data loss.
+ DocumentProto document = DocumentBuilder().SetKey("namespace", "uri").Build();
+ const std::string serialized_document = document.SerializeAsString();
+
+ const std::string document_log_file =
+ absl_ports::StrCat(document_store_dir_, "/",
+ DocumentLogCreator::GetDocumentLogFilename())
+ .c_str();
+ int64_t file_size = filesystem_.GetFileSize(document_log_file.c_str());
+ filesystem_.PWrite(document_log_file.c_str(), file_size,
+ serialized_document.data(), serialized_document.size());
+
+ // Successfully recover from a data loss issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::PARTIAL));
+ EXPECT_THAT(create_result.derived_files_regenerated, IsTrue());
+}
+
+TEST_P(DocumentStoreTest, DetectCompleteDataLoss) {
+ int64_t corruptible_offset;
+ const std::string document_log_file = absl_ports::StrCat(
+ document_store_dir_, "/", DocumentLogCreator::GetDocumentLogFilename());
+ {
+ // Can put and delete fine.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
+ EXPECT_THAT(create_result.derived_files_regenerated, IsFalse());
+
+ // There's some space at the beginning of the file (e.g. header, kmagic,
+ // etc) that is necessary to initialize the FileBackedProtoLog. We can't
+ // corrupt that region, so we need to figure out the offset at which
+ // documents will be written to - which is the file size after
+ // initialization.
+ corruptible_offset = filesystem_.GetFileSize(document_log_file.c_str());
+
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(DocumentProto(test_document1_)));
+ EXPECT_THAT(doc_store->Get(document_id),
+ IsOkAndHolds(EqualsProto(test_document1_)));
+ }
+
+ // "Corrupt" the persisted content written in the log. We can't recover if
+ // the persisted data was corrupted.
+ std::string corruption = "abc";
+ filesystem_.PWrite(document_log_file.c_str(),
+ /*offset=*/corruptible_offset, corruption.data(),
+ corruption.size());
+
+ {
+ // "Corrupt" the content written in the log. Make the corrupt document
+ // smaller than our original one so we don't accidentally write past our
+ // file.
+ DocumentProto document =
+ DocumentBuilder().SetKey("invalid_namespace", "invalid_uri").Build();
+ std::string serialized_document = document.SerializeAsString();
+ ASSERT_TRUE(filesystem_.PWrite(
+ document_log_file.c_str(), corruptible_offset,
+ serialized_document.data(), serialized_document.size()));
+
+ PortableFileBackedProtoLog<DocumentWrapper>::Header header =
+ ReadDocumentLogHeader(filesystem_, document_log_file);
+
+ // Set dirty bit to true to reflect that something changed in the log.
+ header.SetDirtyFlag(true);
+ header.SetHeaderChecksum(header.CalculateHeaderChecksum());
+
+ WriteDocumentLogHeader(filesystem_, document_log_file, header);
+ }
+
+ // Successfully recover from a data loss issue.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::COMPLETE));
+ EXPECT_THAT(create_result.derived_files_regenerated, IsTrue());
+}
+
+TEST_P(DocumentStoreTest, LoadScoreCacheAndInitializeSuccessfully) {
+ // The directory testdata/score_cache_without_length_in_tokens/document_store
+ // contains only the scoring_cache and the document_store_header (holding the
+ // crc for the scoring_cache). If the current code is compatible with the
+ // format of the v0 scoring_cache, then an empty document store should be
+ // initialized, but the non-empty scoring_cache should be retained. The
+ // current document-asscoiated-score-data has a new field with respect to the
+ // ones stored in testdata/score_cache_Without_length_in_tokens, hence the
+ // document store's initialization requires regenerating its derived files.
+
+ // Create dst directory
+ ASSERT_THAT(filesystem_.CreateDirectory(document_store_dir_.c_str()), true);
+
+ // Get src files
+ std::string document_store_without_length_in_tokens;
+ if (IsAndroidArm() || IsIosPlatform()) {
+ document_store_without_length_in_tokens = GetTestFilePath(
+ "icing/testdata/score_cache_without_length_in_tokens/"
+ "document_store_android_ios_compatible");
+ } else if (IsAndroidX86()) {
+ document_store_without_length_in_tokens = GetTestFilePath(
+ "icing/testdata/score_cache_without_length_in_tokens/"
+ "document_store_android_x86");
+ } else {
+ document_store_without_length_in_tokens = GetTestFilePath(
+ "icing/testdata/score_cache_without_length_in_tokens/"
+ "document_store");
+ }
+ Filesystem filesystem;
+ ICING_LOG(INFO) << "Copying files "
+ << document_store_without_length_in_tokens;
+ ASSERT_THAT(
+ filesystem.CopyDirectory(document_store_without_length_in_tokens.c_str(),
+ document_store_dir_.c_str(), /*recursive=*/true),
+ true);
+
+ InitializeStatsProto initialize_stats;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, document_store_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv,
+ GetParam().use_persistent_hash_map,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ &initialize_stats));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ // The document log is using the legacy v0 format so that a migration is
+ // needed, which will also trigger regeneration.
+ EXPECT_THAT(initialize_stats.document_store_recovery_cause(),
+ Eq(InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT));
+ // There should be no data loss, but we still need to regenerate derived files
+ // since we migrated document log from v0 to v1.
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
+ EXPECT_THAT(create_result.derived_files_regenerated, IsTrue());
+}
+
+TEST_P(DocumentStoreTest, DocumentStoreStorageInfo) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ // Add three documents.
+ DocumentProto document1 = test_document1_;
+ document1.set_namespace_("namespace.1");
+ document1.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document1));
+
+ DocumentProto document2 = test_document1_;
+ document2.set_namespace_("namespace.1");
+ document2.set_uri("uri2");
+ document2.set_creation_timestamp_ms(fake_clock_.GetSystemTimeMilliseconds());
+ document2.set_ttl_ms(100);
+ ICING_ASSERT_OK(doc_store->Put(document2));
+
+ DocumentProto document3 = test_document1_;
+ document3.set_namespace_("namespace.1");
+ document3.set_uri("uri3");
+ ICING_ASSERT_OK(doc_store->Put(document3));
+
+ DocumentProto document4 = test_document1_;
+ document4.set_namespace_("namespace.2");
+ document4.set_uri("uri1");
+ ICING_ASSERT_OK(doc_store->Put(document4));
+
+ // Report usage with type 1 on document1
+ UsageReport usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"namespace.1", /*uri=*/"uri1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type1));
+
+ // Report usage with type 2 on document2
+ UsageReport usage_report_type2 = CreateUsageReport(
+ /*name_space=*/"namespace.1", /*uri=*/"uri2", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE2);
+ ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type2));
+
+ // Report usage with type 3 on document3
+ UsageReport usage_report_type3 = CreateUsageReport(
+ /*name_space=*/"namespace.1", /*uri=*/"uri3", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE3);
+ ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type3));
+
+ // Report usage with type 1 on document4
+ usage_report_type1 = CreateUsageReport(
+ /*name_space=*/"namespace.2", /*uri=*/"uri1", /*timestamp_ms=*/1000,
+ UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(doc_store->ReportUsage(usage_report_type1));
+
+ // Delete the first doc.
+ ICING_ASSERT_OK(doc_store->Delete(document1.namespace_(), document1.uri(),
+ fake_clock_.GetSystemTimeMilliseconds()));
+
+ // Expire the second doc.
+ fake_clock_.SetSystemTimeMilliseconds(document2.creation_timestamp_ms() +
+ document2.ttl_ms() + 1);
+
+ // Check high level info
+ DocumentStorageInfoProto storage_info = doc_store->GetStorageInfo();
+ EXPECT_THAT(storage_info.num_alive_documents(), Eq(2));
+ EXPECT_THAT(storage_info.num_deleted_documents(), Eq(1));
+ EXPECT_THAT(storage_info.num_expired_documents(), Eq(1));
+ EXPECT_THAT(storage_info.document_store_size(), Ge(0));
+ EXPECT_THAT(storage_info.document_log_size(), Ge(0));
+ EXPECT_THAT(storage_info.key_mapper_size(), Ge(0));
+ EXPECT_THAT(storage_info.document_id_mapper_size(), Ge(0));
+ EXPECT_THAT(storage_info.score_cache_size(), Ge(0));
+ EXPECT_THAT(storage_info.filter_cache_size(), Ge(0));
+ EXPECT_THAT(storage_info.corpus_mapper_size(), Ge(0));
+ EXPECT_THAT(storage_info.corpus_score_cache_size(), Ge(0));
+ EXPECT_THAT(storage_info.namespace_id_mapper_size(), Ge(0));
+ EXPECT_THAT(storage_info.num_namespaces(), Eq(2));
+
+ // Check per-namespace info
+ EXPECT_THAT(storage_info.namespace_storage_info_size(), Eq(2));
+
+ NamespaceStorageInfoProto namespace_storage_info =
+ GetNamespaceStorageInfo(storage_info, "namespace.1");
+ EXPECT_THAT(namespace_storage_info.num_alive_documents(), Eq(1));
+ EXPECT_THAT(namespace_storage_info.num_expired_documents(), Eq(1));
+ EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type1(), Eq(0));
+ EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type2(), Eq(0));
+ EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type3(), Eq(1));
+ EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type1(),
+ Eq(0));
+ EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type2(),
+ Eq(1));
+ EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type3(),
+ Eq(0));
+
+ namespace_storage_info = GetNamespaceStorageInfo(storage_info, "namespace.2");
+ EXPECT_THAT(namespace_storage_info.num_alive_documents(), Eq(1));
+ EXPECT_THAT(namespace_storage_info.num_expired_documents(), Eq(0));
+ EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type1(), Eq(1));
+ EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type2(), Eq(0));
+ EXPECT_THAT(namespace_storage_info.num_alive_documents_usage_type3(), Eq(0));
+ EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type1(),
+ Eq(0));
+ EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type2(),
+ Eq(0));
+ EXPECT_THAT(namespace_storage_info.num_expired_documents_usage_type3(),
+ Eq(0));
+}
+
+TEST_P(DocumentStoreTest, InitializeForceRecoveryUpdatesTypeIds) {
+ // Start fresh and set the schema with one type.
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+ // The typeid for "email" should be 0.
+ ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+
+ DocumentId docid = kInvalidDocumentId;
+ {
+ // Create the document store the first time and add an email document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto doc =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .SetTtlMs(document1_ttl_)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(doc));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ docid, fake_clock_.GetSystemTimeMilliseconds()));
+
+ ASSERT_THAT(filter_data.schema_type_id(), Eq(0));
+ }
+
+ // Add another type to the schema before the email type.
+ schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("alarm")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("time")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(email_type_config)
+ .Build();
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+ // Adding a new type should cause ids to be reassigned. Ids are assigned in
+ // order of appearance so 'alarm' should be 0 and 'email' should be 1.
+ ASSERT_THAT(schema_store->GetSchemaTypeId("alarm"), IsOkAndHolds(0));
+ ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(1));
+
+ {
+ // Create the document store the second time and force recovery
+ InitializeStatsProto initialize_stats;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/true,
+ GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv,
+ GetParam().use_persistent_hash_map,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ &initialize_stats));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ // Ensure that the type id of the email document has been correctly updated.
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ docid, fake_clock_.GetSystemTimeMilliseconds()));
+ EXPECT_THAT(filter_data.schema_type_id(), Eq(1));
+ EXPECT_THAT(initialize_stats.document_store_recovery_cause(),
+ Eq(InitializeStatsProto::SCHEMA_CHANGES_OUT_OF_SYNC));
+ }
+}
+
+TEST_P(DocumentStoreTest, InitializeDontForceRecoveryDoesntUpdateTypeIds) {
+ // Start fresh and set the schema with one type.
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+ // The typeid for "email" should be 0.
+ ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(0));
+
+ DocumentId docid = kInvalidDocumentId;
+ {
+ // Create the document store the first time and add an email document.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentProto doc =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .SetTtlMs(document1_ttl_)
+ .Build();
+ ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(doc));
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ docid, fake_clock_.GetSystemTimeMilliseconds()));
+
+ ASSERT_THAT(filter_data.schema_type_id(), Eq(0));
+ }
+
+ // Add another type to the schema.
+ schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("alarm")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("time")
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(email_type_config)
+ .Build();
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+ // Adding a new type should cause ids to be reassigned. Ids are assigned in
+ // order of appearance so 'alarm' should be 0 and 'email' should be 1.
+ ASSERT_THAT(schema_store->GetSchemaTypeId("alarm"), IsOkAndHolds(0));
+ ASSERT_THAT(schema_store->GetSchemaTypeId("email"), IsOkAndHolds(1));
+
+ {
+ // Create the document store the second time. Don't force recovery.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ // Check that the type id of the email document has not been updated.
+ ICING_ASSERT_HAS_VALUE_AND_ASSIGN(
+ DocumentFilterData filter_data,
+ doc_store->GetAliveDocumentFilterData(
+ docid, fake_clock_.GetSystemTimeMilliseconds()));
+ ASSERT_THAT(filter_data.schema_type_id(), Eq(0));
+ }
+}
+
+TEST_P(DocumentStoreTest, InitializeForceRecoveryDeletesInvalidDocument) {
+ // Start fresh and set the schema with one type.
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ DocumentProto docWithBody =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .SetTtlMs(document1_ttl_)
+ .Build();
+ DocumentProto docWithoutBody =
+ DocumentBuilder()
+ .SetKey("icing", "email/2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .SetTtlMs(document1_ttl_)
+ .Build();
+
+ {
+ // Create the document store the first time and add two email documents: one
+ // that has the 'body' section and one that doesn't.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentId docid = kInvalidDocumentId;
+ ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody));
+ ASSERT_NE(docid, kInvalidDocumentId);
+ docid = kInvalidDocumentId;
+ ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody));
+ ASSERT_NE(docid, kInvalidDocumentId);
+
+ ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
+ IsOkAndHolds(EqualsProto(docWithBody)));
+ ASSERT_THAT(
+ doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()),
+ IsOkAndHolds(EqualsProto(docWithoutBody)));
+ }
+
+ // Delete the 'body' property from the 'email' type, making all pre-existing
+ // documents with the 'body' property invalid.
+ email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ schema = SchemaBuilder().AddType(email_type_config).Build();
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ {
+ // Create the document store the second time and force recovery
+ CorruptDocStoreHeaderChecksumFile();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, document_store_dir_, &fake_clock_, schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/true,
+ GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv,
+ GetParam().use_persistent_hash_map,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ ASSERT_THAT(
+ doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()),
+ IsOkAndHolds(EqualsProto(docWithoutBody)));
+ }
+}
+
+TEST_P(DocumentStoreTest, InitializeDontForceRecoveryKeepsInvalidDocument) {
+ // Start fresh and set the schema with one type.
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ SchemaTypeConfigProto email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ SchemaProto schema = SchemaBuilder().AddType(email_type_config).Build();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ DocumentProto docWithBody =
+ DocumentBuilder()
+ .SetKey("icing", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .AddStringProperty("body", "body bar")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .SetTtlMs(document1_ttl_)
+ .Build();
+ DocumentProto docWithoutBody =
+ DocumentBuilder()
+ .SetKey("icing", "email/2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "subject foo")
+ .SetScore(document1_score_)
+ .SetCreationTimestampMs(
+ document1_creation_timestamp_) // A random timestamp
+ .SetTtlMs(document1_ttl_)
+ .Build();
+
+ {
+ // Create the document store the first time and add two email documents: one
+ // that has the 'body' section and one that doesn't.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ DocumentId docid = kInvalidDocumentId;
+ ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithBody));
+ ASSERT_NE(docid, kInvalidDocumentId);
+ docid = kInvalidDocumentId;
+ ICING_ASSERT_OK_AND_ASSIGN(docid, doc_store->Put(docWithoutBody));
+ ASSERT_NE(docid, kInvalidDocumentId);
+
+ ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
+ IsOkAndHolds(EqualsProto(docWithBody)));
+ ASSERT_THAT(
+ doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()),
+ IsOkAndHolds(EqualsProto(docWithoutBody)));
+ }
+
+ // Delete the 'body' property from the 'email' type, making all pre-existing
+ // documents with the 'body' property invalid.
+ email_type_config =
+ SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .Build();
+ schema = SchemaBuilder().AddType(email_type_config).Build();
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/true,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ {
+ // Corrupt the document store header checksum so that we will perform
+ // recovery, but without revalidation.
+ CorruptDocStoreHeaderChecksumFile();
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+
+ ASSERT_THAT(doc_store->Get(docWithBody.namespace_(), docWithBody.uri()),
+ IsOkAndHolds(EqualsProto(docWithBody)));
+ ASSERT_THAT(
+ doc_store->Get(docWithoutBody.namespace_(), docWithoutBody.uri()),
+ IsOkAndHolds(EqualsProto(docWithoutBody)));
+ }
+}
+
+TEST_P(DocumentStoreTest, MigrateToPortableFileBackedProtoLog) {
+ // Set up schema.
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+
+ std::string schema_store_dir = schema_store_dir_ + "_migrate";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+ ASSERT_THAT(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
+
+ // Create dst directory that we'll initialize the DocumentStore over.
+ std::string document_store_dir = document_store_dir_ + "_migrate";
+ ASSERT_THAT(
+ filesystem_.DeleteDirectoryRecursively(document_store_dir.c_str()), true);
+ ASSERT_THAT(
+ filesystem_.CreateDirectoryRecursively(document_store_dir.c_str()), true);
+
+ // Copy the testdata files into our DocumentStore directory
+ std::string document_store_without_portable_log;
+ if (IsAndroidX86()) {
+ document_store_without_portable_log = GetTestFilePath(
+ "icing/testdata/not_portable_log/"
+ "icing_search_engine_android_x86/document_dir");
+ } else if (IsAndroidArm()) {
+ document_store_without_portable_log = GetTestFilePath(
+ "icing/testdata/not_portable_log/"
+ "icing_search_engine_android_arm/document_dir");
+ } else if (IsIosPlatform()) {
+ document_store_without_portable_log = GetTestFilePath(
+ "icing/testdata/not_portable_log/"
+ "icing_search_engine_ios/document_dir");
+ } else {
+ document_store_without_portable_log = GetTestFilePath(
+ "icing/testdata/not_portable_log/"
+ "icing_search_engine_linux/document_dir");
+ }
+
+ ASSERT_TRUE(filesystem_.CopyDirectory(
+ document_store_without_portable_log.c_str(), document_store_dir.c_str(),
+ /*recursive=*/true));
+
+ // Initialize the DocumentStore over our copied files.
+ InitializeStatsProto initialize_stats;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, document_store_dir, &fake_clock_, schema_store.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ GetParam().pre_mapping_fbv, GetParam().use_persistent_hash_map,
+ GetParam().namespace_id_fingerprint,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ &initialize_stats));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ // These are the documents that are stored in the testdata files. Do not
+ // change unless you're also updating the testdata files.
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(10)
+ .AddStringProperty("subject", "foo")
+ .AddStringProperty("body", "bar")
+ .Build();
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace1", "uri2")
+ .SetSchema("email")
+ .SetCreationTimestampMs(20)
+ .SetScore(321)
+ .AddStringProperty("body", "baz bat")
+ .Build();
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "uri1")
+ .SetSchema("email")
+ .SetCreationTimestampMs(30)
+ .SetScore(123)
+ .AddStringProperty("subject", "phoo")
+ .Build();
+
+ // Check that we didn't lose anything. A migration also doesn't technically
+ // count as data loss, but we still have to regenerate derived files after
+ // migration.
+ EXPECT_THAT(create_result.data_loss, Eq(DataLoss::NONE));
+ EXPECT_THAT(create_result.derived_files_regenerated, IsTrue());
+ EXPECT_EQ(initialize_stats.document_store_recovery_cause(),
+ InitializeStatsProto::LEGACY_DOCUMENT_LOG_FORMAT);
+
+ // Document 1 and 3 were put normally, and document 2 was deleted in our
+ // testdata files.
+ //
+ // Check by namespace, uri
+ EXPECT_THAT(document_store->Get(document1.namespace_(), document1.uri()),
+ IsOkAndHolds(EqualsProto(document1)));
+ EXPECT_THAT(document_store->Get(document2.namespace_(), document2.uri()),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(document3.namespace_(), document3.uri()),
+ IsOkAndHolds(EqualsProto(document3)));
+
+ // Check by document_id
+ EXPECT_THAT(document_store->Get(/*document_id=*/0),
+ IsOkAndHolds(EqualsProto(document1)));
+ EXPECT_THAT(document_store->Get(/*document_id=*/1),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ EXPECT_THAT(document_store->Get(/*document_id=*/2),
+ IsOkAndHolds(EqualsProto(document3)));
+}
+
+TEST_P(DocumentStoreTest, GetDebugInfo) {
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType("email")
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("subject")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName("body")
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(SchemaTypeConfigBuilder().SetType("person").AddProperty(
+ PropertyConfigBuilder()
+ .SetName("name")
+ .SetDataTypeString(TERM_MATCH_EXACT, TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+ ICING_ASSERT_OK(schema_store->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+
+ DocumentProto document1 = DocumentBuilder()
+ .SetKey("namespace1", "email/1")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa bb cc")
+ .AddStringProperty("body", "dd ee")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document1, 5));
+
+ DocumentProto document2 = DocumentBuilder()
+ .SetKey("namespace2", "email/2")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa bb")
+ .AddStringProperty("body", "cc")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document2, 3));
+
+ DocumentProto document3 = DocumentBuilder()
+ .SetKey("namespace2", "email/3")
+ .SetSchema("email")
+ .AddStringProperty("subject", "aa")
+ .AddStringProperty("body", "")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document3, 1));
+
+ DocumentProto document4 = DocumentBuilder()
+ .SetKey("namespace1", "person/1")
+ .SetSchema("person")
+ .AddStringProperty("name", "test test")
+ .SetCreationTimestampMs(1)
+ .Build();
+ ICING_ASSERT_OK(document_store->Put(document4, 2));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentDebugInfoProto out1,
+ document_store->GetDebugInfo(DebugInfoVerbosity::DETAILED));
+ EXPECT_THAT(out1.crc(), Gt(0));
+ EXPECT_THAT(out1.document_storage_info().num_alive_documents(), Eq(4));
+ EXPECT_THAT(out1.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out1.document_storage_info().num_expired_documents(), Eq(0));
+
+ DocumentDebugInfoProto::CorpusInfo info1, info2, info3;
+ info1.set_namespace_("namespace1");
+ info1.set_schema("email");
+ info1.set_total_documents(1); // document1
+ info1.set_total_token(5);
+
+ info2.set_namespace_("namespace2");
+ info2.set_schema("email");
+ info2.set_total_documents(2); // document2 and document3
+ info2.set_total_token(4); // 3 + 1
+
+ info3.set_namespace_("namespace1");
+ info3.set_schema("person");
+ info3.set_total_documents(1); // document4
+ info3.set_total_token(2);
+
+ EXPECT_THAT(out1.corpus_info(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
+ EqualsProto(info3)));
+
+ // Delete document3.
+ ICING_ASSERT_OK(document_store->Delete(
+ "namespace2", "email/3", fake_clock_.GetSystemTimeMilliseconds()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentDebugInfoProto out2,
+ document_store->GetDebugInfo(DebugInfoVerbosity::DETAILED));
+ EXPECT_THAT(out2.crc(), Gt(0));
+ EXPECT_THAT(out2.crc(), Not(Eq(out1.crc())));
+ EXPECT_THAT(out2.document_storage_info().num_alive_documents(), Eq(3));
+ EXPECT_THAT(out2.document_storage_info().num_deleted_documents(), Eq(1));
+ EXPECT_THAT(out2.document_storage_info().num_expired_documents(), Eq(0));
+ info2.set_total_documents(1); // document2
+ info2.set_total_token(3);
+ EXPECT_THAT(out2.corpus_info(),
+ UnorderedElementsAre(EqualsProto(info1), EqualsProto(info2),
+ EqualsProto(info3)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentDebugInfoProto out3,
+ document_store->GetDebugInfo(DebugInfoVerbosity::BASIC));
+ EXPECT_THAT(out3.corpus_info(), IsEmpty());
+}
+
+TEST_P(DocumentStoreTest, GetDebugInfoWithoutSchema) {
+ std::string schema_store_dir = schema_store_dir_ + "_custom";
+ filesystem_.DeleteDirectoryRecursively(schema_store_dir.c_str());
+ filesystem_.CreateDirectoryRecursively(schema_store_dir.c_str());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<SchemaStore> schema_store,
+ SchemaStore::Create(&filesystem_, schema_store_dir, &fake_clock_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentDebugInfoProto out,
+ document_store->GetDebugInfo(DebugInfoVerbosity::DETAILED));
+ EXPECT_THAT(out.crc(), Gt(0));
+ EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
+ EXPECT_THAT(out.corpus_info(), IsEmpty());
+}
+
+TEST_P(DocumentStoreTest, GetDebugInfoForEmptyDocumentStore) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ CreateDocumentStore(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get()));
+ std::unique_ptr<DocumentStore> document_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentDebugInfoProto out,
+ document_store->GetDebugInfo(DebugInfoVerbosity::DETAILED));
+ EXPECT_THAT(out.crc(), Gt(0));
+ EXPECT_THAT(out.document_storage_info().num_alive_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_deleted_documents(), Eq(0));
+ EXPECT_THAT(out.document_storage_info().num_expired_documents(), Eq(0));
+ EXPECT_THAT(out.corpus_info(), IsEmpty());
+}
+
+TEST_P(DocumentStoreTest, SwitchKeyMapperTypeShouldRegenerateDerivedFiles) {
+ std::string dynamic_trie_uri_mapper_dir =
+ document_store_dir_ + "/key_mapper_dir";
+ std::string persistent_hash_map_uri_mapper_dir =
+ document_store_dir_ + "/uri_mapper";
+ DocumentId document_id1;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ GetParam().namespace_id_fingerprint,
+ GetParam().pre_mapping_fbv,
+ GetParam().use_persistent_hash_map,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(document_id1, doc_store->Put(test_document1_));
+
+ if (GetParam().use_persistent_hash_map) {
+ EXPECT_THAT(filesystem_.DirectoryExists(
+ persistent_hash_map_uri_mapper_dir.c_str()),
+ IsTrue());
+ EXPECT_THAT(
+ filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()),
+ IsFalse());
+ } else {
+ EXPECT_THAT(filesystem_.DirectoryExists(
+ persistent_hash_map_uri_mapper_dir.c_str()),
+ IsFalse());
+ EXPECT_THAT(
+ filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()),
+ IsTrue());
+ }
+ }
+
+ // Switch key mapper. We should get I/O error and derived files should be
+ // regenerated.
+ {
+ bool switch_key_mapper_flag = !GetParam().use_persistent_hash_map;
+ InitializeStatsProto initialize_stats;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv,
+ /*use_persistent_hash_map=*/switch_key_mapper_flag,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ &initialize_stats));
+ EXPECT_THAT(initialize_stats.document_store_recovery_cause(),
+ Eq(InitializeStatsProto::IO_ERROR));
+
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ EXPECT_THAT(doc_store->GetDocumentId(test_document1_.namespace_(),
+ test_document1_.uri()),
+ IsOkAndHolds(document_id1));
+
+ if (switch_key_mapper_flag) {
+ EXPECT_THAT(filesystem_.DirectoryExists(
+ persistent_hash_map_uri_mapper_dir.c_str()),
+ IsTrue());
+ EXPECT_THAT(
+ filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()),
+ IsFalse());
+ } else {
+ EXPECT_THAT(filesystem_.DirectoryExists(
+ persistent_hash_map_uri_mapper_dir.c_str()),
+ IsFalse());
+ EXPECT_THAT(
+ filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()),
+ IsTrue());
+ }
+ }
+}
+
+TEST_P(DocumentStoreTest, SameKeyMapperTypeShouldNotRegenerateDerivedFiles) {
+ std::string dynamic_trie_uri_mapper_dir =
+ document_store_dir_ + "/key_mapper_dir";
+ std::string persistent_hash_map_uri_mapper_dir =
+ document_store_dir_ + "/uri_mapper";
+ DocumentId document_id1;
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ GetParam().namespace_id_fingerprint,
+ GetParam().pre_mapping_fbv,
+ GetParam().use_persistent_hash_map,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(document_id1, doc_store->Put(test_document1_));
+
+ if (GetParam().use_persistent_hash_map) {
+ EXPECT_THAT(filesystem_.DirectoryExists(
+ persistent_hash_map_uri_mapper_dir.c_str()),
+ IsTrue());
+ EXPECT_THAT(
+ filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()),
+ IsFalse());
+ } else {
+ EXPECT_THAT(filesystem_.DirectoryExists(
+ persistent_hash_map_uri_mapper_dir.c_str()),
+ IsFalse());
+ EXPECT_THAT(
+ filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()),
+ IsTrue());
+ }
+ }
+
+ // Use the same key mapper type. Derived files should not be regenerated.
+ {
+ InitializeStatsProto initialize_stats;
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
+ schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ GetParam().namespace_id_fingerprint,
+ GetParam().pre_mapping_fbv,
+ GetParam().use_persistent_hash_map,
+ PortableFileBackedProtoLog<
+ DocumentWrapper>::kDeflateCompressionLevel,
+ &initialize_stats));
+ EXPECT_THAT(initialize_stats.document_store_recovery_cause(),
+ Eq(InitializeStatsProto::NONE));
+
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ EXPECT_THAT(doc_store->GetDocumentId(test_document1_.namespace_(),
+ test_document1_.uri()),
+ IsOkAndHolds(document_id1));
+
+ if (GetParam().use_persistent_hash_map) {
+ EXPECT_THAT(filesystem_.DirectoryExists(
+ persistent_hash_map_uri_mapper_dir.c_str()),
+ IsTrue());
+ EXPECT_THAT(
+ filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()),
+ IsFalse());
+ } else {
+ EXPECT_THAT(filesystem_.DirectoryExists(
+ persistent_hash_map_uri_mapper_dir.c_str()),
+ IsFalse());
+ EXPECT_THAT(
+ filesystem_.DirectoryExists(dynamic_trie_uri_mapper_dir.c_str()),
+ IsTrue());
+ }
+ }
+}
+
+TEST_P(DocumentStoreTest, GetDocumentIdByNamespaceFingerprintIdentifier) {
+ std::string dynamic_trie_uri_mapper_dir =
+ document_store_dir_ + "/key_mapper_dir";
+ std::string persistent_hash_map_uri_mapper_dir =
+ document_store_dir_ + "/uri_mapper";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ DocumentStore::CreateResult create_result,
+ DocumentStore::Create(
+ &filesystem_, document_store_dir_, &fake_clock_, schema_store_.get(),
+ /*force_recovery_and_revalidate_documents=*/false,
+ GetParam().namespace_id_fingerprint, GetParam().pre_mapping_fbv,
+ GetParam().use_persistent_hash_map,
+ PortableFileBackedProtoLog<DocumentWrapper>::kDeflateCompressionLevel,
+ /*initialize_stats=*/nullptr));
+
+ std::unique_ptr<DocumentStore> doc_store =
+ std::move(create_result.document_store);
+ ICING_ASSERT_OK_AND_ASSIGN(DocumentId document_id,
+ doc_store->Put(test_document1_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ NamespaceId namespace_id,
+ doc_store->GetNamespaceId(test_document1_.namespace_()));
+ NamespaceFingerprintIdentifier ns_fingerprint(
+ namespace_id,
+ /*target_str=*/test_document1_.uri());
+ if (GetParam().namespace_id_fingerprint) {
+ EXPECT_THAT(doc_store->GetDocumentId(ns_fingerprint),
+ IsOkAndHolds(document_id));
+
+ NamespaceFingerprintIdentifier non_existing_ns_fingerprint(
+ namespace_id + 1, /*target_str=*/test_document1_.uri());
+ EXPECT_THAT(doc_store->GetDocumentId(non_existing_ns_fingerprint),
+ StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
+ } else {
+ EXPECT_THAT(doc_store->GetDocumentId(ns_fingerprint),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+ DocumentStoreTest, DocumentStoreTest,
+ testing::Values(
+ DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false,
+ /*pre_mapping_fbv_in=*/false,
+ /*use_persistent_hash_map_in=*/false),
+ DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true,
+ /*pre_mapping_fbv_in=*/false,
+ /*use_persistent_hash_map_in=*/false),
+ DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false,
+ /*pre_mapping_fbv_in=*/true,
+ /*use_persistent_hash_map_in=*/false),
+ DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true,
+ /*pre_mapping_fbv_in=*/true,
+ /*use_persistent_hash_map_in=*/false),
+ DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false,
+ /*pre_mapping_fbv_in=*/false,
+ /*use_persistent_hash_map_in=*/true),
+ DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true,
+ /*pre_mapping_fbv_in=*/false,
+ /*use_persistent_hash_map_in=*/true),
+ DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/false,
+ /*pre_mapping_fbv_in=*/true,
+ /*use_persistent_hash_map_in=*/true),
+ DocumentStoreTestParam(/*namespace_id_fingerprint_in=*/true,
+ /*pre_mapping_fbv_in=*/true,
+ /*use_persistent_hash_map_in=*/true)));
+
+} // namespace
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/dynamic-trie-key-mapper.h b/icing/store/dynamic-trie-key-mapper.h
new file mode 100644
index 0000000..63e8488
--- /dev/null
+++ b/icing/store/dynamic-trie-key-mapper.h
@@ -0,0 +1,334 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_DYNAMIC_TRIE_KEY_MAPPER_H_
+#define ICING_STORE_DYNAMIC_TRIE_KEY_MAPPER_H_
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <type_traits>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/file/filesystem.h"
+#include "icing/legacy/index/icing-dynamic-trie.h"
+#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/store/key-mapper.h"
+#include "icing/util/crc32.h"
+#include "icing/util/logging.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// File-backed mapping between the string key and a trivially copyable value
+// type.
+//
+// DynamicTrieKeyMapper is thread-compatible
+template <typename T, typename Formatter = absl_ports::DefaultFormatter>
+class DynamicTrieKeyMapper : public KeyMapper<T, Formatter> {
+ public:
+ // Returns an initialized instance of DynamicTrieKeyMapper that can
+ // immediately handle read/write operations.
+ // Returns any encountered IO errors.
+ //
+ // base_dir : Base directory used to save all the files required to persist
+ // DynamicTrieKeyMapper. If this base_dir was previously used to
+ // create a DynamicTrieKeyMapper, then this existing data would be
+ // loaded. Otherwise, an empty DynamicTrieKeyMapper would be
+ // created.
+ // maximum_size_bytes : The maximum allowable size of the key mapper storage.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<DynamicTrieKeyMapper<T, Formatter>>>
+ Create(const Filesystem& filesystem, std::string_view base_dir,
+ int maximum_size_bytes);
+
+ // Deletes all the files associated with the DynamicTrieKeyMapper.
+ //
+ // base_dir : Base directory used to save all the files required to persist
+ // DynamicTrieKeyMapper. Should be the same as passed into
+ // Create().
+ //
+ // Returns
+ // OK on success
+ // INTERNAL_ERROR on I/O error
+ static libtextclassifier3::Status Delete(const Filesystem& filesystem,
+ std::string_view base_dir);
+
+ ~DynamicTrieKeyMapper() override = default;
+
+ libtextclassifier3::Status Put(std::string_view key, T value) override;
+
+ libtextclassifier3::StatusOr<T> GetOrPut(std::string_view key,
+ T next_value) override;
+
+ libtextclassifier3::StatusOr<T> Get(std::string_view key) const override;
+
+ bool Delete(std::string_view key) override;
+
+ std::unique_ptr<typename KeyMapper<T, Formatter>::Iterator> GetIterator()
+ const override;
+
+ int32_t num_keys() const override { return trie_.size(); }
+
+ libtextclassifier3::Status PersistToDisk() override;
+
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const override;
+
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const override;
+
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum() override;
+
+ private:
+ class Iterator : public KeyMapper<T, Formatter>::Iterator {
+ public:
+ explicit Iterator(const IcingDynamicTrie& trie)
+ : itr_(trie, /*prefix=*/""), start_(true) {}
+
+ ~Iterator() override = default;
+
+ bool Advance() override {
+ if (start_) {
+ start_ = false;
+ return itr_.IsValid();
+ }
+ return itr_.Advance();
+ }
+
+ std::string_view GetKey() const override {
+ const char* key = itr_.GetKey();
+ return std::string_view(key);
+ }
+
+ T GetValue() const override {
+ T value;
+ memcpy(&value, itr_.GetValue(), sizeof(T));
+ return value;
+ }
+
+ private:
+ IcingDynamicTrie::Iterator itr_;
+
+ // TODO(b/241784804): remove this flag after changing IcingDynamicTrie to
+ // follow the common iterator pattern in our codebase.
+ bool start_;
+ };
+
+ static constexpr char kDynamicTrieKeyMapperDir[] = "key_mapper_dir";
+ static constexpr char kDynamicTrieKeyMapperPrefix[] = "key_mapper";
+
+ // Use DynamicTrieKeyMapper::Create() to instantiate.
+ explicit DynamicTrieKeyMapper(std::string_view key_mapper_dir);
+
+ // Load any existing DynamicTrieKeyMapper data from disk, or creates a new
+ // instance of DynamicTrieKeyMapper on disk and gets ready to process
+ // read/write operations.
+ //
+ // Returns any encountered IO errors.
+ libtextclassifier3::Status Initialize(int maximum_size_bytes);
+
+ const std::string file_prefix_;
+
+ // TODO(adorokhine) Filesystem is a forked class that's available both in
+ // icing and icing namespaces. We will need icing::Filesystem in order
+ // to use IcingDynamicTrie. Filesystem class should be fully refactored
+ // to have a single definition across both namespaces. Such a class should
+ // use icing (and general google3) coding conventions and behave like
+ // a proper C++ class.
+ const IcingFilesystem icing_filesystem_;
+ IcingDynamicTrie trie_;
+
+ static_assert(std::is_trivially_copyable<T>::value,
+ "T must be trivially copyable");
+};
+
+template <typename T, typename Formatter>
+libtextclassifier3::StatusOr<
+ std::unique_ptr<DynamicTrieKeyMapper<T, Formatter>>>
+DynamicTrieKeyMapper<T, Formatter>::Create(const Filesystem& filesystem,
+ std::string_view base_dir,
+ int maximum_size_bytes) {
+ // We create a subdirectory since the trie creates and stores multiple files.
+ // This makes it easier to isolate the trie files away from other files that
+ // could potentially be in the same base_dir, and makes it easier to delete.
+ const std::string key_mapper_dir =
+ absl_ports::StrCat(base_dir, "/", kDynamicTrieKeyMapperDir);
+ if (!filesystem.CreateDirectoryRecursively(key_mapper_dir.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to create DynamicTrieKeyMapper directory: ", key_mapper_dir));
+ }
+ auto mapper = std::unique_ptr<DynamicTrieKeyMapper<T, Formatter>>(
+ new DynamicTrieKeyMapper<T, Formatter>(key_mapper_dir));
+ ICING_RETURN_IF_ERROR(mapper->Initialize(maximum_size_bytes));
+ return mapper;
+}
+
+template <typename T, typename Formatter>
+libtextclassifier3::Status DynamicTrieKeyMapper<T, Formatter>::Delete(
+ const Filesystem& filesystem, std::string_view base_dir) {
+ std::string key_mapper_dir =
+ absl_ports::StrCat(base_dir, "/", kDynamicTrieKeyMapperDir);
+ if (!filesystem.DeleteDirectoryRecursively(key_mapper_dir.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to delete DynamicTrieKeyMapper directory: ", key_mapper_dir));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T, typename Formatter>
+DynamicTrieKeyMapper<T, Formatter>::DynamicTrieKeyMapper(
+ std::string_view key_mapper_dir)
+ : file_prefix_(
+ absl_ports::StrCat(key_mapper_dir, "/", kDynamicTrieKeyMapperPrefix)),
+ trie_(file_prefix_,
+ IcingDynamicTrie::RuntimeOptions().set_storage_policy(
+ IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc),
+ &icing_filesystem_) {}
+
+template <typename T, typename Formatter>
+libtextclassifier3::Status DynamicTrieKeyMapper<T, Formatter>::Initialize(
+ int maximum_size_bytes) {
+ IcingDynamicTrie::Options options;
+ // Divide the max space between the three internal arrays: nodes, nexts and
+ // suffixes. MaxNodes and MaxNexts are in units of their own data structures.
+ // MaxSuffixesSize is in units of bytes.
+ options.max_nodes = maximum_size_bytes / (3 * sizeof(IcingDynamicTrie::Node));
+ options.max_nexts = options.max_nodes;
+ options.max_suffixes_size =
+ sizeof(IcingDynamicTrie::Node) * options.max_nodes;
+ options.value_size = sizeof(T);
+
+ if (!trie_.CreateIfNotExist(options)) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to create DynamicTrieKeyMapper file: ", file_prefix_));
+ }
+ if (!trie_.Init()) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to init DynamicTrieKeyMapper file: ", file_prefix_));
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T, typename Formatter>
+libtextclassifier3::StatusOr<T> DynamicTrieKeyMapper<T, Formatter>::GetOrPut(
+ std::string_view key, T next_value) {
+ std::string string_key(key);
+ uint32_t value_index;
+ libtextclassifier3::Status status =
+ trie_.Insert(string_key.c_str(), &next_value, &value_index,
+ /*replace=*/false);
+ if (!status.ok()) {
+ ICING_LOG(DBG) << "Unable to insert key " << string_key
+ << " into DynamicTrieKeyMapper " << file_prefix_ << ".\n"
+ << status.error_message();
+ return status;
+ }
+ // This memory address could be unaligned since we're just grabbing the value
+ // from somewhere in the trie's suffix array. The suffix array is filled with
+ // chars, so the address might not be aligned to T values.
+ const T* unaligned_value =
+ static_cast<const T*>(trie_.GetValueAtIndex(value_index));
+
+ // memcpy the value to ensure that the returned value here is in a T-aligned
+ // address
+ T aligned_value;
+ memcpy(&aligned_value, unaligned_value, sizeof(T));
+ return aligned_value;
+}
+
+template <typename T, typename Formatter>
+libtextclassifier3::Status DynamicTrieKeyMapper<T, Formatter>::Put(
+ std::string_view key, T value) {
+ std::string string_key(key);
+ libtextclassifier3::Status status = trie_.Insert(string_key.c_str(), &value);
+ if (!status.ok()) {
+ ICING_LOG(DBG) << "Unable to insert key " << string_key
+ << " into DynamicTrieKeyMapper " << file_prefix_ << ".\n"
+ << status.error_message();
+ return status;
+ }
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T, typename Formatter>
+libtextclassifier3::StatusOr<T> DynamicTrieKeyMapper<T, Formatter>::Get(
+ std::string_view key) const {
+ std::string string_key(key);
+ T value;
+ if (!trie_.Find(string_key.c_str(), &value)) {
+ return absl_ports::NotFoundError(
+ absl_ports::StrCat("Key not found ", Formatter()(string_key),
+ " in DynamicTrieKeyMapper ", file_prefix_, "."));
+ }
+ return value;
+}
+
+template <typename T, typename Formatter>
+bool DynamicTrieKeyMapper<T, Formatter>::Delete(std::string_view key) {
+ return trie_.Delete(key);
+}
+
+template <typename T, typename Formatter>
+std::unique_ptr<typename KeyMapper<T, Formatter>::Iterator>
+DynamicTrieKeyMapper<T, Formatter>::GetIterator() const {
+ return std::make_unique<DynamicTrieKeyMapper<T, Formatter>::Iterator>(trie_);
+}
+
+template <typename T, typename Formatter>
+libtextclassifier3::Status DynamicTrieKeyMapper<T, Formatter>::PersistToDisk() {
+ if (!trie_.Sync()) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to sync DynamicTrieKeyMapper file: ", file_prefix_));
+ }
+
+ return libtextclassifier3::Status::OK;
+}
+
+template <typename T, typename Formatter>
+libtextclassifier3::StatusOr<int64_t>
+DynamicTrieKeyMapper<T, Formatter>::GetDiskUsage() const {
+ int64_t size = trie_.GetDiskUsage();
+ if (size == IcingFilesystem::kBadFileSize || size < 0) {
+ return absl_ports::InternalError("Failed to get disk usage of key mapper");
+ }
+ return size;
+}
+
+template <typename T, typename Formatter>
+libtextclassifier3::StatusOr<int64_t>
+DynamicTrieKeyMapper<T, Formatter>::GetElementsSize() const {
+ int64_t size = trie_.GetElementsSize();
+ if (size == IcingFilesystem::kBadFileSize || size < 0) {
+ return absl_ports::InternalError(
+ "Failed to get disk usage of elements in the key mapper");
+ }
+ return size;
+}
+
+template <typename T, typename Formatter>
+libtextclassifier3::StatusOr<Crc32>
+DynamicTrieKeyMapper<T, Formatter>::ComputeChecksum() {
+ return Crc32(trie_.UpdateCrc());
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_DYNAMIC_TRIE_KEY_MAPPER_H_
diff --git a/icing/store/dynamic-trie-key-mapper_test.cc b/icing/store/dynamic-trie-key-mapper_test.cc
new file mode 100644
index 0000000..fd56170
--- /dev/null
+++ b/icing/store/dynamic-trie-key-mapper_test.cc
@@ -0,0 +1,67 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/dynamic-trie-key-mapper.h"
+
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+constexpr int kMaxDynamicTrieKeyMapperSize = 3 * 1024 * 1024; // 3 MiB
+
+class DynamicTrieKeyMapperTest : public testing::Test {
+ protected:
+ void SetUp() override { base_dir_ = GetTestTempDir() + "/key_mapper"; }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ std::string base_dir_;
+ Filesystem filesystem_;
+};
+
+TEST_F(DynamicTrieKeyMapperTest, InvalidBaseDir) {
+ EXPECT_THAT(DynamicTrieKeyMapper<DocumentId>::Create(
+ filesystem_, "/dev/null", kMaxDynamicTrieKeyMapperSize),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(DynamicTrieKeyMapperTest, NegativeMaxKeyMapperSizeReturnsInternalError) {
+ EXPECT_THAT(
+ DynamicTrieKeyMapper<DocumentId>::Create(filesystem_, base_dir_, -1),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+TEST_F(DynamicTrieKeyMapperTest, TooLargeMaxKeyMapperSizeReturnsInternalError) {
+ EXPECT_THAT(DynamicTrieKeyMapper<DocumentId>::Create(
+ filesystem_, base_dir_, std::numeric_limits<int>::max()),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/key-mapper.h b/icing/store/key-mapper.h
index 4571df2..2767da8 100644
--- a/icing/store/key-mapper.h
+++ b/icing/store/key-mapper.h
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Google LLC
+// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -17,78 +17,83 @@
#include <cstdint>
#include <cstring>
-#include <memory>
#include <string>
#include <string_view>
#include <type_traits>
+#include <unordered_map>
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/file/filesystem.h"
-#include "icing/legacy/index/icing-dynamic-trie.h"
-#include "icing/legacy/index/icing-filesystem.h"
+#include "icing/absl_ports/str_join.h"
#include "icing/util/crc32.h"
-#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
-// File-backed mapping between the string key and a trivially copyable value
-// type.
+// An interface for file-backed mapping between the string key and a trivially
+// copyable value type.
//
-// KeyMapper is thread-compatible
-template <typename T>
+// The implementation for KeyMapper should be thread-compatible
+template <typename T, typename Formatter = absl_ports::DefaultFormatter>
class KeyMapper {
public:
- // Returns an initialized instance of KeyMapper that can immediately handle
- // read/write operations.
- // Returns any encountered IO errors.
- //
- // base_dir : Base directory used to save all the files required to persist
- // KeyMapper. If this base_dir was previously used to create a
- // KeyMapper, then this existing data would be loaded. Otherwise,
- // an empty KeyMapper would be created.
- // maximum_size_bytes : The maximum allowable size of the key mapper storage.
- static libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<T>>> Create(
- const Filesystem& filesystem, std::string_view base_dir,
- int maximum_size_bytes);
+ class Iterator {
+ public:
+ virtual ~Iterator() = default;
- // Deletes all the files associated with the KeyMapper. Returns success or any
- // encountered IO errors
- //
- // base_dir : Base directory used to save all the files required to persist
- // KeyMapper. Should be the same as passed into Create().
- static libtextclassifier3::Status Delete(const Filesystem& filesystem,
- std::string_view base_dir);
+ // Advance to the next entry.
+ //
+ // Returns:
+ // True on success, otherwise false.
+ virtual bool Advance() = 0;
+
+ // Get the key.
+ //
+ // REQUIRES: The preceding call for Advance() is true.
+ virtual std::string_view GetKey() const = 0;
- ~KeyMapper() = default;
+ // Get the value.
+ //
+ // REQUIRES: The preceding call for Advance() is true.
+ virtual T GetValue() const = 0;
+ };
+
+ virtual ~KeyMapper() = default;
// Inserts/Updates value for key.
// Returns any encountered IO errors.
//
// NOTE: Put() doesn't automatically flush changes to disk and relies on
// either explicit calls to PersistToDisk() or a clean shutdown of the class.
- libtextclassifier3::Status Put(std::string_view key, T value);
+ virtual libtextclassifier3::Status Put(std::string_view key, T value) = 0;
// Finds the current value for key and returns it. If key is not present, it
// is inserted with next_value and next_value is returned.
//
// Returns any IO errors that may occur during Put.
- libtextclassifier3::StatusOr<T> GetOrPut(std::string_view key, T next_value);
+ virtual libtextclassifier3::StatusOr<T> GetOrPut(std::string_view key,
+ T next_value) = 0;
// Returns the value corresponding to the key.
//
// Returns NOT_FOUND error if the key was missing.
// Returns any encountered IO errors.
- libtextclassifier3::StatusOr<T> Get(std::string_view key) const;
+ virtual libtextclassifier3::StatusOr<T> Get(std::string_view key) const = 0;
+
+ // Deletes data related to the given key. Returns true on success.
+ virtual bool Delete(std::string_view key) = 0;
- // Returns a map of values to keys. Empty map if the mapper is empty.
- std::unordered_map<T, std::string> GetValuesToKeys() const;
+ // Returns an iterator of the key mapper.
+ //
+ // Example usage:
+ // auto itr = key_mapper->GetIterator();
+ // while (itr->Advance()) {
+ // std::cout << itr->GetKey() << " " << itr->GetValue() << std::endl;
+ // }
+ virtual std::unique_ptr<Iterator> GetIterator() const = 0;
// Count of unique keys stored in the KeyMapper.
- int32_t num_keys() const { return trie_.size(); }
+ virtual int32_t num_keys() const = 0;
// Syncs all the changes made to the KeyMapper to disk.
// Returns any encountered IO errors.
@@ -100,7 +105,7 @@ class KeyMapper {
// Returns:
// OK on success
// INTERNAL on I/O error
- libtextclassifier3::Status PersistToDisk();
+ virtual libtextclassifier3::Status PersistToDisk() = 0;
// Calculates and returns the disk usage in bytes. Rounds up to the nearest
// block size.
@@ -108,7 +113,7 @@ class KeyMapper {
// Returns:
// Disk usage on success
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+ virtual libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const = 0;
// Returns the size of the elements held in the key mapper. This excludes the
// size of any internal metadata of the key mapper, e.g. the key mapper's
@@ -117,192 +122,16 @@ class KeyMapper {
// Returns:
// File size on success
// INTERNAL_ERROR on IO error
- libtextclassifier3::StatusOr<int64_t> GetElementsSize() const;
+ virtual libtextclassifier3::StatusOr<int64_t> GetElementsSize() const = 0;
// Computes and returns the checksum of the header and contents.
- Crc32 ComputeChecksum();
+ virtual libtextclassifier3::StatusOr<Crc32> ComputeChecksum() = 0;
private:
- static constexpr char kKeyMapperDir[] = "key_mapper_dir";
- static constexpr char kKeyMapperPrefix[] = "key_mapper";
-
- // Use KeyMapper::Create() to instantiate.
- explicit KeyMapper(std::string_view key_mapper_dir);
-
- // Load any existing KeyMapper data from disk, or creates a new instance
- // of KeyMapper on disk and gets ready to process read/write operations.
- //
- // Returns any encountered IO errors.
- libtextclassifier3::Status Initialize(int maximum_size_bytes);
-
- const std::string file_prefix_;
-
- // TODO(adorokhine) Filesystem is a forked class that's available both in
- // icing and icing namespaces. We will need icing::Filesystem in order
- // to use IcingDynamicTrie. Filesystem class should be fully refactored
- // to have a single definition across both namespaces. Such a class should
- // use icing (and general google3) coding conventions and behave like
- // a proper C++ class.
- const IcingFilesystem icing_filesystem_;
- IcingDynamicTrie trie_;
-
static_assert(std::is_trivially_copyable<T>::value,
"T must be trivially copyable");
};
-template <typename T>
-libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<T>>>
-KeyMapper<T>::Create(const Filesystem& filesystem, std::string_view base_dir,
- int maximum_size_bytes) {
- // We create a subdirectory since the trie creates and stores multiple files.
- // This makes it easier to isolate the trie files away from other files that
- // could potentially be in the same base_dir, and makes it easier to delete.
- const std::string key_mapper_dir =
- absl_ports::StrCat(base_dir, "/", kKeyMapperDir);
- if (!filesystem.CreateDirectoryRecursively(key_mapper_dir.c_str())) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Failed to create KeyMapper directory: ", key_mapper_dir));
- }
- auto mapper = std::unique_ptr<KeyMapper<T>>(new KeyMapper<T>(key_mapper_dir));
- ICING_RETURN_IF_ERROR(mapper->Initialize(maximum_size_bytes));
- return mapper;
-}
-
-template <typename T>
-libtextclassifier3::Status KeyMapper<T>::Delete(const Filesystem& filesystem,
- std::string_view base_dir) {
- std::string key_mapper_dir = absl_ports::StrCat(base_dir, "/", kKeyMapperDir);
- if (!filesystem.DeleteDirectoryRecursively(key_mapper_dir.c_str())) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Failed to delete KeyMapper directory: ", key_mapper_dir));
- }
- return libtextclassifier3::Status::OK;
-}
-
-template <typename T>
-KeyMapper<T>::KeyMapper(std::string_view key_mapper_dir)
- : file_prefix_(absl_ports::StrCat(key_mapper_dir, "/", kKeyMapperPrefix)),
- trie_(file_prefix_,
- IcingDynamicTrie::RuntimeOptions().set_storage_policy(
- IcingDynamicTrie::RuntimeOptions::kMapSharedWithCrc),
- &icing_filesystem_) {}
-
-template <typename T>
-libtextclassifier3::Status KeyMapper<T>::Initialize(int maximum_size_bytes) {
- IcingDynamicTrie::Options options;
- // Divide the max space between the three internal arrays: nodes, nexts and
- // suffixes. MaxNodes and MaxNexts are in units of their own data structures.
- // MaxSuffixesSize is in units of bytes.
- options.max_nodes = maximum_size_bytes / (3 * sizeof(IcingDynamicTrie::Node));
- options.max_nexts = options.max_nodes;
- options.max_suffixes_size =
- sizeof(IcingDynamicTrie::Node) * options.max_nodes;
- options.value_size = sizeof(T);
-
- if (!trie_.CreateIfNotExist(options)) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to create KeyMapper file: ", file_prefix_));
- }
- if (!trie_.Init()) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to init KeyMapper file: ", file_prefix_));
- }
- return libtextclassifier3::Status::OK;
-}
-
-template <typename T>
-libtextclassifier3::StatusOr<T> KeyMapper<T>::GetOrPut(std::string_view key,
- T next_value) {
- std::string string_key(key);
- uint32_t value_index;
- if (!trie_.Insert(string_key.c_str(), &next_value, &value_index,
- /*replace=*/false)) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Unable to insert key ", key, " into KeyMapper ", file_prefix_, "."));
- }
- // This memory address could be unaligned since we're just grabbing the value
- // from somewhere in the trie's suffix array. The suffix array is filled with
- // chars, so the address might not be aligned to T values.
- const T* unaligned_value =
- static_cast<const T*>(trie_.GetValueAtIndex(value_index));
-
- // memcpy the value to ensure that the returned value here is in a T-aligned
- // address
- T aligned_value;
- memcpy(&aligned_value, unaligned_value, sizeof(T));
- return aligned_value;
-}
-
-template <typename T>
-libtextclassifier3::Status KeyMapper<T>::Put(std::string_view key, T value) {
- std::string string_key(key);
- if (!trie_.Insert(string_key.c_str(), &value)) {
- return absl_ports::InternalError(absl_ports::StrCat(
- "Unable to insert key ", key, " into KeyMapper ", file_prefix_, "."));
- }
- return libtextclassifier3::Status::OK;
-}
-
-template <typename T>
-libtextclassifier3::StatusOr<T> KeyMapper<T>::Get(std::string_view key) const {
- std::string string_key(key);
- T value;
- if (!trie_.Find(string_key.c_str(), &value)) {
- return absl_ports::NotFoundError(absl_ports::StrCat(
- "Key not found ", key, " in KeyMapper ", file_prefix_, "."));
- }
- return value;
-}
-
-template <typename T>
-std::unordered_map<T, std::string> KeyMapper<T>::GetValuesToKeys() const {
- std::unordered_map<T, std::string> values_to_keys;
- for (IcingDynamicTrie::Iterator itr(trie_, /*prefix=*/""); itr.IsValid();
- itr.Advance()) {
- if (itr.IsValid()) {
- T value;
- memcpy(&value, itr.GetValue(), sizeof(T));
- values_to_keys.insert({value, itr.GetKey()});
- }
- }
-
- return values_to_keys;
-}
-
-template <typename T>
-libtextclassifier3::Status KeyMapper<T>::PersistToDisk() {
- if (!trie_.Sync()) {
- return absl_ports::InternalError(
- absl_ports::StrCat("Failed to sync KeyMapper file: ", file_prefix_));
- }
-
- return libtextclassifier3::Status::OK;
-}
-
-template <typename T>
-libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetDiskUsage() const {
- int64_t size = trie_.GetDiskUsage();
- if (size == IcingFilesystem::kBadFileSize || size < 0) {
- return absl_ports::InternalError("Failed to get disk usage of key mapper");
- }
- return size;
-}
-
-template <typename T>
-libtextclassifier3::StatusOr<int64_t> KeyMapper<T>::GetElementsSize() const {
- int64_t size = trie_.GetElementsSize();
- if (size == IcingFilesystem::kBadFileSize || size < 0) {
- return absl_ports::InternalError(
- "Failed to get disk usage of elements in the key mapper");
- }
- return size;
-}
-
-template <typename T>
-Crc32 KeyMapper<T>::ComputeChecksum() {
- return Crc32(trie_.UpdateCrc());
-}
-
} // namespace lib
} // namespace icing
diff --git a/icing/store/key-mapper_benchmark.cc b/icing/store/key-mapper_benchmark.cc
new file mode 100644
index 0000000..c25fe30
--- /dev/null
+++ b/icing/store/key-mapper_benchmark.cc
@@ -0,0 +1,323 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <random>
+#include <string>
+#include <unordered_map>
+
+#include "testing/base/public/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/file/destructible-directory.h"
+#include "icing/file/filesystem.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/key-mapper.h"
+#include "icing/store/persistent-hash-map-key-mapper.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/random-string.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::IsTrue;
+using ::testing::Not;
+
+class KeyMapperBenchmark {
+ public:
+ static constexpr int kKeyLength = 20;
+
+ explicit KeyMapperBenchmark()
+ : clock(std::make_unique<Clock>()),
+ base_dir(GetTestTempDir() + "/key_mapper_benchmark"),
+ random_engine(/*seed=*/12345) {}
+
+ std::string GenerateUniqueRandomKeyValuePair(int val,
+ std::string_view prefix = "") {
+ std::string rand_str = absl_ports::StrCat(
+ prefix, RandomString(kAlNumAlphabet, kKeyLength, &random_engine));
+ while (random_kvps_map.find(rand_str) != random_kvps_map.end()) {
+ rand_str = absl_ports::StrCat(
+ std::string(prefix),
+ RandomString(kAlNumAlphabet, kKeyLength, &random_engine));
+ }
+ std::pair<std::string, int> entry(rand_str, val);
+ random_kvps.push_back(entry);
+ random_kvps_map.insert(entry);
+ return rand_str;
+ }
+
+ template <typename UnknownKeyMapperType>
+ libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<int>>> CreateKeyMapper(
+ int max_num_entries) {
+ return absl_ports::InvalidArgumentError("Unknown type");
+ }
+
+ template <>
+ libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<int>>>
+ CreateKeyMapper<DynamicTrieKeyMapper<int>>(int max_num_entries) {
+ return DynamicTrieKeyMapper<int>::Create(
+ filesystem, base_dir,
+ /*maximum_size_bytes=*/128 * 1024 * 1024);
+ }
+
+ template <>
+ libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<int>>>
+ CreateKeyMapper<PersistentHashMapKeyMapper<int>>(int max_num_entries) {
+ std::string working_path =
+ absl_ports::StrCat(base_dir, "/", "key_mapper_dir");
+ return PersistentHashMapKeyMapper<int>::Create(
+ filesystem, std::move(working_path), /*pre_mapping_fbv=*/true,
+ max_num_entries, /*average_kv_byte_size=*/kKeyLength + 1 + sizeof(int),
+ /*max_load_factor_percent=*/100);
+ }
+
+ std::unique_ptr<Clock> clock;
+
+ Filesystem filesystem;
+ std::string base_dir;
+
+ std::default_random_engine random_engine;
+ std::vector<std::pair<std::string, int>> random_kvps;
+ std::unordered_map<std::string, int> random_kvps_map;
+};
+
+// Benchmark the total time of putting num_keys (specified by Arg) unique random
+// key value pairs.
+template <typename KeyMapperType>
+void BM_PutMany(benchmark::State& state) {
+ int num_keys = state.range(0);
+
+ KeyMapperBenchmark benchmark;
+ for (int i = 0; i < num_keys; ++i) {
+ benchmark.GenerateUniqueRandomKeyValuePair(i);
+ }
+
+ for (auto _ : state) {
+ state.PauseTiming();
+ benchmark.filesystem.DeleteDirectoryRecursively(benchmark.base_dir.c_str());
+ DestructibleDirectory ddir(&benchmark.filesystem, benchmark.base_dir);
+ ASSERT_THAT(ddir.is_valid(), IsTrue());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<int>> key_mapper,
+ benchmark.CreateKeyMapper<KeyMapperType>(num_keys));
+ ASSERT_THAT(key_mapper->num_keys(), Eq(0));
+ state.ResumeTiming();
+
+ for (int i = 0; i < num_keys; ++i) {
+ ICING_ASSERT_OK(key_mapper->Put(benchmark.random_kvps[i].first,
+ benchmark.random_kvps[i].second));
+ }
+
+ // Explicit calls PersistToDisk.
+ ICING_ASSERT_OK(key_mapper->PersistToDisk());
+
+ state.PauseTiming();
+ ASSERT_THAT(key_mapper->num_keys(), Eq(num_keys));
+ // The destructor of IcingDynamicTrie doesn't implicitly call PersistToDisk,
+ // while PersistentHashMap does. Thus, we reset the unique pointer to invoke
+ // destructor in the pause timing block, so in this case PersistToDisk will
+ // be included into the benchmark only once.
+ key_mapper.reset();
+ state.ResumeTiming();
+ }
+}
+BENCHMARK(BM_PutMany<DynamicTrieKeyMapper<int>>)
+ ->Arg(1 << 10)
+ ->Arg(1 << 11)
+ ->Arg(1 << 12)
+ ->Arg(1 << 13)
+ ->Arg(1 << 14)
+ ->Arg(1 << 15)
+ ->Arg(1 << 16)
+ ->Arg(1 << 17)
+ ->Arg(1 << 18)
+ ->Arg(1 << 19)
+ ->Arg(1 << 20);
+BENCHMARK(BM_PutMany<PersistentHashMapKeyMapper<int>>)
+ ->Arg(1 << 10)
+ ->Arg(1 << 11)
+ ->Arg(1 << 12)
+ ->Arg(1 << 13)
+ ->Arg(1 << 14)
+ ->Arg(1 << 15)
+ ->Arg(1 << 16)
+ ->Arg(1 << 17)
+ ->Arg(1 << 18)
+ ->Arg(1 << 19)
+ ->Arg(1 << 20);
+
+// Benchmark the average time of putting 1 unique random key value pair. The
+// result will be affected by # of iterations, so use --benchmark_max_iters=k
+// and --benchmark_min_iters=k to force # of iterations to be fixed.
+template <typename KeyMapperType>
+void BM_Put(benchmark::State& state) {
+ KeyMapperBenchmark benchmark;
+ benchmark.filesystem.DeleteDirectoryRecursively(benchmark.base_dir.c_str());
+ DestructibleDirectory ddir(&benchmark.filesystem, benchmark.base_dir);
+ ASSERT_THAT(ddir.is_valid(), IsTrue());
+
+ // The overhead of state.PauseTiming is too large and affects the benchmark
+ // result a lot, so pre-generate enough kvps to avoid calling too many times
+ // state.PauseTiming for GenerateUniqueRandomKeyValuePair in the benchmark
+ // for-loop.
+ int MAX_PREGEN_KVPS = 1 << 22;
+ for (int i = 0; i < MAX_PREGEN_KVPS; ++i) {
+ benchmark.GenerateUniqueRandomKeyValuePair(i);
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<int>> key_mapper,
+ benchmark.CreateKeyMapper<KeyMapperType>(/*max_num_entries=*/1 << 22));
+ ASSERT_THAT(key_mapper->num_keys(), Eq(0));
+
+ int cnt = 0;
+ for (auto _ : state) {
+ if (cnt >= MAX_PREGEN_KVPS) {
+ state.PauseTiming();
+ benchmark.GenerateUniqueRandomKeyValuePair(cnt);
+ state.ResumeTiming();
+ }
+
+ ICING_ASSERT_OK(key_mapper->Put(benchmark.random_kvps[cnt].first,
+ benchmark.random_kvps[cnt].second));
+ ++cnt;
+ }
+}
+BENCHMARK(BM_Put<DynamicTrieKeyMapper<int>>);
+BENCHMARK(BM_Put<PersistentHashMapKeyMapper<int>>);
+
+// Benchmark the average time of getting 1 existing key value pair from the key
+// mapper with size num_keys (specified by Arg).
+template <typename KeyMapperType>
+void BM_Get(benchmark::State& state) {
+ int num_keys = state.range(0);
+
+ KeyMapperBenchmark benchmark;
+ benchmark.filesystem.DeleteDirectoryRecursively(benchmark.base_dir.c_str());
+ DestructibleDirectory ddir(&benchmark.filesystem, benchmark.base_dir);
+ ASSERT_THAT(ddir.is_valid(), IsTrue());
+
+ // Create a key mapper with num_keys entries.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<int>> key_mapper,
+ benchmark.CreateKeyMapper<KeyMapperType>(num_keys));
+ for (int i = 0; i < num_keys; ++i) {
+ ICING_ASSERT_OK(
+ key_mapper->Put(benchmark.GenerateUniqueRandomKeyValuePair(i), i));
+ }
+ ASSERT_THAT(key_mapper->num_keys(), Eq(num_keys));
+
+ std::uniform_int_distribution<> distrib(0, num_keys - 1);
+ std::default_random_engine e(/*seed=*/12345);
+ for (auto _ : state) {
+ int idx = distrib(e);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ int val, key_mapper->Get(benchmark.random_kvps[idx].first));
+ ASSERT_THAT(val, Eq(benchmark.random_kvps[idx].second));
+ }
+}
+BENCHMARK(BM_Get<DynamicTrieKeyMapper<int>>)
+ ->Arg(1 << 10)
+ ->Arg(1 << 11)
+ ->Arg(1 << 12)
+ ->Arg(1 << 13)
+ ->Arg(1 << 14)
+ ->Arg(1 << 15)
+ ->Arg(1 << 16)
+ ->Arg(1 << 17)
+ ->Arg(1 << 18)
+ ->Arg(1 << 19)
+ ->Arg(1 << 20);
+BENCHMARK(BM_Get<PersistentHashMapKeyMapper<int>>)
+ ->Arg(1 << 10)
+ ->Arg(1 << 11)
+ ->Arg(1 << 12)
+ ->Arg(1 << 13)
+ ->Arg(1 << 14)
+ ->Arg(1 << 15)
+ ->Arg(1 << 16)
+ ->Arg(1 << 17)
+ ->Arg(1 << 18)
+ ->Arg(1 << 19)
+ ->Arg(1 << 20);
+
+// Benchmark the total time of iterating through all key value pairs of the key
+// mapper with size num_keys (specified by Arg).
+template <typename KeyMapperType>
+void BM_Iterator(benchmark::State& state) {
+ int num_keys = state.range(0);
+
+ KeyMapperBenchmark benchmark;
+ benchmark.filesystem.DeleteDirectoryRecursively(benchmark.base_dir.c_str());
+ DestructibleDirectory ddir(&benchmark.filesystem, benchmark.base_dir);
+ ASSERT_THAT(ddir.is_valid(), IsTrue());
+
+ // Create a key mapper with num_keys entries.
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<KeyMapper<int>> key_mapper,
+ benchmark.CreateKeyMapper<KeyMapperType>(num_keys));
+ for (int i = 0; i < num_keys; ++i) {
+ ICING_ASSERT_OK(
+ key_mapper->Put(benchmark.GenerateUniqueRandomKeyValuePair(i), i));
+ }
+ ASSERT_THAT(key_mapper->num_keys(), Eq(num_keys));
+
+ for (auto _ : state) {
+ auto iter = key_mapper->GetIterator();
+ int cnt = 0;
+ while (iter->Advance()) {
+ ++cnt;
+ std::string key(iter->GetKey());
+ int value = iter->GetValue();
+ auto it = benchmark.random_kvps_map.find(key);
+ ASSERT_THAT(it, Not(Eq(benchmark.random_kvps_map.end())));
+ ASSERT_THAT(it->second, Eq(value));
+ }
+ ASSERT_THAT(cnt, Eq(num_keys));
+ }
+}
+BENCHMARK(BM_Iterator<DynamicTrieKeyMapper<int>>)
+ ->Arg(1 << 10)
+ ->Arg(1 << 11)
+ ->Arg(1 << 12)
+ ->Arg(1 << 13)
+ ->Arg(1 << 14)
+ ->Arg(1 << 15)
+ ->Arg(1 << 16)
+ ->Arg(1 << 17)
+ ->Arg(1 << 18)
+ ->Arg(1 << 19)
+ ->Arg(1 << 20);
+BENCHMARK(BM_Iterator<PersistentHashMapKeyMapper<int>>)
+ ->Arg(1 << 10)
+ ->Arg(1 << 11)
+ ->Arg(1 << 12)
+ ->Arg(1 << 13)
+ ->Arg(1 << 14)
+ ->Arg(1 << 15)
+ ->Arg(1 << 16)
+ ->Arg(1 << 17)
+ ->Arg(1 << 18)
+ ->Arg(1 << 19)
+ ->Arg(1 << 20);
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/key-mapper_test.cc b/icing/store/key-mapper_test.cc
index 4e3dd8a..fa7d1e8 100644
--- a/icing/store/key-mapper_test.cc
+++ b/icing/store/key-mapper_test.cc
@@ -1,4 +1,4 @@
-// Copyright (C) 2019 Google LLC
+// Copyright (C) 2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -14,65 +14,113 @@
#include "icing/store/key-mapper.h"
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
#include "icing/store/document-id.h"
+#include "icing/store/dynamic-trie-key-mapper.h"
+#include "icing/store/persistent-hash-map-key-mapper.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/tmp-directory.h"
-using ::testing::_;
-using ::testing::HasSubstr;
using ::testing::IsEmpty;
+using ::testing::IsTrue;
using ::testing::Pair;
using ::testing::UnorderedElementsAre;
namespace icing {
namespace lib {
+
namespace {
-constexpr int kMaxKeyMapperSize = 3 * 1024 * 1024; // 3 MiB
-class KeyMapperTest : public testing::Test {
+constexpr int kMaxDynamicTrieKeyMapperSize = 3 * 1024 * 1024; // 3 MiB
+
+enum class KeyMapperType {
+ kDynamicTrie,
+ kPersistentHashMap,
+};
+
+struct KeyMapperTestParam {
+ KeyMapperType key_mapper_type;
+ bool pre_mapping_fbv;
+
+ explicit KeyMapperTestParam(KeyMapperType key_mapper_type_in,
+ bool pre_mapping_fbv_in)
+ : key_mapper_type(key_mapper_type_in),
+ pre_mapping_fbv(pre_mapping_fbv_in) {}
+};
+
+class KeyMapperTest : public ::testing::TestWithParam<KeyMapperTestParam> {
protected:
- void SetUp() override { base_dir_ = GetTestTempDir() + "/key_mapper"; }
+ void SetUp() override {
+ base_dir_ = GetTestTempDir() + "/icing";
+ ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
+ IsTrue());
+
+ working_dir_ = base_dir_ + "/key_mapper";
+ }
void TearDown() override {
filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
}
+ libtextclassifier3::StatusOr<std::unique_ptr<KeyMapper<DocumentId>>>
+ CreateKeyMapper() {
+ const KeyMapperTestParam& param = GetParam();
+ switch (param.key_mapper_type) {
+ case KeyMapperType::kDynamicTrie:
+ return DynamicTrieKeyMapper<DocumentId>::Create(
+ filesystem_, working_dir_, kMaxDynamicTrieKeyMapperSize);
+ case KeyMapperType::kPersistentHashMap:
+ return PersistentHashMapKeyMapper<DocumentId>::Create(
+ filesystem_, working_dir_, param.pre_mapping_fbv);
+ }
+ }
+
+ libtextclassifier3::Status DeleteKeyMapper() {
+ const KeyMapperTestParam& param = GetParam();
+ switch (param.key_mapper_type) {
+ case KeyMapperType::kDynamicTrie:
+ return DynamicTrieKeyMapper<DocumentId>::Delete(filesystem_,
+ working_dir_);
+ case KeyMapperType::kPersistentHashMap:
+ return PersistentHashMapKeyMapper<DocumentId>::Delete(filesystem_,
+ working_dir_);
+ }
+ }
+
std::string base_dir_;
+ std::string working_dir_;
Filesystem filesystem_;
};
-TEST_F(KeyMapperTest, InvalidBaseDir) {
- ASSERT_THAT(
- KeyMapper<DocumentId>::Create(filesystem_, "/dev/null", kMaxKeyMapperSize)
- .status()
- .error_message(),
- HasSubstr("Failed to create KeyMapper"));
-}
-
-TEST_F(KeyMapperTest, NegativeMaxKeyMapperSizeReturnsInternalError) {
- ASSERT_THAT(KeyMapper<DocumentId>::Create(filesystem_, base_dir_, -1),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
-}
+std::unordered_map<std::string, DocumentId> GetAllKeyValuePairs(
+ const KeyMapper<DocumentId>* key_mapper) {
+ std::unordered_map<std::string, DocumentId> ret;
-TEST_F(KeyMapperTest, TooLargeMaxKeyMapperSizeReturnsInternalError) {
- ASSERT_THAT(KeyMapper<DocumentId>::Create(filesystem_, base_dir_,
- std::numeric_limits<int>::max()),
- StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+ std::unique_ptr<typename KeyMapper<DocumentId>::Iterator> itr =
+ key_mapper->GetIterator();
+ while (itr->Advance()) {
+ ret.emplace(itr->GetKey(), itr->GetValue());
+ }
+ return ret;
}
-TEST_F(KeyMapperTest, CreateNewKeyMapper) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+TEST_P(KeyMapperTest, CreateNewKeyMapper) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ CreateKeyMapper());
EXPECT_THAT(key_mapper->num_keys(), 0);
}
-TEST_F(KeyMapperTest, CanUpdateSameKeyMultipleTimes) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+TEST_P(KeyMapperTest, CanUpdateSameKeyMultipleTimes) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ CreateKeyMapper());
ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
ICING_EXPECT_OK(key_mapper->Put("default-youtube.com", 50));
@@ -88,10 +136,9 @@ TEST_F(KeyMapperTest, CanUpdateSameKeyMultipleTimes) {
EXPECT_THAT(key_mapper->num_keys(), 2);
}
-TEST_F(KeyMapperTest, GetOrPutOk) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+TEST_P(KeyMapperTest, GetOrPutOk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ CreateKeyMapper());
EXPECT_THAT(key_mapper->Get("foo"),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
@@ -99,15 +146,15 @@ TEST_F(KeyMapperTest, GetOrPutOk) {
EXPECT_THAT(key_mapper->Get("foo"), IsOkAndHolds(1));
}
-TEST_F(KeyMapperTest, CanPersistToDiskRegularly) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
- // Can persist an empty KeyMapper.
+TEST_P(KeyMapperTest, CanPersistToDiskRegularly) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ CreateKeyMapper());
+
+ // Can persist an empty DynamicTrieKeyMapper.
ICING_EXPECT_OK(key_mapper->PersistToDisk());
EXPECT_THAT(key_mapper->num_keys(), 0);
- // Can persist the smallest KeyMapper.
+ // Can persist the smallest DynamicTrieKeyMapper.
ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
ICING_EXPECT_OK(key_mapper->PersistToDisk());
EXPECT_THAT(key_mapper->num_keys(), 1);
@@ -124,17 +171,15 @@ TEST_F(KeyMapperTest, CanPersistToDiskRegularly) {
EXPECT_THAT(key_mapper->num_keys(), 2);
}
-TEST_F(KeyMapperTest, CanUseAcrossMultipleInstances) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+TEST_P(KeyMapperTest, CanUseAcrossMultipleInstances) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ CreateKeyMapper());
ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
ICING_EXPECT_OK(key_mapper->PersistToDisk());
key_mapper.reset();
- ICING_ASSERT_OK_AND_ASSIGN(
- key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+
+ ICING_ASSERT_OK_AND_ASSIGN(key_mapper, CreateKeyMapper());
EXPECT_THAT(key_mapper->num_keys(), 1);
EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(100));
@@ -146,43 +191,49 @@ TEST_F(KeyMapperTest, CanUseAcrossMultipleInstances) {
EXPECT_THAT(key_mapper->Get("default-google.com"), IsOkAndHolds(300));
}
-TEST_F(KeyMapperTest, CanDeleteAndRestartKeyMapping) {
+TEST_P(KeyMapperTest, CanDeleteAndRestartKeyMapping) {
// Can delete even if there's nothing there
- ICING_EXPECT_OK(KeyMapper<DocumentId>::Delete(filesystem_, base_dir_));
+ ICING_EXPECT_OK(DeleteKeyMapper());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ CreateKeyMapper());
ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
ICING_EXPECT_OK(key_mapper->PersistToDisk());
- ICING_EXPECT_OK(KeyMapper<DocumentId>::Delete(filesystem_, base_dir_));
+ ICING_EXPECT_OK(DeleteKeyMapper());
key_mapper.reset();
- ICING_ASSERT_OK_AND_ASSIGN(
- key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
+ ICING_ASSERT_OK_AND_ASSIGN(key_mapper, CreateKeyMapper());
EXPECT_THAT(key_mapper->num_keys(), 0);
ICING_EXPECT_OK(key_mapper->Put("default-google.com", 100));
EXPECT_THAT(key_mapper->num_keys(), 1);
}
-TEST_F(KeyMapperTest, GetValuesToKeys) {
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
- KeyMapper<DocumentId>::Create(filesystem_, base_dir_, kMaxKeyMapperSize));
- EXPECT_THAT(key_mapper->GetValuesToKeys(), IsEmpty());
+TEST_P(KeyMapperTest, Iterator) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<KeyMapper<DocumentId>> key_mapper,
+ CreateKeyMapper());
+ EXPECT_THAT(GetAllKeyValuePairs(key_mapper.get()), IsEmpty());
ICING_EXPECT_OK(key_mapper->Put("foo", /*value=*/1));
ICING_EXPECT_OK(key_mapper->Put("bar", /*value=*/2));
- EXPECT_THAT(key_mapper->GetValuesToKeys(),
- UnorderedElementsAre(Pair(1, "foo"), Pair(2, "bar")));
+ EXPECT_THAT(GetAllKeyValuePairs(key_mapper.get()),
+ UnorderedElementsAre(Pair("foo", 1), Pair("bar", 2)));
ICING_EXPECT_OK(key_mapper->Put("baz", /*value=*/3));
EXPECT_THAT(
- key_mapper->GetValuesToKeys(),
- UnorderedElementsAre(Pair(1, "foo"), Pair(2, "bar"), Pair(3, "baz")));
+ GetAllKeyValuePairs(key_mapper.get()),
+ UnorderedElementsAre(Pair("foo", 1), Pair("bar", 2), Pair("baz", 3)));
}
+INSTANTIATE_TEST_SUITE_P(
+ KeyMapperTest, KeyMapperTest,
+ testing::Values(KeyMapperTestParam(KeyMapperType::kDynamicTrie,
+ /*pre_mapping_fbv_in=*/true),
+ KeyMapperTestParam(KeyMapperType::kPersistentHashMap,
+ /*pre_mapping_fbv_in=*/true),
+ KeyMapperTestParam(KeyMapperType::kPersistentHashMap,
+ /*pre_mapping_fbv_in=*/false)));
+
} // namespace
+
} // namespace lib
} // namespace icing
diff --git a/icing/store/namespace-fingerprint-identifier.cc b/icing/store/namespace-fingerprint-identifier.cc
new file mode 100644
index 0000000..3910105
--- /dev/null
+++ b/icing/store/namespace-fingerprint-identifier.cc
@@ -0,0 +1,73 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/namespace-fingerprint-identifier.h"
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/text_classifier/lib3/utils/hash/farmhash.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_cat.h"
+#include "icing/store/namespace-id.h"
+#include "icing/util/encode-util.h"
+
+namespace icing {
+namespace lib {
+
+/* static */ libtextclassifier3::StatusOr<NamespaceFingerprintIdentifier>
+NamespaceFingerprintIdentifier::DecodeFromCString(
+ std::string_view encoded_cstr) {
+ if (encoded_cstr.size() < kMinEncodedLength) {
+ return absl_ports::InvalidArgumentError("Invalid length");
+ }
+
+ NamespaceId namespace_id = encode_util::DecodeIntFromCString(
+ encoded_cstr.substr(0, kEncodedNamespaceIdLength));
+ uint64_t fingerprint = encode_util::DecodeIntFromCString(
+ encoded_cstr.substr(kEncodedNamespaceIdLength));
+ return NamespaceFingerprintIdentifier(namespace_id, fingerprint);
+}
+
+NamespaceFingerprintIdentifier::NamespaceFingerprintIdentifier(
+ NamespaceId namespace_id, std::string_view target_str)
+ : namespace_id_(namespace_id),
+ fingerprint_(tc3farmhash::Fingerprint64(target_str)) {}
+
+std::string NamespaceFingerprintIdentifier::EncodeToCString() const {
+ // encoded_namespace_id_str should be 1 to 3 bytes based on the value of
+ // namespace_id.
+ std::string encoded_namespace_id_str =
+ encode_util::EncodeIntToCString(namespace_id_);
+ // Make encoded_namespace_id_str to fixed kEncodedNamespaceIdLength bytes.
+ while (encoded_namespace_id_str.size() < kEncodedNamespaceIdLength) {
+ // C string cannot contain 0 bytes, so we append it using 1, just like what
+ // we do in encode_util::EncodeIntToCString.
+ //
+ // The reason that this works is because DecodeIntToString decodes a byte
+ // value of 0x01 as 0x00. When EncodeIntToCString returns an encoded
+ // namespace id that is less than 3 bytes, it means that the id contains
+ // unencoded leading 0x00. So here we're explicitly encoding those bytes as
+ // 0x01.
+ encoded_namespace_id_str.push_back(1);
+ }
+
+ return absl_ports::StrCat(encoded_namespace_id_str,
+ encode_util::EncodeIntToCString(fingerprint_));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/namespace-fingerprint-identifier.h b/icing/store/namespace-fingerprint-identifier.h
new file mode 100644
index 0000000..d91ef94
--- /dev/null
+++ b/icing/store/namespace-fingerprint-identifier.h
@@ -0,0 +1,72 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_NAMESPACE_FINGERPRINT_IDENTIFIER_H_
+#define ICING_STORE_NAMESPACE_FINGERPRINT_IDENTIFIER_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/store/namespace-id.h"
+
+namespace icing {
+namespace lib {
+
+class NamespaceFingerprintIdentifier {
+ public:
+ static constexpr int kEncodedNamespaceIdLength = 3;
+ static constexpr int kMinEncodedLength = kEncodedNamespaceIdLength + 1;
+
+ static libtextclassifier3::StatusOr<NamespaceFingerprintIdentifier>
+ DecodeFromCString(std::string_view encoded_cstr);
+
+ explicit NamespaceFingerprintIdentifier()
+ : namespace_id_(0), fingerprint_(0) {}
+
+ explicit NamespaceFingerprintIdentifier(NamespaceId namespace_id,
+ uint64_t fingerprint)
+ : namespace_id_(namespace_id), fingerprint_(fingerprint) {}
+
+ explicit NamespaceFingerprintIdentifier(NamespaceId namespace_id,
+ std::string_view target_str);
+
+ std::string EncodeToCString() const;
+
+ bool operator<(const NamespaceFingerprintIdentifier& other) const {
+ if (namespace_id_ != other.namespace_id_) {
+ return namespace_id_ < other.namespace_id_;
+ }
+ return fingerprint_ < other.fingerprint_;
+ }
+
+ bool operator==(const NamespaceFingerprintIdentifier& other) const {
+ return namespace_id_ == other.namespace_id_ &&
+ fingerprint_ == other.fingerprint_;
+ }
+
+ NamespaceId namespace_id() const { return namespace_id_; }
+ uint64_t fingerprint() const { return fingerprint_; }
+
+ private:
+ NamespaceId namespace_id_;
+ uint64_t fingerprint_;
+} __attribute__((packed));
+static_assert(sizeof(NamespaceFingerprintIdentifier) == 10, "");
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_NAMESPACE_FINGERPRINT_IDENTIFIER_H_
diff --git a/icing/store/namespace-fingerprint-identifier_test.cc b/icing/store/namespace-fingerprint-identifier_test.cc
new file mode 100644
index 0000000..5f86156
--- /dev/null
+++ b/icing/store/namespace-fingerprint-identifier_test.cc
@@ -0,0 +1,148 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/namespace-fingerprint-identifier.h"
+
+#include <cstdint>
+#include <limits>
+#include <string>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/store/namespace-id.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::Eq;
+
+TEST(NamespaceFingerprintIdentifierTest, EncodeToCString) {
+ NamespaceFingerprintIdentifier identifier1(/*namespace_id=*/0,
+ /*fingerprint=*/0);
+ EXPECT_THAT(identifier1.EncodeToCString(), Eq("\x01\x01\x01\x01"));
+
+ NamespaceFingerprintIdentifier identifier2(/*namespace_id=*/0,
+ /*fingerprint=*/1);
+ EXPECT_THAT(identifier2.EncodeToCString(), Eq("\x01\x01\x01\x02"));
+
+ NamespaceFingerprintIdentifier identifier3(
+ /*namespace_id=*/0, /*fingerprint=*/std::numeric_limits<uint64_t>::max());
+ EXPECT_THAT(identifier3.EncodeToCString(),
+ Eq("\x01\x01\x01\x80\x80\x80\x80\x80\x80\x80\x80\x80\x02"));
+
+ NamespaceFingerprintIdentifier identifier4(/*namespace_id=*/1,
+ /*fingerprint=*/0);
+ EXPECT_THAT(identifier4.EncodeToCString(), Eq("\x02\x01\x01\x01"));
+
+ NamespaceFingerprintIdentifier identifier5(/*namespace_id=*/1,
+ /*fingerprint=*/1);
+ EXPECT_THAT(identifier5.EncodeToCString(), Eq("\x02\x01\x01\x02"));
+
+ NamespaceFingerprintIdentifier identifier6(
+ /*namespace_id=*/1, /*fingerprint=*/std::numeric_limits<uint64_t>::max());
+ EXPECT_THAT(identifier6.EncodeToCString(),
+ Eq("\x02\x01\x01\x80\x80\x80\x80\x80\x80\x80\x80\x80\x02"));
+
+ NamespaceFingerprintIdentifier identifier7(
+ /*namespace_id=*/std::numeric_limits<NamespaceId>::max(),
+ /*fingerprint=*/0);
+ EXPECT_THAT(identifier7.EncodeToCString(), Eq("\x80\x80\x02\x01"));
+
+ NamespaceFingerprintIdentifier identifier8(
+ /*namespace_id=*/std::numeric_limits<NamespaceId>::max(),
+ /*fingerprint=*/1);
+ EXPECT_THAT(identifier8.EncodeToCString(), Eq("\x80\x80\x02\x02"));
+
+ NamespaceFingerprintIdentifier identifier9(
+ /*namespace_id=*/std::numeric_limits<NamespaceId>::max(),
+ /*fingerprint=*/std::numeric_limits<uint64_t>::max());
+ EXPECT_THAT(identifier9.EncodeToCString(),
+ Eq("\x80\x80\x02\x80\x80\x80\x80\x80\x80\x80\x80\x80\x02"));
+}
+
+TEST(NamespaceFingerprintIdentifierTest,
+ MultipleCStringConversionsAreReversible) {
+ NamespaceFingerprintIdentifier identifier1(/*namespace_id=*/0,
+ /*fingerprint=*/0);
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier1.EncodeToCString()),
+ IsOkAndHolds(identifier1));
+
+ NamespaceFingerprintIdentifier identifier2(/*namespace_id=*/0,
+ /*fingerprint=*/1);
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier2.EncodeToCString()),
+ IsOkAndHolds(identifier2));
+
+ NamespaceFingerprintIdentifier identifier3(
+ /*namespace_id=*/0, /*fingerprint=*/std::numeric_limits<uint64_t>::max());
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier3.EncodeToCString()),
+ IsOkAndHolds(identifier3));
+
+ NamespaceFingerprintIdentifier identifier4(/*namespace_id=*/1,
+ /*fingerprint=*/0);
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier4.EncodeToCString()),
+ IsOkAndHolds(identifier4));
+
+ NamespaceFingerprintIdentifier identifier5(/*namespace_id=*/1,
+ /*fingerprint=*/1);
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier5.EncodeToCString()),
+ IsOkAndHolds(identifier5));
+
+ NamespaceFingerprintIdentifier identifier6(
+ /*namespace_id=*/1, /*fingerprint=*/std::numeric_limits<uint64_t>::max());
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier6.EncodeToCString()),
+ IsOkAndHolds(identifier6));
+
+ NamespaceFingerprintIdentifier identifier7(
+ /*namespace_id=*/std::numeric_limits<NamespaceId>::max(),
+ /*fingerprint=*/0);
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier7.EncodeToCString()),
+ IsOkAndHolds(identifier7));
+
+ NamespaceFingerprintIdentifier identifier8(
+ /*namespace_id=*/std::numeric_limits<NamespaceId>::max(),
+ /*fingerprint=*/1);
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier8.EncodeToCString()),
+ IsOkAndHolds(identifier8));
+
+ NamespaceFingerprintIdentifier identifier9(
+ /*namespace_id=*/std::numeric_limits<NamespaceId>::max(),
+ /*fingerprint=*/std::numeric_limits<uint64_t>::max());
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(
+ identifier9.EncodeToCString()),
+ IsOkAndHolds(identifier9));
+}
+
+TEST(NamespaceFingerprintIdentifierTest,
+ DecodeFromCStringInvalidLengthShouldReturnError) {
+ std::string invalid_str = "\x01\x01\x01";
+ EXPECT_THAT(NamespaceFingerprintIdentifier::DecodeFromCString(invalid_str),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/namespace-id.h b/icing/store/namespace-id.h
index 4225be3..374e7a8 100644
--- a/icing/store/namespace-id.h
+++ b/icing/store/namespace-id.h
@@ -22,6 +22,7 @@ namespace lib {
// Id of unique namespace in DocumentProto. Generated in DocumentStore.
using NamespaceId = int16_t;
+inline constexpr NamespaceId kInvalidNamespaceId = -1;
} // namespace lib
} // namespace icing
diff --git a/icing/store/persistent-hash-map-key-mapper.h b/icing/store/persistent-hash-map-key-mapper.h
new file mode 100644
index 0000000..0596fe3
--- /dev/null
+++ b/icing/store/persistent-hash-map-key-mapper.h
@@ -0,0 +1,206 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_PERSISTENT_HASH_MAP_KEY_MAPPER_H_
+#define ICING_STORE_PERSISTENT_HASH_MAP_KEY_MAPPER_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <string_view>
+#include <type_traits>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/file/filesystem.h"
+#include "icing/file/persistent-hash-map.h"
+#include "icing/store/key-mapper.h"
+#include "icing/util/crc32.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+// File-backed mapping between the string key and a trivially copyable value
+// type.
+template <typename T, typename Formatter = absl_ports::DefaultFormatter>
+class PersistentHashMapKeyMapper : public KeyMapper<T, Formatter> {
+ public:
+ static constexpr int32_t kDefaultMaxNumEntries =
+ PersistentHashMap::Entry::kMaxNumEntries;
+ static constexpr int32_t kDefaultAverageKVByteSize =
+ PersistentHashMap::Options::kDefaultAverageKVByteSize;
+ static constexpr int32_t kDefaultMaxLoadFactorPercent =
+ PersistentHashMap::Options::kDefaultMaxLoadFactorPercent;
+
+ // Returns an initialized instance of PersistentHashMapKeyMapper that can
+ // immediately handle read/write operations.
+ // Returns any encountered IO errors.
+ //
+ // filesystem: Object to make system level calls
+ // working_path: Working directory used to save all the files required to
+ // persist PersistentHashMapKeyMapper. If this working_path was
+ // previously used to create a PersistentHashMapKeyMapper, then
+ // this existing data would be loaded. Otherwise, an empty
+ // PersistentHashMapKeyMapper would be created. See
+ // PersistentStorage for more details about the concept of
+ // working_path.
+ // pre_mapping_fbv: flag indicating whether memory map max possible file size
+ // for underlying FileBackedVector before growing the actual
+ // file size.
+ // max_num_entries: max # of kvps. It will be used to compute 3 storages size.
+ // average_kv_byte_size: average byte size of a single key + serialized value.
+ // It will be used to compute kv_storage size.
+ // max_load_factor_percent: percentage of the max loading for the hash map.
+ // load_factor_percent = 100 * num_keys / num_buckets
+ // If load_factor_percent exceeds
+ // max_load_factor_percent, then rehash will be
+ // invoked (and # of buckets will be doubled).
+ // Note that load_factor_percent exceeding 100 is
+ // considered valid.
+ static libtextclassifier3::StatusOr<
+ std::unique_ptr<PersistentHashMapKeyMapper<T, Formatter>>>
+ Create(const Filesystem& filesystem, std::string working_path,
+ bool pre_mapping_fbv, int32_t max_num_entries = kDefaultMaxNumEntries,
+ int32_t average_kv_byte_size = kDefaultAverageKVByteSize,
+ int32_t max_load_factor_percent = kDefaultMaxLoadFactorPercent);
+
+ // Deletes working_path (and all the files under it recursively) associated
+ // with the PersistentHashMapKeyMapper.
+ //
+ // working_path: Working directory used to save all the files required to
+ // persist PersistentHashMapKeyMapper. Should be the same as
+ // passed into Create().
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on I/O error
+ static libtextclassifier3::Status Delete(const Filesystem& filesystem,
+ const std::string& working_path);
+
+ ~PersistentHashMapKeyMapper() override = default;
+
+ libtextclassifier3::Status Put(std::string_view key, T value) override {
+ return persistent_hash_map_->Put(key, &value);
+ }
+
+ libtextclassifier3::StatusOr<T> GetOrPut(std::string_view key,
+ T next_value) override {
+ ICING_RETURN_IF_ERROR(persistent_hash_map_->GetOrPut(key, &next_value));
+ return next_value;
+ }
+
+ libtextclassifier3::StatusOr<T> Get(std::string_view key) const override {
+ T value;
+ ICING_RETURN_IF_ERROR(persistent_hash_map_->Get(key, &value));
+ return value;
+ }
+
+ bool Delete(std::string_view key) override {
+ return persistent_hash_map_->Delete(key).ok();
+ }
+
+ std::unique_ptr<typename KeyMapper<T, Formatter>::Iterator> GetIterator()
+ const override {
+ return std::make_unique<PersistentHashMapKeyMapper<T, Formatter>::Iterator>(
+ persistent_hash_map_.get());
+ }
+
+ int32_t num_keys() const override { return persistent_hash_map_->size(); }
+
+ libtextclassifier3::Status PersistToDisk() override {
+ return persistent_hash_map_->PersistToDisk();
+ }
+
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const override {
+ return persistent_hash_map_->GetDiskUsage();
+ }
+
+ libtextclassifier3::StatusOr<int64_t> GetElementsSize() const override {
+ return persistent_hash_map_->GetElementsSize();
+ }
+
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum() override {
+ return persistent_hash_map_->UpdateChecksums();
+ }
+
+ private:
+ class Iterator : public KeyMapper<T, Formatter>::Iterator {
+ public:
+ explicit Iterator(const PersistentHashMap* persistent_hash_map)
+ : itr_(persistent_hash_map->GetIterator()) {}
+
+ ~Iterator() override = default;
+
+ bool Advance() override { return itr_.Advance(); }
+
+ std::string_view GetKey() const override { return itr_.GetKey(); }
+
+ T GetValue() const override {
+ T value;
+ memcpy(&value, itr_.GetValue(), sizeof(T));
+ return value;
+ }
+
+ private:
+ PersistentHashMap::Iterator itr_;
+ };
+
+ // Use PersistentHashMapKeyMapper::Create() to instantiate.
+ explicit PersistentHashMapKeyMapper(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map)
+ : persistent_hash_map_(std::move(persistent_hash_map)) {}
+
+ std::unique_ptr<PersistentHashMap> persistent_hash_map_;
+
+ static_assert(std::is_trivially_copyable<T>::value,
+ "T must be trivially copyable");
+};
+
+template <typename T, typename Formatter>
+/* static */ libtextclassifier3::StatusOr<
+ std::unique_ptr<PersistentHashMapKeyMapper<T, Formatter>>>
+PersistentHashMapKeyMapper<T, Formatter>::Create(
+ const Filesystem& filesystem, std::string working_path,
+ bool pre_mapping_fbv, int32_t max_num_entries, int32_t average_kv_byte_size,
+ int32_t max_load_factor_percent) {
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<PersistentHashMap> persistent_hash_map,
+ PersistentHashMap::Create(
+ filesystem, std::move(working_path),
+ PersistentHashMap::Options(
+ /*value_type_size_in=*/sizeof(T),
+ /*max_num_entries_in=*/max_num_entries,
+ /*max_load_factor_percent_in=*/max_load_factor_percent,
+ /*average_kv_byte_size_in=*/average_kv_byte_size,
+ /*init_num_buckets_in=*/
+ PersistentHashMap::Options::kDefaultInitNumBuckets,
+ /*pre_mapping_fbv_in=*/pre_mapping_fbv)));
+ return std::unique_ptr<PersistentHashMapKeyMapper<T, Formatter>>(
+ new PersistentHashMapKeyMapper<T, Formatter>(
+ std::move(persistent_hash_map)));
+}
+
+template <typename T, typename Formatter>
+/* static */ libtextclassifier3::Status
+PersistentHashMapKeyMapper<T, Formatter>::Delete(
+ const Filesystem& filesystem, const std::string& working_path) {
+ return PersistentHashMap::Discard(filesystem, working_path);
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_PERSISTENT_HASH_MAP_KEY_MAPPER_H_
diff --git a/icing/store/persistent-hash-map-key-mapper_test.cc b/icing/store/persistent-hash-map-key-mapper_test.cc
new file mode 100644
index 0000000..0d610e9
--- /dev/null
+++ b/icing/store/persistent-hash-map-key-mapper_test.cc
@@ -0,0 +1,52 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/persistent-hash-map-key-mapper.h"
+
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/file/filesystem.h"
+#include "icing/store/document-id.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+class PersistentHashMapKeyMapperTest : public testing::Test {
+ protected:
+ void SetUp() override { base_dir_ = GetTestTempDir() + "/key_mapper"; }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
+ }
+
+ std::string base_dir_;
+ Filesystem filesystem_;
+};
+
+TEST_F(PersistentHashMapKeyMapperTest, InvalidBaseDir) {
+ EXPECT_THAT(PersistentHashMapKeyMapper<DocumentId>::Create(
+ filesystem_, "/dev/null", /*pre_mapping_fbv=*/false),
+ StatusIs(libtextclassifier3::StatusCode::INTERNAL));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/suggestion-result-checker-impl.h b/icing/store/suggestion-result-checker-impl.h
new file mode 100644
index 0000000..4e01f81
--- /dev/null
+++ b/icing/store/suggestion-result-checker-impl.h
@@ -0,0 +1,154 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_SUGGESTION_RESULT_CHECKER_IMPL_H_
+#define ICING_STORE_SUGGESTION_RESULT_CHECKER_IMPL_H_
+
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/document-store.h"
+#include "icing/store/namespace-id.h"
+#include "icing/store/suggestion-result-checker.h"
+
+namespace icing {
+namespace lib {
+
+class SuggestionResultCheckerImpl : public SuggestionResultChecker {
+ public:
+ explicit SuggestionResultCheckerImpl(
+ const DocumentStore* document_store, const SchemaStore* schema_store,
+ std::unordered_set<NamespaceId> target_namespace_ids,
+ std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
+ document_id_filter_map,
+ std::unordered_set<SchemaTypeId> target_schema_type_ids,
+ std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map,
+ std::string target_section, std::unordered_set<DocumentId> search_base,
+ int64_t current_time_ms)
+ : document_store_(*document_store),
+ schema_store_(*schema_store),
+ target_namespace_ids_(std::move(target_namespace_ids)),
+ document_id_filter_map_(std::move(document_id_filter_map)),
+ target_schema_type_ids_(std::move(target_schema_type_ids)),
+ property_filter_map_(std::move(property_filter_map)),
+ target_section_(std::move(target_section)),
+ search_base_(std::move(search_base)),
+ current_time_ms_(current_time_ms) {}
+
+ bool MatchesTargetNamespace(NamespaceId namespace_id) const {
+ return target_namespace_ids_.empty() ||
+ target_namespace_ids_.find(namespace_id) !=
+ target_namespace_ids_.end();
+ }
+
+ bool MatchesTargetDocumentIds(NamespaceId namespace_id,
+ DocumentId document_id) const {
+ if (document_id_filter_map_.empty()) {
+ return true;
+ }
+ auto document_ids_itr = document_id_filter_map_.find(namespace_id);
+ // The client doesn't set desired document ids in this namespace, or the
+ // client doesn't want this document.
+ return document_ids_itr == document_id_filter_map_.end() ||
+ document_ids_itr->second.find(document_id) !=
+ document_ids_itr->second.end();
+ }
+
+ bool MatchesTargetSchemaType(SchemaTypeId schema_type_id) const {
+ return target_schema_type_ids_.empty() ||
+ target_schema_type_ids_.find(schema_type_id) !=
+ target_schema_type_ids_.end();
+ }
+
+ bool MatchesTargetSection(SchemaTypeId schema_type_id,
+ SectionId section_id) const {
+ if (target_section_.empty()) {
+ return true;
+ }
+ auto section_metadata_or =
+ schema_store_.GetSectionMetadata(schema_type_id, section_id);
+ if (!section_metadata_or.ok()) {
+ // cannot find the target section metadata.
+ return false;
+ }
+ const SectionMetadata* section_metadata = section_metadata_or.ValueOrDie();
+ return section_metadata->path == target_section_;
+ }
+
+ bool MatchesSearchBase(DocumentId document_id) const {
+ return search_base_.empty() ||
+ search_base_.find(document_id) != search_base_.end();
+ }
+
+ bool MatchesPropertyFilter(SchemaTypeId schema_type_id,
+ SectionId section_id) const {
+ if (property_filter_map_.empty()) {
+ return true;
+ }
+ auto section_mask_itr = property_filter_map_.find(schema_type_id);
+ return section_mask_itr == property_filter_map_.end() ||
+ (section_mask_itr->second & (UINT64_C(1) << section_id)) != 0;
+ }
+
+ bool BelongsToTargetResults(DocumentId document_id,
+ SectionId section_id) const override {
+ // Get the document filter data first.
+ auto document_filter_data_optional_ =
+ document_store_.GetAliveDocumentFilterData(document_id,
+ current_time_ms_);
+ if (!document_filter_data_optional_) {
+ // The document doesn't exist.
+ return false;
+ }
+ DocumentFilterData document_filter_data =
+ document_filter_data_optional_.value();
+
+ if (!MatchesTargetNamespace(document_filter_data.namespace_id())) {
+ return false;
+ }
+ if (!MatchesTargetDocumentIds(document_filter_data.namespace_id(),
+ document_id)) {
+ return false;
+ }
+ if (!MatchesTargetSchemaType(document_filter_data.schema_type_id())) {
+ return false;
+ }
+ if (!MatchesTargetSection(document_filter_data.schema_type_id(),
+ section_id)) {
+ return false;
+ }
+ if (!MatchesSearchBase(document_id)) {
+ return false;
+ }
+ if (!MatchesPropertyFilter(document_filter_data.schema_type_id(),
+ section_id)) {
+ return false;
+ }
+ return true;
+ }
+ const DocumentStore& document_store_;
+ const SchemaStore& schema_store_;
+ std::unordered_set<NamespaceId> target_namespace_ids_;
+ std::unordered_map<NamespaceId, std::unordered_set<DocumentId>>
+ document_id_filter_map_;
+ std::unordered_set<SchemaTypeId> target_schema_type_ids_;
+ std::unordered_map<SchemaTypeId, SectionIdMask> property_filter_map_;
+ std::string target_section_;
+ std::unordered_set<DocumentId> search_base_;
+ int64_t current_time_ms_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_SUGGESTION_RESULT_CHECKER_IMPL_H_ \ No newline at end of file
diff --git a/icing/store/suggestion-result-checker.h b/icing/store/suggestion-result-checker.h
new file mode 100644
index 0000000..8fadd3e
--- /dev/null
+++ b/icing/store/suggestion-result-checker.h
@@ -0,0 +1,44 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_SUGGESTION_RESULT_CHECKER_H_
+#define ICING_STORE_SUGGESTION_RESULT_CHECKER_H_
+
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+class SuggestionResultChecker {
+ public:
+ virtual ~SuggestionResultChecker() = default;
+
+ // Check whether the given document id is belongs to the target namespaces.
+ // Returns:
+ // On success,
+ // - true: the given document id belongs to the target namespaces
+ // - false: the given document id doesn't belong to the target namespaces
+ // OUT_OF_RANGE if document_id is negative or exceeds previously seen
+ // DocumentIds
+ // NOT_FOUND if the document or the filter data is not found
+ // INTERNAL_ERROR on all other errors
+ virtual bool BelongsToTargetResults(DocumentId document_id,
+ SectionId section_id) const = 0;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_SUGGESTION_RESULT_CHECKER_H_
diff --git a/icing/store/usage-store.cc b/icing/store/usage-store.cc
new file mode 100644
index 0000000..546067d
--- /dev/null
+++ b/icing/store/usage-store.cc
@@ -0,0 +1,262 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/usage-store.h"
+
+#include "icing/file/file-backed-vector.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+std::string MakeUsageScoreCacheFilename(const std::string& base_dir) {
+ return absl_ports::StrCat(base_dir, "/usage-scores");
+}
+} // namespace
+
+libtextclassifier3::StatusOr<std::unique_ptr<UsageStore>> UsageStore::Create(
+ const Filesystem* filesystem, const std::string& base_dir) {
+ ICING_RETURN_ERROR_IF_NULL(filesystem);
+
+ if (!filesystem->CreateDirectoryRecursively(base_dir.c_str())) {
+ return absl_ports::InternalError(absl_ports::StrCat(
+ "Failed to create UsageStore directory: ", base_dir));
+ }
+
+ const std::string score_cache_filename =
+ MakeUsageScoreCacheFilename(base_dir);
+
+ auto usage_score_cache_or = FileBackedVector<UsageScores>::Create(
+ *filesystem, score_cache_filename,
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC);
+
+ if (absl_ports::IsFailedPrecondition(usage_score_cache_or.status())) {
+ // File checksum doesn't match the stored checksum. Delete and recreate the
+ // file.
+ ICING_RETURN_IF_ERROR(
+ FileBackedVector<int64_t>::Delete(*filesystem, score_cache_filename));
+
+ ICING_VLOG(1) << "The score cache file in UsageStore is corrupted, all "
+ "scores have been reset.";
+
+ usage_score_cache_or = FileBackedVector<UsageScores>::Create(
+ *filesystem, score_cache_filename,
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC);
+ }
+
+ if (!usage_score_cache_or.ok()) {
+ ICING_LOG(ERROR) << usage_score_cache_or.status().error_message()
+ << "Failed to initialize usage_score_cache";
+ return usage_score_cache_or.status();
+ }
+
+ return std::unique_ptr<UsageStore>(new UsageStore(
+ std::move(usage_score_cache_or).ValueOrDie(), *filesystem, base_dir));
+}
+
+libtextclassifier3::Status UsageStore::AddUsageReport(const UsageReport& report,
+ DocumentId document_id) {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id %d is invalid.", document_id));
+ }
+
+ // We don't need a copy here because we'll set the value at the same index.
+ // This won't unintentionally grow the underlying file since we already have
+ // enough space for the current index.
+ auto usage_scores_or = usage_score_cache_->Get(document_id);
+
+ // OutOfRange means that the mapper hasn't seen this document id before, it's
+ // not an error here.
+ UsageScores usage_scores;
+ if (usage_scores_or.ok()) {
+ usage_scores = *std::move(usage_scores_or).ValueOrDie();
+ } else if (!absl_ports::IsOutOfRange(usage_scores_or.status())) {
+ // Real error
+ return usage_scores_or.status();
+ }
+
+ // Update last used timestamps and type counts. The counts won't be
+ // incremented if they are already the maximum values. The timestamp from
+ // UsageReport is in milliseconds, we need to convert it to seconds.
+ int64_t report_timestamp_s = report.usage_timestamp_ms() / 1000;
+
+ switch (report.usage_type()) {
+ case UsageReport::USAGE_TYPE1:
+ if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) {
+ usage_scores.usage_type1_last_used_timestamp_s =
+ std::numeric_limits<uint32_t>::max();
+ } else if (report_timestamp_s >
+ usage_scores.usage_type1_last_used_timestamp_s) {
+ usage_scores.usage_type1_last_used_timestamp_s = report_timestamp_s;
+ }
+
+ if (usage_scores.usage_type1_count < std::numeric_limits<int>::max()) {
+ ++usage_scores.usage_type1_count;
+ }
+ break;
+ case UsageReport::USAGE_TYPE2:
+ if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) {
+ usage_scores.usage_type2_last_used_timestamp_s =
+ std::numeric_limits<uint32_t>::max();
+ } else if (report_timestamp_s >
+ usage_scores.usage_type2_last_used_timestamp_s) {
+ usage_scores.usage_type2_last_used_timestamp_s = report_timestamp_s;
+ }
+
+ if (usage_scores.usage_type2_count < std::numeric_limits<int>::max()) {
+ ++usage_scores.usage_type2_count;
+ }
+ break;
+ case UsageReport::USAGE_TYPE3:
+ if (report_timestamp_s > std::numeric_limits<uint32_t>::max()) {
+ usage_scores.usage_type3_last_used_timestamp_s =
+ std::numeric_limits<uint32_t>::max();
+ } else if (report_timestamp_s >
+ usage_scores.usage_type3_last_used_timestamp_s) {
+ usage_scores.usage_type3_last_used_timestamp_s = report_timestamp_s;
+ }
+
+ if (usage_scores.usage_type3_count < std::numeric_limits<int>::max()) {
+ ++usage_scores.usage_type3_count;
+ }
+ }
+
+ // Write updated usage scores to file.
+ return usage_score_cache_->Set(document_id, usage_scores);
+}
+
+libtextclassifier3::Status UsageStore::DeleteUsageScores(
+ DocumentId document_id) {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id %d is invalid.", document_id));
+ }
+
+ if (document_id >= usage_score_cache_->num_elements()) {
+ // Nothing to delete.
+ return libtextclassifier3::Status::OK;
+ }
+
+ // Clear all the scores of the document.
+ return usage_score_cache_->Set(document_id, UsageScores());
+}
+
+libtextclassifier3::StatusOr<UsageStore::UsageScores>
+UsageStore::GetUsageScores(DocumentId document_id) {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id %d is invalid.", document_id));
+ }
+
+ auto usage_scores_or = usage_score_cache_->GetCopy(document_id);
+ if (absl_ports::IsOutOfRange(usage_scores_or.status())) {
+ // No usage scores found. Return the default scores.
+ return UsageScores();
+ } else if (!usage_scores_or.ok()) {
+ // Pass up any other errors.
+ return usage_scores_or.status();
+ }
+
+ return std::move(usage_scores_or).ValueOrDie();
+}
+
+libtextclassifier3::Status UsageStore::SetUsageScores(
+ DocumentId document_id, const UsageScores& usage_scores) {
+ if (!IsDocumentIdValid(document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Document id %d is invalid.", document_id));
+ }
+
+ return usage_score_cache_->Set(document_id, usage_scores);
+}
+
+libtextclassifier3::Status UsageStore::CloneUsageScores(
+ DocumentId from_document_id, DocumentId to_document_id) {
+ if (!IsDocumentIdValid(from_document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "from_document_id %d is invalid.", from_document_id));
+ }
+
+ if (!IsDocumentIdValid(to_document_id)) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "to_document_id %d is invalid.", to_document_id));
+ }
+
+ auto usage_scores_or = usage_score_cache_->GetCopy(from_document_id);
+ if (usage_scores_or.ok()) {
+ return usage_score_cache_->Set(to_document_id,
+ std::move(usage_scores_or).ValueOrDie());
+ } else if (absl_ports::IsOutOfRange(usage_scores_or.status())) {
+ // No usage scores found. Set default scores to to_document_id.
+ return usage_score_cache_->Set(to_document_id, UsageScores());
+ }
+
+ // Real error
+ return usage_scores_or.status();
+}
+
+libtextclassifier3::Status UsageStore::PersistToDisk() {
+ return usage_score_cache_->PersistToDisk();
+}
+
+libtextclassifier3::StatusOr<Crc32> UsageStore::ComputeChecksum() {
+ return usage_score_cache_->ComputeChecksum();
+}
+
+libtextclassifier3::StatusOr<int64_t> UsageStore::GetElementsFileSize() const {
+ return usage_score_cache_->GetElementsFileSize();
+}
+
+libtextclassifier3::StatusOr<int64_t> UsageStore::GetDiskUsage() const {
+ return usage_score_cache_->GetDiskUsage();
+}
+
+libtextclassifier3::Status UsageStore::TruncateTo(DocumentId num_documents) {
+ if (num_documents >= usage_score_cache_->num_elements()) {
+ // No need to truncate
+ return libtextclassifier3::Status::OK;
+ }
+ // "+1" because document ids start from 0.
+ return usage_score_cache_->TruncateTo(num_documents);
+}
+
+libtextclassifier3::Status UsageStore::Reset() {
+ // We delete all the scores by deleting the whole file.
+ libtextclassifier3::Status status = FileBackedVector<int64_t>::Delete(
+ filesystem_, MakeUsageScoreCacheFilename(base_dir_));
+ if (!status.ok()) {
+ ICING_LOG(ERROR) << status.error_message()
+ << "Failed to delete usage_score_cache";
+ return status;
+ }
+
+ // Create a new usage_score_cache
+ auto usage_score_cache_or = FileBackedVector<UsageScores>::Create(
+ filesystem_, MakeUsageScoreCacheFilename(base_dir_),
+ MemoryMappedFile::READ_WRITE_AUTO_SYNC);
+ if (!usage_score_cache_or.ok()) {
+ ICING_LOG(ERROR) << usage_score_cache_or.status().error_message()
+ << "Failed to re-create usage_score_cache";
+ return usage_score_cache_or.status();
+ }
+ usage_score_cache_ = std::move(usage_score_cache_or).ValueOrDie();
+
+ return PersistToDisk();
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/store/usage-store.h b/icing/store/usage-store.h
new file mode 100644
index 0000000..3c7a55e
--- /dev/null
+++ b/icing/store/usage-store.h
@@ -0,0 +1,205 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+
+#include "icing/file/file-backed-vector.h"
+#include "icing/proto/usage.pb.h"
+#include "icing/store/document-id.h"
+
+#ifndef ICING_STORE_USAGE_STORE_H_
+#define ICING_STORE_USAGE_STORE_H_
+
+namespace icing {
+namespace lib {
+
+// A storage class that maintains scores that are calculated based on usage
+// reports.
+class UsageStore {
+ public:
+ // Factory function to create a UsageStore instance. The base directory is
+ // used to persist usage scores. If a usage store was previously created with
+ // this directory, it will reload the files saved by the last instance.
+ //
+ // TODO(b/169594617): consider returning StatusOr<UsageStore>
+ //
+ // Returns:
+ // A UsageStore on success
+ // FAILED_PRECONDITION on any null pointer input
+ // INTERNAL_ERROR on I/O error
+ static libtextclassifier3::StatusOr<std::unique_ptr<UsageStore>> Create(
+ const Filesystem* filesystem, const std::string& base_dir);
+
+ // The scores here reflect the timestamps and usage types defined in
+ // usage.proto.
+ struct UsageScores {
+ // The latest timestamp in seconds reported with custom usage type 1.
+ uint32_t usage_type1_last_used_timestamp_s = 0;
+
+ // The latest timestamp in seconds reported with custom usage type 2.
+ uint32_t usage_type2_last_used_timestamp_s = 0;
+
+ // The latest timestamp in seconds reported with custom usage type 3.
+ uint32_t usage_type3_last_used_timestamp_s = 0;
+
+ // Count of reports with custom usage type 1
+ int usage_type1_count = 0;
+
+ // Count of reports with custom usage type 2
+ int usage_type2_count = 0;
+
+ // Count of reports with custom usage type 3
+ int usage_type3_count = 0;
+
+ bool operator==(const UsageScores& other) const {
+ return usage_type1_last_used_timestamp_s ==
+ other.usage_type1_last_used_timestamp_s &&
+ usage_type2_last_used_timestamp_s ==
+ other.usage_type2_last_used_timestamp_s &&
+ usage_type3_last_used_timestamp_s ==
+ other.usage_type3_last_used_timestamp_s &&
+ usage_type1_count == other.usage_type1_count &&
+ usage_type2_count == other.usage_type2_count &&
+ usage_type3_count == other.usage_type3_count;
+ }
+ };
+
+ // Adds one usage report. The corresponding usage scores of the specified
+ // document will be updated.
+ //
+ // Note: changes are written to disk automatically, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if document_id is invalid
+ // INTERNAL_ERROR on I/O errors.
+ libtextclassifier3::Status AddUsageReport(const UsageReport& report,
+ DocumentId document_id);
+
+ // Deletes the usage scores of a document.
+ //
+ // Note: changes are written to disk automatically, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if document_id is invalid
+ // INTERNAL_ERROR on I/O errors
+ libtextclassifier3::Status DeleteUsageScores(DocumentId document_id);
+
+ // Gets the usage scores of a document.
+ //
+ // Returns:
+ // UsageScores on success
+ // INVALID_ARGUMENT if document_id is invalid
+ // INTERNAL_ERROR on I/O errors
+ //
+ // TODO(b/169433395): return a pointer instead of an object.
+ libtextclassifier3::StatusOr<UsageScores> GetUsageScores(
+ DocumentId document_id);
+
+ // Sets the usage scores of a document.
+ //
+ // Note: changes are written to disk automatically, callers can also call
+ // PersistToDisk() to flush changes immediately.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if document_id is invalid
+ // INTERNAL_ERROR on I/O errors
+ libtextclassifier3::Status SetUsageScores(DocumentId document_id,
+ const UsageScores& usage_scores);
+
+ // Clones the usage scores from one document to another.
+ //
+ // Returns:
+ // OK on success
+ // INVALID_ARGUMENT if any of the document ids is invalid
+ // INTERNAL_ERROR on I/O errors
+ //
+ // TODO(b/169433395): We can remove this method once GetUsageScores() returns
+ // a pointer.
+ libtextclassifier3::Status CloneUsageScores(DocumentId from_document_id,
+ DocumentId to_document_id);
+
+ // Syncs data to disk.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL on I/O error
+ libtextclassifier3::Status PersistToDisk();
+
+ // Updates checksum of the usage scores and returns it.
+ //
+ // Returns:
+ // A Crc32 on success
+ // INTERNAL_ERROR if the internal state is inconsistent
+ libtextclassifier3::StatusOr<Crc32> ComputeChecksum();
+
+ // Returns the file size of the all the elements held in the UsageStore. File
+ // size is in bytes. This excludes the size of any internal metadata, e.g. any
+ // internal headers.
+ //
+ // Returns:
+ // File size on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetElementsFileSize() const;
+
+ // Calculates and returns the disk usage in bytes. Rounds up to the nearest
+ // block size.
+ //
+ // Returns:
+ // Disk usage on success
+ // INTERNAL_ERROR on IO error
+ libtextclassifier3::StatusOr<int64_t> GetDiskUsage() const;
+
+ // Resizes the storage so that only the usage scores of and before
+ // last_document_id are stored.
+ //
+ // Returns:
+ // OK on success
+ // OUT_OF_RANGE_ERROR if num_documents is negative
+ libtextclassifier3::Status TruncateTo(DocumentId num_documents);
+
+ // Deletes all usage data and re-initialize the storage.
+ //
+ // Returns:
+ // OK on success
+ // INTERNAL_ERROR on I/O error
+ libtextclassifier3::Status Reset();
+
+ int32_t num_elements() const { return usage_score_cache_->num_elements(); }
+
+ private:
+ explicit UsageStore(std::unique_ptr<FileBackedVector<UsageScores>>
+ document_id_to_scores_mapper,
+ const Filesystem& filesystem, std::string base_dir)
+ : filesystem_(filesystem),
+ base_dir_(std::move(base_dir)),
+ usage_score_cache_(std::move(document_id_to_scores_mapper)) {}
+
+ const Filesystem& filesystem_;
+
+ // Base directory where the files are located.
+ const std::string base_dir_;
+
+ // Used to store the usage scores of documents.
+ std::unique_ptr<FileBackedVector<UsageScores>> usage_score_cache_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_USAGE_STORE_H_
diff --git a/icing/store/usage-store_test.cc b/icing/store/usage-store_test.cc
new file mode 100644
index 0000000..07fe2c5
--- /dev/null
+++ b/icing/store/usage-store_test.cc
@@ -0,0 +1,628 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/store/usage-store.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/tmp-directory.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::Not;
+
+class UsageStoreTest : public testing::Test {
+ protected:
+ UsageStoreTest() : test_dir_(GetTestTempDir() + "/usage-store-test") {}
+
+ void SetUp() override {
+ filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
+ }
+
+ void TearDown() override {
+ filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
+ }
+
+ const Filesystem filesystem_;
+ const std::string test_dir_;
+};
+
+UsageReport CreateUsageReport(std::string name_space, std::string uri,
+ int64_t timestamp_ms,
+ UsageReport::UsageType usage_type) {
+ UsageReport usage_report;
+ usage_report.set_document_namespace(name_space);
+ usage_report.set_document_uri(uri);
+ usage_report.set_usage_timestamp_ms(timestamp_ms);
+ usage_report.set_usage_type(usage_type);
+ return usage_report;
+}
+
+UsageStore::UsageScores CreateUsageScores(uint32_t type1_timestamp,
+ uint32_t type2_timestamp,
+ uint32_t type3_timestamp,
+ int type1_count, int type2_count,
+ int type3_count) {
+ UsageStore::UsageScores scores;
+ scores.usage_type1_last_used_timestamp_s = type1_timestamp;
+ scores.usage_type2_last_used_timestamp_s = type2_timestamp;
+ scores.usage_type3_last_used_timestamp_s = type3_timestamp;
+ scores.usage_type1_count = type1_count;
+ scores.usage_type2_count = type2_count;
+ scores.usage_type3_count = type3_count;
+
+ return scores;
+}
+
+TEST_F(UsageStoreTest, CreationShouldSucceed) {
+ EXPECT_THAT(UsageStore::Create(&filesystem_, test_dir_), IsOk());
+}
+
+TEST_F(UsageStoreTest, CreationShouldFailOnNullPointer) {
+ EXPECT_THAT(UsageStore::Create(nullptr, test_dir_),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(UsageStoreTest, UsageScoresShouldBeComparable) {
+ UsageStore::UsageScores scores1;
+ UsageStore::UsageScores scores2;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type1_last_used_timestamp_s.
+ ++scores1.usage_type1_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type1_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type2_last_used_timestamp_s.
+ ++scores1.usage_type2_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type2_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type3_last_used_timestamp_s.
+ ++scores1.usage_type3_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type3_last_used_timestamp_s;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type1_count.
+ ++scores1.usage_type1_count;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type1_count;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type2_count.
+ ++scores1.usage_type2_count;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type2_count;
+ EXPECT_THAT(scores1, Eq(scores2));
+
+ // operator== should compare usage_type3_count.
+ ++scores1.usage_type3_count;
+ EXPECT_THAT(scores1, Not(Eq(scores2)));
+ ++scores2.usage_type3_count;
+ EXPECT_THAT(scores1, Eq(scores2));
+}
+
+TEST_F(UsageStoreTest, InvalidDocumentIdShouldReturnError) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ DocumentId invalid_document_id = -1;
+
+ EXPECT_THAT(usage_store->AddUsageReport(UsageReport(), invalid_document_id),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(usage_store->DeleteUsageScores(invalid_document_id),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(usage_store->GetUsageScores(invalid_document_id),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+
+ EXPECT_THAT(usage_store->SetUsageScores(invalid_document_id,
+ UsageStore::UsageScores()),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+}
+
+TEST_F(UsageStoreTest, AddUsageReportShouldUpdateLastUsedTimestamp) {
+ // Create 3 reports with different timestamps.
+ UsageReport usage_report_time1 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_time5 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/5000, UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_time10 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/10000, UsageReport::USAGE_TYPE1);
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Report a usage with timestamp 5.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_time5, /*document_id=*/1));
+ UsageStore::UsageScores expected_scores = CreateUsageScores(
+ /*type1_timestamp=*/5, /*type2_timestamp=*/0, /*type3_timestamp=*/0,
+ /*type1_count=*/1, /*type2_count=*/0, /*type3_count=*/0);
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+
+ // Report a usage with timestamp 1. The timestamp won't be updated.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_time1, /*document_id=*/1));
+ ++expected_scores.usage_type1_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+
+ // Report a usage with timestamp 10. The timestamp should be updated.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_time10, /*document_id=*/1));
+ expected_scores.usage_type1_last_used_timestamp_s = 10;
+ ++expected_scores.usage_type1_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, AddUsageReportShouldUpdateCounts) {
+ // Create 3 reports with different usage types.
+ UsageReport usage_report_type1 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1);
+ UsageReport usage_report_type2 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE2);
+ UsageReport usage_report_type3 = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE3);
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Report a usage with type 1.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1));
+ UsageStore::UsageScores expected_scores = CreateUsageScores(
+ /*type1_timestamp=*/0, /*type2_timestamp=*/0, /*type3_timestamp=*/0,
+ /*type1_count=*/1, /*type2_count=*/0, /*type3_count=*/0);
+ ;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+ // Report another usage with type 1.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_type1, /*document_id=*/1));
+ ++expected_scores.usage_type1_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+
+ // Report a usage with type 2.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1));
+ ++expected_scores.usage_type2_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+ // Report another usage with type 2.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_type2, /*document_id=*/1));
+ ++expected_scores.usage_type2_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+
+ // Report a usage with type 3.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1));
+ ++expected_scores.usage_type3_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+ // Report another usage with type 3.
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report_type3, /*document_id=*/1));
+ ++expected_scores.usage_type3_count;
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, GetNonExistingDocumentShouldReturnDefaultScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, SetAndGetUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+
+ // Verify that set and get results are consistent.
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+}
+
+TEST_F(UsageStoreTest, ImplicitlyInitializedScoresShouldBeZero) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Explicitly set scores for document 2.
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/2,
+ UsageStore::UsageScores()));
+
+ // Now the scores of document 1 have been implicitly initialized. The scores
+ // should all be 0.
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, DeleteUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+ ;
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+
+ // Delete the usage scores of document 1, all the scores of document 1 should
+ // be 0.
+ ICING_EXPECT_OK(usage_store->DeleteUsageScores(/*document_id=*/1));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, CloneUsageScores) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers and assign them to document 1.
+ UsageStore::UsageScores scores_a = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+ ;
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores_a));
+
+ // Create another set of usage scores with some random numbers and assign them
+ // to document 2.
+ UsageStore::UsageScores scores_b = CreateUsageScores(
+ /*type1_timestamp=*/111, /*type2_timestamp=*/666, /*type3_timestamp=*/333,
+ /*type1_count=*/50, /*type2_count=*/30, /*type3_count=*/100);
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/2, scores_b));
+
+ // Clone scores from document 1 to document 3.
+ EXPECT_THAT(usage_store->CloneUsageScores(/*from_document_id=*/1,
+ /*to_document_id=*/3),
+ IsOk());
+
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/3),
+ IsOkAndHolds(scores_a));
+
+ // Clone scores from document 2 to document 3.
+ EXPECT_THAT(usage_store->CloneUsageScores(/*from_document_id=*/2,
+ /*to_document_id=*/3),
+ IsOk());
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/3),
+ IsOkAndHolds(scores_b));
+
+ // Clone scores from document 4 to document 3, scores should be set to
+ // default.
+ EXPECT_THAT(usage_store->CloneUsageScores(/*from_document_id=*/4,
+ /*to_document_id=*/3),
+ IsOk());
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/3),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, PersistToDisk) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+
+ EXPECT_THAT(usage_store->PersistToDisk(), IsOk());
+}
+
+TEST_F(UsageStoreTest, ComputeChecksum) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum1, usage_store->ComputeChecksum());
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum2, usage_store->ComputeChecksum());
+
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/2, scores));
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum3, usage_store->ComputeChecksum());
+
+ EXPECT_THAT(checksum1, Not(Eq(checksum2)));
+ EXPECT_THAT(checksum1, Not(Eq(checksum3)));
+ EXPECT_THAT(checksum2, Not(Eq(checksum3)));
+
+ // Without changing the store, checksum should be the same.
+ ICING_ASSERT_OK_AND_ASSIGN(Crc32 checksum4, usage_store->ComputeChecksum());
+ EXPECT_THAT(checksum3, Eq(checksum4));
+}
+
+TEST_F(UsageStoreTest, TruncateTo) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers and set scores for document 0,
+ // 1, 2.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/0, scores));
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/2, scores));
+
+ // Truncate number of documents to 2, scores of document 2 should be gone.
+ EXPECT_THAT(usage_store->TruncateTo(/*num_documents=*/2), IsOk());
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/0),
+ IsOkAndHolds(scores));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/2),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, TruncateToALargeNumberShouldDoNothing) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers and set scores for document
+ // 0, 1.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/0, scores));
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+
+ ASSERT_THAT(usage_store->GetUsageScores(/*document_id=*/0),
+ IsOkAndHolds(scores));
+ ASSERT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+ ASSERT_THAT(usage_store->GetUsageScores(/*document_id=*/2),
+ IsOkAndHolds(UsageStore::UsageScores()));
+
+ // Truncate to a number that is greater than the number of documents. Scores
+ // should be the same.
+ EXPECT_THAT(usage_store->TruncateTo(1000), IsOk());
+
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/0),
+ IsOkAndHolds(scores));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/2),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, TruncateToNegativeNumberShouldReturnError) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ EXPECT_THAT(usage_store->TruncateTo(-1),
+ StatusIs(libtextclassifier3::StatusCode::OUT_OF_RANGE));
+}
+
+TEST_F(UsageStoreTest, Reset) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+
+ // Set scores for document 1 and document 2.
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/2, scores));
+
+ EXPECT_THAT(usage_store->Reset(), IsOk());
+
+ // After resetting, all the scores are cleared.
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(UsageStore::UsageScores()));
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/2),
+ IsOkAndHolds(UsageStore::UsageScores()));
+ }
+
+ // New instances should be created successfully after Reset().
+ EXPECT_THAT(UsageStore::Create(&filesystem_, test_dir_).status(), IsOk());
+}
+
+TEST_F(UsageStoreTest, TimestampInSecondsShouldNotOverflow) {
+ // Create a report with the max value of timestamps.
+ UsageReport usage_report = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/std::numeric_limits<int64_t>::max(),
+ UsageReport::USAGE_TYPE1);
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // The stored timestamp in seconds should be the max value of uint32.
+ ICING_ASSERT_OK(usage_store->AddUsageReport(usage_report, /*document_id=*/1));
+ UsageStore::UsageScores expected_scores = CreateUsageScores(
+ /*type1_timestamp=*/std::numeric_limits<uint32_t>::max(),
+ /*type2_timestamp=*/0, /*type3_timestamp=*/0,
+ /*type1_count=*/1, /*type2_count=*/0, /*type3_count=*/0);
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(expected_scores));
+}
+
+TEST_F(UsageStoreTest, CountsShouldNotOverflow) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with the max value of int.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/0, /*type2_timestamp=*/0, /*type3_timestamp=*/0,
+ /*type1_count=*/std::numeric_limits<int>::max(), /*type2_count=*/0,
+ /*type3_count=*/0);
+
+ ICING_ASSERT_OK(usage_store->SetUsageScores(/*document_id=*/1, scores));
+ ASSERT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+
+ // Report another usage with type 1.
+ UsageReport usage_report = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/0, UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(usage_store->AddUsageReport(usage_report, /*document_id=*/1));
+
+ // usage_type1_count should not change because it's already the max value.
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/1),
+ IsOkAndHolds(scores));
+}
+
+TEST_F(UsageStoreTest, StoreShouldBeResetOnVectorChecksumMismatch) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/0, scores));
+ ASSERT_THAT(usage_store->GetUsageScores(/*document_id=*/0),
+ IsOkAndHolds(scores));
+ }
+
+ // Modify the header to trigger a vector checksum mismatch.
+ const std::string score_cache_file_path =
+ absl_ports::StrCat(test_dir_, "/usage-scores");
+ FileBackedVector<UsageStore::UsageScores>::Header header{};
+ filesystem_.PRead(
+ score_cache_file_path.c_str(), /*buf=*/&header,
+ /*buf_size=*/sizeof(FileBackedVector<UsageStore::UsageScores>::Header),
+ /*offset=*/0);
+ header.vector_checksum = 10; // Arbitrary garbage checksum
+ header.header_checksum = header.CalculateHeaderChecksum();
+ filesystem_.PWrite(
+ score_cache_file_path.c_str(), /*offset=*/0, /*data=*/&header,
+ /*data_size=*/sizeof(FileBackedVector<UsageStore::UsageScores>::Header));
+
+ // Recover from checksum mismatch.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+ // Previous data should be cleared.
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/0),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, StoreShouldBeResetOnHeaderChecksumMismatch) {
+ {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // Create usage scores with some random numbers.
+ UsageStore::UsageScores scores = CreateUsageScores(
+ /*type1_timestamp=*/7, /*type2_timestamp=*/9, /*type3_timestamp=*/1,
+ /*type1_count=*/3, /*type2_count=*/4, /*type3_count=*/9);
+ ICING_EXPECT_OK(usage_store->SetUsageScores(/*document_id=*/0, scores));
+ ASSERT_THAT(usage_store->GetUsageScores(/*document_id=*/0),
+ IsOkAndHolds(scores));
+ }
+
+ // Modify the header to trigger a header checksum mismatch.
+ const std::string score_cache_file_path =
+ absl_ports::StrCat(test_dir_, "/usage-scores");
+ FileBackedVector<UsageStore::UsageScores>::Header header{};
+ filesystem_.PRead(
+ score_cache_file_path.c_str(), /*buf=*/&header,
+ /*buf_size=*/sizeof(FileBackedVector<UsageStore::UsageScores>::Header),
+ /*offset=*/0);
+ header.header_checksum = 10; // Arbitrary garbage checksum
+ filesystem_.PWrite(
+ score_cache_file_path.c_str(), /*offset=*/0, /*data=*/&header,
+ /*data_size=*/sizeof(FileBackedVector<UsageStore::UsageScores>::Header));
+
+ // Recover from checksum mismatch.
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+ // Previous data should be cleared.
+ EXPECT_THAT(usage_store->GetUsageScores(/*document_id=*/0),
+ IsOkAndHolds(UsageStore::UsageScores()));
+}
+
+TEST_F(UsageStoreTest, GetElementsFileSize) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_file_size,
+ usage_store->GetElementsFileSize());
+ EXPECT_THAT(empty_file_size, Eq(0));
+
+ UsageReport usage_report = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1);
+ ICING_ASSERT_OK(usage_store->AddUsageReport(usage_report, /*document_id=*/1));
+
+ EXPECT_THAT(usage_store->GetElementsFileSize(),
+ IsOkAndHolds(Gt(empty_file_size)));
+}
+
+TEST_F(UsageStoreTest, GetDiskUsageEmpty) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // There's some internal metadata, so our disk usage will round up to 1 block.
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_disk_usage,
+ usage_store->GetDiskUsage());
+ EXPECT_THAT(empty_disk_usage, Gt(0));
+}
+
+TEST_F(UsageStoreTest, GetDiskUsageNonEmpty) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<UsageStore> usage_store,
+ UsageStore::Create(&filesystem_, test_dir_));
+
+ // There's some internal metadata, so our disk usage will round up to 1 block.
+ ICING_ASSERT_OK_AND_ASSIGN(int64_t empty_disk_usage,
+ usage_store->GetDiskUsage());
+
+ // Since our GetDiskUsage can only get sizes in increments of block_size, we
+ // need to insert enough usage reports so the disk usage will increase by at
+ // least 1 block size. The number 200 is a bit arbitrary, gotten from manually
+ // testing.
+ UsageReport usage_report = CreateUsageReport(
+ "namespace", "uri", /*timestamp_ms=*/1000, UsageReport::USAGE_TYPE1);
+ for (int i = 0; i < 200; ++i) {
+ ICING_ASSERT_OK(
+ usage_store->AddUsageReport(usage_report, /*document_id=*/i));
+ }
+
+ // We need to persist since iOS won't see the new disk allocations until after
+ // everything gets written.
+ ICING_ASSERT_OK(usage_store->PersistToDisk());
+
+ EXPECT_THAT(usage_store->GetDiskUsage(), IsOkAndHolds(Gt(empty_disk_usage)));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/always-false-suggestion-result-checker-impl.h b/icing/testing/always-false-suggestion-result-checker-impl.h
new file mode 100644
index 0000000..2f956de
--- /dev/null
+++ b/icing/testing/always-false-suggestion-result-checker-impl.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_ALWAYS_TRUE_SUGGESTION_RESULT_CHECKER_IMPL_H_
+#define ICING_TESTING_ALWAYS_TRUE_SUGGESTION_RESULT_CHECKER_IMPL_H_
+
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/suggestion-result-checker.h"
+
+namespace icing {
+namespace lib {
+
+class AlwaysFalseSuggestionResultCheckerImpl : public SuggestionResultChecker {
+ public:
+ bool BelongsToTargetResults(DocumentId document_id,
+ SectionId section_id) const override {
+ return false;
+ }
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_ALWAYS_TRUE_SUGGESTION_RESULT_CHECKER_IMPL_H_ \ No newline at end of file
diff --git a/icing/testing/always-true-suggestion-result-checker-impl.h b/icing/testing/always-true-suggestion-result-checker-impl.h
new file mode 100644
index 0000000..d25c39c
--- /dev/null
+++ b/icing/testing/always-true-suggestion-result-checker-impl.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_ALWAYS_TRUE_SUGGESTION_RESULT_CHECKER_IMPL_H_
+#define ICING_TESTING_ALWAYS_TRUE_SUGGESTION_RESULT_CHECKER_IMPL_H_
+
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+#include "icing/store/suggestion-result-checker.h"
+
+namespace icing {
+namespace lib {
+
+class AlwaysTrueSuggestionResultCheckerImpl : public SuggestionResultChecker {
+ public:
+ bool BelongsToTargetResults(DocumentId document_id,
+ SectionId section_id) const override {
+ return true;
+ }
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_ALWAYS_TRUE_SUGGESTION_RESULT_CHECKER_IMPL_H_ \ No newline at end of file
diff --git a/icing/testing/common-matchers.cc b/icing/testing/common-matchers.cc
new file mode 100644
index 0000000..cd4e446
--- /dev/null
+++ b/icing/testing/common-matchers.cc
@@ -0,0 +1,124 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+
+ExtractTermFrequenciesResult ExtractTermFrequencies(
+ const std::unordered_map<SectionId, Hit::TermFrequency>&
+ section_ids_tf_map) {
+ ExtractTermFrequenciesResult result;
+ for (const auto& [section_id, tf] : section_ids_tf_map) {
+ result.term_frequencies[section_id] = tf;
+ result.section_mask |= UINT64_C(1) << section_id;
+ }
+ return result;
+}
+
+CheckTermFrequencyResult CheckTermFrequency(
+ const std::array<Hit::TermFrequency, kTotalNumSections>&
+ expected_term_frequencies,
+ const std::array<Hit::TermFrequency, kTotalNumSections>&
+ actual_term_frequencies) {
+ CheckTermFrequencyResult result;
+ for (SectionId section_id = 0; section_id < kTotalNumSections; ++section_id) {
+ if (expected_term_frequencies.at(section_id) !=
+ actual_term_frequencies.at(section_id)) {
+ result.term_frequencies_match = false;
+ }
+ }
+ result.actual_term_frequencies_str =
+ absl_ports::StrCat("[",
+ absl_ports::StrJoin(actual_term_frequencies, ",",
+ absl_ports::NumberFormatter()),
+ "]");
+ result.expected_term_frequencies_str =
+ absl_ports::StrCat("[",
+ absl_ports::StrJoin(expected_term_frequencies, ",",
+ absl_ports::NumberFormatter()),
+ "]");
+ return result;
+}
+
+std::string StatusCodeToString(libtextclassifier3::StatusCode code) {
+ switch (code) {
+ case libtextclassifier3::StatusCode::OK:
+ return "OK";
+ case libtextclassifier3::StatusCode::CANCELLED:
+ return "CANCELLED";
+ case libtextclassifier3::StatusCode::UNKNOWN:
+ return "UNKNOWN";
+ case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
+ return "INVALID_ARGUMENT";
+ case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
+ return "DEADLINE_EXCEEDED";
+ case libtextclassifier3::StatusCode::NOT_FOUND:
+ return "NOT_FOUND";
+ case libtextclassifier3::StatusCode::ALREADY_EXISTS:
+ return "ALREADY_EXISTS";
+ case libtextclassifier3::StatusCode::PERMISSION_DENIED:
+ return "PERMISSION_DENIED";
+ case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
+ return "RESOURCE_EXHAUSTED";
+ case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
+ return "FAILED_PRECONDITION";
+ case libtextclassifier3::StatusCode::ABORTED:
+ return "ABORTED";
+ case libtextclassifier3::StatusCode::OUT_OF_RANGE:
+ return "OUT_OF_RANGE";
+ case libtextclassifier3::StatusCode::UNIMPLEMENTED:
+ return "UNIMPLEMENTED";
+ case libtextclassifier3::StatusCode::INTERNAL:
+ return "INTERNAL";
+ case libtextclassifier3::StatusCode::UNAVAILABLE:
+ return "UNAVAILABLE";
+ case libtextclassifier3::StatusCode::DATA_LOSS:
+ return "DATA_LOSS";
+ case libtextclassifier3::StatusCode::UNAUTHENTICATED:
+ return "UNAUTHENTICATED";
+ default:
+ return "";
+ }
+}
+
+std::string ProtoStatusCodeToString(StatusProto::Code code) {
+ switch (code) {
+ case StatusProto::OK:
+ return "OK";
+ case StatusProto::UNKNOWN:
+ return "UNKNOWN";
+ case StatusProto::INVALID_ARGUMENT:
+ return "INVALID_ARGUMENT";
+ case StatusProto::NOT_FOUND:
+ return "NOT_FOUND";
+ case StatusProto::ALREADY_EXISTS:
+ return "ALREADY_EXISTS";
+ case StatusProto::OUT_OF_SPACE:
+ return "OUT_OF_SPACE";
+ case StatusProto::FAILED_PRECONDITION:
+ return "FAILED_PRECONDITION";
+ case StatusProto::ABORTED:
+ return "ABORTED";
+ case StatusProto::INTERNAL:
+ return "INTERNAL";
+ case StatusProto::WARNING_DATA_LOSS:
+ return "WARNING_DATA_LOSS";
+ default:
+ return "";
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/common-matchers.h b/icing/testing/common-matchers.h
index 25f6249..7d8e0cb 100644
--- a/icing/testing/common-matchers.h
+++ b/icing/testing/common-matchers.h
@@ -15,27 +15,40 @@
#ifndef ICING_TESTING_COMMON_MATCHERS_H_
#define ICING_TESTING_COMMON_MATCHERS_H_
+#include <algorithm>
+#include <cinttypes>
+#include <cmath>
+#include <string>
+#include <vector>
+
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/status_macros.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_join.h"
#include "icing/index/hit/doc-hit-info.h"
+#include "icing/index/hit/hit.h"
+#include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
+#include "icing/index/iterator/doc-hit-info-iterator.h"
#include "icing/legacy/core/icing-string-util.h"
+#include "icing/portable/equals-proto.h"
+#include "icing/proto/search.pb.h"
+#include "icing/proto/status.pb.h"
+#include "icing/schema/joinable-property.h"
#include "icing/schema/schema-store.h"
#include "icing/schema/section.h"
-#include "icing/util/status-macros.h"
+#include "icing/scoring/scored-document-hit.h"
namespace icing {
namespace lib {
// Used to match Token(Token::Type type, std::string_view text)
MATCHER_P2(EqualsToken, type, text, "") {
+ std::string arg_string(arg.text.data(), arg.text.length());
if (arg.type != type || arg.text != text) {
*result_listener << IcingStringUtil::StringPrintf(
"(Expected: type=%d, text=\"%s\". Actual: type=%d, text=\"%s\")", type,
- &text[0], arg.type, arg.text.data());
+ text, arg.type, arg_string.c_str());
return false;
}
return true;
@@ -46,30 +59,199 @@ MATCHER_P2(EqualsDocHitInfo, document_id, section_ids, "") {
const DocHitInfo& actual = arg;
SectionIdMask section_mask = kSectionIdMaskNone;
for (SectionId section_id : section_ids) {
- section_mask |= 1U << section_id;
+ section_mask |= UINT64_C(1) << section_id;
}
*result_listener << IcingStringUtil::StringPrintf(
- "(actual is {document_id=%d, section_mask=%d}, but expected was "
- "{document_id=%d, section_mask=%d}.)",
+ "(actual is {document_id=%d, section_mask=%" PRIu64
+ "}, but expected was "
+ "{document_id=%d, section_mask=%" PRIu64 "}.)",
actual.document_id(), actual.hit_section_ids_mask(), document_id,
section_mask);
return actual.document_id() == document_id &&
actual.hit_section_ids_mask() == section_mask;
}
+// Used to match a DocHitInfoIterator::CallStats
+MATCHER_P5(EqualsDocHitInfoIteratorCallStats, num_leaf_advance_calls_lite_index,
+ num_leaf_advance_calls_main_index,
+ num_leaf_advance_calls_integer_index,
+ num_leaf_advance_calls_no_index, num_blocks_inspected, "") {
+ const DocHitInfoIterator::CallStats& actual = arg;
+ *result_listener << IcingStringUtil::StringPrintf(
+ "(actual is {num_leaf_advance_calls_lite_index=%d, "
+ "num_leaf_advance_calls_main_index=%d, "
+ "num_leaf_advance_calls_integer_index=%d, "
+ "num_leaf_advance_calls_no_index=%d, num_blocks_inspected=%d}, but "
+ "expected was {num_leaf_advance_calls_lite_index=%d, "
+ "num_leaf_advance_calls_main_index=%d, "
+ "num_leaf_advance_calls_integer_index=%d, "
+ "num_leaf_advance_calls_no_index=%d, num_blocks_inspected=%d}.)",
+ actual.num_leaf_advance_calls_lite_index,
+ actual.num_leaf_advance_calls_main_index,
+ actual.num_leaf_advance_calls_integer_index,
+ actual.num_leaf_advance_calls_no_index, actual.num_blocks_inspected,
+ num_leaf_advance_calls_lite_index, num_leaf_advance_calls_main_index,
+ num_leaf_advance_calls_integer_index, num_leaf_advance_calls_no_index,
+ num_blocks_inspected);
+ return actual.num_leaf_advance_calls_lite_index ==
+ num_leaf_advance_calls_lite_index &&
+ actual.num_leaf_advance_calls_main_index ==
+ num_leaf_advance_calls_main_index &&
+ actual.num_leaf_advance_calls_integer_index ==
+ num_leaf_advance_calls_integer_index &&
+ actual.num_leaf_advance_calls_no_index ==
+ num_leaf_advance_calls_no_index &&
+ actual.num_blocks_inspected == num_blocks_inspected;
+}
+
+struct ExtractTermFrequenciesResult {
+ std::array<Hit::TermFrequency, kTotalNumSections> term_frequencies = {0};
+ SectionIdMask section_mask = kSectionIdMaskNone;
+};
+// Extracts the term frequencies represented by the section_ids_tf_map.
+// Returns:
+// - a SectionIdMask representing all sections that appears as entries in the
+// map, even if they have an entry with term_frequency==0
+// - an array representing the term frequencies for each section. Sections not
+// present in section_ids_tf_map have a term frequency of 0.
+ExtractTermFrequenciesResult ExtractTermFrequencies(
+ const std::unordered_map<SectionId, Hit::TermFrequency>&
+ section_ids_tf_map);
+
+struct CheckTermFrequencyResult {
+ std::string expected_term_frequencies_str;
+ std::string actual_term_frequencies_str;
+ bool term_frequencies_match = true;
+};
+// Checks that the term frequencies in actual_term_frequencies match those
+// specified in expected_section_ids_tf_map. If there is no entry in
+// expected_section_ids_tf_map, then it is assumed that the term frequency for
+// that section is 0.
+// Returns:
+// - a bool indicating if the term frequencies match
+// - debug strings representing the contents of the actual and expected term
+// term frequency arrays.
+CheckTermFrequencyResult CheckTermFrequency(
+ const std::array<Hit::TermFrequency, kTotalNumSections>&
+ expected_term_frequencies,
+ const std::array<Hit::TermFrequency, kTotalNumSections>&
+ actual_term_frequencies);
+
+// Used to match a DocHitInfo
+MATCHER_P2(EqualsDocHitInfoWithTermFrequency, document_id,
+ section_ids_to_term_frequencies_map, "") {
+ const DocHitInfoTermFrequencyPair& actual = arg;
+ std::array<Hit::TermFrequency, kTotalNumSections> actual_tf_array;
+ for (SectionId section_id = 0; section_id < kTotalNumSections; ++section_id) {
+ actual_tf_array[section_id] = actual.hit_term_frequency(section_id);
+ }
+ ExtractTermFrequenciesResult expected =
+ ExtractTermFrequencies(section_ids_to_term_frequencies_map);
+ CheckTermFrequencyResult check_tf_result =
+ CheckTermFrequency(expected.term_frequencies, actual_tf_array);
+
+ *result_listener << IcingStringUtil::StringPrintf(
+ "(actual is {document_id=%d, section_mask=%" PRIu64
+ ", term_frequencies=%s}, but expected was "
+ "{document_id=%d, section_mask=%" PRIu64 ", term_frequencies=%s}.)",
+ actual.doc_hit_info().document_id(),
+ actual.doc_hit_info().hit_section_ids_mask(),
+ check_tf_result.actual_term_frequencies_str.c_str(), document_id,
+ expected.section_mask,
+ check_tf_result.expected_term_frequencies_str.c_str());
+ return actual.doc_hit_info().document_id() == document_id &&
+ actual.doc_hit_info().hit_section_ids_mask() ==
+ expected.section_mask &&
+ check_tf_result.term_frequencies_match;
+}
+
+MATCHER_P2(EqualsTermMatchInfo, term, section_ids_to_term_frequencies_map, "") {
+ const TermMatchInfo& actual = arg;
+ std::string term_str(term);
+ ExtractTermFrequenciesResult expected =
+ ExtractTermFrequencies(section_ids_to_term_frequencies_map);
+ CheckTermFrequencyResult check_tf_result =
+ CheckTermFrequency(expected.term_frequencies, actual.term_frequencies);
+ *result_listener << IcingStringUtil::StringPrintf(
+ "(actual is {term=%s, section_mask=%" PRIu64
+ ", term_frequencies=%s}, but expected was "
+ "{term=%s, section_mask=%" PRIu64 ", term_frequencies=%s}.)",
+ actual.term.data(), actual.section_ids_mask,
+ check_tf_result.actual_term_frequencies_str.c_str(), term_str.data(),
+ expected.section_mask,
+ check_tf_result.expected_term_frequencies_str.c_str());
+ return actual.term == term &&
+ actual.section_ids_mask == expected.section_mask &&
+ check_tf_result.term_frequencies_match;
+}
+
+class ScoredDocumentHitFormatter {
+ public:
+ std::string operator()(const ScoredDocumentHit& scored_document_hit) {
+ return IcingStringUtil::StringPrintf(
+ "(document_id=%d, hit_section_id_mask=%" PRId64 ", score=%.2f)",
+ scored_document_hit.document_id(),
+ scored_document_hit.hit_section_id_mask(), scored_document_hit.score());
+ }
+};
+
+class ScoredDocumentHitEqualComparator {
+ public:
+ bool operator()(const ScoredDocumentHit& lhs,
+ const ScoredDocumentHit& rhs) const {
+ return lhs.document_id() == rhs.document_id() &&
+ lhs.hit_section_id_mask() == rhs.hit_section_id_mask() &&
+ std::fabs(lhs.score() - rhs.score()) < 1e-6;
+ }
+};
+
// Used to match a ScoredDocumentHit
MATCHER_P(EqualsScoredDocumentHit, expected_scored_document_hit, "") {
- if (arg.document_id() != expected_scored_document_hit.document_id() ||
- arg.hit_section_id_mask() !=
- expected_scored_document_hit.hit_section_id_mask() ||
- arg.score() != expected_scored_document_hit.score()) {
+ ScoredDocumentHitEqualComparator equal_comparator;
+ if (!equal_comparator(arg, expected_scored_document_hit)) {
+ ScoredDocumentHitFormatter formatter;
+ *result_listener << "Expected: " << formatter(expected_scored_document_hit)
+ << ". Actual: " << formatter(arg);
+ return false;
+ }
+ return true;
+}
+
+// Used to match a JoinedScoredDocumentHit
+MATCHER_P(EqualsJoinedScoredDocumentHit, expected_joined_scored_document_hit,
+ "") {
+ ScoredDocumentHitEqualComparator equal_comparator;
+ if (std::fabs(arg.final_score() -
+ expected_joined_scored_document_hit.final_score()) > 1e-6 ||
+ !equal_comparator(
+ arg.parent_scored_document_hit(),
+ expected_joined_scored_document_hit.parent_scored_document_hit()) ||
+ arg.child_scored_document_hits().size() !=
+ expected_joined_scored_document_hit.child_scored_document_hits()
+ .size() ||
+ !std::equal(
+ arg.child_scored_document_hits().cbegin(),
+ arg.child_scored_document_hits().cend(),
+ expected_joined_scored_document_hit.child_scored_document_hits()
+ .cbegin(),
+ equal_comparator)) {
+ ScoredDocumentHitFormatter formatter;
+
*result_listener << IcingStringUtil::StringPrintf(
- "Expected: document_id=%d, hit_section_id_mask=%d, score=%.2f. Actual: "
- "document_id=%d, hit_section_id_mask=%d, score=%.2f",
- expected_scored_document_hit.document_id(),
- expected_scored_document_hit.hit_section_id_mask(),
- expected_scored_document_hit.score(), arg.document_id(),
- arg.hit_section_id_mask(), arg.score());
+ "Expected: final_score=%.2f, parent_scored_document_hit=%s, "
+ "child_scored_document_hits=[%s]. Actual: final_score=%.2f, "
+ "parent_scored_document_hit=%s, child_scored_document_hits=[%s]",
+ expected_joined_scored_document_hit.final_score(),
+ formatter(
+ expected_joined_scored_document_hit.parent_scored_document_hit())
+ .c_str(),
+ absl_ports::StrJoin(
+ expected_joined_scored_document_hit.child_scored_document_hits(),
+ ",", formatter)
+ .c_str(),
+ arg.final_score(), formatter(arg.parent_scored_document_hit()).c_str(),
+ absl_ports::StrJoin(arg.child_scored_document_hits(), ",", formatter)
+ .c_str());
return false;
}
return true;
@@ -79,7 +261,6 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") {
const SchemaStore::SetSchemaResult& actual = arg;
if (actual.success == expected.success &&
- actual.index_incompatible == expected.index_incompatible &&
actual.old_schema_type_ids_changed ==
expected.old_schema_type_ids_changed &&
actual.schema_types_deleted_by_name ==
@@ -89,7 +270,14 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") {
actual.schema_types_incompatible_by_name ==
expected.schema_types_incompatible_by_name &&
actual.schema_types_incompatible_by_id ==
- expected.schema_types_incompatible_by_id) {
+ expected.schema_types_incompatible_by_id &&
+ actual.schema_types_new_by_name == expected.schema_types_new_by_name &&
+ actual.schema_types_changed_fully_compatible_by_name ==
+ expected.schema_types_changed_fully_compatible_by_name &&
+ actual.schema_types_index_incompatible_by_name ==
+ expected.schema_types_index_incompatible_by_name &&
+ actual.schema_types_join_incompatible_by_name ==
+ expected.schema_types_join_incompatible_by_name) {
return true;
}
@@ -149,81 +337,133 @@ MATCHER_P(EqualsSetSchemaResult, expected, "") {
absl_ports::NumberFormatter()),
"]");
+ // Format schema_types_new_by_name
+ std::string actual_schema_types_new_by_name = absl_ports::StrCat(
+ "[", absl_ports::StrJoin(actual.schema_types_new_by_name, ","), "]");
+
+ std::string expected_schema_types_new_by_name = absl_ports::StrCat(
+ "[", absl_ports::StrJoin(expected.schema_types_new_by_name, ","), "]");
+
+ // Format schema_types_changed_fully_compatible_by_name
+ std::string actual_schema_types_changed_fully_compatible_by_name =
+ absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(
+ actual.schema_types_changed_fully_compatible_by_name, ","),
+ "]");
+
+ std::string expected_schema_types_changed_fully_compatible_by_name =
+ absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(
+ expected.schema_types_changed_fully_compatible_by_name, ","),
+ "]");
+
+ // Format schema_types_deleted_by_id
+ std::string actual_schema_types_index_incompatible_by_name =
+ absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(actual.schema_types_index_incompatible_by_name,
+ ","),
+ "]");
+
+ std::string expected_schema_types_index_incompatible_by_name =
+ absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(expected.schema_types_index_incompatible_by_name,
+ ","),
+ "]");
+
+ // Format schema_types_join_incompatible_by_name
+ std::string actual_schema_types_join_incompatible_by_name =
+ absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(actual.schema_types_join_incompatible_by_name,
+ ","),
+ "]");
+
+ std::string expected_schema_types_join_incompatible_by_name =
+ absl_ports::StrCat(
+ "[",
+ absl_ports::StrJoin(expected.schema_types_join_incompatible_by_name,
+ ","),
+ "]");
+
*result_listener << IcingStringUtil::StringPrintf(
"\nExpected {\n"
"\tsuccess=%d,\n"
- "\tindex_incompatible=%d,\n"
"\told_schema_type_ids_changed=%s,\n"
"\tschema_types_deleted_by_name=%s,\n"
"\tschema_types_deleted_by_id=%s,\n"
"\tschema_types_incompatible_by_name=%s,\n"
"\tschema_types_incompatible_by_id=%s\n"
+ "\tschema_types_new_by_name=%s,\n"
+ "\tschema_types_changed_fully_compatible_by_name=%s\n"
+ "\tschema_types_index_incompatible_by_name=%s,\n"
+ "\tschema_types_join_incompatible_by_name=%s\n"
"}\n"
"Actual {\n"
"\tsuccess=%d,\n"
- "\tindex_incompatible=%d,\n"
"\told_schema_type_ids_changed=%s,\n"
"\tschema_types_deleted_by_name=%s,\n"
"\tschema_types_deleted_by_id=%s,\n"
"\tschema_types_incompatible_by_name=%s,\n"
"\tschema_types_incompatible_by_id=%s\n"
+ "\tschema_types_new_by_name=%s,\n"
+ "\tschema_types_changed_fully_compatible_by_name=%s\n"
+ "\tschema_types_index_incompatible_by_name=%s,\n"
+ "\tschema_types_join_incompatible_by_name=%s\n"
"}\n",
- expected.success, expected.index_incompatible,
- expected_old_schema_type_ids_changed.c_str(),
+ expected.success, expected_old_schema_type_ids_changed.c_str(),
expected_schema_types_deleted_by_name.c_str(),
expected_schema_types_deleted_by_id.c_str(),
expected_schema_types_incompatible_by_name.c_str(),
- expected_schema_types_incompatible_by_id.c_str(), actual.success,
- actual.index_incompatible, actual_old_schema_type_ids_changed.c_str(),
+ expected_schema_types_incompatible_by_id.c_str(),
+ expected_schema_types_new_by_name.c_str(),
+ expected_schema_types_changed_fully_compatible_by_name.c_str(),
+ expected_schema_types_index_incompatible_by_name.c_str(),
+ expected_schema_types_join_incompatible_by_name.c_str(), actual.success,
+ actual_old_schema_type_ids_changed.c_str(),
actual_schema_types_deleted_by_name.c_str(),
actual_schema_types_deleted_by_id.c_str(),
actual_schema_types_incompatible_by_name.c_str(),
- actual_schema_types_incompatible_by_id.c_str());
-
+ actual_schema_types_incompatible_by_id.c_str(),
+ actual_schema_types_new_by_name.c_str(),
+ actual_schema_types_changed_fully_compatible_by_name.c_str(),
+ actual_schema_types_index_incompatible_by_name.c_str(),
+ actual_schema_types_join_incompatible_by_name.c_str());
return false;
}
-std::string StatusCodeToString(libtextclassifier3::StatusCode code) {
- switch (code) {
- case libtextclassifier3::StatusCode::OK:
- return "OK";
- case libtextclassifier3::StatusCode::CANCELLED:
- return "CANCELLED";
- case libtextclassifier3::StatusCode::UNKNOWN:
- return "UNKNOWN";
- case libtextclassifier3::StatusCode::INVALID_ARGUMENT:
- return "INVALID_ARGUMENT";
- case libtextclassifier3::StatusCode::DEADLINE_EXCEEDED:
- return "DEADLINE_EXCEEDED";
- case libtextclassifier3::StatusCode::NOT_FOUND:
- return "NOT_FOUND";
- case libtextclassifier3::StatusCode::ALREADY_EXISTS:
- return "ALREADY_EXISTS";
- case libtextclassifier3::StatusCode::PERMISSION_DENIED:
- return "PERMISSION_DENIED";
- case libtextclassifier3::StatusCode::RESOURCE_EXHAUSTED:
- return "RESOURCE_EXHAUSTED";
- case libtextclassifier3::StatusCode::FAILED_PRECONDITION:
- return "FAILED_PRECONDITION";
- case libtextclassifier3::StatusCode::ABORTED:
- return "ABORTED";
- case libtextclassifier3::StatusCode::OUT_OF_RANGE:
- return "OUT_OF_RANGE";
- case libtextclassifier3::StatusCode::UNIMPLEMENTED:
- return "UNIMPLEMENTED";
- case libtextclassifier3::StatusCode::INTERNAL:
- return "INTERNAL";
- case libtextclassifier3::StatusCode::UNAVAILABLE:
- return "UNAVAILABLE";
- case libtextclassifier3::StatusCode::DATA_LOSS:
- return "DATA_LOSS";
- case libtextclassifier3::StatusCode::UNAUTHENTICATED:
- return "UNAUTHENTICATED";
- default:
- return "";
- }
+MATCHER_P3(EqualsSectionMetadata, expected_id, expected_property_path,
+ expected_property_config_proto, "") {
+ const SectionMetadata& actual = arg;
+ return actual.id == expected_id && actual.path == expected_property_path &&
+ actual.data_type == expected_property_config_proto.data_type() &&
+ actual.tokenizer ==
+ expected_property_config_proto.string_indexing_config()
+ .tokenizer_type() &&
+ actual.term_match_type ==
+ expected_property_config_proto.string_indexing_config()
+ .term_match_type() &&
+ actual.numeric_match_type ==
+ expected_property_config_proto.integer_indexing_config()
+ .numeric_match_type();
+}
+
+MATCHER_P3(EqualsJoinablePropertyMetadata, expected_id, expected_property_path,
+ expected_property_config_proto, "") {
+ const JoinablePropertyMetadata& actual = arg;
+ return actual.id == expected_id && actual.path == expected_property_path &&
+ actual.data_type == expected_property_config_proto.data_type() &&
+ actual.value_type ==
+ expected_property_config_proto.joinable_config().value_type();
}
+std::string StatusCodeToString(libtextclassifier3::StatusCode code);
+
+std::string ProtoStatusCodeToString(StatusProto::Code code);
+
MATCHER(IsOk, "") {
libtextclassifier3::StatusAdapter adapter(arg);
if (adapter.status().ok()) {
@@ -274,6 +514,68 @@ MATCHER_P2(StatusIs, status_code, error_matcher, "") {
result_listener);
}
+MATCHER(ProtoIsOk, "") {
+ if (arg.code() == StatusProto::OK) {
+ return true;
+ }
+ *result_listener << IcingStringUtil::StringPrintf(
+ "Expected OK, actual was (%s:%s)",
+ ProtoStatusCodeToString(arg.code()).c_str(), arg.message().c_str());
+ return false;
+}
+
+MATCHER_P(ProtoStatusIs, status_code, "") {
+ if (arg.code() == status_code) {
+ return true;
+ }
+ *result_listener << IcingStringUtil::StringPrintf(
+ "Expected (%s:), actual was (%s:%s)",
+ ProtoStatusCodeToString(status_code).c_str(),
+ ProtoStatusCodeToString(arg.code()).c_str(), arg.message().c_str());
+ return false;
+}
+
+MATCHER_P2(ProtoStatusIs, status_code, error_matcher, "") {
+ if (arg.code() != status_code) {
+ *result_listener << IcingStringUtil::StringPrintf(
+ "Expected (%s:), actual was (%s:%s)",
+ ProtoStatusCodeToString(status_code).c_str(),
+ ProtoStatusCodeToString(arg.code()).c_str(), arg.message().c_str());
+ return false;
+ }
+ return ExplainMatchResult(error_matcher, arg.message(), result_listener);
+}
+
+MATCHER_P(EqualsSearchResultIgnoreStatsAndScores, expected, "") {
+ SearchResultProto actual_copy = arg;
+ actual_copy.clear_query_stats();
+ actual_copy.clear_debug_info();
+ for (SearchResultProto::ResultProto& result :
+ *actual_copy.mutable_results()) {
+ // Joined results
+ for (SearchResultProto::ResultProto& joined_result :
+ *result.mutable_joined_results()) {
+ joined_result.clear_score();
+ }
+ result.clear_score();
+ }
+
+ SearchResultProto expected_copy = expected;
+ expected_copy.clear_query_stats();
+ expected_copy.clear_debug_info();
+ for (SearchResultProto::ResultProto& result :
+ *expected_copy.mutable_results()) {
+ // Joined results
+ for (SearchResultProto::ResultProto& joined_result :
+ *result.mutable_joined_results()) {
+ joined_result.clear_score();
+ }
+ result.clear_score();
+ }
+ return ExplainMatchResult(portable_equals_proto::EqualsProto(expected_copy),
+ actual_copy, result_listener);
+}
+
// TODO(tjbarron) Remove this once icing has switched to depend on TC3 Status
#define ICING_STATUS_MACROS_CONCAT_NAME(x, y) \
ICING_STATUS_MACROS_CONCAT_IMPL(x, y)
@@ -290,6 +592,10 @@ MATCHER_P2(StatusIs, status_code, error_matcher, "") {
ICING_ASSERT_OK(statusor.status()); \
lhs = std::move(statusor).ValueOrDie()
+#define ICING_ASSERT_HAS_VALUE_AND_ASSIGN(lhs, rexpr) \
+ ASSERT_TRUE(rexpr); \
+ lhs = rexpr.value()
+
} // namespace lib
} // namespace icing
diff --git a/icing/testing/fake-clock.h b/icing/testing/fake-clock.h
index 54b56c3..f451753 100644
--- a/icing/testing/fake-clock.h
+++ b/icing/testing/fake-clock.h
@@ -20,6 +20,22 @@
namespace icing {
namespace lib {
+// A fake timer class for tests. It makes sure that the elapsed time changes
+// every time it's requested.
+class FakeTimer : public Timer {
+ public:
+ int64_t GetElapsedMilliseconds() const override {
+ return fake_elapsed_milliseconds_;
+ }
+
+ void SetElapsedMilliseconds(int64_t elapsed_milliseconds) {
+ fake_elapsed_milliseconds_ = elapsed_milliseconds;
+ }
+
+ private:
+ int64_t fake_elapsed_milliseconds_ = 0;
+};
+
// Wrapper around real-time clock functions. This is separated primarily so
// tests can override this clock and inject it into the class under test.
class FakeClock : public Clock {
@@ -30,8 +46,17 @@ class FakeClock : public Clock {
milliseconds_ = milliseconds;
}
+ std::unique_ptr<Timer> GetNewTimer() const override {
+ return std::make_unique<FakeTimer>(fake_timer_);
+ }
+
+ void SetTimerElapsedMilliseconds(int64_t timer_elapsed_milliseconds) {
+ fake_timer_.SetElapsedMilliseconds(timer_elapsed_milliseconds);
+ }
+
private:
int64_t milliseconds_ = 0;
+ FakeTimer fake_timer_;
};
} // namespace lib
diff --git a/icing/testing/fake-clock_test.cc b/icing/testing/fake-clock_test.cc
index 3c75ae9..4b36727 100644
--- a/icing/testing/fake-clock_test.cc
+++ b/icing/testing/fake-clock_test.cc
@@ -24,7 +24,7 @@ namespace {
using ::testing::Eq;
-TEST(FakeClockTest, GetSetOk) {
+TEST(FakeClockTest, GetSetSystemTimeOk) {
FakeClock fake_clock;
EXPECT_THAT(fake_clock.GetSystemTimeMilliseconds(), Eq(0));
@@ -35,6 +35,17 @@ TEST(FakeClockTest, GetSetOk) {
EXPECT_THAT(fake_clock.GetSystemTimeMilliseconds(), Eq(-1));
}
+TEST(FakeClockTest, GetSetTimerElapsedTimeOk) {
+ FakeClock fake_clock;
+ EXPECT_THAT(fake_clock.GetNewTimer()->GetElapsedMilliseconds(), Eq(0));
+
+ fake_clock.SetTimerElapsedMilliseconds(10);
+ EXPECT_THAT(fake_clock.GetNewTimer()->GetElapsedMilliseconds(), Eq(10));
+
+ fake_clock.SetTimerElapsedMilliseconds(-1);
+ EXPECT_THAT(fake_clock.GetNewTimer()->GetElapsedMilliseconds(), Eq(-1));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/testing/hit-test-utils.cc b/icing/testing/hit-test-utils.cc
new file mode 100644
index 0000000..7ad8a64
--- /dev/null
+++ b/icing/testing/hit-test-utils.cc
@@ -0,0 +1,59 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/hit-test-utils.h"
+
+namespace icing {
+namespace lib {
+
+// Returns a hit that has a delta of desired_byte_length from last_hit.
+Hit CreateHit(Hit last_hit, int desired_byte_length) {
+ Hit hit = (last_hit.section_id() == kMinSectionId)
+ ? Hit(kMaxSectionId, last_hit.document_id() + 1,
+ last_hit.term_frequency())
+ : Hit(last_hit.section_id() - 1, last_hit.document_id(),
+ last_hit.term_frequency());
+ uint8_t buf[5];
+ while (VarInt::Encode(last_hit.value() - hit.value(), buf) <
+ desired_byte_length) {
+ hit = (hit.section_id() == kMinSectionId)
+ ? Hit(kMaxSectionId, hit.document_id() + 1, hit.term_frequency())
+ : Hit(hit.section_id() - 1, hit.document_id(),
+ hit.term_frequency());
+ }
+ return hit;
+}
+
+// Returns a vector of num_hits Hits with the first hit starting at start_docid
+// and with 1-byte deltas.
+std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits,
+ int desired_byte_length) {
+ std::vector<Hit> hits;
+ if (num_hits < 1) {
+ return hits;
+ }
+ hits.push_back(Hit(/*section_id=*/1, /*document_id=*/start_docid,
+ Hit::kDefaultTermFrequency));
+ while (hits.size() < num_hits) {
+ hits.push_back(CreateHit(hits.back(), desired_byte_length));
+ }
+ return hits;
+}
+
+std::vector<Hit> CreateHits(int num_hits, int desired_byte_length) {
+ return CreateHits(/*start_docid=*/0, num_hits, desired_byte_length);
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/hit-test-utils.h b/icing/testing/hit-test-utils.h
new file mode 100644
index 0000000..e236ec0
--- /dev/null
+++ b/icing/testing/hit-test-utils.h
@@ -0,0 +1,43 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_HIT_TEST_UTILS_H_
+#define ICING_TESTING_HIT_TEST_UTILS_H_
+
+#include <vector>
+
+#include "icing/index/hit/hit.h"
+#include "icing/legacy/index/icing-bit-util.h"
+#include "icing/schema/section.h"
+#include "icing/store/document-id.h"
+
+namespace icing {
+namespace lib {
+
+// Returns a hit that has a delta of desired_byte_length from last_hit.
+Hit CreateHit(Hit last_hit, int desired_byte_length);
+
+// Returns a vector of num_hits Hits with the first hit starting at start_docid
+// and with desired_byte_length deltas.
+std::vector<Hit> CreateHits(DocumentId start_docid, int num_hits,
+ int desired_byte_length);
+
+// Returns a vector of num_hits Hits with the first hit starting at 0 and each
+// with desired_byte_length deltas.
+std::vector<Hit> CreateHits(int num_hits, int desired_byte_length);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_HIT_TEST_UTILS_H_
diff --git a/icing/helpers/icu/icu-data-file-helper.cc b/icing/testing/icu-data-file-helper.cc
index 5cf6a1d..aaeb738 100644
--- a/icing/helpers/icu/icu-data-file-helper.cc
+++ b/icing/testing/icu-data-file-helper.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/testing/icu-data-file-helper.h"
#include <sys/mman.h>
@@ -49,8 +49,6 @@ libtextclassifier3::Status SetUpICUDataFile(
return absl_ports::InternalError("Unable to open file at provided path");
}
- // TODO(samzheng): figure out why icing::MemoryMappedFile causes
- // segmentation fault here.
const void* data =
mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd.get(), 0);
diff --git a/icing/helpers/icu/icu-data-file-helper.h b/icing/testing/icu-data-file-helper.h
index 90f5bc7..d0276e7 100644
--- a/icing/helpers/icu/icu-data-file-helper.h
+++ b/icing/testing/icu-data-file-helper.h
@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
-#define ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#ifndef ICING_TESTING_ICU_DATA_FILE_HELPER
+#define ICING_TESTING_ICU_DATA_FILE_HELPER
#include "icing/text_classifier/lib3/utils/base/status.h"
@@ -40,4 +40,4 @@ libtextclassifier3::Status SetUpICUDataFile(
} // namespace lib
} // namespace icing
-#endif // ICING_HELPERS_ICU_ICU_DATA_FILE_HELPER
+#endif // ICING_TESTING_ICU_DATA_FILE_HELPER
diff --git a/icing/testing/icu-i18n-test-utils.cc b/icing/testing/icu-i18n-test-utils.cc
index 09878db..50dc26c 100644
--- a/icing/testing/icu-i18n-test-utils.cc
+++ b/icing/testing/icu-i18n-test-utils.cc
@@ -29,7 +29,7 @@ std::string UCharToString(UChar32 uchar) {
uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes
int utf8_index = 0;
- UBool has_error = FALSE;
+ UBool has_error = false;
// utf8_index is advanced to the end of the contents if successful
U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);
diff --git a/icing/testing/jni-test-helpers.h b/icing/testing/jni-test-helpers.h
index adc469a..67a98c3 100644
--- a/icing/testing/jni-test-helpers.h
+++ b/icing/testing/jni-test-helpers.h
@@ -15,6 +15,8 @@
#ifndef ICING_TESTING_JNI_TEST_HELPERS_H_
#define ICING_TESTING_JNI_TEST_HELPERS_H_
+#include <memory>
+
#include "icing/jni/jni-cache.h"
#ifdef ICING_REVERSE_JNI_SEGMENTATION
diff --git a/icing/testing/numeric/normal-distribution-number-generator.h b/icing/testing/numeric/normal-distribution-number-generator.h
new file mode 100644
index 0000000..73cdd1f
--- /dev/null
+++ b/icing/testing/numeric/normal-distribution-number-generator.h
@@ -0,0 +1,42 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_NUMERIC_NORMAL_DISTRIBUTION_NUMBER_GENERATOR_H_
+#define ICING_TESTING_NUMERIC_NORMAL_DISTRIBUTION_NUMBER_GENERATOR_H_
+
+#include <cmath>
+#include <random>
+
+#include "icing/testing/numeric/number-generator.h"
+
+namespace icing {
+namespace lib {
+
+template <typename T>
+class NormalDistributionNumberGenerator : public NumberGenerator<T> {
+ public:
+ explicit NormalDistributionNumberGenerator(int seed, double mean,
+ double stddev)
+ : NumberGenerator<T>(seed), distribution_(mean, stddev) {}
+
+ T Generate() override { return std::round(distribution_(this->engine_)); }
+
+ private:
+ std::normal_distribution<> distribution_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_NUMERIC_NORMAL_DISTRIBUTION_NUMBER_GENERATOR_H_
diff --git a/icing/testing/numeric/number-generator.h b/icing/testing/numeric/number-generator.h
new file mode 100644
index 0000000..bb601b4
--- /dev/null
+++ b/icing/testing/numeric/number-generator.h
@@ -0,0 +1,39 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_NUMERIC_NUMBER_GENERATOR_H_
+#define ICING_TESTING_NUMERIC_NUMBER_GENERATOR_H_
+
+#include <random>
+
+namespace icing {
+namespace lib {
+
+template <typename T>
+class NumberGenerator {
+ public:
+ virtual ~NumberGenerator() = default;
+
+ virtual T Generate() = 0;
+
+ protected:
+ explicit NumberGenerator(int seed) : engine_(seed) {}
+
+ std::default_random_engine engine_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_NUMERIC_NUMBER_GENERATOR_H_
diff --git a/icing/testing/numeric/uniform-distribution-integer-generator.h b/icing/testing/numeric/uniform-distribution-integer-generator.h
new file mode 100644
index 0000000..569eebd
--- /dev/null
+++ b/icing/testing/numeric/uniform-distribution-integer-generator.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_NUMERIC_UNIFORM_DISTRIBUTION_INTEGER_GENERATOR_H_
+#define ICING_TESTING_NUMERIC_UNIFORM_DISTRIBUTION_INTEGER_GENERATOR_H_
+
+#include <random>
+
+#include "icing/testing/numeric/number-generator.h"
+
+namespace icing {
+namespace lib {
+
+template <typename T>
+class UniformDistributionIntegerGenerator : public NumberGenerator<T> {
+ public:
+ explicit UniformDistributionIntegerGenerator(int seed, T range_lower,
+ T range_upper)
+ : NumberGenerator<T>(seed), distribution_(range_lower, range_upper) {}
+
+ T Generate() override { return distribution_(this->engine_); }
+
+ private:
+ std::uniform_int_distribution<T> distribution_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_NUMERIC_UNIFORM_DISTRIBUTION_INTEGER_GENERATOR_H_
diff --git a/icing/testing/random-string.cc b/icing/testing/random-string.cc
new file mode 100644
index 0000000..27f83bc
--- /dev/null
+++ b/icing/testing/random-string.cc
@@ -0,0 +1,54 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/random-string.h"
+
+namespace icing {
+namespace lib {
+
+std::vector<std::string> GenerateUniqueTerms(int num_terms) {
+ char before_a = 'a' - 1;
+ std::string term(1, before_a);
+ std::vector<std::string> terms;
+ int current_char = 0;
+ for (int permutation = 0; permutation < num_terms; ++permutation) {
+ if (term[current_char] != 'z') {
+ ++term[current_char];
+ } else {
+ if (current_char < term.length() - 1) {
+ // The string currently looks something like this "zzzaa"
+ // 1. Find the first char after this one that isn't
+ current_char = term.find_first_not_of('z', current_char);
+ if (current_char != std::string::npos) {
+ // 2. Increment that character
+ ++term[current_char];
+
+ // 3. Set every character prior to current_char to 'a'
+ term.replace(0, current_char, current_char, 'a');
+ } else {
+ // Every character in this string is a 'z'. We need to grow.
+ term = std::string(term.length() + 1, 'a');
+ }
+ } else {
+ term = std::string(term.length() + 1, 'a');
+ }
+ current_char = 0;
+ }
+ terms.push_back(term);
+ }
+ return terms;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/random-string.h b/icing/testing/random-string.h
index 1510e15..a313c1c 100644
--- a/icing/testing/random-string.h
+++ b/icing/testing/random-string.h
@@ -15,6 +15,7 @@
#ifndef ICING_TESTING_RANDOM_STRING_H_
#define ICING_TESTING_RANDOM_STRING_H_
+#include <algorithm>
#include <random>
#include <string>
@@ -24,10 +25,19 @@ namespace lib {
inline constexpr std::string_view kAlNumAlphabet =
"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
+// Average length of word in English is 4.7 characters.
+inline constexpr int kAvgTokenLen = 5;
+// Made up value. This results in a fairly reasonable language - the majority of
+// generated words are 3-9 characters, ~3% of words are >=20 chars, and the
+// longest ones are 27 chars, (roughly consistent with the longest,
+// non-contrived English words
+// https://en.wikipedia.org/wiki/Longest_word_in_English)
+inline constexpr int kTokenStdDev = 7;
+
template <typename Gen>
std::string RandomString(const std::string_view alphabet, size_t len,
Gen* gen) {
- std::uniform_int_distribution<size_t> uniform(0u, alphabet.size());
+ std::uniform_int_distribution<size_t> uniform(0u, alphabet.size() - 1);
std::string result(len, '\0');
std::generate(
std::begin(result), std::end(result),
@@ -36,6 +46,26 @@ std::string RandomString(const std::string_view alphabet, size_t len,
return result;
}
+// Creates a vector containing num_words randomly-generated words for use by
+// documents.
+template <typename Rand>
+std::vector<std::string> CreateLanguages(int num_words, Rand* r) {
+ std::vector<std::string> language;
+ std::normal_distribution<> norm_dist(kAvgTokenLen, kTokenStdDev);
+ while (--num_words >= 0) {
+ int word_length = 0;
+ while (word_length < 1) {
+ word_length = std::round(norm_dist(*r));
+ }
+ language.push_back(RandomString(kAlNumAlphabet, word_length, r));
+ }
+ return language;
+}
+
+// Returns a vector containing num_terms unique terms. Terms are created in
+// non-random order starting with "a" to "z" to "aa" to "zz", etc.
+std::vector<std::string> GenerateUniqueTerms(int num_terms);
+
} // namespace lib
} // namespace icing
diff --git a/icing/testing/random-string_test.cc b/icing/testing/random-string_test.cc
new file mode 100644
index 0000000..759fec0
--- /dev/null
+++ b/icing/testing/random-string_test.cc
@@ -0,0 +1,54 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/testing/random-string.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+TEST(RandomStringTest, GenerateUniqueTerms) {
+ EXPECT_THAT(GenerateUniqueTerms(0), IsEmpty());
+ EXPECT_THAT(GenerateUniqueTerms(1), ElementsAre("a"));
+ EXPECT_THAT(GenerateUniqueTerms(4), ElementsAre("a", "b", "c", "d"));
+ EXPECT_THAT(GenerateUniqueTerms(29),
+ ElementsAre("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
+ "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v",
+ "w", "x", "y", "z", "aa", "ba", "ca"));
+ EXPECT_THAT(GenerateUniqueTerms(56),
+ ElementsAre("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
+ "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v",
+ "w", "x", "y", "z", "aa", "ba", "ca", "da", "ea",
+ "fa", "ga", "ha", "ia", "ja", "ka", "la", "ma", "na",
+ "oa", "pa", "qa", "ra", "sa", "ta", "ua", "va", "wa",
+ "xa", "ya", "za", "ab", "bb", "cb", "db"));
+ EXPECT_THAT(GenerateUniqueTerms(56).at(54), Eq("cb"));
+ EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26), Eq("aa"));
+ EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27), Eq("aaa"));
+ EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27 - 6), Eq("uz"));
+ EXPECT_THAT(GenerateUniqueTerms(26 * 26 * 26).at(26 * 27 + 5), Eq("faa"));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/testing/schema-generator.h b/icing/testing/schema-generator.h
index e733612..8de8d06 100644
--- a/icing/testing/schema-generator.h
+++ b/icing/testing/schema-generator.h
@@ -18,8 +18,8 @@
#include <random>
#include <string>
-#include "icing/proto/schema.proto.h"
#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
namespace icing {
namespace lib {
@@ -31,13 +31,16 @@ class ExactStringPropertyGenerator {
prop.set_property_name(name.data(), name.length());
prop.set_data_type(PropertyConfigProto::DataType::STRING);
prop.set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
- IndexingConfig* indexing_config = prop.mutable_indexing_config();
- indexing_config->set_term_match_type(TermMatchType::EXACT_ONLY);
- indexing_config->set_tokenizer_type(IndexingConfig::TokenizerType::PLAIN);
+ StringIndexingConfig* string_indexing_config =
+ prop.mutable_string_indexing_config();
+ string_indexing_config->set_term_match_type(TermMatchType::EXACT_ONLY);
+ string_indexing_config->set_tokenizer_type(
+ StringIndexingConfig::TokenizerType::PLAIN);
return prop;
}
};
+// Schema generator with random number of properties
template <typename Rand, typename PropertyGenerator>
class RandomSchemaGenerator {
public:
@@ -69,6 +72,37 @@ class RandomSchemaGenerator {
PropertyGenerator* prop_generator_;
};
+// Schema generator with number of properties specified by the caller
+template <typename PropertyGenerator>
+class SchemaGenerator {
+ public:
+ explicit SchemaGenerator(int num_properties,
+ PropertyGenerator* prop_generator)
+ : num_properties_(num_properties), prop_generator_(prop_generator) {}
+
+ SchemaProto GenerateSchema(int num_types) {
+ SchemaProto schema;
+ while (--num_types >= 0) {
+ SetType(schema.add_types(), "Type" + std::to_string(num_types),
+ num_properties_);
+ }
+ return schema;
+ }
+
+ private:
+ void SetType(SchemaTypeConfigProto* type_config, std::string_view name,
+ int num_properties) const {
+ type_config->set_schema_type(name.data(), name.length());
+ while (--num_properties >= 0) {
+ std::string prop_name = "Prop" + std::to_string(num_properties);
+ (*type_config->add_properties()) = (*prop_generator_)(prop_name);
+ }
+ }
+
+ int num_properties_;
+ PropertyGenerator* prop_generator_;
+};
+
} // namespace lib
} // namespace icing
diff --git a/icing/testing/snippet-helpers.cc b/icing/testing/snippet-helpers.cc
deleted file mode 100644
index fde0004..0000000
--- a/icing/testing/snippet-helpers.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/testing/snippet-helpers.h"
-
-#include <algorithm>
-#include <string_view>
-
-#include "icing/proto/search.pb.h"
-
-namespace icing {
-namespace lib {
-
-const SnippetMatchProto* GetSnippetMatch(const SnippetProto& snippet_proto,
- const std::string& property_name,
- int snippet_index) {
- auto iterator = std::find_if(
- snippet_proto.entries().begin(), snippet_proto.entries().end(),
- [&property_name](const SnippetProto::EntryProto& entry) {
- return entry.property_name() == property_name;
- });
- if (iterator == snippet_proto.entries().end() ||
- iterator->snippet_matches_size() <= snippet_index) {
- return nullptr;
- }
- return &iterator->snippet_matches(snippet_index);
-}
-
-const PropertyProto* GetProperty(const DocumentProto& document,
- const std::string& property_name) {
- const PropertyProto* property = nullptr;
- for (const PropertyProto& prop : document.properties()) {
- if (prop.name() == property_name) {
- property = &prop;
- }
- }
- return property;
-}
-
-std::string GetWindow(const DocumentProto& document,
- const SnippetProto& snippet_proto,
- const std::string& property_name, int snippet_index) {
- const SnippetMatchProto* match =
- GetSnippetMatch(snippet_proto, property_name, snippet_index);
- const PropertyProto* property = GetProperty(document, property_name);
- if (match == nullptr || property == nullptr) {
- return "";
- }
- std::string_view value = property->string_values(match->values_index());
- return std::string(
- value.substr(match->window_position(), match->window_bytes()));
-}
-
-std::string GetMatch(const DocumentProto& document,
- const SnippetProto& snippet_proto,
- const std::string& property_name, int snippet_index) {
- const SnippetMatchProto* match =
- GetSnippetMatch(snippet_proto, property_name, snippet_index);
- const PropertyProto* property = GetProperty(document, property_name);
- if (match == nullptr || property == nullptr) {
- return "";
- }
- std::string_view value = property->string_values(match->values_index());
- return std::string(
- value.substr(match->exact_match_position(), match->exact_match_bytes()));
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/testing/snippet-helpers.h b/icing/testing/snippet-helpers.h
deleted file mode 100644
index 124e421..0000000
--- a/icing/testing/snippet-helpers.h
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TESTING_SNIPPET_HELPERS_H_
-#define ICING_TESTING_SNIPPET_HELPERS_H_
-
-#include <string>
-
-#include "icing/proto/document.pb.h"
-#include "icing/proto/search.pb.h"
-
-namespace icing {
-namespace lib {
-
-// Retrieve pointer to the snippet_index'th SnippetMatchProto within the
-// EntryProto identified by property_name within snippet_proto.
-// Returns nullptr
-// - if there is no EntryProto within snippet_proto corresponding to
-// property_name.
-// - if there is no SnippetMatchProto at snippet_index within the EntryProto
-const SnippetMatchProto* GetSnippetMatch(const SnippetProto& snippet_proto,
- const std::string& property_name,
- int snippet_index);
-
-// Retrieve pointer to the PropertyProto identified by property_name.
-// Returns nullptr if no such property exists.
-const PropertyProto* GetProperty(const DocumentProto& document,
- const std::string& property_name);
-
-// Retrieves the window defined by the SnippetMatchProto returned by
-// GetSnippetMatch(snippet_proto, property_name, snippet_index) for the property
-// returned by GetProperty(document, property_name).
-// Returns "" if no such property, snippet or window exists.
-std::string GetWindow(const DocumentProto& document,
- const SnippetProto& snippet_proto,
- const std::string& property_name, int snippet_index);
-
-// Retrieves the match defined by the SnippetMatchProto returned by
-// GetSnippetMatch(snippet_proto, property_name, snippet_index) for the property
-// returned by GetProperty(document, property_name).
-// Returns "" if no such property or snippet exists.
-std::string GetMatch(const DocumentProto& document,
- const SnippetProto& snippet_proto,
- const std::string& property_name, int snippet_index);
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TESTING_SNIPPET_HELPERS_H_
diff --git a/icing/text_classifier/lib3/utils/base/logging.h b/icing/text_classifier/lib3/utils/base/logging.h
index bf02f65..92d775e 100644
--- a/icing/text_classifier/lib3/utils/base/logging.h
+++ b/icing/text_classifier/lib3/utils/base/logging.h
@@ -22,7 +22,6 @@
#include "icing/text_classifier/lib3/utils/base/logging_levels.h"
#include "icing/text_classifier/lib3/utils/base/port.h"
-
namespace libtextclassifier3 {
namespace logging {
diff --git a/icing/text_classifier/lib3/utils/base/statusor.h b/icing/text_classifier/lib3/utils/base/statusor.h
index f5fae7a..aa1e598 100644
--- a/icing/text_classifier/lib3/utils/base/statusor.h
+++ b/icing/text_classifier/lib3/utils/base/statusor.h
@@ -86,6 +86,8 @@ class StatusOr {
// Conversion assignment operator, T must be assignable from U
template <typename U>
inline StatusOr& operator=(const StatusOr<U>& other);
+ template <typename U>
+ inline StatusOr& operator=(StatusOr<U>&& other);
inline ~StatusOr();
@@ -134,6 +136,40 @@ class StatusOr {
friend class StatusOr;
private:
+ void Clear() {
+ if (ok()) {
+ value_.~T();
+ }
+ }
+
+ // Construct the value through placement new with the passed argument.
+ template <typename... Arg>
+ void MakeValue(Arg&&... arg) {
+ new (&value_) T(std::forward<Arg>(arg)...);
+ }
+
+ // Creates a valid instance of type T constructed with U and assigns it to
+ // value_. Handles how to properly assign to value_ if value_ was never
+ // actually initialized (if this is currently non-OK).
+ template <typename U>
+ void AssignValue(U&& value) {
+ if (ok()) {
+ value_ = std::forward<U>(value);
+ } else {
+ MakeValue(std::forward<U>(value));
+ status_ = Status::OK;
+ }
+ }
+
+ // Creates a status constructed with U and assigns it to status_. It also
+ // properly destroys value_ if this is OK and value_ represents a valid
+ // instance of T.
+ template <typename U>
+ void AssignStatus(U&& v) {
+ Clear();
+ status_ = static_cast<Status>(std::forward<U>(v));
+ }
+
Status status_;
// The members of unions do not require initialization and are not destructed
// unless specifically called. This allows us to construct instances of
@@ -165,12 +201,19 @@ template <typename T>
inline StatusOr<T>::StatusOr(T&& value) : value_(std::move(value)) {}
template <typename T>
-inline StatusOr<T>::StatusOr(const StatusOr& other)
- : status_(other.status_), value_(other.value_) {}
+inline StatusOr<T>::StatusOr(const StatusOr& other) : status_(other.status_) {
+ if (other.ok()) {
+ MakeValue(other.value_);
+ }
+}
template <typename T>
inline StatusOr<T>::StatusOr(StatusOr&& other)
- : status_(other.status_), value_(std::move(other.value_)) {}
+ : status_(std::move(other.status_)) {
+ if (other.ok()) {
+ MakeValue(std::move(other.value_));
+ }
+}
template <typename T>
template <
@@ -180,7 +223,11 @@ template <
std::is_convertible<const U&, T>>::value,
int>>
inline StatusOr<T>::StatusOr(const StatusOr<U>& other)
- : status_(other.status_), value_(other.value_) {}
+ : status_(other.status_) {
+ if (other.ok()) {
+ MakeValue(other.value_);
+ }
+}
template <typename T>
template <typename U,
@@ -189,7 +236,11 @@ template <typename U,
std::is_convertible<U&&, T>>::value,
int>>
inline StatusOr<T>::StatusOr(StatusOr<U>&& other)
- : status_(other.status_), value_(std::move(other.value_)) {}
+ : status_(std::move(other.status_)) {
+ if (other.ok()) {
+ MakeValue(std::move(other.value_));
+ }
+}
template <typename T>
template <
@@ -210,35 +261,47 @@ inline StatusOr<T>::StatusOr(U&& value) : StatusOr(T(std::forward<U>(value))) {}
template <typename T>
inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr& other) {
- status_ = other.status_;
- if (status_.ok()) {
- value_ = other.value_;
+ if (other.ok()) {
+ AssignValue(other.value_);
+ } else {
+ AssignStatus(other.status_);
}
return *this;
}
template <typename T>
inline StatusOr<T>& StatusOr<T>::operator=(StatusOr&& other) {
- status_ = other.status_;
- if (status_.ok()) {
- value_ = std::move(other.value_);
+ if (other.ok()) {
+ AssignValue(std::move(other.value_));
+ } else {
+ AssignStatus(std::move(other.status_));
}
return *this;
}
template <typename T>
inline StatusOr<T>::~StatusOr() {
- if (ok()) {
- value_.~T();
- }
+ Clear();
}
template <typename T>
template <typename U>
inline StatusOr<T>& StatusOr<T>::operator=(const StatusOr<U>& other) {
- status_ = other.status_;
- if (status_.ok()) {
- value_ = other.value_;
+ if (other.ok()) {
+ AssignValue(other.value_);
+ } else {
+ AssignStatus(other.status_);
+ }
+ return *this;
+}
+
+template <typename T>
+template <typename U>
+inline StatusOr<T>& StatusOr<T>::operator=(StatusOr<U>&& other) {
+ if (other.ok()) {
+ AssignValue(std::move(other.value_));
+ } else {
+ AssignStatus(std::move(other.status_));
}
return *this;
}
diff --git a/icing/text_classifier/lib3/utils/java/jni-base.cc b/icing/text_classifier/lib3/utils/java/jni-base.cc
index 897628c..e97e8b9 100644
--- a/icing/text_classifier/lib3/utils/java/jni-base.cc
+++ b/icing/text_classifier/lib3/utils/java/jni-base.cc
@@ -22,11 +22,13 @@ bool EnsureLocalCapacity(JNIEnv* env, int capacity) {
return env->EnsureLocalCapacity(capacity) == JNI_OK;
}
-bool JniExceptionCheckAndClear(JNIEnv* env) {
+bool JniExceptionCheckAndClear(JNIEnv* env, bool print_exception_on_error) {
TC3_CHECK(env != nullptr);
const bool result = env->ExceptionCheck();
if (result) {
- env->ExceptionDescribe();
+ if (print_exception_on_error) {
+ env->ExceptionDescribe();
+ }
env->ExceptionClear();
}
return result;
diff --git a/icing/text_classifier/lib3/utils/java/jni-base.h b/icing/text_classifier/lib3/utils/java/jni-base.h
index 5876eba..f86434b 100644
--- a/icing/text_classifier/lib3/utils/java/jni-base.h
+++ b/icing/text_classifier/lib3/utils/java/jni-base.h
@@ -17,6 +17,7 @@
#include <jni.h>
+#include <memory>
#include <string>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
@@ -63,7 +64,8 @@ namespace libtextclassifier3 {
bool EnsureLocalCapacity(JNIEnv* env, int capacity);
// Returns true if there was an exception. Also it clears the exception.
-bool JniExceptionCheckAndClear(JNIEnv* env);
+bool JniExceptionCheckAndClear(JNIEnv* env,
+ bool print_exception_on_error = true);
// A deleter to be used with std::unique_ptr to delete JNI global references.
class GlobalRefDeleter {
diff --git a/icing/text_classifier/lib3/utils/java/jni-helper.h b/icing/text_classifier/lib3/utils/java/jni-helper.h
index 907ad0d..4e548ec 100644
--- a/icing/text_classifier/lib3/utils/java/jni-helper.h
+++ b/icing/text_classifier/lib3/utils/java/jni-helper.h
@@ -150,8 +150,10 @@ class JniHelper {
jmethodID method_id, ...);
template <class T>
- static StatusOr<T> CallStaticIntMethod(JNIEnv* env, jclass clazz,
- jmethodID method_id, ...);
+ static StatusOr<T> CallStaticIntMethod(JNIEnv* env,
+ bool print_exception_on_error,
+ jclass clazz, jmethodID method_id,
+ ...);
};
template <typename T>
@@ -167,14 +169,19 @@ StatusOr<ScopedLocalRef<T>> JniHelper::GetObjectArrayElement(JNIEnv* env,
}
template <class T>
-StatusOr<T> JniHelper::CallStaticIntMethod(JNIEnv* env, jclass clazz,
- jmethodID method_id, ...) {
+StatusOr<T> JniHelper::CallStaticIntMethod(JNIEnv* env,
+ bool print_exception_on_error,
+ jclass clazz, jmethodID method_id,
+ ...) {
va_list args;
va_start(args, method_id);
jint result = env->CallStaticIntMethodV(clazz, method_id, args);
va_end(args);
- TC3_NO_EXCEPTION_OR_RETURN;
+ if (JniExceptionCheckAndClear(env, print_exception_on_error)) {
+ return {Status::UNKNOWN};
+ }
+
return result;
}
diff --git a/icing/tokenization/combined-tokenizer_test.cc b/icing/tokenization/combined-tokenizer_test.cc
new file mode 100644
index 0000000..0e400e2
--- /dev/null
+++ b/icing/tokenization/combined-tokenizer_test.cc
@@ -0,0 +1,262 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string_view>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/tokenizer-factory.h"
+#include "icing/tokenization/tokenizer.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::testing::ElementsAre;
+
+// This test exists to ensure that the different tokenizers treat different
+// segments of text in the same manner.
+class CombinedTokenizerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+ jni_cache_ = GetTestJniCache();
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+ }
+
+ std::unique_ptr<const JniCache> jni_cache_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+};
+
+std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
+ std::vector<std::string> terms;
+ terms.reserve(tokens.size());
+ for (const Token& token : tokens) {
+ if (token.type == Token::Type::REGULAR) {
+ terms.push_back(std::string(token.text));
+ }
+ }
+ return terms;
+}
+
+} // namespace
+
+TEST_F(CombinedTokenizerTest, SpecialCharacters) {
+ const std::string_view kText = "😊 Hello! Goodbye?";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("😊", "Hello", "Goodbye"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("😊", "Hello", "Goodbye"));
+}
+
+TEST_F(CombinedTokenizerTest, Parentheses) {
+ const std::string_view kText = "((paren1)(paren2) (last paren))";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("paren1", "paren2", "last", "paren"));
+}
+
+TEST_F(CombinedTokenizerTest, Negation) {
+ const std::string_view kText = "-foo -bar -baz";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
+}
+
+// TODO(b/254874614): Handle colon word breaks in ICU 72+
+TEST_F(CombinedTokenizerTest, Colons) {
+ const std::string_view kText = ":foo: :bar baz:";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
+}
+
+// TODO(b/254874614): Handle colon word breaks in ICU 72+
+TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ if (GetIcuTokenizationVersion() >= 72) {
+ // In ICU 72+ and above, ':' are no longer considered word connectors. The
+ // query tokenizer should still consider them to be property restricts.
+ constexpr std::string_view kText = "foo:bar";
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("bar"));
+
+ // This difference, however, should only apply to the first ':'. Both should
+ // consider a second ':' to be a word break.
+ constexpr std::string_view kText2 = "foo:bar:baz";
+ ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText2));
+ indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ query_tokenizer->TokenizeAll(kText2));
+ query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("bar", "baz"));
+ } else {
+ // This is a difference between the two tokenizers. "foo:bar" is a single
+ // token to the plain tokenizer because ':' is a word connector. But
+ // "foo:bar" is a property restrict to the query tokenizer - so "foo" is the
+ // property and "bar" is the only text term.
+ constexpr std::string_view kText = "foo:bar";
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("bar"));
+
+ // This difference, however, should only apply to the first ':'. A
+ // second ':' should be treated by both tokenizers as a word connector.
+ constexpr std::string_view kText2 = "foo:bar:baz";
+ ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText2));
+ indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ query_tokenizer->TokenizeAll(kText2));
+ query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("bar:baz"));
+ }
+}
+
+TEST_F(CombinedTokenizerTest, Punctuation) {
+ const std::string_view kText = "Who? What!? Why & How.";
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> indexing_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> query_tokenizer,
+ CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
+ lang_segmenter_.get()));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
+ indexing_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
+ EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
+
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ query_tokenizer->TokenizeAll(kText));
+ std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
+ EXPECT_THAT(query_terms, ElementsAre("Who", "What", "Why", "How"));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/icu/icu-language-segmenter-factory.cc b/icing/tokenization/icu/icu-language-segmenter-factory.cc
index 0ef1824..7b095b4 100644
--- a/icing/tokenization/icu/icu-language-segmenter-factory.cc
+++ b/icing/tokenization/icu/icu-language-segmenter-factory.cc
@@ -15,6 +15,7 @@
#include "icing/tokenization/icu/icu-language-segmenter.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -31,7 +32,7 @@ constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
// A LanguageSegmenter on success
// INVALID_ARGUMENT if locale string is invalid
//
-// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// TODO(b/156383798): Figure out if we want to verify locale strings and notify
// users. Right now illegal locale strings will be ignored by ICU. ICU
// components will be created with its default locale.
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
@@ -46,7 +47,7 @@ libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
<< " not supported. Converting to locale " << ULOC_US;
options.locale = ULOC_US;
}
- return std::make_unique<IcuLanguageSegmenter>(std::move(options.locale));
+ return IcuLanguageSegmenter::Create(std::move(options.locale));
}
} // namespace language_segmenter_factory
diff --git a/icing/tokenization/icu/icu-language-segmenter.cc b/icing/tokenization/icu/icu-language-segmenter.cc
index d43a78d..cac12f7 100644
--- a/icing/tokenization/icu/icu-language-segmenter.cc
+++ b/icing/tokenization/icu/icu-language-segmenter.cc
@@ -24,7 +24,9 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/mutex.h"
#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
#include "unicode/ubrk.h"
@@ -47,9 +49,11 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// INTERNAL_ERROR if unable to create
static libtextclassifier3::StatusOr<
std::unique_ptr<LanguageSegmenter::Iterator>>
- Create(std::string_view text, std::string_view locale) {
+ Create(const IcuLanguageSegmenter* creator, UBreakIterator* break_iterator,
+ std::string_view text, std::string_view locale) {
std::unique_ptr<IcuLanguageSegmenterIterator> iterator(
- new IcuLanguageSegmenterIterator(text, locale));
+ new IcuLanguageSegmenterIterator(creator, break_iterator, text,
+ locale));
if (iterator->Initialize()) {
return iterator;
}
@@ -57,8 +61,8 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
}
~IcuLanguageSegmenterIterator() {
- ubrk_close(break_iterator_);
- utext_close(&u_text_);
+ utext_close(u_text_);
+ creator_.ReturnBreakIterator(break_iterator_);
}
// Advances to the next term. Returns false if it has reached the end.
@@ -82,9 +86,6 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return false;
}
- if (!IsValidSegment()) {
- return Advance();
- }
return true;
}
@@ -101,78 +102,175 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return text_.substr(term_start_index_, term_length);
}
- libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
+ override {
+ if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ return offset_iterator_;
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
+ override {
+ if (!offset_iterator_.MoveToUtf8(term_end_index_exclusive_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ return offset_iterator_;
+ }
+
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
int32_t offset) override {
- if (offset < 0 || offset >= text_.length()) {
+ if (offset < 0) {
+ // Very simple. The first term start after a negative offset is the first
+ // term. So just reset to start and Advance.
+ return ResetToStartUtf32();
+ }
+
+ // 1. Find the unicode character that contains the byte at offset.
+ if (!offset_iterator_.MoveToUtf32(offset)) {
+ // An error occurred. Mark as DONE
+ if (offset_iterator_.utf8_index() != text_.length()) {
+ // We returned false for some reason other than hitting the end. This is
+ // a real error. Just return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ }
+ if (offset_iterator_.utf8_index() == text_.length()) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Illegal offset provided! Offset %d is not within bounds of string "
- "of length %zu",
- offset, text_.length()));
+ "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
+ "bounds of string of length %zu",
+ offset_iterator_.utf32_index(), offset_iterator_.utf8_index(),
+ text_.length()));
}
- term_start_index_ = ubrk_following(break_iterator_, offset);
- if (term_start_index_ == UBRK_DONE) {
+
+ // 2. We've got the unicode character containing byte offset. Now, we need
+ // to point to the segment that starts after this character.
+ int following_utf8_index =
+ ubrk_following(break_iterator_, offset_iterator_.utf8_index());
+ if (following_utf8_index == UBRK_DONE) {
MarkAsDone();
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No segments begin after provided offset %d.", offset));
}
- term_end_index_exclusive_ = ubrk_next(break_iterator_);
- if (term_end_index_exclusive_ == UBRK_DONE) {
- MarkAsDone();
+ term_end_index_exclusive_ = following_utf8_index;
+
+ // 3. The term_end_exclusive_ points to the start of the term that we want
+ // to return. We need to Advance so that term_start_ will now point to this
+ // term.
+ if (!Advance()) {
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No segments begin after provided offset %d.", offset));
}
- if (!IsValidSegment()) {
- if (!Advance()) {
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No segments begin after provided offset %d.", offset));
- }
+ if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
}
- return term_start_index_;
+ return offset_iterator_.utf32_index();
}
- libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
int32_t offset) override {
- if (offset < 0 || offset >= text_.length()) {
+ if (offset < 0) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Illegal offset provided! Offset %d is not within bounds of string "
"of length %zu",
offset, text_.length()));
}
- ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(offset));
- if (term_end_index_exclusive_ > offset) {
- // This term ends after offset. So we need to get the term just before
- // this one.
- ICING_RETURN_IF_ERROR(ResetToTermStartingBefore(term_start_index_));
+
+ if (!offset_iterator_.MoveToUtf32(offset)) {
+ // An error occurred. Mark as DONE
+ if (offset_iterator_.utf8_index() != text_.length()) {
+ // We returned false for some reason other than hitting the end. This is
+ // a real error. Just return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ // If it returned false because we hit the end. Then that's fine. We'll
+ // just treat it as if the request was for the end.
}
- return term_start_index_;
+
+ // 2. We've got the unicode character containing byte offset. Now, we need
+ // to point to the segment that ends before this character.
+ int starting_utf8_index =
+ ubrk_preceding(break_iterator_, offset_iterator_.utf8_index());
+ if (starting_utf8_index == UBRK_DONE) {
+ // Rewind the end indices.
+ MarkAsDone();
+ return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
+ "No segments end before provided offset %d.", offset));
+ }
+ term_start_index_ = starting_utf8_index;
+
+ // 3. We've correctly set the start index and the iterator currently points
+ // to that position. Now we need to find the correct end position and
+ // advance the iterator to that position.
+ int ending_utf8_index = ubrk_next(break_iterator_);
+ if (ending_utf8_index == UBRK_DONE) {
+ // This shouldn't ever happen.
+ MarkAsDone();
+ return absl_ports::AbortedError(IcingStringUtil::StringPrintf(
+ "No segments end before provided offset %d.", offset));
+ }
+ term_end_index_exclusive_ = ending_utf8_index;
+
+ // 4. The start and end indices point to a segment, but we need to ensure
+ // that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
+ // need a segment prior to this one.
+ CharacterIterator term_start_iterator = offset_iterator_;
+ if (!term_start_iterator.MoveToUtf8(term_start_index_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ if (term_end_index_exclusive_ > offset_iterator_.utf8_index()) {
+ return ResetToTermEndingBeforeUtf32(term_start_iterator.utf32_index());
+ }
+ return term_start_iterator.utf32_index();
}
- libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
term_start_index_ = 0;
term_end_index_exclusive_ = 0;
if (!Advance()) {
- return absl_ports::NotFoundError("");
+ return absl_ports::NotFoundError(
+ "Unable to find any valid terms in text.");
+ }
+ if (!offset_iterator_.MoveToUtf8(term_start_index_)) {
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
}
- return term_start_index_;
+ return offset_iterator_.utf32_index();
}
private:
- explicit IcuLanguageSegmenterIterator(std::string_view text,
+ explicit IcuLanguageSegmenterIterator(const IcuLanguageSegmenter* creator,
+ UBreakIterator* break_iterator,
+ std::string_view text,
std::string_view locale)
- : break_iterator_(nullptr),
+ : creator_(*creator),
+ break_iterator_(break_iterator),
text_(text),
locale_(locale),
- u_text_(UTEXT_INITIALIZER),
+ u_text_(nullptr),
+ offset_iterator_(text),
term_start_index_(0),
term_end_index_exclusive_(0) {}
// Returns true on success
bool Initialize() {
+ if (break_iterator_ == nullptr) {
+ return false;
+ }
UErrorCode status = U_ZERO_ERROR;
- utext_openUTF8(&u_text_, text_.data(), /*length=*/-1, &status);
- break_iterator_ = ubrk_open(UBRK_WORD, locale_.data(), /*text=*/nullptr,
- /*textLength=*/0, &status);
- ubrk_setUText(break_iterator_, &u_text_, &status);
+ u_text_ = utext_openUTF8(nullptr, text_.data(), text_.length(), &status);
+ if (u_text_ == nullptr) {
+ return false;
+ }
+ ubrk_setUText(break_iterator_, u_text_, &status);
return !U_FAILURE(status);
}
@@ -199,26 +297,11 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
term_start_index_ = 0;
}
- bool IsValidSegment() const {
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[term_start_index_])) {
- return true;
- }
-
- UChar32 uchar32 = i18n_utils::GetUChar32At(text_.data(), text_.length(),
- term_start_index_);
- // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
- // We know it's an alphabetic term by checking the first unicode character.
- if (u_isUAlphabetic(uchar32)) {
- return true;
- }
- return false;
- }
+ const IcuLanguageSegmenter& creator_; // Does not own.
// The underlying class that does the segmentation, ubrk_close() must be
// called after using.
- UBreakIterator* break_iterator_;
+ UBreakIterator* break_iterator_; // Does not own
// Text to be segmented
std::string_view text_;
@@ -229,8 +312,17 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
std::string_view locale_;
// A thin wrapper around the input UTF8 text, needed by break_iterator_.
- // utext_close() must be called after using.
- UText u_text_;
+ // Allocated by calling utext_openUtf8() and freed by calling utext_close().
+ UText* u_text_;
+
+ // Offset iterator. This iterator is not guaranteed to point to any particular
+ // character, but is guaranteed to point to a valid UTF character sequence.
+ //
+ // This iterator is used to save some amount of linear traversal when seeking
+ // to a specific UTF-32 offset. Each function that uses it could just create
+ // a CharacterIterator starting at the beginning of the text and traverse
+ // forward from there.
+ CharacterIterator offset_iterator_;
// The start and end indices are used to track the positions of current
// term.
@@ -238,18 +330,61 @@ class IcuLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
int term_end_index_exclusive_;
};
-IcuLanguageSegmenter::IcuLanguageSegmenter(std::string locale)
- : locale_(std::move(locale)) {}
+/* static */ libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>>
+IcuLanguageSegmenter::Create(std::string&& locale) {
+ UErrorCode status = U_ZERO_ERROR;
+ UBreakIterator* break_iterator = ubrk_open(
+ UBRK_WORD, locale.c_str(), /*text=*/nullptr, /*textLength=*/0, &status);
+ if (U_FAILURE(status) || break_iterator == nullptr) {
+ return absl_ports::AbortedError(
+ "Unable to create ICU break_iterator for language segmentation");
+ }
+ return std::unique_ptr<IcuLanguageSegmenter>(
+ new IcuLanguageSegmenter(std::move(locale), break_iterator));
+}
+
+UBreakIterator* IcuLanguageSegmenter::ProduceBreakIterator() const {
+ UBreakIterator* itr = nullptr;
+ {
+ absl_ports::unique_lock l(&mutex_);
+ if (cached_break_iterator_ != nullptr) {
+ itr = cached_break_iterator_;
+ cached_break_iterator_ = nullptr;
+ }
+ }
+ if (itr == nullptr) {
+ UErrorCode status = U_ZERO_ERROR;
+ itr = ubrk_open(UBRK_WORD, locale_.c_str(), /*text=*/nullptr,
+ /*textLength=*/0, &status);
+ if (U_FAILURE(status)) {
+ itr = nullptr;
+ }
+ }
+ return itr;
+}
+
+void IcuLanguageSegmenter::ReturnBreakIterator(UBreakIterator* itr) const {
+ {
+ absl_ports::unique_lock l(&mutex_);
+ if (cached_break_iterator_ == nullptr) {
+ cached_break_iterator_ = itr;
+ return;
+ }
+ }
+ ubrk_close(itr);
+}
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
IcuLanguageSegmenter::Segment(const std::string_view text) const {
- return IcuLanguageSegmenterIterator::Create(text, locale_);
+ return IcuLanguageSegmenterIterator::Create(this, ProduceBreakIterator(),
+ text, locale_);
}
libtextclassifier3::StatusOr<std::vector<std::string_view>>
IcuLanguageSegmenter::GetAllTerms(const std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
- Segment(text));
+ ICING_ASSIGN_OR_RETURN(
+ std::unique_ptr<LanguageSegmenter::Iterator> iterator,
+ Segment(text));
std::vector<std::string_view> terms;
while (iterator->Advance()) {
terms.push_back(iterator->GetTerm());
diff --git a/icing/tokenization/icu/icu-language-segmenter.h b/icing/tokenization/icu/icu-language-segmenter.h
index 4115461..44de5a2 100644
--- a/icing/tokenization/icu/icu-language-segmenter.h
+++ b/icing/tokenization/icu/icu-language-segmenter.h
@@ -22,7 +22,9 @@
#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/mutex.h"
#include "icing/tokenization/language-segmenter.h"
+#include "unicode/ubrk.h"
namespace icing {
namespace lib {
@@ -41,7 +43,14 @@ namespace lib {
// class. Other special tokenization logic will be in each tokenizer.
class IcuLanguageSegmenter : public LanguageSegmenter {
public:
- explicit IcuLanguageSegmenter(std::string locale);
+ static libtextclassifier3::StatusOr<std::unique_ptr<IcuLanguageSegmenter>>
+ Create(std::string&& locale);
+
+ ~IcuLanguageSegmenter() override {
+ if (cached_break_iterator_ != nullptr) {
+ ubrk_close(cached_break_iterator_);
+ }
+ }
IcuLanguageSegmenter(const IcuLanguageSegmenter&) = delete;
IcuLanguageSegmenter& operator=(const IcuLanguageSegmenter&) = delete;
@@ -69,8 +78,32 @@ class IcuLanguageSegmenter : public LanguageSegmenter {
std::string_view text) const override;
private:
+ // Declared a friend so that it can call AcceptBreakIterator.
+ friend class IcuLanguageSegmenterIterator;
+
+ explicit IcuLanguageSegmenter(std::string&& locale, UBreakIterator* iterator)
+ : locale_(std::move(locale)), cached_break_iterator_(iterator) {}
+
+ // Returns a UBreakIterator that the caller owns.
+ // If cached_break_iterator_ is non-null, transfers ownership to caller and
+ // sets cached_break_iterator_ to null.
+ // If cached_break_iterator is null, creates a new UBreakIterator and
+ // transfers ownership to caller.
+ UBreakIterator* ProduceBreakIterator() const;
+
+ // Caller transfers ownership of itr to IcuLanguageSegmenter.
+ // If cached_break_iterator_ is null, itr becomes the cached_break_iterator_
+ // If cached_break_iterator_ is non-null, then itr will be closed.
+ void ReturnBreakIterator(UBreakIterator* itr) const;
+
// Used to help segment text
const std::string locale_;
+
+ // The underlying class that does the segmentation, ubrk_close() must be
+ // called after using.
+ mutable UBreakIterator* cached_break_iterator_ ICING_GUARDED_BY(mutex_);
+
+ mutable absl_ports::shared_mutex mutex_;
};
} // namespace lib
diff --git a/icing/tokenization/icu/icu-language-segmenter_test.cc b/icing/tokenization/icu/icu-language-segmenter_test.cc
index 31c2726..a7f7419 100644
--- a/icing/tokenization/icu/icu-language-segmenter_test.cc
+++ b/icing/tokenization/icu/icu-language-segmenter_test.cc
@@ -12,24 +12,40 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+#include <memory>
+#include <string_view>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/jni/jni-cache.h"
+#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
#include "unicode/uloc.h"
namespace icing {
namespace lib {
-namespace {
+
using ::testing::ElementsAre;
using ::testing::Eq;
using ::testing::IsEmpty;
+namespace {
+
+language_segmenter_factory::SegmenterOptions GetSegmenterOptions(
+ const std::string& locale, const JniCache* jni_cache) {
+ return language_segmenter_factory::SegmenterOptions(locale, jni_cache);
+}
+
// Returns a vector containing all terms retrieved by Advancing on the iterator.
std::vector<std::string_view> GetAllTermsAdvance(
LanguageSegmenter::Iterator* itr) {
@@ -40,70 +56,61 @@ std::vector<std::string_view> GetAllTermsAdvance(
return terms;
}
-// Returns a vector containing all terms retrieved by calling
-// ResetToStart/ResetAfter with the current position to simulate Advancing on
-// the iterator.
-std::vector<std::string_view> GetAllTermsResetAfter(
+// Returns a vector containing all terms retrieved by calling ResetAfter with
+// the UTF-32 position of the current term start to simulate Advancing on the
+// iterator.
+std::vector<std::string_view> GetAllTermsResetAfterUtf32(
LanguageSegmenter::Iterator* itr) {
std::vector<std::string_view> terms;
- if (!itr->ResetToStart().ok()) {
- return terms;
- }
- terms.push_back(itr->GetTerm());
- const char* text_begin = itr->GetTerm().data();
- // Calling ResetToTermStartingAfter with the current position should get the
- // very next term in the sequence.
- for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
- current_pos = itr->GetTerm().data() - text_begin) {
+ // Calling ResetToTermStartingAfterUtf32 with -1 should get the first term in
+ // the sequence.
+ bool is_ok = itr->ResetToTermStartingAfterUtf32(-1).ok();
+ while (is_ok) {
terms.push_back(itr->GetTerm());
+ // Calling ResetToTermStartingAfterUtf32 with the current position should
+ // get the very next term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
}
return terms;
}
// Returns a vector containing all terms retrieved by alternating calls to
-// Advance and calls to ResetAfter with the current position to simulate
-// Advancing.
-std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+// Advance and calls to ResetAfter with the UTF-32 position of the current term
+// start to simulate Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfterUtf32(
LanguageSegmenter::Iterator* itr) {
- const char* text_begin = itr->GetTerm().data();
std::vector<std::string_view> terms;
-
- bool is_ok = true;
- int current_pos = 0;
+ bool is_ok = itr->Advance();
while (is_ok) {
+ terms.push_back(itr->GetTerm());
// Alternate between using Advance and ResetToTermAfter.
if (terms.size() % 2 == 0) {
is_ok = itr->Advance();
} else {
- // Calling ResetToTermStartingAfter with the current position should get
- // the very next term in the sequence.
- current_pos = itr->GetTerm().data() - text_begin;
- is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
- }
- if (is_ok) {
- terms.push_back(itr->GetTerm());
+ // Calling ResetToTermStartingAfterUtf32 with the current position should
+ // get the very next term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
}
}
return terms;
}
// Returns a vector containing all terms retrieved by calling ResetBefore with
-// the current position, starting at the end of the text. This vector should be
-// in reverse order of GetAllTerms and missing the last term.
-std::vector<std::string_view> GetAllTermsResetBefore(
+// the UTF-32 position of the current term start, starting at the end of the
+// text. This vector should be in reverse order of GetAllTerms and missing the
+// last term.
+std::vector<std::string_view> GetAllTermsResetBeforeUtf32(
LanguageSegmenter::Iterator* itr) {
- const char* text_begin = itr->GetTerm().data();
- int last_pos = 0;
- while (itr->Advance()) {
- last_pos = itr->GetTerm().data() - text_begin;
- }
std::vector<std::string_view> terms;
- // Calling ResetToTermEndingBefore with the current position should get the
- // previous term in the sequence.
- for (int current_pos = last_pos;
- itr->ResetToTermEndingBefore(current_pos).ok();
- current_pos = itr->GetTerm().data() - text_begin) {
+ bool is_ok = itr->ResetToTermEndingBeforeUtf32(1000).ok();
+ while (is_ok) {
terms.push_back(itr->GetTerm());
+ // Calling ResetToTermEndingBeforeUtf32 with the current position should get
+ // the previous term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermEndingBeforeUtf32(char_itr.utf32_index()).ok();
}
return terms;
}
@@ -112,6 +119,9 @@ class IcuLanguageSegmenterAllLocalesTest
: public testing::TestWithParam<const char*> {
protected:
void SetUp() override {
+ if (!IsIcuTokenization()) {
+ GTEST_SKIP() << "ICU tokenization not enabled!";
+ }
ICING_ASSERT_OK(
// File generated via icu_data_file rule in //icing/BUILD.
icu_data_file_helper::SetUpICUDataFile(
@@ -119,27 +129,34 @@ class IcuLanguageSegmenterAllLocalesTest
}
static std::string GetLocale() { return GetParam(); }
- static language_segmenter_factory::SegmenterOptions GetOptions() {
- return language_segmenter_factory::SegmenterOptions(GetLocale());
- }
+
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
};
+} // namespace
+
TEST_P(IcuLanguageSegmenterAllLocalesTest, EmptyText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, SimpleText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
IsOkAndHolds(ElementsAre("Hello", " ", "World")));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// ASCII punctuation marks are kept
EXPECT_THAT(
language_segmenter->GetAllTerms("Hello, World!!!"),
@@ -153,8 +170,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_Punctuation) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// ASCII special characters are kept
EXPECT_THAT(language_segmenter->GetAllTerms("Pay $1000"),
IsOkAndHolds(ElementsAre("Pay", " ", "$", "1000")));
@@ -169,19 +188,23 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ASCII_SpecialCharacter) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Non_ASCII_Non_Alphabetic) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
- IsOkAndHolds(ElementsAre("Hello")));
+ IsOkAndHolds(ElementsAre("。", "?", "·", "Hello", "!", "×")));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
- EXPECT_THAT(language_segmenter->GetAllTerms("U.S. Bank"),
- IsOkAndHolds(ElementsAre("U.S", ".", " ", "Bank")));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("U.S.𡔖 Bank"),
+ IsOkAndHolds(ElementsAre("U.S", ".", "𡔖", " ", "Bank")));
EXPECT_THAT(language_segmenter->GetAllTerms("I.B.M."),
IsOkAndHolds(ElementsAre("I.B.M", ".")));
EXPECT_THAT(language_segmenter->GetAllTerms("I,B,M"),
@@ -191,8 +214,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Acronym) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// According to unicode word break rules
// WB6(https://unicode.org/reports/tr29/#WB6),
// WB7(https://unicode.org/reports/tr29/#WB7), and a few others, some
@@ -202,16 +227,42 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
// Word connecters
EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android"),
IsOkAndHolds(ElementsAre("com.google.android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
- IsOkAndHolds(ElementsAre("com:google:android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com'google'android"),
IsOkAndHolds(ElementsAre("com'google'android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com_google_android"),
IsOkAndHolds(ElementsAre("com_google_android")));
// Word connecters can be mixed
- EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
- IsOkAndHolds(ElementsAre("com.google.android:icing")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android_icing"),
+ IsOkAndHolds(ElementsAre("com.google.android_icing")));
+
+ // Connectors that don't have valid terms on both sides of it are not
+ // considered connectors.
+ EXPECT_THAT(language_segmenter->GetAllTerms("'bar'baz"),
+ IsOkAndHolds(ElementsAre("'", "bar'baz")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar.baz."),
+ IsOkAndHolds(ElementsAre("bar.baz", ".")));
+
+ // Connectors that don't have valid terms on both sides of it are not
+ // considered connectors.
+ EXPECT_THAT(language_segmenter->GetAllTerms(" .bar.baz"),
+ IsOkAndHolds(ElementsAre(" ", ".", "bar.baz")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar'baz' "),
+ IsOkAndHolds(ElementsAre("bar'baz", "'", " ")));
+
+ // Connectors don't connect if one side is an invalid term (?)
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar.baz.?"),
+ IsOkAndHolds(ElementsAre("bar.baz", ".", "?")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("?'bar'baz"),
+ IsOkAndHolds(ElementsAre("?", "'", "bar'baz")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("私'は"),
+ IsOkAndHolds(ElementsAre("私", "'", "は")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("我.每"),
+ IsOkAndHolds(ElementsAre("我", ".", "每")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("เดิน'ไป"),
+ IsOkAndHolds(ElementsAre("เดิน'ไป")));
// Any heading and trailing characters are not connecters
EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
@@ -226,8 +277,6 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
IsOkAndHolds(ElementsAre("com", "+", "google", "+", "android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com*google*android"),
IsOkAndHolds(ElementsAre("com", "*", "google", "*", "android")));
- EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
- IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com^google^android"),
IsOkAndHolds(ElementsAre("com", "^", "google", "^", "android")));
EXPECT_THAT(language_segmenter->GetAllTerms("com&google&android"),
@@ -241,11 +290,43 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WordConnector) {
EXPECT_THAT(
language_segmenter->GetAllTerms("com\"google\"android"),
IsOkAndHolds(ElementsAre("com", "\"", "google", "\"", "android")));
+
+ // In ICU 72, there were a few changes:
+ // 1. ':' stopped being a word connector
+ // 2. '@' became a word connector
+ // 3. <numeric><word-connector><numeric> such as "3'14" is now considered as
+ // a single token.
+ if (GetIcuTokenizationVersion() >= 72) {
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com", ":", "google", ":", "android")));
+ // In ICU 74, the rules for '@' were reverted.
+ if (GetIcuTokenizationVersion() >= 74) {
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ } else {
+ EXPECT_THAT(language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com@google@android")));
+ }
+ EXPECT_THAT(language_segmenter->GetAllTerms("3'14"),
+ IsOkAndHolds(ElementsAre("3'14")));
+ } else {
+ EXPECT_THAT(language_segmenter->GetAllTerms("com:google:android"),
+ IsOkAndHolds(ElementsAre("com:google:android")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("com@google@android"),
+ IsOkAndHolds(ElementsAre("com", "@", "google", "@", "android")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("3'14"),
+ IsOkAndHolds(ElementsAre("3", "'", "14")));
+ }
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("It's ok."),
IsOkAndHolds(ElementsAre("It's", " ", "ok", ".")));
EXPECT_THAT(language_segmenter->GetAllTerms("He'll be back."),
@@ -265,8 +346,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Apostrophes) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("(Hello)"),
IsOkAndHolds(ElementsAre("(", "Hello", ")")));
@@ -276,8 +359,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Parentheses) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("\"Hello\""),
IsOkAndHolds(ElementsAre("\"", "Hello", "\"")));
@@ -287,8 +372,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Quotes) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Alphanumeric terms are allowed
EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
@@ -296,8 +383,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Alphanumeric) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Alphanumeric terms are allowed
EXPECT_THAT(
@@ -311,9 +400,20 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, Number) {
IsOkAndHolds(ElementsAre("-", "123")));
}
+TEST_P(IcuLanguageSegmenterAllLocalesTest, FullWidthNumbers) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"),
+ IsOkAndHolds(ElementsAre("0123456789")));
+}
+
TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
const int kNumSeparators = 256;
std::string text_with_spaces =
@@ -337,21 +437,24 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ContinuousWhitespaces) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that don't
// have whitespaces as word delimiter.
// Chinese
- EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班", "。")));
// Japanese
EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
- "い", "てい", "ます")));
+ "い", "てい", "ます", "。")));
// Khmer
EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ", "។")));
// Thai
EXPECT_THAT(
language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
@@ -359,16 +462,19 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, CJKT) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, LatinLettersWithAccents) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("āăąḃḅḇčćç"),
IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
}
-// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Turkish
EXPECT_THAT(language_segmenter->GetAllTerms("merhaba dünya"),
IsOkAndHolds(ElementsAre("merhaba", " ", "dünya")));
@@ -378,10 +484,11 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, WhitespaceSplitLanguages) {
IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
}
-// TODO(samzheng): more mixed languages test cases
TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
EXPECT_THAT(language_segmenter->GetAllTerms("How are you你好吗お元気ですか"),
IsOkAndHolds(ElementsAre("How", " ", "are", " ", "you", "你好",
"吗", "お", "元気", "です", "か")));
@@ -392,8 +499,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguages) {
}
TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Validates that the input strings are not copied
const std::string text = "Hello World";
const char* word1_address = text.c_str();
@@ -409,26 +518,141 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, NotCopyStrings) {
EXPECT_THAT(word2_address, Eq(word2_result_address));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToStartUtf32WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "com.google.android is package";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "com.google.android is package"
+ // ^ ^^ ^^
+ // UTF-8 idx: 0 18 19 21 22
+ // UTF-32 idx: 0 18 19 21 22
+ auto position_or = itr->ResetToStartUtf32();
+ EXPECT_THAT(position_or, IsOk());
+ ASSERT_THAT(itr->GetTerm(), Eq("com.google.android"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, NewIteratorResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ IteratorOneAdvanceResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_TRUE(itr->Advance()); // itr points to 'How'
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ IteratorMultipleAdvancesResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance()); // itr points to ' '
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, IteratorDoneResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ while (itr->Advance()) {
+ // Do nothing.
+ }
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "package com.google.android name";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "package com.google.android name"
+ // ^ ^^ ^^
+ // UTF-8 idx: 0 7 8 26 27
+ // UTF-32 idx: 0 7 8 26 27
+ auto position_or = itr->ResetToTermStartingAfterUtf32(8);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(26));
+ ASSERT_THAT(itr->GetTerm(), Eq(" "));
+
+ position_or = itr->ResetToTermStartingAfterUtf32(7);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
+ ASSERT_THAT(itr->GetTerm(), Eq("com.google.android"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterUtf32OutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
ASSERT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("you"));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(-1), IsOk());
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(21),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("you"));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
// Tests that ResetToTermAfter and Advance produce the same output. With the
@@ -437,9 +661,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermAfterOutOfBounds) {
// terms produced by ResetToTermAfter calls with the current position
// provided as the argument.
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- MixedLanguagesResetToTermAfterEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ MixedLanguagesResetToTermAfterUtf32EquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -451,16 +676,17 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kText));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ThaiResetToTermAfterEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ThaiResetToTermAfterUtf32EquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -472,16 +698,17 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- KoreanResetToTermAfterEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ KoreanResetToTermAfterUtf32EquivalentToAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -493,7 +720,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
@@ -504,9 +731,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
// should be able to mix ResetToTermAfter(current_position) calls and Advance
// calls to mimic calling Advance.
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ MixedLanguagesResetToTermAfterUtf32InteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -518,7 +746,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kText));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
@@ -526,9 +754,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ThaiResetToTermAfterInteroperableWithAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ThaiResetToTermAfterUtf32InteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -540,7 +769,7 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
@@ -548,9 +777,10 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- KoreanResetToTermAfterInteroperableWithAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ KoreanResetToTermAfterUtf32InteroperableWithAdvance) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
@@ -562,190 +792,243 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment("How are you你好吗お元気ですか"));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(11)));
EXPECT_THAT(itr->GetTerm(), Eq("你好"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(18), IsOkAndHolds(Eq(19)));
EXPECT_THAT(itr->GetTerm(), Eq("か"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
EXPECT_THAT(itr->GetTerm(), Eq("吗"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ContinuousWhitespacesResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ContinuousWhitespacesResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
constexpr std::string_view kTextWithSpace = "Hello World";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kTextWithSpace));
- // String: "Hello World"
- // ^ ^ ^
- // Bytes: 0 5 15
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+ // String: "Hello World"
+ // ^ ^ ^
+ // UTF-8 idx: 0 5 15
+ // UTF-32 idx: 0 5 15
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("World"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(5), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("World"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
// don't have whitespaces as word delimiter. Chinese
constexpr std::string_view kChinese = "我每天走路去上班。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // Bytes: 0 3 9 15 18
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^ ^
+ // UTF-8 idx: 0 3 9 15 18 24
+ // UTF-832 idx: 0 1 3 5 6 8
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每天"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("走路"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Japanese
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
- // String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 6 12 18212427 33
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33 39
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
- // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^
- // Bytes: 0 9 24 45
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45 69
+ // UTF-32 idx: 0 3 8 15 23
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("។"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermAfterUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Thai
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kThai));
- // String: "ฉันเดินไปทำงานทุกวัน"
- // ^ ^ ^ ^ ^ ^
- // Bytes: 0 9 21 27 42 51
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 21 27 42 51
+ // UTF-32 idx: 0 3 7 9 14 17
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(14)));
EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ ResetToTermBeforeWordConnectorUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "package name com.google.android!";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "package name com.google.android!"
+ // ^ ^^ ^^ ^
+ // UTF-8 idx: 0 7 8 12 13 31
+ // UTF-32 idx: 0 7 8 12 13 31
+ auto position_or = itr->ResetToTermEndingBeforeUtf32(31);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(13));
+ ASSERT_THAT(itr->GetTerm(), Eq("com.google.android"));
+
+ position_or = itr->ResetToTermEndingBeforeUtf32(21);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(12));
+ ASSERT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBoundsUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
ASSERT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(-1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("are"));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(29), IsOk());
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
}
// Tests that ResetToTermBefore and Advance produce the same output. With the
@@ -754,26 +1037,22 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest, ResetToTermBeforeOutOfBounds) {
// terms produced by ResetToTermBefore calls with the current position
// provided as the argument (after their order has been reversed).
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ MixedLanguagesResetToTermBeforeEquivalentToAdvanceUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kText = "How are𡔖 you你好吗お元気ですか";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
segmenter->Segment(kText));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kText));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -782,26 +1061,22 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ThaiResetToTermBeforeEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ThaiResetToTermBeforeEquivalentToAdvanceUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -809,189 +1084,263 @@ TEST_P(IcuLanguageSegmenterAllLocalesTest,
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- KoreanResetToTermBeforeEquivalentToAdvance) {
- ICING_ASSERT_OK_AND_ASSIGN(auto segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ KoreanResetToTermBeforeEquivalentToAdvanceUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKorean = "나는 매일 출근합니다.";
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> advance_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, MixedLanguagesResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest,
+ MixedLanguagesResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment("How are you你好吗お元気ですか"));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(18), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("元気"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(12), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(17)));
EXPECT_THAT(itr->GetTerm(), Eq("です"));
}
TEST_P(IcuLanguageSegmenterAllLocalesTest,
- ContinuousWhitespacesResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+ ContinuousWhitespacesResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Multiple continuous whitespaces are treated as one.
constexpr std::string_view kTextWithSpace = "Hello World";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kTextWithSpace));
- // String: "Hello World"
- // ^ ^ ^
- // Bytes: 0 5 15
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "Hello World"
+ // ^ ^ ^
+ // UTF-8 idx: 0 5 15
+ // UTF-32 idx: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(15), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ChineseResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// CJKT (Chinese, Japanese, Khmer, Thai) are the 4 main languages that
// don't have whitespaces as word delimiter. Chinese
constexpr std::string_view kChinese = "我每天走路去上班。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // Bytes: 0 3 9 15 18
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF-8 idx: 0 3 9 15 18
+ // UTF-32 idx: 0 1 3 5 6
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("我"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq("去"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, JapaneseResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Japanese
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
- // String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 6 12 18212427 33
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(9)));
EXPECT_THAT(itr->GetTerm(), Eq("てい"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(3), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, KhmerResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
- // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^
- // Bytes: 0 9 24 45
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45
+ // UTF-32 idx: 0 3 8 15
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(16), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
}
-TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create(GetOptions()));
+TEST_P(IcuLanguageSegmenterAllLocalesTest, ThaiResetToTermBeforeUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
// Thai
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kThai));
- // String: "ฉันเดินไปทำงานทุกวัน"
- // ^ ^ ^ ^ ^ ^
- // Bytes: 0 9 21 27 42 51
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 21 27 42 51
+ // UTF-32 idx: 0 3 7 9 14 17
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(14)));
EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(4), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
}
+TEST_P(IcuLanguageSegmenterAllLocalesTest, QuerySyntax) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Validates that the input strings are not copied
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(
+ "(-term1 OR term2) AND property1.subproperty2:term3"));
+ EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2",
+ ")", " ", "AND", " ", "property1", ".",
+ "subproperty2", ":", "term3"));
+}
+
+TEST_P(IcuLanguageSegmenterAllLocalesTest, MultipleLangSegmentersTest) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> iterator_one,
+ language_segmenter->Segment("foo bar baz"));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<LanguageSegmenter::Iterator> iterator_two,
+ language_segmenter->Segment("abra kadabra alakazam"));
+
+ ASSERT_TRUE(iterator_one->Advance());
+ ASSERT_TRUE(iterator_two->Advance());
+ EXPECT_THAT(iterator_one->GetTerm(), Eq("foo"));
+ EXPECT_THAT(iterator_two->GetTerm(), Eq("abra"));
+
+ ASSERT_TRUE(iterator_one->Advance());
+ ASSERT_TRUE(iterator_two->Advance());
+ EXPECT_THAT(iterator_one->GetTerm(), Eq(" "));
+ EXPECT_THAT(iterator_two->GetTerm(), Eq(" "));
+
+ ASSERT_TRUE(iterator_one->Advance());
+ EXPECT_THAT(iterator_one->GetTerm(), Eq("bar"));
+ EXPECT_THAT(iterator_two->GetTerm(), Eq(" "));
+ ASSERT_TRUE(iterator_two->Advance());
+ EXPECT_THAT(iterator_one->GetTerm(), Eq("bar"));
+ EXPECT_THAT(iterator_two->GetTerm(), Eq("kadabra"));
+
+ ASSERT_TRUE(iterator_one->Advance());
+ ASSERT_TRUE(iterator_two->Advance());
+ EXPECT_THAT(iterator_one->GetTerm(), Eq(" "));
+ EXPECT_THAT(iterator_two->GetTerm(), Eq(" "));
+
+ ASSERT_TRUE(iterator_two->Advance());
+ ASSERT_TRUE(iterator_one->Advance());
+ EXPECT_THAT(iterator_one->GetTerm(), Eq("baz"));
+ EXPECT_THAT(iterator_two->GetTerm(), Eq("alakazam"));
+
+ ASSERT_FALSE(iterator_two->Advance());
+ ASSERT_FALSE(iterator_one->Advance());
+}
+
INSTANTIATE_TEST_SUITE_P(
LocaleName, IcuLanguageSegmenterAllLocalesTest,
testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
@@ -1011,6 +1360,5 @@ INSTANTIATE_TEST_SUITE_P(
"" // Will fall back to ICU default locale
));
-} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/tokenization/language-segmenter-factory.h b/icing/tokenization/language-segmenter-factory.h
index ce50d0b..2505a07 100644
--- a/icing/tokenization/language-segmenter-factory.h
+++ b/icing/tokenization/language-segmenter-factory.h
@@ -18,11 +18,9 @@
#include <memory>
#include <string_view>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/jni/jni-cache.h"
#include "icing/tokenization/language-segmenter.h"
-#include "icing/util/i18n-utils.h"
-#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -30,7 +28,7 @@ namespace lib {
namespace language_segmenter_factory {
struct SegmenterOptions {
- explicit SegmenterOptions(std::string locale = ULOC_US,
+ explicit SegmenterOptions(std::string locale,
const JniCache* jni_cache = nullptr)
: locale(std::move(locale)), jni_cache(jni_cache) {}
@@ -46,7 +44,7 @@ struct SegmenterOptions {
// A LanguageSegmenter on success
// INVALID_ARGUMENT if locale string is invalid
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
- SegmenterOptions options = SegmenterOptions());
+ SegmenterOptions options);
} // namespace language_segmenter_factory
diff --git a/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc b/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc
new file mode 100644
index 0000000..3a94af3
--- /dev/null
+++ b/icing/tokenization/language-segmenter-iterator-test-jni-layer.cc
@@ -0,0 +1,37 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_LanguageSegmenterIteratorJniTest_testsMain(JNIEnv* env,
+ jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "jni-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/tokenization/language-segmenter-iterator_test.cc b/icing/tokenization/language-segmenter-iterator_test.cc
index c7b068d..3aff45c 100644
--- a/icing/tokenization/language-segmenter-iterator_test.cc
+++ b/icing/tokenization/language-segmenter-iterator_test.cc
@@ -15,8 +15,10 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
@@ -35,16 +37,23 @@ using ::testing::Eq;
class LanguageSegmenterIteratorTest : public testing::Test {
protected:
void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
}
+
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
};
TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
@@ -61,111 +70,135 @@ TEST_F(LanguageSegmenterIteratorTest, AdvanceAndGetTerm) {
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithOffsetInText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ ResetToTermStartingAfterUtf32WithOffsetInText) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/0),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/0),
IsOkAndHolds(3)); // The term " "
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/3),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/3),
IsOkAndHolds(4)); // The term "bar"
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/4),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/4),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithNegativeOffsetNotOk) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ ResetToTermStartingAfterUtf32WithNegativeOffsetNotOk) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-1), IsOk());
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/-100),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/-100), IsOk());
- EXPECT_THAT(iterator->ResetToStart(), IsOkAndHolds(0));
+ EXPECT_THAT(iterator->ResetToStartUtf32(), IsOkAndHolds(0));
EXPECT_THAT(iterator->GetTerm(), Eq("foo"));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithTextLengthOffsetInvalidArgument) {
+ ResetToTermStartingAfterUtf32WithTextLengthOffsetInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/text.size()),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/text.length()),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermStartingAfterWithOffsetPastTextLengthInvalidArgument) {
+ ResetToTermStartingAfterUtf32WithOffsetPastTextLengthInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
- EXPECT_THAT(iterator->ResetToTermStartingAfter(/*offset=*/100),
+ EXPECT_THAT(iterator->ResetToTermStartingAfterUtf32(/*offset=*/100),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
-TEST_F(LanguageSegmenterIteratorTest, ResetToTermEndingBeforeWithOffsetInText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+TEST_F(LanguageSegmenterIteratorTest,
+ ResetToTermEndingBeforeUtf32WithOffsetInText) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/6),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/6),
IsOkAndHolds(3)); // The term " "
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/3),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/3),
IsOkAndHolds(0)); // The term "foo"
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/2),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermEndingBeforeWithZeroNotFound) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ ResetToTermEndingBeforeUtf32WithZeroNotFound) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
// Zero is a valid argument, but there aren't any terms that end before it.
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/0),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermEndingBeforeWithNegativeOffsetInvalidArgument) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ ResetToTermEndingBeforeUtf32WithNegativeOffsetInvalidArgument) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator,
language_segmenter->Segment("foo bar"));
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-1),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/-100),
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/-100),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
}
TEST_F(LanguageSegmenterIteratorTest,
- ResetToTermEndingBeforeWithOffsetPastTextEndInvalidArgument) {
+ ResetToTermEndingBeforeUtf32WithOffsetPastTextEndInvalidArgument) {
std::string text = "foo bar";
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(auto iterator, language_segmenter->Segment(text));
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length()),
+ IsOk());
- EXPECT_THAT(iterator->ResetToTermEndingBefore(/*offset=*/text.length() + 1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
+ EXPECT_THAT(
+ iterator->ResetToTermEndingBeforeUtf32(/*offset=*/text.length() + 1),
+ IsOk());
}
} // namespace
diff --git a/icing/tokenization/language-segmenter.h b/icing/tokenization/language-segmenter.h
index fdb1846..913386a 100644
--- a/icing/tokenization/language-segmenter.h
+++ b/icing/tokenization/language-segmenter.h
@@ -21,6 +21,8 @@
#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -56,53 +58,100 @@ class LanguageSegmenter {
// true.
virtual std::string_view GetTerm() const = 0;
- // Resets the iterator to point to the first term that starts after offset.
+ // RETURNS:
+ // On success, a CharacterIterator pointing to the beginning of the
+ // current term.
+ // ABORTED if an invalid unicode character is encountered while
+ // calculating the term start.
+ virtual libtextclassifier3::StatusOr<CharacterIterator>
+ CalculateTermStart() {
+ return absl_ports::UnimplementedError("");
+ }
+
+ // RETURNS:
+ // On success, a CharacterIterator pointing just past the end of the
+ // current term.
+ // ABORTED if an invalid unicode character is encountered while
+ // calculating the term end.
+ virtual libtextclassifier3::StatusOr<CharacterIterator>
+ CalculateTermEndExclusive() {
+ return absl_ports::UnimplementedError("");
+ }
+
+ // Resets the iterator to point to the first term that starts after UTF-32
+ // offset.
// GetTerm will now return that term. For example:
//
// language_segmenter = language_segmenter_factory::Create(type);
// iterator = language_segmenter->Segment("foo bar baz");
- // iterator.ResetToTermStartingAfter(4);
+ // iterator.ResetToTermStartingAfterUtf32(4);
// iterator.GetTerm() // returns "baz";
//
// Return types of OK and NOT_FOUND indicate that the function call was
// valid and the state of the iterator has changed. Return type of
- // INVALID_ARGUMENT will leave the iterator unchanged.
+ // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type
+ // of ABORTED means that the iterator may be left in an undefined state and
+ // no longer be usable.
//
// Returns:
- // On success, the starting position of the first term that starts after
+ // On success, the UTF-32 offset of the first term that starts after
// offset.
// NOT_FOUND if an error occurred or there are no terms that start after
// offset.
- // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // INVALID_ARGUMENT if offset is beyond the end of the text.
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
- int32_t offset) = 0;
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
+ int32_t offset) {
+ return absl_ports::UnimplementedError("");
+ }
- // Resets the iterator to point to the first term that ends before offset.
+ // Resets the iterator to point to the first term that ends before UTF-32
+ // offset.
// GetTerm will now return that term. For example:
//
// language_segmenter = language_segmenter_factory::Create(type);
// iterator = language_segmenter->Segment("foo bar baz");
- // iterator.ResetToTermEndingBefore(7);
+ // iterator.ResetToTermEndingBeforeUtf32(7);
// iterator.GetTerm() // returns "bar";
//
// Return types of OK and NOT_FOUND indicate that the function call was
// valid and the state of the iterator has changed. Return type of
- // INVALID_ARGUMENT will leave the iterator unchanged.
+ // INVALID_ARGUMENT will leave the iterator unchanged. Lastly, a return type
+ // of ABORTED means that the iterator may be left in an undefined state and
+ // no longer be usable.
//
// Returns:
- // On success, the starting position of the first term that ends before
+ // On success, the UTF-32 offset of the first term that ends before
// offset.
// NOT_FOUND if an error occurred or there are no terms that ends before
// offset.
- // INVALID_ARGUMENT if offset is out of bounds for the provided text.
+ // INVALID_ARGUMENT if offset is negative
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
- int32_t offset) = 0;
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
+ int32_t offset) {
+ return absl_ports::UnimplementedError("");
+ }
- virtual libtextclassifier3::StatusOr<int32_t> ResetToStart() = 0;
+ // Resets the iterator to point to the first term.
+ // GetTerm will now return that term. For example:
+ //
+ // language_segmenter = language_segmenter_factory::Create(type);
+ // iterator = language_segmenter->Segment("foo bar baz");
+ // iterator.Advance();
+ // iterator.ResetToStartUtf32();
+ // iterator.GetTerm() // returns "foo";
+ //
+ // Return types of OK and NOT_FOUND indicate that the function call was
+ // valid and the state of the iterator has changed.
+ //
+ // Returns:
+ // On success, the starting position of the first term.
+ // NOT_FOUND if an error occurred or there are no valid terms in the text.
+ // ABORTED if an invalid unicode character is encountered while
+ // traversing the text.
+ virtual libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() = 0;
};
// Segments the input text into terms.
diff --git a/icing/tokenization/language-segmenter_benchmark.cc b/icing/tokenization/language-segmenter_benchmark.cc
index 49ddfca..748a322 100644
--- a/icing/tokenization/language-segmenter_benchmark.cc
+++ b/icing/tokenization/language-segmenter_benchmark.cc
@@ -14,19 +14,20 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/transform/normalizer.h"
+#include "unicode/uloc.h"
// Run on a Linux workstation:
// $ blaze build -c opt --dynamic_mode=off --copt=-gmlt
// //icing/tokenization:language-segmenter_benchmark
//
// $ blaze-bin/icing/tokenization/language-segmenter_benchmark
-// --benchmarks=all
+// --benchmark_filter=all
//
// Run on an Android device:
// Make target //icing/tokenization:language-segmenter depend on
@@ -40,7 +41,7 @@
// blaze-bin/icing/tokenization/language-segmenter_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/language-segmenter_benchmark --benchmarks=all
+// $ adb shell /data/local/tmp/language-segmenter_benchmark --benchmark_filter=all
// --adb
// Flag to tell the benchmark that it'll be run on an Android device via adb,
@@ -59,8 +60,9 @@ void BM_SegmentNoSpace(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string(state.range(0), 'A');
@@ -95,8 +97,9 @@ void BM_SegmentWithSpaces(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string(state.range(0), 'A');
for (int i = 1; i < input_string.length(); i += 2) {
@@ -134,8 +137,9 @@ void BM_SegmentCJK(benchmark::State& state) {
GetTestFilePath("icing/icu.dat")));
}
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
std::unique_ptr<LanguageSegmenter> language_segmenter =
- language_segmenter_factory::Create().ValueOrDie();
+ language_segmenter_factory::Create(std::move(options)).ValueOrDie();
std::string input_string;
while (input_string.length() < state.range(0)) {
diff --git a/icing/tokenization/plain-tokenizer-test-jni-layer.cc b/icing/tokenization/plain-tokenizer-test-jni-layer.cc
new file mode 100644
index 0000000..efa6427
--- /dev/null
+++ b/icing/tokenization/plain-tokenizer-test-jni-layer.cc
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_PlainTokenizerJniTest_testsMain(JNIEnv* env, jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "jni-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/tokenization/plain-tokenizer.cc b/icing/tokenization/plain-tokenizer.cc
index 6e54af9..d40022b 100644
--- a/icing/tokenization/plain-tokenizer.cc
+++ b/icing/tokenization/plain-tokenizer.cc
@@ -14,10 +14,13 @@
#include "icing/tokenization/plain-tokenizer.h"
+#include <algorithm>
#include <cstdint>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
@@ -63,15 +66,26 @@ class PlainTokenIterator : public Tokenizer::Iterator {
return found_next_valid_term;
}
- Token GetToken() const override {
- if (current_term_.empty()) {
- return Token(Token::INVALID);
+ std::vector<Token> GetTokens() const override {
+ std::vector<Token> result;
+ if (!current_term_.empty()) {
+ result.push_back(Token(Token::Type::REGULAR, current_term_));
}
- return Token(Token::REGULAR, current_term_);
+ return result;
}
- bool ResetToTokenAfter(int32_t offset) override {
- if (!base_iterator_->ResetToTermStartingAfter(offset).ok()) {
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
+ override {
+ return base_iterator_->CalculateTermStart();
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
+ override {
+ return base_iterator_->CalculateTermEndExclusive();
+ }
+
+ bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
+ if (!base_iterator_->ResetToTermStartingAfterUtf32(utf32_offset).ok()) {
return false;
}
current_term_ = base_iterator_->GetTerm();
@@ -82,22 +96,24 @@ class PlainTokenIterator : public Tokenizer::Iterator {
return true;
}
- bool ResetToTokenBefore(int32_t offset) override {
+ bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
ICING_ASSIGN_OR_RETURN(
- offset, base_iterator_->ResetToTermEndingBefore(offset), false);
+ utf32_offset,
+ base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
current_term_ = base_iterator_->GetTerm();
while (!IsValidTerm(current_term_)) {
// Haven't found a valid term yet. Retrieve the term prior to this one
// from the segmenter.
ICING_ASSIGN_OR_RETURN(
- offset, base_iterator_->ResetToTermEndingBefore(offset), false);
+ utf32_offset,
+ base_iterator_->ResetToTermEndingBeforeUtf32(utf32_offset), false);
current_term_ = base_iterator_->GetTerm();
}
return true;
}
bool ResetToStart() override {
- if (!base_iterator_->ResetToStart().ok()) {
+ if (!base_iterator_->ResetToStartUtf32().ok()) {
return false;
}
current_term_ = base_iterator_->GetTerm();
@@ -127,7 +143,8 @@ libtextclassifier3::StatusOr<std::vector<Token>> PlainTokenizer::TokenizeAll(
Tokenize(text));
std::vector<Token> tokens;
while (iterator->Advance()) {
- tokens.push_back(iterator->GetToken());
+ std::vector<Token> batch_tokens = iterator->GetTokens();
+ tokens.insert(tokens.end(), batch_tokens.begin(), batch_tokens.end());
}
return tokens;
}
diff --git a/icing/tokenization/plain-tokenizer_test.cc b/icing/tokenization/plain-tokenizer_test.cc
index f2fc678..6c426da 100644
--- a/icing/tokenization/plain-tokenizer_test.cc
+++ b/icing/tokenization/plain-tokenizer_test.cc
@@ -18,12 +18,15 @@
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -34,59 +37,111 @@ using ::testing::IsEmpty;
class PlainTokenizerTest : public ::testing::Test {
protected:
void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
}
+
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
};
TEST_F(PlainTokenizerTest, CreationWithNullPointerShouldFail) {
- EXPECT_THAT(
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, /*lang_segmenter=*/nullptr),
- StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+ EXPECT_THAT(tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ /*lang_segmenter=*/nullptr),
+ StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
+}
+
+TEST_F(PlainTokenizerTest, NoTokensBeforeAdvancing) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator,
+ plain_tokenizer->Tokenize(kText));
+
+ // We should get no tokens if we get the token before advancing.
+ EXPECT_THAT(token_iterator->GetTokens(), IsEmpty());
+}
+
+TEST_F(PlainTokenizerTest, LastTokenAfterFullyAdvanced) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator,
+ plain_tokenizer->Tokenize(kText));
+
+ while (token_iterator->Advance()) {}
+
+ // After advance returns false, GetTokens will stay on the last token.
+ EXPECT_THAT(token_iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "!")));
}
TEST_F(PlainTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
EXPECT_THAT(plain_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
- EXPECT_THAT(plain_tokenizer->TokenizeAll("Hello World"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("Hello World"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
EXPECT_THAT(
plain_tokenizer->TokenizeAll(
"Lorem ipsum dolor sit amet, consectetur adipiscing elit. "
"Duis efficitur iaculis auctor."),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Lorem"),
- EqualsToken(Token::REGULAR, "ipsum"),
- EqualsToken(Token::REGULAR, "dolor"),
- EqualsToken(Token::REGULAR, "sit"),
- EqualsToken(Token::REGULAR, "amet"),
- EqualsToken(Token::REGULAR, "consectetur"),
- EqualsToken(Token::REGULAR, "adipiscing"),
- EqualsToken(Token::REGULAR, "elit"),
- EqualsToken(Token::REGULAR, "Duis"),
- EqualsToken(Token::REGULAR, "efficitur"),
- EqualsToken(Token::REGULAR, "iaculis"),
- EqualsToken(Token::REGULAR, "auctor"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Lorem"),
+ EqualsToken(Token::Type::REGULAR, "ipsum"),
+ EqualsToken(Token::Type::REGULAR, "dolor"),
+ EqualsToken(Token::Type::REGULAR, "sit"),
+ EqualsToken(Token::Type::REGULAR, "amet"),
+ EqualsToken(Token::Type::REGULAR, "consectetur"),
+ EqualsToken(Token::Type::REGULAR, "adipiscing"),
+ EqualsToken(Token::Type::REGULAR, "elit"),
+ EqualsToken(Token::Type::REGULAR, "Duis"),
+ EqualsToken(Token::Type::REGULAR, "efficitur"),
+ EqualsToken(Token::Type::REGULAR, "iaculis"),
+ EqualsToken(Token::Type::REGULAR, "auctor"))));
}
TEST_F(PlainTokenizerTest, Whitespace) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
// There're many unicode characters that are whitespaces, here we choose tabs
// to represent others.
@@ -94,168 +149,249 @@ TEST_F(PlainTokenizerTest, Whitespace) {
// 0x0009 is horizontal tab, considered as a whitespace
std::string text_with_horizontal_tab =
absl_ports::StrCat("Hello", UCharToString(0x0009), "World");
- EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(text_with_horizontal_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
// 0x000B is vertical tab, considered as a whitespace
std::string text_with_vertical_tab =
absl_ports::StrCat("Hello", UCharToString(0x000B), "World");
- EXPECT_THAT(plain_tokenizer->TokenizeAll(text_with_vertical_tab),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(text_with_vertical_tab),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
}
TEST_F(PlainTokenizerTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
// Half-width punctuation marks are filtered out.
- EXPECT_THAT(plain_tokenizer->TokenizeAll(
- "Hello, World! Hello: World. \"Hello\" World?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"),
- EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"),
- EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(
+ "Hello, World! Hello: World. \"Hello\" World?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
// Full-width punctuation marks are filtered out.
- EXPECT_THAT(
- plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"),
- EqualsToken(Token::REGULAR, "你好"),
- EqualsToken(Token::REGULAR, "世界"))));
+ std::vector<std::string_view> exp_tokens;
+ if (IsCfStringTokenization()) {
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"),
+ EqualsToken(Token::Type::REGULAR, "世界"))));
+ } else {
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("你好,世界!你好:世界。“你好”世界?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"),
+ EqualsToken(Token::Type::REGULAR, "你好"),
+ EqualsToken(Token::Type::REGULAR, "世界"))));
+ }
}
TEST_F(PlainTokenizerTest, SpecialCharacters) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
// Right now we don't have special logic for these characters, just output
// them as tokens.
- EXPECT_THAT(plain_tokenizer->TokenizeAll("1+1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "1"),
- EqualsToken(Token::REGULAR, "+"),
- EqualsToken(Token::REGULAR, "1"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("1+1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "1"),
+ EqualsToken(Token::Type::REGULAR, "+"),
+ EqualsToken(Token::Type::REGULAR, "1"))));
- EXPECT_THAT(plain_tokenizer->TokenizeAll("$50"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "$"),
- EqualsToken(Token::REGULAR, "50"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("$50"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "$"),
+ EqualsToken(Token::Type::REGULAR, "50"))));
}
TEST_F(PlainTokenizerTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
- ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
-
// In plain tokenizer, CJKT characters are handled the same way as non-CJKT
// characters, just add these tests as sanity checks.
-
// Chinese
- EXPECT_THAT(plain_tokenizer->TokenizeAll("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "我"),
- EqualsToken(Token::REGULAR, "每天"),
- EqualsToken(Token::REGULAR, "走路"),
- EqualsToken(Token::REGULAR, "去"),
- EqualsToken(Token::REGULAR, "上班"))));
- // Japanese
+ language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
EXPECT_THAT(
- plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "私"), EqualsToken(Token::REGULAR, "は"),
- EqualsToken(Token::REGULAR, "毎日"),
- EqualsToken(Token::REGULAR, "仕事"),
- EqualsToken(Token::REGULAR, "に"), EqualsToken(Token::REGULAR, "歩"),
- EqualsToken(Token::REGULAR, "い"),
- EqualsToken(Token::REGULAR, "てい"),
- EqualsToken(Token::REGULAR, "ます"))));
+ plain_tokenizer->TokenizeAll("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "我"),
+ EqualsToken(Token::Type::REGULAR, "每天"),
+ EqualsToken(Token::Type::REGULAR, "走路"),
+ EqualsToken(Token::Type::REGULAR, "去"),
+ EqualsToken(Token::Type::REGULAR, "上班"))));
+ // Japanese
+ options = language_segmenter_factory::SegmenterOptions(ULOC_JAPANESE,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
+ if (IsCfStringTokenization()) {
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
+ EqualsToken(Token::Type::REGULAR, "は"),
+ EqualsToken(Token::Type::REGULAR, "毎日"),
+ EqualsToken(Token::Type::REGULAR, "仕事"),
+ EqualsToken(Token::Type::REGULAR, "に"),
+ EqualsToken(Token::Type::REGULAR, "歩い"),
+ EqualsToken(Token::Type::REGULAR, "て"),
+ EqualsToken(Token::Type::REGULAR, "い"),
+ EqualsToken(Token::Type::REGULAR, "ます"))));
+ } else {
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("私は毎日仕事に歩いています。"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "私"),
+ EqualsToken(Token::Type::REGULAR, "は"),
+ EqualsToken(Token::Type::REGULAR, "毎日"),
+ EqualsToken(Token::Type::REGULAR, "仕事"),
+ EqualsToken(Token::Type::REGULAR, "に"),
+ EqualsToken(Token::Type::REGULAR, "歩"),
+ EqualsToken(Token::Type::REGULAR, "い"),
+ EqualsToken(Token::Type::REGULAR, "てい"),
+ EqualsToken(Token::Type::REGULAR, "ます"))));
+ }
+
// Khmer
- EXPECT_THAT(plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ញុំ"),
- EqualsToken(Token::REGULAR, "ដើរទៅ"),
- EqualsToken(Token::REGULAR, "ធ្វើការ"),
- EqualsToken(Token::REGULAR, "រាល់ថ្ងៃ"))));
- // Korean
EXPECT_THAT(
- plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "나는"),
- EqualsToken(Token::REGULAR, "매일"),
- EqualsToken(Token::REGULAR, "출근합니다"))));
+ plain_tokenizer->TokenizeAll("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ញុំ"),
+ EqualsToken(Token::Type::REGULAR, "ដើរទៅ"),
+ EqualsToken(Token::Type::REGULAR, "ធ្វើការ"),
+ EqualsToken(Token::Type::REGULAR, "រាល់ថ្ងៃ"))));
+ // Korean
+ EXPECT_THAT(plain_tokenizer->TokenizeAll("나는 매일 출근합니다."),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::REGULAR, "나는"),
+ EqualsToken(Token::Type::REGULAR, "매일"),
+ EqualsToken(Token::Type::REGULAR, "출근합니다"))));
// Thai
- EXPECT_THAT(plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "ฉัน"),
- EqualsToken(Token::REGULAR, "เดิน"),
- EqualsToken(Token::REGULAR, "ไป"),
- EqualsToken(Token::REGULAR, "ทำงาน"),
- EqualsToken(Token::REGULAR, "ทุก"),
- EqualsToken(Token::REGULAR, "วัน"))));
+ // DIFFERENCE!! Disagreement over how to segment "ทุกวัน" (iOS groups).
+ // This difference persists even when locale is set to THAI
+ if (IsCfStringTokenization()) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<Token> tokens,
+ plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"));
+
+ EXPECT_THAT(tokens, ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
+ EqualsToken(Token::Type::REGULAR, "เดิน"),
+ EqualsToken(Token::Type::REGULAR, "ไป"),
+ EqualsToken(Token::Type::REGULAR, "ทำงาน"),
+ EqualsToken(Token::Type::REGULAR, "ทุกวัน")));
+ } else {
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll("ฉันเดินไปทำงานทุกวัน"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "ฉัน"),
+ EqualsToken(Token::Type::REGULAR, "เดิน"),
+ EqualsToken(Token::Type::REGULAR, "ไป"),
+ EqualsToken(Token::Type::REGULAR, "ทำงาน"),
+ EqualsToken(Token::Type::REGULAR, "ทุก"),
+ EqualsToken(Token::Type::REGULAR, "วัน"))));
+ }
}
-TEST_F(PlainTokenizerTest, ResetToTokenAfterSimple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+TEST_F(PlainTokenizerTest, ResetToTokenStartingAfterSimple) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
constexpr std::string_view kText = "f b";
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
- EXPECT_TRUE(iterator->ResetToTokenAfter(0));
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "b"));
+ EXPECT_TRUE(iterator->ResetToTokenStartingAfter(0));
+ EXPECT_THAT(iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "b")));
- EXPECT_FALSE(iterator->ResetToTokenAfter(2));
+ EXPECT_FALSE(iterator->ResetToTokenStartingAfter(2));
}
-TEST_F(PlainTokenizerTest, ResetToTokenBeforeSimple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+TEST_F(PlainTokenizerTest, ResetToTokenEndingBeforeSimple) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
constexpr std::string_view kText = "f b";
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
- EXPECT_TRUE(iterator->ResetToTokenBefore(2));
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "f"));
+ EXPECT_TRUE(iterator->ResetToTokenEndingBefore(2));
+ EXPECT_THAT(iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "f")));
- EXPECT_FALSE(iterator->ResetToTokenBefore(0));
+ EXPECT_FALSE(iterator->ResetToTokenEndingBefore(0));
}
-TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+TEST_F(PlainTokenizerTest, ResetToTokenStartingAfter) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
- EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
- EqualsToken(Token::REGULAR, "bar"),
- EqualsToken(Token::REGULAR, "baz"),
- EqualsToken(Token::REGULAR, "bat"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
+ EqualsToken(Token::Type::REGULAR, "bar"),
+ EqualsToken(Token::Type::REGULAR, "baz"),
+ EqualsToken(Token::Type::REGULAR, "bat"))));
std::vector<std::string> expected_text = {
"foo", // 0: " foo . bar"
"bar", // 1: "foo . bar "
@@ -278,32 +414,38 @@ TEST_F(PlainTokenizerTest, ResetToTokenAfter) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->Advance());
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+ EXPECT_THAT(iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "foo")));
for (int i = 0; i < kText.length(); ++i) {
if (i < expected_text.size()) {
- EXPECT_TRUE(iterator->ResetToTokenAfter(i));
- EXPECT_THAT(iterator->GetToken(),
- EqualsToken(Token::REGULAR, expected_text[i]));
+ EXPECT_TRUE(iterator->ResetToTokenStartingAfter(i));
+ EXPECT_THAT(
+ iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::REGULAR, expected_text[i])));
} else {
- EXPECT_FALSE(iterator->ResetToTokenAfter(i));
+ EXPECT_FALSE(iterator->ResetToTokenStartingAfter(i));
}
}
}
-TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+TEST_F(PlainTokenizerTest, ResetToTokenEndingBefore) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
ICING_ASSERT_OK_AND_ASSIGN(
- std::unique_ptr<Tokenizer> plain_tokenizer,
- tokenizer_factory::CreateIndexingTokenizer(
- IndexingConfig::TokenizerType::PLAIN, language_segmenter.get()));
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> plain_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::PLAIN,
+ language_segmenter.get()));
constexpr std::string_view kText = " foo . bar baz.. bat ";
- EXPECT_THAT(plain_tokenizer->TokenizeAll(kText),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "foo"),
- EqualsToken(Token::REGULAR, "bar"),
- EqualsToken(Token::REGULAR, "baz"),
- EqualsToken(Token::REGULAR, "bat"))));
+ EXPECT_THAT(
+ plain_tokenizer->TokenizeAll(kText),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "foo"),
+ EqualsToken(Token::Type::REGULAR, "bar"),
+ EqualsToken(Token::Type::REGULAR, "baz"),
+ EqualsToken(Token::Type::REGULAR, "bat"))));
std::vector<std::string> expected_text = {
"bat", // 20: "baz.. bat "
"baz", // 19: " baz.. bat"
@@ -326,15 +468,17 @@ TEST_F(PlainTokenizerTest, ResetToTokenBefore) {
auto iterator = plain_tokenizer->Tokenize(kText).ValueOrDie();
EXPECT_TRUE(iterator->Advance());
- EXPECT_THAT(iterator->GetToken(), EqualsToken(Token::REGULAR, "foo"));
+ EXPECT_THAT(iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "foo")));
for (int i = kText.length() - 1; i >= 0; --i) {
int expected_index = kText.length() - 1 - i;
if (expected_index < expected_text.size()) {
- EXPECT_TRUE(iterator->ResetToTokenBefore(i));
- EXPECT_THAT(iterator->GetToken(),
- EqualsToken(Token::REGULAR, expected_text[expected_index]));
+ EXPECT_TRUE(iterator->ResetToTokenEndingBefore(i));
+ EXPECT_THAT(iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::REGULAR,
+ expected_text[expected_index])));
} else {
- EXPECT_FALSE(iterator->ResetToTokenBefore(i));
+ EXPECT_FALSE(iterator->ResetToTokenEndingBefore(i));
}
}
}
diff --git a/icing/tokenization/raw-query-tokenizer.cc b/icing/tokenization/raw-query-tokenizer.cc
index 8b2edc9..1dcbf9b 100644
--- a/icing/tokenization/raw-query-tokenizer.cc
+++ b/icing/tokenization/raw-query-tokenizer.cc
@@ -14,9 +14,8 @@
#include "icing/tokenization/raw-query-tokenizer.h"
-#include <stddef.h>
-
#include <cctype>
+#include <cstddef>
#include <memory>
#include <string>
#include <string_view>
@@ -26,6 +25,9 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/absl_ports/str_join.h"
+#include "icing/schema/property-util.h"
+#include "icing/schema/schema-util.h"
#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/token.h"
#include "icing/tokenization/tokenizer.h"
@@ -70,7 +72,7 @@ constexpr char kColon = ':';
constexpr char kLeftParentheses = '(';
constexpr char kRightParentheses = ')';
constexpr char kExclusion = '-';
-constexpr char kOrOperator[] = "OR";
+constexpr std::string_view kOrOperator = "OR";
enum State {
// Ready to process any terms
@@ -100,10 +102,14 @@ enum State {
// When seeing right parentheses
CLOSING_PARENTHESES = 8,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9,
+
+ PROCESSING_PROPERTY_TERM_APPENDING = 10,
+
// Valid state count
- STATE_COUNT = 9,
+ STATE_COUNT = 11,
- INVALID = 10
+ INVALID = 12
};
enum TermType {
@@ -111,27 +117,29 @@ enum TermType {
WHITESPACE = 0,
// A term that consists of unicode alphabetic and numeric characters
- ALPHANUMERIC_TERM = 1,
+ ASCII_ALPHANUMERIC_TERM = 1,
+
+ NON_ASCII_ALPHANUMERIC_TERM = 2,
// "("
- LEFT_PARENTHESES = 2,
+ LEFT_PARENTHESES = 3,
// ")"
- RIGHT_PARENTHESES = 3,
+ RIGHT_PARENTHESES = 4,
// "-"
- EXCLUSION_OPERATOR = 4,
+ EXCLUSION_OPERATOR = 5,
// "OR"
- OR_OPERATOR = 5,
+ OR_OPERATOR = 6,
// ":"
- COLON = 6,
+ COLON = 7,
// All the other characters seen that are not the types above
- OTHER = 7,
+ OTHER = 8,
- TYPE_COUNT = 8
+ TYPE_COUNT = 9
};
enum ActionOrError {
@@ -145,6 +153,9 @@ enum ActionOrError {
// Ignore / throw away the current term
IGNORE = 2,
+ // Concatenate with next term
+ CONCATENATE = 3,
+
// Errors
ERROR_UNKNOWN = 100,
ERROR_NO_WHITESPACE_AROUND_OR = 101,
@@ -154,6 +165,7 @@ enum ActionOrError {
ERROR_EXCLUSION_PROPERTY_TOGETHER = 105,
ERROR_EXCLUSION_OR_TOGETHER = 106,
ERROR_PROPERTY_OR_TOGETHER = 107,
+ ERROR_NON_ASCII_AS_PROPERTY_NAME = 108,
};
std::string_view GetErrorMessage(ActionOrError maybe_error) {
@@ -175,6 +187,8 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
return "Exclusion and OR operators can't be used together";
case ERROR_PROPERTY_OR_TOGETHER:
return "Property restriction and OR operators can't be used together";
+ case ERROR_NON_ASCII_AS_PROPERTY_NAME:
+ return "Characters in property name must all be ASCII.";
default:
return "";
}
@@ -186,7 +200,7 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// States:
//
// READY = 0
-// PROCESSING_ALPHANUMERIC_TERM = 1
+// PROCESSING_ASCII_ALPHANUMERIC_TERM = 1
// PROCESSING_EXCLUSION = 2
// PROCESSING_EXCLUSION_TERM = 3
// PROCESSING_PROPERTY_RESTRICT = 4
@@ -194,24 +208,28 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
// PROCESSING_OR = 6
// OPENING_PARENTHESES = 7
// CLOSING_PARENTHESES = 8
+// PROCESSING_NON_ASCII_ALPHANUMERIC_TERM = 9
+// PROCESSING_PROPERTY_TERM_APPENDING = 10
//
// Actions:
//
// OUTPUT = a
// KEEP = b
// IGNORE = c
+// CONCAT = d, concatenate the current term and the new term.
//
-// ========================================================
-// Transition Table || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
-// ===========================================================================
-// WHITESPACE || 0,c | 0,a | 0,c | 0,a | 0,a | 0,a | 0,a | 0,a | 0,a |
-// ALPHANUMERIC_TERM || 1,c | 1,a | 3,a | 1,a | 5,a | 1,a |ERROR| 1,a | 1,a |
-// LEFT_PARENTHESES || 7,c | 7,a |ERROR| 7,a |ERROR| 7,a | 7,a | 7,a | 7,a |
-// RIGHT_PARENTHESES || 8,c | 8,a | 8,c | 8,a | 8,a | 8,a | 8,c | 8,a | 8,a |
-// EXCLUSION_OPERATOR || 2,c | 0,a | 2,c | 0,a |ERROR| 0,a |ERROR| 2,a | 2,a |
-// OR_OPERATOR || 6,c |ERROR|ERROR|ERROR|ERROR|ERROR|ERROR| 7,b | 6,a |
-// COLON || 0,c | 4,b |ERROR|ERROR| 4,b | 0,a |ERROR| 0,a |ERROR|
-// OTHER || 0,c | 0,a | 0,c | 0,a | 0,a | 0,a | 0,a | 0,a | 0,a |
+// =============================================================================
+// Transition || 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
+// =============================================================================
+// WHITESPACE || 0,c| 0,a| 0,c| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a|
+// ASCII_ALPHA || 1,c| 1,d| 3,a| 1,a| 5,a| 1,a|ERR | 1,a| 1,a| 1,a|10,d|
+// NONASCII_ALPHA || 9,c| 9,a| 3,a| 9,a| 5,a| 9,a|ERR | 9,a| 9,a| 9,a|10,d|
+// LEFT_PAREN || 7,c| 7,a|ERR | 7,a|ERR | 7,a| 7,a| 7,a| 7,a| 7,a| 7,a|
+// RIGHT_PAREN || 8,c| 8,a| 8,c| 8,a| 8,a| 8,a| 8,c| 8,a| 8,a| 8,a| 8,a|
+// EXCLUSION_OP || 2,c| 0,a| 2,c| 0,a|ERR | 0,a|ERR | 2,a| 2,a| 0,a| 0,a|
+// OR_OPERATOR || 6,c|ERR |ERR |ERR |ERR |ERR |ERR | 7,b| 6,a|ERR |ERR |
+// COLON || 0,c| 4,b|ERR |ERR | 4,b|10,d|ERR | 0,a|ERR |ERR |10,d|
+// OTHER || 0,c| 0,a| 0,c| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a| 0,a|
//
// Each cell is a rule that consists of 4 things:
// [current state] + [next term type] -> [new state] + [action]
@@ -228,39 +246,56 @@ std::string_view GetErrorMessage(ActionOrError maybe_error) {
//
// NOTE: Please update the state transition table above if this is updated.
//
-// TODO(samzheng): support syntax "-property1:term1", right now we don't allow
+// TODO(tjbarron): support syntax "-property1:term1", right now we don't allow
// exclusion and property restriction applied on the same term.
// TODO(b/141007791): figure out how we'd like to support special characters
// like "+", "&", "@", "#" in indexing and query tokenizers.
constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: Ready*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ {READY, PROCESSING_ALPHANUMERIC_TERM,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, READY, READY},
/*State: PROCESSING_ALPHANUMERIC_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ {READY, PROCESSING_ALPHANUMERIC_TERM,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_RESTRICT, READY},
/*State: PROCESSING_EXCLUSION*/
- {READY, PROCESSING_EXCLUSION_TERM, INVALID, CLOSING_PARENTHESES,
- PROCESSING_EXCLUSION, INVALID, INVALID, READY},
+ {READY, PROCESSING_EXCLUSION_TERM, PROCESSING_EXCLUSION_TERM, INVALID,
+ CLOSING_PARENTHESES, PROCESSING_EXCLUSION, INVALID, INVALID, READY},
/*State: PROCESSING_EXCLUSION_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ {READY, PROCESSING_ALPHANUMERIC_TERM,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
/*State: PROCESSING_PROPERTY_RESTRICT*/
- {READY, PROCESSING_PROPERTY_TERM, INVALID, CLOSING_PARENTHESES, INVALID,
- INVALID, PROCESSING_PROPERTY_RESTRICT, READY},
+ {READY, PROCESSING_PROPERTY_TERM, PROCESSING_PROPERTY_TERM, INVALID,
+ CLOSING_PARENTHESES, INVALID, INVALID, PROCESSING_PROPERTY_RESTRICT,
+ READY},
/*State: PROCESSING_PROPERTY_TERM*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
- CLOSING_PARENTHESES, READY, INVALID, READY, READY},
+ {READY, PROCESSING_ALPHANUMERIC_TERM,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_TERM_APPENDING,
+ READY},
/*State: PROCESSING_OR*/
- {READY, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID, INVALID,
- INVALID, READY},
+ {READY, INVALID, INVALID, OPENING_PARENTHESES, CLOSING_PARENTHESES, INVALID,
+ INVALID, INVALID, READY},
/*State: OPENING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ {READY, PROCESSING_ALPHANUMERIC_TERM,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
CLOSING_PARENTHESES, PROCESSING_EXCLUSION, OPENING_PARENTHESES, READY,
READY},
/*State: CLOSING_PARENTHESES*/
- {READY, PROCESSING_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
- CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, INVALID, READY}};
+ {READY, PROCESSING_ALPHANUMERIC_TERM,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, PROCESSING_EXCLUSION, PROCESSING_OR, INVALID, READY},
+ /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/
+ {READY, PROCESSING_ALPHANUMERIC_TERM,
+ PROCESSING_NON_ASCII_ALPHANUMERIC_TERM, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, READY, INVALID, INVALID, READY},
+ /*State: PROCESSING_PROPERTY_TERM_APPENDING*/
+ {READY, PROCESSING_PROPERTY_TERM_APPENDING,
+ PROCESSING_PROPERTY_TERM_APPENDING, OPENING_PARENTHESES,
+ CLOSING_PARENTHESES, READY, INVALID, PROCESSING_PROPERTY_TERM_APPENDING,
+ READY}};
// We use a 2D array to encode the action rules,
// The value of action_rules[state1][term_type1] means "what action we need to
@@ -269,62 +304,150 @@ constexpr State state_transition_rules[STATE_COUNT][TYPE_COUNT] = {
// NOTE: Please update the state transition table above if this is updated.
constexpr ActionOrError action_rules[STATE_COUNT][TYPE_COUNT] = {
/*State: Ready*/
- {IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE},
+ {IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE, IGNORE},
/*State: PROCESSING_ALPHANUMERIC_TERM*/
- {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
- KEEP, OUTPUT},
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_NO_WHITESPACE_AROUND_OR, KEEP, OUTPUT},
/*State: PROCESSING_EXCLUSION*/
- {IGNORE, OUTPUT, ERROR_GROUP_AFTER_EXCLUSION, IGNORE, IGNORE,
+ {IGNORE, OUTPUT, OUTPUT, ERROR_GROUP_AFTER_EXCLUSION, IGNORE, IGNORE,
ERROR_EXCLUSION_OR_TOGETHER, ERROR_EXCLUSION_PROPERTY_TOGETHER, IGNORE},
/*State: PROCESSING_EXCLUSION_TERM*/
- {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
- ERROR_EXCLUSION_PROPERTY_TOGETHER, OUTPUT},
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_NO_WHITESPACE_AROUND_OR, ERROR_EXCLUSION_PROPERTY_TOGETHER, OUTPUT},
/*State: PROCESSING_PROPERTY_RESTRICT*/
- {OUTPUT, OUTPUT, ERROR_GROUP_AFTER_PROPERTY_RESTRICTION, OUTPUT,
+ {OUTPUT, OUTPUT, OUTPUT, ERROR_GROUP_AFTER_PROPERTY_RESTRICTION, OUTPUT,
ERROR_EXCLUSION_PROPERTY_TOGETHER, ERROR_PROPERTY_OR_TOGETHER, KEEP,
OUTPUT},
/*State: PROCESSING_PROPERTY_TERM*/
- {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR,
- OUTPUT, OUTPUT},
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_NO_WHITESPACE_AROUND_OR, CONCATENATE, OUTPUT},
/*State: PROCESSING_OR*/
- {OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT, IGNORE,
- ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR,
- ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT},
+ {OUTPUT, ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR,
+ OUTPUT, IGNORE, ERROR_NO_WHITESPACE_AROUND_OR,
+ ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NO_WHITESPACE_AROUND_OR, OUTPUT},
/*State: OPENING_PARENTHESES*/
- {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, KEEP, OUTPUT, OUTPUT},
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, KEEP, OUTPUT, OUTPUT},
/*State: CLOSING_PARENTHESES*/
+ {OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT},
+ /*State: PROCESSING_NON_ASCII_ALPHANUMERIC_TERM*/
{OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT, OUTPUT,
- ERROR_GROUP_AS_PROPERTY_NAME, OUTPUT}};
-
-// Helper function to get the TermType of the input term.
-TermType GetTermType(std::string_view term) {
- if (term.length() == 1) {
- // Must be an ASCII char
- const char& first_term_char = term[0];
- if (first_term_char == kWhitespace) {
- return WHITESPACE;
- } else if (first_term_char == kColon) {
- return COLON;
- } else if (first_term_char == kLeftParentheses) {
- return LEFT_PARENTHESES;
- } else if (first_term_char == kRightParentheses) {
- return RIGHT_PARENTHESES;
- } else if (first_term_char == kExclusion) {
- return EXCLUSION_OPERATOR;
- }
- } else if (term.length() == 2 && term == kOrOperator) {
- return OR_OPERATOR;
+ ERROR_NO_WHITESPACE_AROUND_OR, ERROR_NON_ASCII_AS_PROPERTY_NAME, OUTPUT},
+ /*State: PROCESSING_PROPERTY_TERM_APPENDING*/
+ {OUTPUT, CONCATENATE, CONCATENATE, OUTPUT, OUTPUT, OUTPUT,
+ ERROR_NO_WHITESPACE_AROUND_OR, CONCATENATE, OUTPUT}};
+
+// Determines the length of the whitespace term beginning at text[pos] and
+// returns a pair with the WHITESPACE TermType and a string_view of the
+// whitespace term.
+std::pair<TermType, std::string_view> GetWhitespaceTerm(std::string_view text,
+ size_t pos) {
+ size_t cur = pos;
+ while (cur < text.length() && text[cur] == kWhitespace) {
+ ++cur;
+ }
+ return std::make_pair(WHITESPACE, text.substr(pos, cur - pos));
+}
+
+TermType GetContentTermType(std::string_view text, size_t pos) {
+ if (i18n_utils::IsPunctuationAt(text, pos)) {
+ return OTHER;
+ } else if (i18n_utils::IsAscii(text[pos])) {
+ return ASCII_ALPHANUMERIC_TERM;
+ }
+ return NON_ASCII_ALPHANUMERIC_TERM;
+}
+
+bool IsContentTermType(TermType term_type) {
+ switch (term_type) {
+ case ASCII_ALPHANUMERIC_TERM:
+ [[fallthrough]];
+ case NON_ASCII_ALPHANUMERIC_TERM:
+ [[fallthrough]];
+ case OTHER:
+ return true;
+ case WHITESPACE:
+ [[fallthrough]];
+ case LEFT_PARENTHESES:
+ [[fallthrough]];
+ case RIGHT_PARENTHESES:
+ [[fallthrough]];
+ case EXCLUSION_OPERATOR:
+ [[fallthrough]];
+ case OR_OPERATOR:
+ [[fallthrough]];
+ case COLON:
+ [[fallthrough]];
+ case TYPE_COUNT:
+ return false;
}
+}
+
+// Determines the length of the potential content term beginning at text[pos]
+// and returns a pair with the appropriate TermType and a string_view of the
+// content term.
+//
+// NOTE: The potential content term could multiple content terms (segmentation
+// is needed to determine this), a property restrict (depending on other
+// neighboring tokens). It could also be multiple content terms surrounding an
+// OR operator (segmentation is also needed to determine this).
+std::pair<TermType, std::string_view> GetContentTerm(std::string_view text,
+ size_t pos) {
+ size_t len = 0;
// Checks the first char to see if it's an ASCII term
- if (i18n_utils::IsAscii(term[0])) {
- if (std::isalnum(term[0])) {
- return ALPHANUMERIC_TERM;
+ TermType type = GetContentTermType(text, pos);
+ for (size_t cur = pos; cur < text.length() && len == 0; ++cur) {
+ switch (text[cur]) {
+ case kLeftParentheses:
+ [[fallthrough]];
+ case kRightParentheses:
+ [[fallthrough]];
+ case kExclusion:
+ [[fallthrough]];
+ case kWhitespace:
+ [[fallthrough]];
+ case kColon:
+ // If we reach any of our special characters (colon, exclusion or
+ // parentheses), then we've reached the end of the content term. Set len
+ // and exit the loop.
+ len = cur - pos;
+ break;
+ default:
+ break;
}
- return OTHER;
}
- // All non-ASCII terms are alphabetic since language segmenter already
- // filters out non-ASCII and non-alphabetic terms
- return ALPHANUMERIC_TERM;
+ if (len == 0) {
+ // If len isn't set, then we must have reached the end of the string.
+ len = text.length() - pos;
+ }
+ return std::make_pair(type, text.substr(pos, len));
+}
+
+// Determines the type and length of the term beginning at text[pos].
+std::pair<TermType, std::string_view> GetTerm(std::string_view text,
+ size_t pos) {
+ switch (text[pos]) {
+ case kLeftParentheses:
+ return std::make_pair(LEFT_PARENTHESES, text.substr(pos, 1));
+ case kRightParentheses:
+ return std::make_pair(RIGHT_PARENTHESES, text.substr(pos, 1));
+ case kExclusion:
+ return std::make_pair(EXCLUSION_OPERATOR, text.substr(pos, 1));
+ case kWhitespace:
+ // Get length of whitespace
+ return GetWhitespaceTerm(text, pos);
+ case kColon:
+ return std::make_pair(COLON, text.substr(pos, 1));
+ case kOrOperator[0]:
+ if (text.length() >= pos + kOrOperator.length() &&
+ text.substr(pos, kOrOperator.length()) == kOrOperator) {
+ return std::make_pair(OR_OPERATOR,
+ text.substr(pos, kOrOperator.length()));
+ }
+ [[fallthrough]];
+ default:
+ return GetContentTerm(text, pos);
+ }
}
// Helper function to remove the last token if it's OR operator. This is used to
@@ -332,7 +455,7 @@ TermType GetTermType(std::string_view term) {
// and [(cat OR)]. This helps assert extra rule 3: "OR" is ignored if there's no
// valid token on its right.
void RemoveLastTokenIfOrOperator(std::vector<Token>* tokens) {
- if (!tokens->empty() && tokens->back().type == Token::QUERY_OR) {
+ if (!tokens->empty() && tokens->back().type == Token::Type::QUERY_OR) {
tokens->pop_back();
}
}
@@ -346,11 +469,11 @@ libtextclassifier3::Status OutputOrOperatorToken(std::vector<Token>* tokens) {
}
Token::Type last_token_type = tokens->back().type;
switch (last_token_type) {
- case Token::REGULAR:
- case Token::QUERY_RIGHT_PARENTHESES:
- tokens->emplace_back(Token::QUERY_OR);
+ case Token::Type::REGULAR:
+ case Token::Type::QUERY_RIGHT_PARENTHESES:
+ tokens->emplace_back(Token::Type::QUERY_OR);
break;
- case Token::QUERY_OR:
+ case Token::Type::QUERY_OR:
// Ignores "OR" because there's already an "OR", e.g. "term1 OR OR term2"
break;
default:
@@ -378,28 +501,34 @@ libtextclassifier3::Status OutputToken(State new_state,
TermType current_term_type,
std::vector<Token>* tokens) {
switch (current_term_type) {
- case ALPHANUMERIC_TERM:
+ case ASCII_ALPHANUMERIC_TERM:
+ [[fallthrough]];
+ case NON_ASCII_ALPHANUMERIC_TERM:
if (new_state == PROCESSING_PROPERTY_TERM) {
- // Asserts extra rule 1: property name must be in ASCII
- if (!i18n_utils::IsAscii(current_term[0])) {
- return absl_ports::InvalidArgumentError(
- "Characters in property name must all be ASCII.");
+ // Asserts extra rule 1: each property name in the property path is a
+ // valid term.
+ for (std::string_view property :
+ property_util::SplitPropertyPathExpr(current_term)) {
+ if (!SchemaUtil::ValidatePropertyName(property).ok()) {
+ return absl_ports::InvalidArgumentError(
+ GetErrorMessage(ERROR_NON_ASCII_AS_PROPERTY_NAME));
+ }
}
- tokens->emplace_back(Token::QUERY_PROPERTY, current_term);
+ tokens->emplace_back(Token::Type::QUERY_PROPERTY, current_term);
} else {
- tokens->emplace_back(Token::REGULAR, current_term);
+ tokens->emplace_back(Token::Type::REGULAR, current_term);
}
break;
case LEFT_PARENTHESES:
- tokens->emplace_back(Token::QUERY_LEFT_PARENTHESES);
+ tokens->emplace_back(Token::Type::QUERY_LEFT_PARENTHESES);
break;
case RIGHT_PARENTHESES:
// Ignores "OR" if it's followed by right parentheses.
RemoveLastTokenIfOrOperator(tokens);
- tokens->emplace_back(Token::QUERY_RIGHT_PARENTHESES);
+ tokens->emplace_back(Token::Type::QUERY_RIGHT_PARENTHESES);
break;
case EXCLUSION_OPERATOR:
- tokens->emplace_back(Token::QUERY_EXCLUSION);
+ tokens->emplace_back(Token::Type::QUERY_EXCLUSION);
break;
case OR_OPERATOR:
return OutputOrOperatorToken(tokens);
@@ -416,13 +545,11 @@ libtextclassifier3::Status OutputToken(State new_state,
// Returns:
// OK on success
// INVALID_ARGUMENT with error message on invalid query syntax
-libtextclassifier3::Status ProcessTerm(State* current_state,
- std::string_view* current_term,
- TermType* current_term_type,
- int* unclosed_parentheses_count,
- const std::string_view next_term,
- TermType next_term_type,
- std::vector<Token>* tokens) {
+libtextclassifier3::Status ProcessTerm(
+ State* current_state, std::string_view* current_term,
+ TermType* current_term_type, int* unclosed_parentheses_count,
+ const std::string_view next_term, TermType next_term_type,
+ const LanguageSegmenter* language_segmenter, std::vector<Token>* tokens) {
// Asserts extra rule 4: parentheses must appear in pairs.
if (next_term_type == LEFT_PARENTHESES) {
++(*unclosed_parentheses_count);
@@ -440,8 +567,23 @@ libtextclassifier3::Status ProcessTerm(State* current_state,
}
switch (action_or_error) {
case OUTPUT:
- ICING_RETURN_IF_ERROR(
- OutputToken(new_state, *current_term, *current_term_type, tokens));
+ if (*current_state == PROCESSING_PROPERTY_TERM_APPENDING) {
+ // We appended multiple terms together in case they actually should have
+ // been connected by a colon connector.
+ ICING_ASSIGN_OR_RETURN(std::vector<std::string_view> content_terms,
+ language_segmenter->GetAllTerms(*current_term));
+ for (std::string_view term : content_terms) {
+ TermType type = GetContentTermType(term, 0);
+ if (type == OTHER) {
+ // Skip OTHER tokens here.
+ continue;
+ }
+ ICING_RETURN_IF_ERROR(OutputToken(new_state, term, type, tokens));
+ }
+ } else {
+ ICING_RETURN_IF_ERROR(
+ OutputToken(new_state, *current_term, *current_term_type, tokens));
+ }
[[fallthrough]];
case IGNORE:
*current_term = next_term;
@@ -449,6 +591,11 @@ libtextclassifier3::Status ProcessTerm(State* current_state,
break;
case KEEP:
break;
+ case CONCATENATE:
+ *current_term = std::string_view(
+ current_term->data(),
+ next_term.data() - current_term->data() + next_term.length());
+ break;
default:
return absl_ports::InvalidArgumentError(GetErrorMessage(ERROR_UNKNOWN));
}
@@ -463,56 +610,50 @@ libtextclassifier3::Status ProcessTerm(State* current_state,
// A list of tokens on success
// INVALID_ARGUMENT with error message on invalid query syntax
libtextclassifier3::StatusOr<std::vector<Token>> ProcessTerms(
- std::unique_ptr<LanguageSegmenter::Iterator> base_iterator) {
+ const LanguageSegmenter* language_segmenter,
+ std::vector<std::pair<TermType, std::string_view>> prescanned_terms) {
std::vector<Token> tokens;
State current_state = READY;
std::string_view current_term;
TermType current_term_type;
int unclosed_parentheses_count = 0;
- while (base_iterator->Advance()) {
- const std::string_view next_term = base_iterator->GetTerm();
- size_t colon_position = next_term.find(kColon);
- // Since colon ":" is a word connector per ICU's rule
- // (https://unicode.org/reports/tr29/#Word_Boundaries), strings like
- // "foo:bar" are returned by LanguageSegmenter as one term. Here we're
- // trying to find the first colon as it represents property restriction in
- // raw query.
- if (colon_position == std::string_view::npos) {
- // No colon found
- ICING_RETURN_IF_ERROR(ProcessTerm(&current_state, &current_term,
- &current_term_type,
- &unclosed_parentheses_count, next_term,
- GetTermType(next_term), &tokens));
- } else if (next_term.size() == 1 && next_term[0] == kColon) {
- // The whole term is a colon
+ for (int i = 0; i < prescanned_terms.size(); ++i) {
+ const std::pair<TermType, std::string_view>& prescanned_term =
+ prescanned_terms.at(i);
+ if (!IsContentTermType(prescanned_term.first)) {
+ // This can't be a property restrict. Just pass it in.
ICING_RETURN_IF_ERROR(
ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count, next_term, COLON, &tokens));
+ &unclosed_parentheses_count, prescanned_term.second,
+ prescanned_term.first, language_segmenter, &tokens));
} else {
- // String before the colon is the property name
- std::string_view property_name = next_term.substr(0, colon_position);
- ICING_RETURN_IF_ERROR(
- ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count, property_name,
- GetTermType(property_name), &tokens));
- ICING_RETURN_IF_ERROR(
- ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count, std::string_view(&kColon, 1),
- COLON, &tokens));
- // String after the colon is the term that property restriction is applied
- // on.
- std::string_view property_term = next_term.substr(colon_position + 1);
- ICING_RETURN_IF_ERROR(
- ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count, property_term,
- GetTermType(property_term), &tokens));
+ // There's no colon after this term. Now, we need to segment this.
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<std::string_view> content_terms,
+ language_segmenter->GetAllTerms(prescanned_term.second));
+ for (std::string_view term : content_terms) {
+ TermType type = GetContentTermType(term, 0);
+ if (term == kOrOperator) {
+ // TODO(tjbarron) Decide whether we should revise this and other
+ // handled syntax. This is used to allow queries like "term1,OR,term2"
+ // to succeed. It's not clear if we should allow this or require
+ // clients to ensure that OR operators are always surrounded by
+ // whitespace.
+ // Override the type if this is actually an OR operator.
+ type = OR_OPERATOR;
+ }
+ ICING_RETURN_IF_ERROR(ProcessTerm(&current_state, &current_term,
+ &current_term_type,
+ &unclosed_parentheses_count, term,
+ type, language_segmenter, &tokens));
+ }
}
}
// Adds a fake whitespace at the end to flush the last term.
- ICING_RETURN_IF_ERROR(
- ProcessTerm(&current_state, &current_term, &current_term_type,
- &unclosed_parentheses_count,
- std::string_view(&kWhitespace, 1), WHITESPACE, &tokens));
+ ICING_RETURN_IF_ERROR(ProcessTerm(
+ &current_state, &current_term, &current_term_type,
+ &unclosed_parentheses_count, std::string_view(&kWhitespace, 1),
+ WHITESPACE, language_segmenter, &tokens));
if (unclosed_parentheses_count > 0) {
return absl_ports::InvalidArgumentError("Unclosed left parentheses.");
}
@@ -531,11 +672,14 @@ class RawQueryTokenIterator : public Tokenizer::Iterator {
bool Advance() override { return ++current_ < tokens_.size(); }
- Token GetToken() const override {
- if (current_ < 0 || current_ >= tokens_.size()) {
- return Token(Token::INVALID);
+ std::vector<Token> GetTokens() const override {
+ std::vector<Token> result;
+
+ if (current_ >= 0 && current_ < tokens_.size()) {
+ result.push_back(tokens_.at(current_));
}
- return tokens_.at(current_);
+
+ return result;
}
private:
@@ -553,10 +697,16 @@ RawQueryTokenizer::Tokenize(std::string_view text) const {
libtextclassifier3::StatusOr<std::vector<Token>> RawQueryTokenizer::TokenizeAll(
std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(
- std::unique_ptr<LanguageSegmenter::Iterator> base_iterator,
- language_segmenter_.Segment(text));
- return ProcessTerms(std::move(base_iterator));
+ // 1. Prescan all terms in the text, to determine which ones are potentially
+ // content and which ones are not.
+ std::vector<std::pair<TermType, std::string_view>> prescanned_terms;
+ for (size_t pos = 0; pos < text.length();) {
+ std::pair<TermType, std::string_view> term_pair = GetTerm(text, pos);
+ pos += term_pair.second.length();
+ prescanned_terms.push_back(term_pair);
+ }
+ // 2. Process the prescanned terms, segmenting content terms as needed.
+ return ProcessTerms(&language_segmenter_, std::move(prescanned_terms));
}
} // namespace lib
diff --git a/icing/tokenization/raw-query-tokenizer_test.cc b/icing/tokenization/raw-query-tokenizer_test.cc
index 351f7c1..39cc0ed 100644
--- a/icing/tokenization/raw-query-tokenizer_test.cc
+++ b/icing/tokenization/raw-query-tokenizer_test.cc
@@ -16,26 +16,31 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
+#include "icing/portable/platform.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/tokenizer-factory.h"
#include "icing/tokenization/tokenizer.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
namespace {
using ::testing::ElementsAre;
+using ::testing::IsEmpty;
using ::testing::HasSubstr;
class RawQueryTokenizerTest : public ::testing::Test {
protected:
void SetUp() override {
- ICING_ASSERT_OK(
- // File generated via icu_data_file rule in //icing/BUILD.
- icu_data_file_helper::SetUpICUDataFile(
- GetTestFilePath("icing/icu.dat")));
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
}
};
@@ -44,106 +49,168 @@ TEST_F(RawQueryTokenizerTest, CreationWithNullPointerShouldFail) {
tokenizer_factory::RAW_QUERY, /*lang_segmenter=*/nullptr),
StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
}
+TEST_F(RawQueryTokenizerTest, NoTokensBeforeAdvancing) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> raw_query_tokenizer,
+ tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
+ language_segmenter.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ ICING_ASSERT_OK_AND_ASSIGN(auto token_iterator,
+ raw_query_tokenizer->Tokenize(kText));
+
+ // We should get no tokens if we get the token before advancing.
+ EXPECT_THAT(token_iterator->GetTokens(), IsEmpty());
+}
TEST_F(RawQueryTokenizerTest, Simple) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("Hello World!"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "Hello"),
- EqualsToken(Token::REGULAR, "World"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("Hello World!"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "World"))));
+
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("hElLo WORLD"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "hElLo"),
+ EqualsToken(Token::Type::REGULAR, "WORLD"))));
}
-TEST_F(RawQueryTokenizerTest, Parentheses) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+TEST_F(RawQueryTokenizerTest, Emoji) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("()"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
-
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( )"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("😊 Hello! Goodbye?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "😊"),
+ EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "Goodbye"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 term2)"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("Hello😊 ! Goodbye?"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "Hello"),
+ EqualsToken(Token::Type::REGULAR, "😊"),
+ EqualsToken(Token::Type::REGULAR, "Goodbye"))));
+}
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term3"),
- EqualsToken(Token::REGULAR, "term4"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
-
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1(term2)"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+TEST_F(RawQueryTokenizerTest, Parentheses) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Tokenizer> raw_query_tokenizer,
+ tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
+ language_segmenter.get()));
+ ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
+ raw_query_tokenizer->TokenizeAll("()"));
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("( )"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("(term1 term2)"));
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)-term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+ ICING_ASSERT_OK_AND_ASSIGN(
+ query_tokens,
+ raw_query_tokenizer->TokenizeAll("((term1 term2) (term3 term4))"));
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term3"),
+ EqualsToken(Token::Type::REGULAR, "term4"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("term1(term2)"));
+ EXPECT_THAT(
+ query_tokens,
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+
+ ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
+ raw_query_tokenizer->TokenizeAll("(term1)term2"));
+ EXPECT_THAT(query_tokens,
+ ElementsAre(EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2")));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)(term2)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)-term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1)OR(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1):term2"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -158,52 +225,59 @@ TEST_F(RawQueryTokenizerTest, Parentheses) {
HasSubstr("Too many right parentheses")));
}
-TEST_F(RawQueryTokenizerTest, Exclustion) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+TEST_F(RawQueryTokenizerTest, Exclusion) {
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(-term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("- term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("- term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// Exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1- term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1- term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// Exclusion operator is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 -)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// First exclusion operator is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("--term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("--term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// First "-" is exclusion operator, second is not and will be discarded.
// In other words, exclusion only applies to the term right after it.
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1-term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1-term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-(term1)"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
@@ -226,74 +300,94 @@ TEST_F(RawQueryTokenizerTest, Exclustion) {
}
TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(property1:term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Colon is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll(":term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll(":term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// Colon is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(:term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Colon is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1:"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1:"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// property name can be a path
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("email.title:hello"),
- IsOkAndHolds(
- ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "email.title"),
- EqualsToken(Token::REGULAR, "hello"))));
-
- // The first colon ":" triggers property restriction, the second colon is used
- // as a word connector per ICU's rule
- // (https://unicode.org/reports/tr29/#Word_Boundaries).
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property:foo:bar"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property"),
- EqualsToken(Token::REGULAR, "foo:bar"))));
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "email.title"),
+ EqualsToken(Token::Type::REGULAR, "hello"))));
+
+ // The first colon ":" triggers property restriction. Pre ICU 72, ':' was
+ // considered a word connector, so the second ':' will be interepreted as a
+ // connector pre-ICU 72. For ICU 72 and above, it's no longer considered a
+ // connector.
+ // TODO(b/254874614): Handle colon word breaks in ICU 72+
+ if (GetIcuTokenizationVersion() >= 72) {
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property"),
+ EqualsToken(Token::Type::REGULAR, "foo"),
+ EqualsToken(Token::Type::REGULAR, "bar"))));
+ } else {
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property:foo:bar"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property"),
+ EqualsToken(Token::Type::REGULAR, "foo:bar"))));
+ }
// Property restriction only applies to the term right after it.
// Note: "term1:term2" is not a term but 2 terms because word connectors
// don't apply to numbers and alphabets.
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1:term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1-"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:今天:天气"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1-"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// Multiple continuous colons will still be recognized as a property
// restriction operator
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1::term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1::term1"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("property1:(term1)"),
@@ -314,112 +408,118 @@ TEST_F(RawQueryTokenizerTest, PropertyRestriction) {
}
TEST_F(RawQueryTokenizerTest, OR) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// Two continuous "OR"s are treated as one
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
-
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ raw_query_tokenizer->TokenizeAll("term1 OR OR term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1) OR term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR (term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("((term1) OR (term2))"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Only "OR" (all in uppercase) is the operator
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("term1 or term2 Or term3 oR term4"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "or"),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::REGULAR, "Or"),
- EqualsToken(Token::REGULAR, "term3"),
- EqualsToken(Token::REGULAR, "oR"),
- EqualsToken(Token::REGULAR, "term4"))));
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "or"),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::REGULAR, "Or"),
+ EqualsToken(Token::Type::REGULAR, "term3"),
+ EqualsToken(Token::Type::REGULAR, "oR"),
+ EqualsToken(Token::Type::REGULAR, "term4"))));
// "OR" is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("OR term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("OR term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// "OR" is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1 OR"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(OR term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR term1)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(term1 OR )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// "OR" is ignored
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("( OR )"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1 OR(term2)"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term2"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("term1 OR-term2"),
@@ -435,34 +535,56 @@ TEST_F(RawQueryTokenizerTest, OR) {
// CJKT are treated the same way by language segmenter and raw tokenizer, so
// here we test Chinese and Japanese to represent CJKT.
TEST_F(RawQueryTokenizerTest, CJKT) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
// Exclusion only applies to the term right after it.
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-今天天气很好"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "今天"),
- EqualsToken(Token::REGULAR, "天气"),
- EqualsToken(Token::REGULAR, "很好"))));
+ if (IsCfStringTokenization()) {
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-今天天气很好"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"),
+ EqualsToken(Token::Type::REGULAR, "很"),
+ EqualsToken(Token::Type::REGULAR, "好"))));
+ } else {
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-今天天气很好"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::REGULAR, "天气"),
+ EqualsToken(Token::Type::REGULAR, "很好"))));
+ }
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:你好"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "你好"))));
+ if (IsCfStringTokenization()) {
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "你"),
+ EqualsToken(Token::Type::REGULAR, "好"))));
+ } else {
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:你好"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "你好"))));
+ }
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("标题:你好"),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
HasSubstr("Characters in property name must all be ASCII")));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "cat"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "ねこ"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("cat OR ねこ"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "cat"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "ねこ"))));
EXPECT_THAT(
raw_query_tokenizer->TokenizeAll("cat ORねこ"),
@@ -488,73 +610,104 @@ TEST_F(RawQueryTokenizerTest, CJKT) {
// Raw tokenizer identifies all characters that it doesn't know as OTHER type,
// so we can choose comma "," to represent all OTHER characters.
TEST_F(RawQueryTokenizerTest, OtherChars) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
// Comma is ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll(",term1, ,"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll(",term1, ,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
EXPECT_THAT(raw_query_tokenizer->TokenizeAll("(,term1),"),
IsOkAndHolds(ElementsAre(
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
// Exclusion operator and comma are ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-,term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"))));
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("-term1,"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "term1"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("-term1,"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
// Colon and comma are ignored
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:,term1"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "property1"),
- EqualsToken(Token::REGULAR, "term1"))));
-
EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::QUERY_PROPERTY, "property1"),
- EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::REGULAR, "term2"))));
+ raw_query_tokenizer->TokenizeAll("property1:,term1"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"))));
+
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll("property1:term1,term2"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::QUERY_PROPERTY, "property1"),
+ EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
// This is a special case for OR, unknown chars are treated the same as
// whitespaces before and after OR.
- EXPECT_THAT(raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
- IsOkAndHolds(ElementsAre(EqualsToken(Token::REGULAR, "term1"),
- EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::REGULAR, "term2"))));
+ EXPECT_THAT(
+ raw_query_tokenizer->TokenizeAll("term1,OR,term2"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::REGULAR, "term1"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::REGULAR, "term2"))));
}
TEST_F(RawQueryTokenizerTest, Mix) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(std::move(options)));
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Tokenizer> raw_query_tokenizer,
tokenizer_factory::CreateQueryTokenizer(tokenizer_factory::RAW_QUERY,
language_segmenter.get()));
- EXPECT_THAT(
- raw_query_tokenizer->TokenizeAll(
- "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"),
- IsOkAndHolds(ElementsAre(
- EqualsToken(Token::REGULAR, "こんにちは"),
- EqualsToken(Token::REGULAR, "good"),
- EqualsToken(Token::REGULAR, "afternoon"),
- EqualsToken(Token::QUERY_PROPERTY, "title"),
- EqualsToken(Token::REGULAR, "今天"), EqualsToken(Token::QUERY_OR, ""),
- EqualsToken(Token::QUERY_LEFT_PARENTHESES, ""),
- EqualsToken(Token::REGULAR, "ใน"), EqualsToken(Token::REGULAR, "วัน"),
- EqualsToken(Token::REGULAR, "นี้"),
- EqualsToken(Token::QUERY_EXCLUSION, ""),
- EqualsToken(Token::REGULAR, "B12"),
- EqualsToken(Token::QUERY_RIGHT_PARENTHESES, ""))));
+ if (IsCfStringTokenization()) {
+ EXPECT_THAT(raw_query_tokenizer->TokenizeAll(
+ "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::REGULAR, "こんにちは"),
+ EqualsToken(Token::Type::REGULAR, "good"),
+ EqualsToken(Token::Type::REGULAR, "afternoon"),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "ใน"),
+ EqualsToken(Token::Type::REGULAR, "วันนี้"),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "B12"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, ""))));
+ } else {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<Token> tokens,
+ raw_query_tokenizer->TokenizeAll(
+ "こんにちはgood afternoon, title:今天 OR (ในวันนี้ -B12)"));
+ EXPECT_THAT(
+ tokens,
+ ElementsAre(EqualsToken(Token::Type::REGULAR, "こんにちは"),
+ EqualsToken(Token::Type::REGULAR, "good"),
+ EqualsToken(Token::Type::REGULAR, "afternoon"),
+ EqualsToken(Token::Type::QUERY_PROPERTY, "title"),
+ EqualsToken(Token::Type::REGULAR, "今天"),
+ EqualsToken(Token::Type::QUERY_OR, ""),
+ EqualsToken(Token::Type::QUERY_LEFT_PARENTHESES, ""),
+ EqualsToken(Token::Type::REGULAR, "ใน"),
+ EqualsToken(Token::Type::REGULAR, "วัน"),
+ EqualsToken(Token::Type::REGULAR, "นี้"),
+ EqualsToken(Token::Type::QUERY_EXCLUSION, ""),
+ EqualsToken(Token::Type::REGULAR, "B12"),
+ EqualsToken(Token::Type::QUERY_RIGHT_PARENTHESES, "")));
+ }
}
} // namespace
diff --git a/icing/jni/reverse-jni-break-iterator.cc b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
index 1a8a799..dbd7f5a 100644
--- a/icing/jni/reverse-jni-break-iterator.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.cc
@@ -12,20 +12,20 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/jni/reverse-jni-break-iterator.h"
+#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h"
#include <jni.h>
-#include <math.h>
#include <cassert>
#include <cctype>
+#include <cmath>
#include <map>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/text_classifier/lib3/utils/java/jni-base.h"
#include "icing/text_classifier/lib3/utils/java/jni-helper.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/jni/jni-cache.h"
#include "icing/util/status-macros.h"
namespace icing {
diff --git a/icing/jni/reverse-jni-break-iterator.h b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
index c1f05f4..537666c 100644
--- a/icing/jni/reverse-jni-break-iterator.h
+++ b/icing/tokenization/reverse_jni/reverse-jni-break-iterator.h
@@ -12,16 +12,16 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#ifndef ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
-#define ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
#include <jni.h>
#include <queue>
#include <string>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/java/jni-base.h"
+#include "icing/jni/jni-cache.h"
namespace icing {
namespace lib {
@@ -121,4 +121,4 @@ class ReverseJniBreakIterator {
} // namespace lib
} // namespace icing
-#endif // ICING_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
+#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_BREAK_ITERATOR_H_
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
index f79bc68..a251f90 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-factory.cc
@@ -13,9 +13,11 @@
// limitations under the License.
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/jni/jni-cache.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h"
#include "icing/util/logging.h"
+#include "unicode/uloc.h"
namespace icing {
namespace lib {
@@ -32,7 +34,7 @@ constexpr std::string_view kLocaleAmericanEnglishComputer = "en_US_POSIX";
// A LanguageSegmenter on success
// INVALID_ARGUMENT if locale string is invalid
//
-// TODO(samzheng): Figure out if we want to verify locale strings and notify
+// TODO(b/156383798): Figure out if we want to verify locale strings and notify
// users. Right now illegal locale strings will be ignored by ICU. ICU
// components will be created with its default locale.
libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc
new file mode 100644
index 0000000..5f5202c
--- /dev/null
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test-jni-layer.cc
@@ -0,0 +1,37 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <jni.h>
+
+#include "gtest/gtest.h"
+#include "icing/testing/logging-event-listener.h"
+
+// Global variable used so that the test implementation can access the JNIEnv.
+JNIEnv* g_jenv = nullptr;
+
+extern "C" JNIEXPORT jboolean JNICALL
+Java_icing_jni_ReverseJniLanguageSegmenterJniTest_testsMain(JNIEnv* env,
+ jclass ignored) {
+ g_jenv = env;
+
+ std::vector<char*> my_argv;
+ char arg[] = "jni-test-lib";
+ my_argv.push_back(arg);
+ int argc = 1;
+ char** argv = &(my_argv[0]);
+ testing::InitGoogleTest(&argc, argv);
+ testing::UnitTest::GetInstance()->listeners().Append(
+ new icing::lib::LoggingEventListener());
+ return RUN_ALL_TESTS() == 0;
+}
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
deleted file mode 100644
index 64b68ec..0000000
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
-#define ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
-
-#include <jni.h>
-
-#include "icing/jni/jni-cache.h"
-#include "gtest/gtest.h"
-
-extern JNIEnv* g_jenv;
-
-namespace icing {
-namespace lib {
-
-namespace test_internal {
-
-class ReverseJniLanguageSegmenterTest
- : public testing::TestWithParam<const char*> {
- protected:
- ReverseJniLanguageSegmenterTest()
- : jni_cache_(std::move(JniCache::Create(g_jenv)).ValueOrDie()) {}
-
- static std::string GetLocale() { return GetParam(); }
-
- std::unique_ptr<JniCache> jni_cache_;
-};
-
-} // namespace test_internal
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TOKENIZATION_REVERSE_JNI_REVERSE_JNI_LANGUAGE_SEGMENTER_TEST_H_
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
index 2256022..bd80718 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.cc
@@ -19,169 +19,18 @@
#include <string>
#include <string_view>
-#include "icing/jni/reverse-jni-break-iterator.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/absl_ports/canonical_errors.h"
#include "icing/legacy/core/icing-string-util.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/reverse_jni/reverse-jni-break-iterator.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/status-macros.h"
namespace icing {
namespace lib {
-namespace {
-
-// Returns the lead byte of the UTF-8 character that includes the byte at
-// current_byte_index within it.
-int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
- while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
- --current_byte_index;
- }
- return current_byte_index;
-}
-
-class CharacterIterator {
- public:
- explicit CharacterIterator(std::string_view text)
- : CharacterIterator(text, 0, 0) {}
- CharacterIterator(std::string_view text, int utf8_index, int utf16_index)
- : text_(text), utf8_index_(utf8_index), utf16_index_(utf16_index) {}
-
- // Moves from current position to the character that includes the specified
- // UTF-8 index.
- // REQUIRES: desired_utf8_index <= text_.length()
- // desired_utf8_index is allowed to point one index past the end, but no
- // further.
- bool AdvanceToUtf8(int desired_utf8_index) {
- if (desired_utf8_index > text_.length()) {
- // Enforce the requirement.
- return false;
- }
- // Need to work forwards.
- while (utf8_index_ < desired_utf8_index) {
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- if (utf8_index_ + utf8_length > desired_utf8_index) {
- // Ah! Don't go too far!
- break;
- }
- utf8_index_ += utf8_length;
- utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- // Moves from current position to the character that includes the specified
- // UTF-8 index.
- // REQUIRES: 0 <= desired_utf8_index
- bool RewindToUtf8(int desired_utf8_index) {
- if (desired_utf8_index < 0) {
- // Enforce the requirement.
- return false;
- }
- // Need to work backwards.
- while (utf8_index_ > desired_utf8_index) {
- --utf8_index_;
- utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
- if (utf8_index_ < 0) {
- // Somehow, there wasn't a single UTF-8 lead byte at
- // requested_byte_index or an earlier byte.
- return false;
- }
- // We've found the start of a unicode char!
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- // Advances current position to desired_utf16_index.
- // REQUIRES: desired_utf16_index <= text_.utf16_length()
- // desired_utf16_index is allowed to point one index past the end, but no
- // further.
- bool AdvanceToUtf16(int desired_utf16_index) {
- while (utf16_index_ < desired_utf16_index) {
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- int utf16_length = i18n_utils::GetUtf16Length(uchar32);
- if (utf16_index_ + utf16_length > desired_utf16_index) {
- // Ah! Don't go too far!
- break;
- }
- int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- if (utf8_index_ + utf8_length > text_.length()) {
- // Enforce the requirement.
- return false;
- }
- utf8_index_ += utf8_length;
- utf16_index_ += utf16_length;
- }
- return true;
- }
-
- // Rewinds current position to desired_utf16_index.
- // REQUIRES: 0 <= desired_utf16_index
- bool RewindToUtf16(int desired_utf16_index) {
- if (desired_utf16_index < 0) {
- return false;
- }
- while (utf16_index_ > desired_utf16_index) {
- --utf8_index_;
- utf8_index_ = GetUTF8StartPosition(text_, utf8_index_);
- // We've found the start of a unicode char!
- UChar32 uchar32 =
- i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- // Unable to retrieve a valid UTF-32 character at the previous position.
- return false;
- }
- utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
- }
- return true;
- }
-
- bool IsValidCharacter() const {
- // Rule 1: all ASCII terms will be returned.
- // We know it's a ASCII term by checking the first char.
- if (i18n_utils::IsAscii(text_[utf8_index_])) {
- return true;
- }
-
- // Rule 2: for non-ASCII terms, only the alphabetic terms are returned.
- // We know it's an alphabetic term by checking the first unicode character.
- if (i18n_utils::IsAlphabeticAt(text_, utf8_index_)) {
- return true;
- }
-
- return false;
- }
-
- int utf8_index() const { return utf8_index_; }
- int utf16_index() const { return utf16_index_; }
-
- private:
- std::string_view text_;
- int utf8_index_;
- int utf16_index_;
-};
-
-} // namespace
-
class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
public:
explicit ReverseJniLanguageSegmenterIterator(
@@ -195,16 +44,16 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// Advances to the next term. Returns false if it has reached the end.
bool Advance() override {
// Prerequisite check
- if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ if (IsDone()) {
return false;
}
if (term_end_exclusive_.utf16_index() == 0) {
int first = break_iterator_->First();
- if (!term_start_.AdvanceToUtf16(first)) {
- // First is guaranteed to succeed and return a position within bonds. So
- // the only possible failure could be an invalid sequence. Mark as DONE
- // and return.
+ if (!term_start_.MoveToUtf16(first)) {
+ // First is guaranteed to succeed and return a position within bonds.
+ // So the only possible failure could be an invalid sequence. Mark as
+ // DONE and return.
MarkAsDone();
return false;
}
@@ -218,7 +67,7 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
MarkAsDone();
return false;
}
- if (!term_end_exclusive_.AdvanceToUtf16(next_utf16_index_exclusive)) {
+ if (!term_end_exclusive_.MoveToUtf16(next_utf16_index_exclusive)) {
// next_utf16_index_exclusive is guaranteed to be within bonds thanks to
// the check for kDone above. So the only possible failure could be an
// invalid sequence. Mark as DONE and return.
@@ -226,18 +75,15 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return false;
}
- // Check if the current term is valid. We consider any term valid if its
- // first character is valid. If it's not valid, then we need to advance to
- // the next term.
- if (term_start_.IsValidCharacter()) {
- return true;
- }
- return Advance();
+ return true;
}
// Returns the current term. It can be called only when Advance() returns
// true.
std::string_view GetTerm() const override {
+ if (IsDone()) {
+ return text_.substr(0, 0);
+ }
int term_length =
term_end_exclusive_.utf8_index() - term_start_.utf8_index();
if (term_length > 0 && std::isspace(text_[term_start_.utf8_index()])) {
@@ -247,6 +93,16 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
return text_.substr(term_start_.utf8_index(), term_length);
}
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTermStart()
+ override {
+ return term_start_;
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTermEndExclusive()
+ override {
+ return term_end_exclusive_;
+ }
+
// Resets the iterator to point to the first term that starts after offset.
// GetTerm will now return that term.
//
@@ -258,15 +114,14 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// INVALID_ARGUMENT if offset is out of bounds for the provided text.
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
+ libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfterUtf32(
int32_t offset) override {
- if (offset < 0 || offset >= text_.length()) {
- return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
- "Illegal offset provided! Offset %d is not within bounds of string "
- "of length %zu",
- offset, text_.length()));
+ if (offset < 0) {
+ // Very simple. The first term start after a negative offset is the first
+ // term. So just reset to start.
+ return ResetToStartUtf32();
}
- if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ if (IsDone()) {
// We're done. Need to start from the beginning if we're going to reset
// properly.
term_start_ = CharacterIterator(text_);
@@ -274,43 +129,48 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
}
// 1. Find the unicode character that contains the byte at offset.
- CharacterIterator offset_iterator = term_end_exclusive_;
- bool success = (offset > offset_iterator.utf8_index())
- ? offset_iterator.AdvanceToUtf8(offset)
- : offset_iterator.RewindToUtf8(offset);
- if (!success) {
- // Offset is guaranteed to be within bounds thanks to the check above. So
- // the only possible failure could be an invalid sequence. Mark as DONE
- // and return.
- MarkAsDone();
- return absl_ports::AbortedError("Encountered invalid UTF sequence!");
+ CharacterIterator offset_iterator = (offset < term_start_.utf32_index())
+ ? term_start_
+ : term_end_exclusive_;
+ if (!offset_iterator.MoveToUtf32(offset)) {
+ if (offset_iterator.utf8_index() != text_.length()) {
+ // We returned false for some reason other than hitting the end. This is
+ // a real error. Just return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ }
+ // Check to see if offset is past the end of the text. If it is, then
+ // there's no term starting after it. Return an invalid argument.
+ if (offset_iterator.utf8_index() == text_.length()) {
+ return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+ "Illegal offset provided! Offset utf-32:%d, utf-8:%d is not within "
+ "bounds of string of length %zu",
+ offset_iterator.utf32_index(), offset_iterator.utf8_index(),
+ text_.length()));
}
// 2. We've got the unicode character containing byte offset. Now, we need
// to point to the segment that starts after this character.
int following_utf16_index =
break_iterator_->Following(offset_iterator.utf16_index());
- if (following_utf16_index == ReverseJniBreakIterator::kDone) {
+ if (following_utf16_index == ReverseJniBreakIterator::kDone ||
+ !offset_iterator.MoveToUtf16(following_utf16_index)) {
MarkAsDone();
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No segments begin after provided offset %d.", offset));
}
- if (!offset_iterator.AdvanceToUtf16(following_utf16_index)) {
- // following_utf16_index is guaranteed to be within bonds thanks to the
- // check for kDone above. So the only possible failure could be an invalid
- // sequence. Mark as DONE and return.
- MarkAsDone();
- return absl_ports::AbortedError("Encountered invalid UTF sequence!");
- }
term_end_exclusive_ = offset_iterator;
- // 3. The term_end_exclusive_ points to the term that we want to return. We
- // need to Advance so that term_start_ will now point to this term.
+ // 3. The term_end_exclusive_ points to the start of the term that we want
+ // to return. We need to Advance so that term_start_ will now point to this
+ // term.
if (!Advance()) {
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No segments begin after provided offset %d.", offset));
}
- return term_start_.utf8_index();
+ return term_start_.utf32_index();
}
// Resets the iterator to point to the first term that ends before offset.
@@ -324,52 +184,48 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// INVALID_ARGUMENT if offset is out of bounds for the provided text.
// ABORTED if an invalid unicode character is encountered while
// traversing the text.
- libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
+ libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBeforeUtf32(
int32_t offset) override {
- if (offset < 0 || offset >= text_.length()) {
+ if (offset < 0) {
return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
"Illegal offset provided! Offset %d is not within bounds of string "
"of length %zu",
offset, text_.length()));
}
- if (term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone) {
+ if (IsDone()) {
// We're done. Need to start from the beginning if we're going to reset
// properly.
term_start_ = CharacterIterator(text_);
term_end_exclusive_ = CharacterIterator(text_);
}
- // 1. Find the unicode character that contains the byte at offset.
- CharacterIterator offset_iterator = term_end_exclusive_;
- bool success = (offset > offset_iterator.utf8_index())
- ? offset_iterator.AdvanceToUtf8(offset)
- : offset_iterator.RewindToUtf8(offset);
- if (!success) {
- // Offset is guaranteed to be within bounds thanks to the check above. So
- // the only possible failure could be an invalid sequence. Mark as DONE
- // and return.
- MarkAsDone();
- return absl_ports::AbortedError(
- "Could not retrieve valid utf8 character!");
+ CharacterIterator offset_iterator = (offset < term_start_.utf32_index())
+ ? term_start_
+ : term_end_exclusive_;
+ if (!offset_iterator.MoveToUtf32(offset)) {
+ // An error occurred. Mark as DONE
+ if (offset_iterator.utf8_index() != text_.length()) {
+ // We returned false for some reason other than hitting the end. This is
+ // a real error. Just return.
+ MarkAsDone();
+ return absl_ports::AbortedError(
+ "Could not retrieve valid utf8 character!");
+ }
+ // If it returned false because we hit the end. Then that's fine. We'll
+ // just treat it as if the request was for the end.
}
// 2. We've got the unicode character containing byte offset. Now, we need
- // to point to the segment that starts before this character.
+ // to point to the segment that ends before this character.
int starting_utf16_index =
break_iterator_->Preceding(offset_iterator.utf16_index());
- if (starting_utf16_index == ReverseJniBreakIterator::kDone) {
+ if (starting_utf16_index == ReverseJniBreakIterator::kDone ||
+ !offset_iterator.MoveToUtf16(starting_utf16_index)) {
// Rewind the end indices.
MarkAsDone();
return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
"No segments end before provided offset %d.", offset));
}
- if (!offset_iterator.RewindToUtf16(starting_utf16_index)) {
- // starting_utf16_index is guaranteed to be within bonds thanks to the
- // check for kDone above. So the only possible failure could be an invalid
- // sequence. Mark as DONE and return.
- MarkAsDone();
- return absl_ports::AbortedError("Encountered invalid UTF sequence!");
- }
term_start_ = offset_iterator;
// 3. We've correctly set the start index and the iterator currently points
@@ -377,25 +233,25 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// advance the iterator to that position.
int end_utf16_index = break_iterator_->Next();
term_end_exclusive_ = term_start_;
- term_end_exclusive_.AdvanceToUtf16(end_utf16_index);
+ term_end_exclusive_.MoveToUtf16(end_utf16_index);
// 4. The start and end indices point to a segment, but we need to ensure
// that this segment is 1) valid and 2) ends before offset. Otherwise, we'll
// need a segment prior to this one.
- if (term_end_exclusive_.utf8_index() > offset ||
- !term_start_.IsValidCharacter()) {
- return ResetToTermEndingBefore(term_start_.utf8_index());
+ if (term_end_exclusive_.utf32_index() > offset) {
+ return ResetToTermEndingBeforeUtf32(term_start_.utf32_index());
}
- return term_start_.utf8_index();
+ return term_start_.utf32_index();
}
- libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
+ libtextclassifier3::StatusOr<int32_t> ResetToStartUtf32() override {
term_start_ = CharacterIterator(text_);
term_end_exclusive_ = CharacterIterator(text_);
if (!Advance()) {
- return absl_ports::NotFoundError("");
+ return absl_ports::NotFoundError(
+ "Unable to find any valid terms in text.");
}
- return term_start_.utf8_index();
+ return term_start_.utf32_index();
}
private:
@@ -407,11 +263,19 @@ class ReverseJniLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
// break_iterator_ may be in any state.
void MarkAsDone() {
term_start_ =
- CharacterIterator(text_, /*utf8_index=*/0,
- /*utf16_index=*/ReverseJniBreakIterator::kDone);
+ CharacterIterator(text_, /*utf8_index=*/ReverseJniBreakIterator::kDone,
+ /*utf16_index=*/ReverseJniBreakIterator::kDone,
+ /*utf32_index=*/ReverseJniBreakIterator::kDone);
term_end_exclusive_ =
- CharacterIterator(text_, /*utf8_index=*/0,
- /*utf16_index=*/ReverseJniBreakIterator::kDone);
+ CharacterIterator(text_, /*utf8_index=*/ReverseJniBreakIterator::kDone,
+ /*utf16_index=*/ReverseJniBreakIterator::kDone,
+ /*utf32_index=*/ReverseJniBreakIterator::kDone);
+ }
+ bool IsDone() const {
+ // We could just as easily check the other utf indices or the values in
+ // term_start_ to check for done. There's no particular reason to choose any
+ // one since they should all hold kDone.
+ return term_end_exclusive_.utf16_index() == ReverseJniBreakIterator::kDone;
}
// All of ReverseJniBreakIterator's functions return UTF-16 boundaries. So
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
index f06dac9..29df4ee 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter.h
@@ -21,8 +21,8 @@
#include <string_view>
#include <vector>
-#include "icing/jni/jni-cache.h"
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/jni/jni-cache.h"
#include "icing/tokenization/language-segmenter.h"
namespace icing {
diff --git a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
index a01d944..47a01fe 100644
--- a/icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.cc
+++ b/icing/tokenization/reverse_jni/reverse-jni-language-segmenter_test.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "icing/tokenization/reverse_jni/reverse-jni-language-segmenter-test.h"
+#include <jni.h>
#include <memory>
#include <string_view>
@@ -21,10 +21,13 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "gmock/gmock.h"
#include "icing/absl_ports/str_cat.h"
+#include "icing/jni/jni-cache.h"
#include "icing/testing/common-matchers.h"
#include "icing/testing/icu-i18n-test-utils.h"
+#include "icing/testing/jni-test-helpers.h"
#include "icing/tokenization/language-segmenter-factory.h"
#include "icing/tokenization/language-segmenter.h"
+#include "icing/util/character-iterator.h"
#include "unicode/uloc.h"
namespace icing {
@@ -54,72 +57,72 @@ std::vector<std::string_view> GetAllTermsAdvance(
}
// Returns a vector containing all terms retrieved by calling ResetAfter with
-// the current position to simulate Advancing on the iterator.
-std::vector<std::string_view> GetAllTermsResetAfter(
+// the UTF-32 position of the current term start to simulate Advancing on the
+// iterator.
+std::vector<std::string_view> GetAllTermsResetAfterUtf32(
LanguageSegmenter::Iterator* itr) {
std::vector<std::string_view> terms;
- if (!itr->ResetToStart().ok()) {
- return terms;
- }
- terms.push_back(itr->GetTerm());
- const char* text_begin = itr->GetTerm().data();
- // Calling ResetToTermStartingAfter with the current position should get the
- // very next term in the sequence.
- for (int current_pos = 0; itr->ResetToTermStartingAfter(current_pos).ok();
- current_pos = itr->GetTerm().data() - text_begin) {
+ // Calling ResetToTermStartingAfterUtf32 with -1 should get the first term in
+ // the sequence.
+ bool is_ok = itr->ResetToTermStartingAfterUtf32(-1).ok();
+ while (is_ok) {
terms.push_back(itr->GetTerm());
+ // Calling ResetToTermStartingAfterUtf32 with the current position should
+ // get the very next term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
}
return terms;
}
// Returns a vector containing all terms retrieved by alternating calls to
-// Advance and calls to ResetAfter with the current position to simulate
-// Advancing.
-std::vector<std::string_view> GetAllTermsAdvanceAndResetAfter(
+// Advance and calls to ResetAfter with the UTF-32 position of the current term
+// start to simulate Advancing.
+std::vector<std::string_view> GetAllTermsAdvanceAndResetAfterUtf32(
LanguageSegmenter::Iterator* itr) {
- const char* text_begin = itr->GetTerm().data();
std::vector<std::string_view> terms;
-
- bool is_ok = true;
- int current_pos = 0;
+ bool is_ok = itr->Advance();
while (is_ok) {
+ terms.push_back(itr->GetTerm());
// Alternate between using Advance and ResetToTermAfter.
if (terms.size() % 2 == 0) {
is_ok = itr->Advance();
} else {
- // Calling ResetToTermStartingAfter with the current position should get
- // the very next term in the sequence.
- current_pos = itr->GetTerm().data() - text_begin;
- is_ok = itr->ResetToTermStartingAfter(current_pos).ok();
- }
- if (is_ok) {
- terms.push_back(itr->GetTerm());
+ // Calling ResetToTermStartingAfterUtf32 with the current position should
+ // get the very next term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermStartingAfterUtf32(char_itr.utf32_index()).ok();
}
}
return terms;
}
// Returns a vector containing all terms retrieved by calling ResetBefore with
-// the current position, starting at the end of the text. This vector should be
-// in reverse order of GetAllTerms and missing the last term.
-std::vector<std::string_view> GetAllTermsResetBefore(
+// the UTF-32 position of the current term start, starting at the end of the
+// text. This vector should be in reverse order of GetAllTerms and missing the
+// last term.
+std::vector<std::string_view> GetAllTermsResetBeforeUtf32(
LanguageSegmenter::Iterator* itr) {
- const char* text_begin = itr->GetTerm().data();
- int last_pos = 0;
- while (itr->Advance()) {
- last_pos = itr->GetTerm().data() - text_begin;
- }
std::vector<std::string_view> terms;
- // Calling ResetToTermEndingBefore with the current position should get the
- // previous term in the sequence.
- for (int current_pos = last_pos;
- itr->ResetToTermEndingBefore(current_pos).ok();
- current_pos = itr->GetTerm().data() - text_begin) {
+ bool is_ok = itr->ResetToTermEndingBeforeUtf32(1000).ok();
+ while (is_ok) {
terms.push_back(itr->GetTerm());
+ // Calling ResetToTermEndingBeforeUtf32 with the current position should get
+ // the previous term in the sequence.
+ CharacterIterator char_itr = itr->CalculateTermStart().ValueOrDie();
+ is_ok = itr->ResetToTermEndingBeforeUtf32(char_itr.utf32_index()).ok();
}
return terms;
}
+class ReverseJniLanguageSegmenterTest
+ : public testing::TestWithParam<const char*> {
+ protected:
+ static std::string GetLocale() { return GetParam(); }
+
+ std::unique_ptr<const JniCache> jni_cache_ = GetTestJniCache();
+};
+
} // namespace
TEST_P(ReverseJniLanguageSegmenterTest, EmptyText) {
@@ -182,7 +185,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, Non_ASCII_Non_Alphabetic) {
// Full-width (non-ASCII) punctuation marks and special characters are left
// out.
EXPECT_THAT(language_segmenter->GetAllTerms("。?·Hello!×"),
- IsOkAndHolds(ElementsAre("Hello")));
+ IsOkAndHolds(ElementsAre("。", "?", "·", "Hello", "!", "×")));
}
TEST_P(ReverseJniLanguageSegmenterTest, Acronym) {
@@ -225,6 +228,36 @@ TEST_P(ReverseJniLanguageSegmenterTest, WordConnector) {
EXPECT_THAT(language_segmenter->GetAllTerms("com.google.android:icing"),
IsOkAndHolds(ElementsAre("com.google.android:icing")));
+ // Connectors that don't have valid terms on both sides of it are not
+ // considered connectors.
+ EXPECT_THAT(language_segmenter->GetAllTerms(":bar:baz"),
+ IsOkAndHolds(ElementsAre(":", "bar:baz")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:"),
+ IsOkAndHolds(ElementsAre("bar:baz", ":")));
+
+ // Connectors that don't have valid terms on both sides of it are not
+ // considered connectors.
+ EXPECT_THAT(language_segmenter->GetAllTerms(" :bar:baz"),
+ IsOkAndHolds(ElementsAre(" ", ":", "bar:baz")));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz: "),
+ IsOkAndHolds(ElementsAre("bar:baz", ":", " ")));
+
+ // Connectors don't connect if one side is an invalid term (?)
+ EXPECT_THAT(language_segmenter->GetAllTerms("bar:baz:?"),
+ IsOkAndHolds(ElementsAre("bar:baz", ":", "?")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("?:bar:baz"),
+ IsOkAndHolds(ElementsAre("?", ":", "bar:baz")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("3:14"),
+ IsOkAndHolds(ElementsAre("3", ":", "14")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("私:は"),
+ IsOkAndHolds(ElementsAre("私", ":", "は")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("我:每"),
+ IsOkAndHolds(ElementsAre("我", ":", "每")));
+ EXPECT_THAT(language_segmenter->GetAllTerms("เดิน:ไป"),
+ IsOkAndHolds(ElementsAre("เดิน:ไป")));
+
// Any heading and trailing characters are not connecters
EXPECT_THAT(language_segmenter->GetAllTerms(".com.google.android."),
IsOkAndHolds(ElementsAre(".", "com.google.android", ".")));
@@ -333,6 +366,17 @@ TEST_P(ReverseJniLanguageSegmenterTest, Number) {
IsOkAndHolds(ElementsAre("-", "123")));
}
+TEST_P(ReverseJniLanguageSegmenterTest, FullWidthNumbers) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+
+ EXPECT_THAT(language_segmenter->GetAllTerms("0123456789"),
+ IsOkAndHolds(ElementsAre("0", "1", "2", "3", "4", "5", "6",
+ "7", "8", "9")));
+}
+
TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespaces) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
@@ -369,15 +413,16 @@ TEST_P(ReverseJniLanguageSegmenterTest, CJKT) {
// have whitespaces as word delimiter.
// Chinese
- EXPECT_THAT(language_segmenter->GetAllTerms("我每天走路去上班。"),
- IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班")));
+ EXPECT_THAT(
+ language_segmenter->GetAllTerms("我每天走路去上班。"),
+ IsOkAndHolds(ElementsAre("我", "每天", "走路", "去", "上班", "。")));
// Japanese
EXPECT_THAT(language_segmenter->GetAllTerms("私は毎日仕事に歩いています。"),
IsOkAndHolds(ElementsAre("私", "は", "毎日", "仕事", "に", "歩",
- "い", "てい", "ます")));
+ "い", "てい", "ます", "。")));
// Khmer
EXPECT_THAT(language_segmenter->GetAllTerms("ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"),
- IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ")));
+ IsOkAndHolds(ElementsAre("ញុំ", "ដើរទៅ", "ធ្វើការ", "រាល់ថ្ងៃ", "។")));
// Thai
EXPECT_THAT(
language_segmenter->GetAllTerms("ฉันเดินไปทำงานทุกวัน"),
@@ -393,7 +438,6 @@ TEST_P(ReverseJniLanguageSegmenterTest, LatinLettersWithAccents) {
IsOkAndHolds(ElementsAre("āăąḃḅḇčćç")));
}
-// TODO(samzheng): test cases for more languages (e.g. top 20 in the world)
TEST_P(ReverseJniLanguageSegmenterTest, WhitespaceSplitLanguages) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
@@ -408,7 +452,6 @@ TEST_P(ReverseJniLanguageSegmenterTest, WhitespaceSplitLanguages) {
IsOkAndHolds(ElementsAre("나는", " ", "매일", " ", "출근합니다", ".")));
}
-// TODO(samzheng): more mixed languages test cases
TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguages) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
@@ -443,7 +486,78 @@ TEST_P(ReverseJniLanguageSegmenterTest, NotCopyStrings) {
EXPECT_THAT(word2_address, Eq(word2_result_address));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToStartUtf32WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "com:google:android is package";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "com:google:android is package"
+ // ^ ^^ ^^
+ // UTF-8 idx: 0 18 19 21 22
+ // UTF-32 idx: 0 18 19 21 22
+ auto position_or = itr->ResetToStartUtf32();
+ EXPECT_THAT(position_or, IsOk());
+ ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, NewIteratorResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorOneAdvanceResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_TRUE(itr->Advance()); // itr points to 'How'
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest,
+ IteratorMultipleAdvancesResetToStartUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance());
+ ASSERT_TRUE(itr->Advance()); // itr points to ' '
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, IteratorDoneResetToStartUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -451,19 +565,61 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ while (itr->Advance()) {
+ // Do nothing.
+ }
+ EXPECT_THAT(itr->ResetToStartUtf32(), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32WordConnector) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "package com:google:android name";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "package com:google:android name"
+ // ^ ^^ ^^
+ // UTF-8 idx: 0 7 8 26 27
+ // UTF-32 idx: 0 7 8 26 27
+ auto position_or = itr->ResetToTermStartingAfterUtf32(8);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(26));
+ ASSERT_THAT(itr->GetTerm(), Eq(" "));
+
+ position_or = itr->ResetToTermStartingAfterUtf32(7);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(8));
+ ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterUtf32OutOfBounds) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "How are you你好吗お元気ですか";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
ASSERT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("you"));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(-1), IsOk());
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(kText.length()),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(21),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("you"));
+ EXPECT_THAT(itr->GetTerm(), Eq("How"));
}
// Tests that ResetToTermAfter and Advance produce the same output. With the
@@ -472,7 +628,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermAfterOutOfBounds) {
// terms produced by ResetToTermAfter calls with the current position
// provided as the argument.
TEST_P(ReverseJniLanguageSegmenterTest,
- MixedLanguagesResetToTermAfterEquivalentToAdvance) {
+ MixedLanguagesResetToTermAfterUtf32EquivalentToAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -487,14 +643,14 @@ TEST_P(ReverseJniLanguageSegmenterTest,
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kText));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
TEST_P(ReverseJniLanguageSegmenterTest,
- ThaiResetToTermAfterEquivalentToAdvance) {
+ ThaiResetToTermAfterUtf32EquivalentToAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -509,14 +665,14 @@ TEST_P(ReverseJniLanguageSegmenterTest,
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
TEST_P(ReverseJniLanguageSegmenterTest,
- KoreanResetToTermAfterEquivalentToAdvance) {
+ KoreanResetToTermAfterUtf32EquivalentToAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -531,7 +687,7 @@ TEST_P(ReverseJniLanguageSegmenterTest,
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetAfter(reset_to_term_itr.get());
+ GetAllTermsResetAfterUtf32(reset_to_term_itr.get());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
@@ -542,7 +698,7 @@ TEST_P(ReverseJniLanguageSegmenterTest,
// should be able to mix ResetToTermAfter(current_position) calls and Advance
// calls to mimic calling Advance.
TEST_P(ReverseJniLanguageSegmenterTest,
- MixedLanguagesResetToTermAfterInteroperableWithAdvance) {
+ MixedLanguagesResetToTermAfterUtf32InteroperableWithAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -557,7 +713,7 @@ TEST_P(ReverseJniLanguageSegmenterTest,
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kText));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
@@ -565,7 +721,7 @@ TEST_P(ReverseJniLanguageSegmenterTest,
}
TEST_P(ReverseJniLanguageSegmenterTest,
- ThaiResetToTermAfterInteroperableWithAdvance) {
+ ThaiResetToTermAfterUtf32InteroperableWithAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -580,7 +736,7 @@ TEST_P(ReverseJniLanguageSegmenterTest,
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
@@ -588,7 +744,7 @@ TEST_P(ReverseJniLanguageSegmenterTest,
}
TEST_P(ReverseJniLanguageSegmenterTest,
- KoreanResetToTermAfterInteroperableWithAdvance) {
+ KoreanResetToTermAfterUtf32InteroperableWithAdvance) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -603,14 +759,14 @@ TEST_P(ReverseJniLanguageSegmenterTest,
std::unique_ptr<LanguageSegmenter::Iterator> advance_and_reset_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> advance_and_reset_terms =
- GetAllTermsAdvanceAndResetAfter(advance_and_reset_itr.get());
+ GetAllTermsAdvanceAndResetAfterUtf32(advance_and_reset_itr.get());
EXPECT_THAT(advance_and_reset_terms,
testing::ElementsAreArray(advance_terms));
EXPECT_THAT(advance_and_reset_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
-TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -619,33 +775,35 @@ TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermAfter) {
std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment("How are you你好吗お元気ですか"));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(3)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(11)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(11)));
EXPECT_THAT(itr->GetTerm(), Eq("你好"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(32), IsOkAndHolds(Eq(35)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(18), IsOkAndHolds(Eq(19)));
EXPECT_THAT(itr->GetTerm(), Eq("か"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(17)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
EXPECT_THAT(itr->GetTerm(), Eq("吗"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(35),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest,
+ ContinuousWhitespacesResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -655,35 +813,36 @@ TEST_P(ReverseJniLanguageSegmenterTest, ContinuousWhitespacesResetToTermAfter) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kTextWithSpace));
- // String: "Hello World"
- // ^ ^ ^
- // Bytes: 0 5 15
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(5)));
+ // String: "Hello World"
+ // ^ ^ ^
+ // UTF-8 idx: 0 5 15
+ // UTF-32 idx: 0 5 15
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(2), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermStartingAfter(10), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(10), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("World"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(5), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(5), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("World"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(15),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(17),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(19),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -693,21 +852,25 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermAfter) {
constexpr std::string_view kChinese = "我每天走路去上班。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // Bytes: 0 3 9 15 18
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^ ^
+ // UTF-8 idx: 0 3 9 15 18 24
+ // UTF-832 idx: 0 1 3 5 6 8
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("每天"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(9)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(2), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("走路"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(19),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(7), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(8),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
}
-TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -716,21 +879,25 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermAfter) {
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
- // String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 6 12 18212427 33
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(3)));
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33 39
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11 13
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(33),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(13),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(7), IsOkAndHolds(Eq(12)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(3), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("仕事"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(13)));
+ EXPECT_THAT(itr->GetTerm(), Eq("。"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -738,21 +905,25 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermAfter) {
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
- // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^
- // Bytes: 0 9 24 45
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45 69
+ // UTF-32 idx: 0 3 8 15 23
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("ដើរទៅ"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(47),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(15), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->GetTerm(), Eq("។"));
+
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(23),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(14), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) {
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfterUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -761,24 +932,48 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermAfter) {
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kThai));
- // String: "ฉันเดินไปทำงานทุกวัน"
- // ^ ^ ^ ^ ^ ^
- // Bytes: 0 9 21 27 42 51
- EXPECT_THAT(itr->ResetToTermStartingAfter(0), IsOkAndHolds(Eq(9)));
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 21 27 42 51
+ // UTF-32 idx: 0 3 7 9 14 17
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(0), IsOkAndHolds(Eq(3)));
EXPECT_THAT(itr->GetTerm(), Eq("เดิน"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(51),
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(17),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermStartingAfter(13), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(6), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
- EXPECT_THAT(itr->ResetToTermStartingAfter(34), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->ResetToTermStartingAfterUtf32(12), IsOkAndHolds(Eq(14)));
EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) {
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeWordConnectorUtf32) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto segmenter, language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ constexpr std::string_view kText = "package name com:google:android!";
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
+ segmenter->Segment(kText));
+
+ // String: "package name com:google:android!"
+ // ^ ^^ ^^ ^
+ // UTF-8 idx: 0 7 8 12 13 31
+ // UTF-32 idx: 0 7 8 12 13 31
+ auto position_or = itr->ResetToTermEndingBeforeUtf32(31);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(13));
+ ASSERT_THAT(itr->GetTerm(), Eq("com:google:android"));
+
+ position_or = itr->ResetToTermEndingBeforeUtf32(21);
+ EXPECT_THAT(position_or, IsOk());
+ EXPECT_THAT(position_or.ValueOrDie(), Eq(12));
+ ASSERT_THAT(itr->GetTerm(), Eq(" "));
+}
+
+TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBoundsUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -786,19 +981,19 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) {
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
segmenter->Segment(kText));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- ASSERT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ ASSERT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
ASSERT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(-1),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(-1),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
EXPECT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(kText.length()),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(itr->GetTerm(), Eq("are"));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(29), IsOk());
+ EXPECT_THAT(itr->GetTerm(), Eq("か"));
}
// Tests that ResetToTermBefore and Advance produce the same output. With the
@@ -807,7 +1002,7 @@ TEST_P(ReverseJniLanguageSegmenterTest, ResetToTermBeforeOutOfBounds) {
// terms produced by ResetToTermBefore calls with the current position
// provided as the argument (after their order has been reversed).
TEST_P(ReverseJniLanguageSegmenterTest,
- MixedLanguagesResetToTermBeforeEquivalentToAdvance) {
+ MixedLanguagesResetToTermBeforeEquivalentToAdvanceUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -817,17 +1012,12 @@ TEST_P(ReverseJniLanguageSegmenterTest,
segmenter->Segment(kText));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kText));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -836,7 +1026,7 @@ TEST_P(ReverseJniLanguageSegmenterTest,
}
TEST_P(ReverseJniLanguageSegmenterTest,
- ThaiResetToTermBeforeEquivalentToAdvance) {
+ ThaiResetToTermBeforeEquivalentToAdvanceUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -846,17 +1036,12 @@ TEST_P(ReverseJniLanguageSegmenterTest,
segmenter->Segment(kThai));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kThai));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
@@ -864,7 +1049,7 @@ TEST_P(ReverseJniLanguageSegmenterTest,
}
TEST_P(ReverseJniLanguageSegmenterTest,
- KoreanResetToTermBeforeEquivalentToAdvance) {
+ KoreanResetToTermBeforeEquivalentToAdvanceUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto segmenter, language_segmenter_factory::Create(
GetSegmenterOptions(GetLocale(), jni_cache_.get())));
@@ -874,24 +1059,19 @@ TEST_P(ReverseJniLanguageSegmenterTest,
segmenter->Segment(kKorean));
std::vector<std::string_view> advance_terms =
GetAllTermsAdvance(advance_itr.get());
- // Can't produce the last term via calls to ResetToTermBefore. So skip
- // past that one.
- auto itr = advance_terms.begin();
- std::advance(itr, advance_terms.size() - 1);
- advance_terms.erase(itr);
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<LanguageSegmenter::Iterator> reset_to_term_itr,
segmenter->Segment(kKorean));
std::vector<std::string_view> reset_terms =
- GetAllTermsResetBefore(reset_to_term_itr.get());
+ GetAllTermsResetBeforeUtf32(reset_to_term_itr.get());
std::reverse(reset_terms.begin(), reset_terms.end());
EXPECT_THAT(reset_terms, testing::ElementsAreArray(advance_terms));
EXPECT_THAT(reset_to_term_itr->GetTerm(), Eq(advance_itr->GetTerm()));
}
-TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -900,35 +1080,36 @@ TEST_P(ReverseJniLanguageSegmenterTest, MixedLanguagesResetToTermBefore) {
std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment("How are you你好吗お元気ですか"));
- // String: "How are you你好吗お元気ですか"
- // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 4 7 8 11 172023 29 35
- EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ // String: "How are you你好吗お元気ですか"
+ // ^ ^^ ^^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 4 7 8 11 172023 29 35
+ // UTF-32 idx: 0 3 4 7 8 11 131415 17 19
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(7)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(4)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(4)));
EXPECT_THAT(itr->GetTerm(), Eq("are"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(32), IsOkAndHolds(Eq(23)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(18), IsOkAndHolds(Eq(15)));
EXPECT_THAT(itr->GetTerm(), Eq("元気"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(8)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(12), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("you"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(35), IsOkAndHolds(Eq(29)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(17)));
EXPECT_THAT(itr->GetTerm(), Eq("です"));
}
TEST_P(ReverseJniLanguageSegmenterTest,
- ContinuousWhitespacesResetToTermBefore) {
+ ContinuousWhitespacesResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -938,34 +1119,35 @@ TEST_P(ReverseJniLanguageSegmenterTest,
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kTextWithSpace));
- // String: "Hello World"
- // ^ ^ ^
- // Bytes: 0 5 15
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "Hello World"
+ // ^ ^ ^
+ // UTF-8 idx: 0 5 15
+ // UTF-32 idx: 0 5 15
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(2),
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(10), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(10), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(5), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("Hello"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(15), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(15), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(17), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
- EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(5)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(19), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq(" "));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -975,21 +1157,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, ChineseResetToTermBefore) {
constexpr std::string_view kChinese = "我每天走路去上班。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kChinese));
- // String: "我每天走路去上班。"
- // ^ ^ ^ ^^
- // Bytes: 0 3 9 15 18
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "我每天走路去上班。"
+ // ^ ^ ^ ^^
+ // UTF-8 idx: 0 3 9 15 18
+ // UTF-32 idx: 0 1 3 5 6
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(2), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("我"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(19), IsOkAndHolds(Eq(15)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(7), IsOkAndHolds(Eq(5)));
EXPECT_THAT(itr->GetTerm(), Eq("去"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -998,21 +1181,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, JapaneseResetToTermBefore) {
constexpr std::string_view kJapanese = "私は毎日仕事に歩いています。";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kJapanese));
- // String: "私は毎日仕事に歩いています。"
- // ^ ^ ^ ^ ^ ^ ^ ^ ^
- // Bytes: 0 3 6 12 18212427 33
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "私は毎日仕事に歩いています。"
+ // ^ ^ ^ ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 3 6 12 18212427 33
+ // UTF-32 idx: 0 1 2 4 6 7 8 9 11
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(33), IsOkAndHolds(Eq(27)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(9)));
EXPECT_THAT(itr->GetTerm(), Eq("てい"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(7), IsOkAndHolds(Eq(3)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(3), IsOkAndHolds(Eq(1)));
EXPECT_THAT(itr->GetTerm(), Eq("は"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -1020,21 +1204,22 @@ TEST_P(ReverseJniLanguageSegmenterTest, KhmerResetToTermBefore) {
constexpr std::string_view kKhmer = "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kKhmer));
- // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
- // ^ ^ ^ ^
- // Bytes: 0 9 24 45
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "ញុំដើរទៅធ្វើការរាល់ថ្ងៃ។"
+ // ^ ^ ^ ^
+ // UTF-8 idx: 0 9 24 45
+ // UTF-32 idx: 0 3 8 15
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(47), IsOkAndHolds(Eq(24)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(16), IsOkAndHolds(Eq(8)));
EXPECT_THAT(itr->GetTerm(), Eq("ធ្វើការ"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(14), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(5), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("ញុំ"));
}
-TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) {
+TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBeforeUtf32) {
ICING_ASSERT_OK_AND_ASSIGN(
auto language_segmenter,
language_segmenter_factory::Create(
@@ -1043,23 +1228,39 @@ TEST_P(ReverseJniLanguageSegmenterTest, ThaiResetToTermBefore) {
constexpr std::string_view kThai = "ฉันเดินไปทำงานทุกวัน";
ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LanguageSegmenter::Iterator> itr,
language_segmenter->Segment(kThai));
- // String: "ฉันเดินไปทำงานทุกวัน"
- // ^ ^ ^ ^ ^ ^
- // Bytes: 0 9 21 27 42 51
- EXPECT_THAT(itr->ResetToTermEndingBefore(0),
+ // String: "ฉันเดินไปทำงานทุกวัน"
+ // ^ ^ ^ ^ ^ ^
+ // UTF-8 idx: 0 9 21 27 42 51
+ // UTF-32 idx: 0 3 7 9 14 17
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(0),
StatusIs(libtextclassifier3::StatusCode::NOT_FOUND));
EXPECT_THAT(itr->GetTerm(), IsEmpty());
- EXPECT_THAT(itr->ResetToTermEndingBefore(51), IsOkAndHolds(Eq(42)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(17), IsOkAndHolds(Eq(14)));
EXPECT_THAT(itr->GetTerm(), Eq("ทุก"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(13), IsOkAndHolds(Eq(0)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(4), IsOkAndHolds(Eq(0)));
EXPECT_THAT(itr->GetTerm(), Eq("ฉัน"));
- EXPECT_THAT(itr->ResetToTermEndingBefore(34), IsOkAndHolds(Eq(21)));
+ EXPECT_THAT(itr->ResetToTermEndingBeforeUtf32(11), IsOkAndHolds(Eq(7)));
EXPECT_THAT(itr->GetTerm(), Eq("ไป"));
}
+TEST_P(ReverseJniLanguageSegmenterTest, QuerySyntax) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ auto language_segmenter,
+ language_segmenter_factory::Create(
+ GetSegmenterOptions(GetLocale(), jni_cache_.get())));
+ // Validates that the input strings are not copied
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::vector<std::string_view> terms,
+ language_segmenter->GetAllTerms(
+ "(-term1 OR term2) AND property1.subproperty2:term3"));
+ EXPECT_THAT(terms, ElementsAre("(", "-", "term1", " ", "OR", " ", "term2",
+ ")", " ", "AND", " ", "property1", ".",
+ "subproperty2", ":", "term3"));
+}
+
INSTANTIATE_TEST_SUITE_P(
LocaleName, ReverseJniLanguageSegmenterTest,
testing::Values(ULOC_US, ULOC_UK, ULOC_CANADA, ULOC_CANADA_FRENCH,
diff --git a/icing/tokenization/rfc822-tokenizer.cc b/icing/tokenization/rfc822-tokenizer.cc
new file mode 100644
index 0000000..13c58c5
--- /dev/null
+++ b/icing/tokenization/rfc822-tokenizer.cc
@@ -0,0 +1,798 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/rfc822-tokenizer.h"
+
+#include <algorithm>
+#include <deque>
+#include <queue>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include "icing/tokenization/token.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/util/character-iterator.h"
+#include "icing/util/i18n-utils.h"
+#include "icing/util/status-macros.h"
+#include "unicode/umachine.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+bool IsDelimiter(UChar32 c) { return c == ',' || c == ';' || c == '\n'; }
+} // namespace
+
+class Rfc822TokenIterator : public Tokenizer::Iterator {
+ public:
+ // Cursor is the index into the string_view, text_end_ is the length.
+ explicit Rfc822TokenIterator(std::string_view text)
+ : text_(std::move(text)),
+ iterator_(text, 0, 0, 0),
+ text_end_(text.length()),
+ token_index_(-1) {}
+
+ // Advance will move token_index_ past the end of tokens_
+ bool Advance() override {
+ // Stop the token index on a RFC822 token, or one past the end, where the
+ // next RFC822 token will be if more are generated.
+ do {
+ token_index_++;
+ } while (token_index_ < tokens_.size() &&
+ tokens_[token_index_].type != Token::Type::RFC822_TOKEN);
+
+ // There is still something left, possible if we rewinded and call Advance
+ if (token_index_ < tokens_.size()) {
+ return true;
+ }
+
+ // Done with the entire string_view.
+ if (iterator_.utf8_index() >= text_end_) {
+ return false;
+ }
+
+ // Parsing a new email, update the current email marker.
+ AdvancePastWhitespace();
+
+ // This may return false, as in the case of "<alex>,,", where after
+ // processing <alex>, there are no more tokens.
+ return GetNextRfc822Token();
+ }
+
+ // Returns the current token group, an RFC822_TOKEN along with all it's
+ // subtokens. For example, "tim@google.com" will return all tokens generated
+ // from that text.
+ //
+ // Returns:
+ // A vector of Tokens on success
+ // An empty vector if the token list is empty
+ // An empty vector if the index is past the end of the token list
+ std::vector<Token> GetTokens() const override {
+ std::vector<Token> result;
+ if (token_index_ < tokens_.size() && token_index_ >= 0) {
+ int index = token_index_;
+ do {
+ result.push_back(tokens_[index]);
+ } while (++index < tokens_.size() &&
+ tokens_[index].type != Token::Type::RFC822_TOKEN);
+ }
+ return result;
+ }
+
+ bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
+ CharacterIterator tracker(text_);
+ for (int new_index = 0; new_index < tokens_.size(); ++new_index) {
+ const Token& t = tokens_[new_index];
+ if (t.type != Token::Type::RFC822_TOKEN) {
+ continue;
+ }
+
+ tracker.AdvanceToUtf8(t.text.begin() - text_.begin());
+ if (tracker.utf32_index() > utf32_offset) {
+ token_index_ = new_index;
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ // This will attempt to reset the token_index to point to the last token
+ // ending before an offset. If it fails, due to there not being any tokens
+ // before the offset, the token index will become -1.
+ bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
+ // First, advance until we pass offset or Advance is false
+ if (tokens_.empty()) {
+ if (!Advance()) {
+ // No tokens available, and Advancing doesn't get more, so return false.
+ return false;
+ }
+ }
+
+ CharacterIterator tracker(text_);
+
+ // Keep advancing until we parse all the emails, or run past the offset.
+ // Advance will always make token_index_ point to an RFC822_TOKEN, so we can
+ // look at that tokens text end to determine if it ends before the offset.
+ // This first loop will guarantee that we end up either past the offset or
+ // at the end.
+ do {
+ tracker.AdvanceToUtf8(tokens_[token_index_].text.end() - text_.begin());
+
+ // When we Advance and have to convert names to email addresses, it's
+ // possible that multiple RFC822 tokens are added. We need to advance
+ // through these one at a time, we cannot skip to the top of the line.
+ } while (tracker.utf32_index() <= utf32_offset && Advance());
+
+ // We are either past the offset or at the end. Either way, we now work
+ // backwards and reset to the first (highest index) RFC822_TOKEN we find.
+ while (--token_index_ >= 0) {
+ if (tokens_[token_index_].type != Token::Type::RFC822_TOKEN) {
+ continue;
+ }
+
+ tracker.MoveToUtf8(tokens_[token_index_].text.end() - text_.begin());
+ if (tracker.utf32_index() <= utf32_offset) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ // Returns a character iterator to the start of the token.
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
+ override {
+ CharacterIterator token_start = iterator_;
+ token_start.MoveToUtf8(GetTokens().at(0).text.begin() - text_.begin());
+ return token_start;
+ }
+
+ // Returns a character iterator to right after the end of the token.
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
+ override {
+ CharacterIterator token_end = iterator_;
+ token_end.MoveToUtf8(GetTokens().at(0).text.end() - text_.begin());
+ return token_end;
+ }
+
+ // Reset to start moves to the state we're in after the first Advance().
+ bool ResetToStart() override {
+ token_index_ = -1;
+ return Advance();
+ }
+
+ private:
+ // Advance until the next email delimiter, generating as many tokens as
+ // necessary.
+ bool GetNextRfc822Token() {
+ if (iterator_.utf8_index() >= text_end_) {
+ return false;
+ }
+
+ int token_start = iterator_.utf8_index();
+ bool address_found = false;
+ bool name_found = false;
+ std::vector<Token> next_tokens;
+ Token rfc822(Token::Type::RFC822_TOKEN);
+
+ // We start at unquoted and run until a ",;\n<( .
+ while (iterator_.utf8_index() < text_end_) {
+ UChar32 c = iterator_.GetCurrentChar();
+ if (IsDelimiter(c)) {
+ // End of the token, advance cursor past all delimiters then quit.
+ rfc822.text =
+ text_.substr(token_start, iterator_.utf8_index() - token_start);
+
+ UChar32 delimiter;
+ do {
+ AdvanceCursor();
+ delimiter = iterator_.GetCurrentChar();
+ // If we get current char on the end, it is not a delimiter so this
+ // loop will end
+ } while (IsDelimiter(delimiter));
+
+ break;
+ }
+
+ std::vector<Token> consume_result;
+ if (c == '"') {
+ consume_result = ConsumeQuotedSection();
+ name_found |= !consume_result.empty();
+ } else if (c == '(') {
+ consume_result = ConsumeParenthesizedSection();
+ } else if (c == '<') {
+ // Only set address_found to true if ConsumeAdress returns true.
+ // Otherwise, keep address_found as is to prevent setting address_found
+ // back to false if it is true.
+ consume_result = ConsumeAddress();
+ address_found |= !consume_result.empty();
+ } else {
+ consume_result = ConsumeUnquotedSection();
+ name_found |= !consume_result.empty();
+ }
+ next_tokens.insert(next_tokens.end(), consume_result.begin(),
+ consume_result.end());
+ }
+ if (iterator_.utf8_index() >= text_end_) {
+ rfc822.text = text_.substr(token_start, text_end_ - token_start);
+ }
+
+ // If an address is found, use the tokens we have.
+ // If an address isn't found, and a name isn't found, also use the tokens
+ // we have.
+ // If an address isn't found but a name is, convert name Tokens to email
+ // Tokens.
+ if (!address_found && name_found) {
+ // We don't add the rfc822 token, as it will be handled by
+ // ConvertNameToEmail.
+ std::vector<Token> converted_tokens = ConvertNameToEmail(next_tokens);
+ tokens_.insert(tokens_.end(), converted_tokens.begin(),
+ converted_tokens.end());
+ } else {
+ if (next_tokens.empty()) {
+ // Tokens may not be generated in the case of ",,,,,,"
+ return false;
+ } else {
+ // If tokens were generated, push back the RFC822 token for them
+ tokens_.push_back(rfc822);
+ tokens_.insert(tokens_.end(), next_tokens.begin(), next_tokens.end());
+ }
+ }
+
+ return true;
+ }
+
+ // We allow for the "First Last <email>" format, but if there is no email in
+ // brackets, we won't allow for unquoted spaces. For example, the input
+ // "alex@google.com tim@google.com" has an unquoted space, so we will split
+ // it into two emails. We don't need to find more tokens, we just need to
+ // find @ signs and spaces and convert name tokens to parts of the email.
+ std::vector<Token> ConvertNameToEmail(std::vector<Token>& name_tokens) {
+ if (name_tokens.empty()) {
+ return name_tokens;
+ }
+
+ // There will only be names and comments, and they will be in order.
+ std::vector<Token> converted_tokens;
+
+ // Start at the beginning of the current email.
+ CharacterIterator scanner(text_);
+
+ scanner.MoveToUtf8(name_tokens[0].text.begin() - text_.begin());
+ int token_processed_index = 0;
+
+ bool in_quote = false;
+ // Setting at_sign_index to before the beginning, it'll only be set to
+ // something else if we find an @ sign
+ const char* at_sign_index = nullptr;
+
+ // Run to the end
+ while (scanner.utf8_index() < iterator_.utf8_index()) {
+ const char* end_of_token = nullptr;
+ UChar32 c = scanner.GetCurrentChar();
+ if (c == '\\') {
+ // Skip the slash, as well as the following token.
+ scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
+ scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
+ continue;
+ }
+ if (c == '"') {
+ in_quote = !in_quote;
+ }
+ if (c == '@') {
+ at_sign_index = text_.begin() + scanner.utf8_index();
+ }
+
+ // If the next character is the end OR we hit an unquoted space.
+ if (scanner.utf8_index() + i18n_utils::GetUtf8Length(c) ==
+ iterator_.utf8_index() ||
+ (!in_quote && c == ' ')) {
+ if (!in_quote && c == ' ') {
+ end_of_token = text_.begin() + scanner.utf8_index();
+ } else {
+ end_of_token = text_.begin() + iterator_.utf8_index();
+ }
+ std::deque<Token> more_tokens = ConvertOneNameToEmail(
+ name_tokens, at_sign_index, end_of_token, token_processed_index);
+ converted_tokens.insert(converted_tokens.end(), more_tokens.begin(),
+ more_tokens.end());
+ // Reset the at_sign_index
+ at_sign_index = nullptr;
+ }
+ scanner.AdvanceToUtf32(scanner.utf32_index() + 1);
+ }
+
+ // It's possible we left something out.
+ if (token_processed_index < name_tokens.size()) {
+ std::deque<Token> more_tokens =
+ ConvertOneNameToEmail(name_tokens, at_sign_index,
+ name_tokens[name_tokens.size() - 1].text.end(),
+ token_processed_index);
+ converted_tokens.insert(converted_tokens.end(), more_tokens.begin(),
+ more_tokens.end());
+ }
+
+ return converted_tokens;
+ }
+
+ // Once a name is determined to be an address, convert its tokens to address
+ // tokens.
+ std::deque<Token> ConvertOneNameToEmail(const std::vector<Token>& name_tokens,
+ const char* at_sign_index,
+ const char* end_of_token,
+ int& token_processed_index) {
+ const char* address_start = nullptr;
+ const char* local_address_end = nullptr;
+ const char* host_address_start = nullptr;
+ const char* address_end = nullptr;
+ const char* token_start = nullptr;
+ const char* token_end = nullptr;
+ std::deque<Token> converted_tokens;
+
+ // Transform tokens up to end of token pointer.
+
+ for (; token_processed_index < name_tokens.size();
+ ++token_processed_index) {
+ const Token& token = name_tokens[token_processed_index];
+
+ if (token.text.end() > end_of_token) {
+ break;
+ }
+ std::string_view text = token.text;
+ // We need to do this both for comment and name tokens. Comment tokens
+ // will get a corresponding RFC822 token, but not an address or local
+ // address.
+ if (token_start == nullptr) {
+ token_start = text.begin();
+ }
+ token_end = text.end();
+
+ if (token.type == Token::Type::RFC822_COMMENT) {
+ // Comment tokens will stay as they are.
+ converted_tokens.push_back(token);
+ } else if (token.type == Token::Type::RFC822_NAME) {
+ // Names need to be converted to address tokens. We keep the order of
+ // which the name tokens appeared. Name tokens that appear before an
+ // @ sign in the name will become RFC822_ADDRESS_COMPONENT_LOCAL, and
+ // those after will become RFC822_ADDRESS_COMPONENT_HOST. We aren't
+ // able to determine RFC822_ADDRESS, RFC822_LOCAL_ADDRESS, and
+ // RFC_HOST_ADDRESS before checking the name tokens, so they will be
+ // added after the component tokens.
+ if (address_start == nullptr) {
+ address_start = text.begin();
+ }
+ address_end = text.end();
+ if (text.begin() > at_sign_index) {
+ if (host_address_start == nullptr) {
+ host_address_start = text.begin();
+ }
+ // Once this is hit, we switch to COMPONENT_HOST and mark end of the
+ // local address
+ converted_tokens.push_back(
+ Token(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, token.text));
+ } else {
+ local_address_end = text.end();
+ converted_tokens.push_back(
+ Token(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, token.text));
+ }
+ }
+ }
+
+ if (address_start != nullptr) {
+ converted_tokens.push_back(
+ Token(Token::Type::RFC822_ADDRESS,
+ std::string_view(address_start, address_end - address_start)));
+ if (local_address_end != nullptr) {
+ converted_tokens.push_back(
+ Token(Token::Type::RFC822_LOCAL_ADDRESS,
+ std::string_view(address_start,
+ local_address_end - address_start)));
+ }
+ }
+
+ if (host_address_start != nullptr && host_address_start < address_end) {
+ converted_tokens.push_back(
+ Token(Token::Type::RFC822_HOST_ADDRESS,
+ text_.substr(host_address_start - text_.begin(),
+ address_end - host_address_start)));
+ }
+
+ if (token_start != nullptr) {
+ converted_tokens.push_front(
+ Token(Token::Type::RFC822_TOKEN,
+ std::string_view(token_start, token_end - token_start)));
+ }
+
+ return converted_tokens;
+ }
+
+ // Returns name tokens in an unquoted section. This is useful in case we do
+ // not find an address and have to use the name. An unquoted section may look
+ // like "Alex Sav", or "alex@google.com". In the absense of a bracketed email
+ // address, the unquoted section will be used as the email address along with
+ // the quoted section.
+ std::vector<Token> ConsumeUnquotedSection() {
+ UChar32 c;
+
+ int token_start = -1;
+ std::vector<Token> next_tokens;
+
+ // Advance to another state or a character marking the end of token, one
+ // of \n,; .
+ while (iterator_.utf8_index() < text_end_) {
+ c = iterator_.GetCurrentChar();
+
+ if (i18n_utils::IsAlphaNumeric(c)) {
+ if (token_start == -1) {
+ // Start recording
+ token_start = iterator_.utf8_index();
+ }
+ AdvanceCursor();
+
+ } else {
+ if (token_start != -1) {
+ // The character is non alphabetic, save a token.
+ next_tokens.push_back(Token(
+ Token::Type::RFC822_NAME,
+ text_.substr(token_start, iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ if (c == '"' || c == '<' || c == '(' || IsDelimiter(c)) {
+ // Stay on the token.
+ break;
+ }
+
+ AdvanceCursor();
+ }
+ }
+ if (token_start != -1) {
+ next_tokens.push_back(Token(
+ Token::Type::RFC822_NAME,
+ text_.substr(token_start, iterator_.utf8_index() - token_start)));
+ }
+ return next_tokens;
+ }
+
+ // Names that are within quotes should have all characters blindly
+ // unescaped. When a name is made into an address, it isn't re-escaped.
+
+ // Returns name tokens found in a quoted section. This is useful in case we do
+ // not find an address and have to use the name. The quoted section may
+ // contain whitespaces.
+ std::vector<Token> ConsumeQuotedSection() {
+ // Get past the first quote.
+ AdvanceCursor();
+
+ bool end_quote_found = false;
+ std::vector<Token> next_tokens;
+ UChar32 c;
+
+ int token_start = -1;
+
+ while (!end_quote_found && (iterator_.utf8_index() < text_end_)) {
+ c = iterator_.GetCurrentChar();
+
+ if (i18n_utils::IsAlphaNumeric(c)) {
+ if (token_start == -1) {
+ // Start tracking the token.
+ token_start = iterator_.utf8_index();
+ }
+ AdvanceCursor();
+
+ } else {
+ // Non- alphabetic
+ if (c == '\\') {
+ // A backslash, let's look at the next character.
+ CharacterIterator temp = iterator_;
+ temp.AdvanceToUtf32(iterator_.utf32_index() + 1);
+ UChar32 n = temp.GetCurrentChar();
+ if (i18n_utils::IsAlphaNumeric(n)) {
+ // The next character is alphabetic, skip the slash and don't end
+ // the last token. For quoted sections, the only things that are
+ // escaped are double quotes and slashes. For example, in "a\lex",
+ // an l appears after the slash. We want to treat this as if it
+ // was just "alex". So we tokenize it as <RFC822_NAME, "a\lex">.
+ AdvanceCursor();
+ } else {
+ // Not alphabetic, so save the last token if necessary.
+ if (token_start != -1) {
+ next_tokens.push_back(
+ Token(Token::Type::RFC822_NAME,
+ text_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ // Skip the backslash.
+ AdvanceCursor();
+
+ if (n == '"' || n == '\\' || n == '@') {
+ // Skip these too if they're next.
+ AdvanceCursor();
+ }
+ }
+ } else {
+ // Not a backslash.
+
+ if (token_start != -1) {
+ next_tokens.push_back(
+ Token(Token::Type::RFC822_NAME,
+ text_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ if (c == '"') {
+ end_quote_found = true;
+ }
+ // Advance one more time to get past the non-alphabetic character.
+ AdvanceCursor();
+ }
+ }
+ }
+ if (token_start != -1) {
+ next_tokens.push_back(Token(
+ Token::Type::RFC822_NAME,
+ text_.substr(token_start, iterator_.utf8_index() - token_start)));
+ }
+ return next_tokens;
+ }
+
+ // '(', ')', '\\' chars should be escaped. All other escaped chars should be
+ // unescaped.
+ std::vector<Token> ConsumeParenthesizedSection() {
+ // Skip the initial (
+ AdvanceCursor();
+
+ int paren_layer = 1;
+ UChar32 c;
+ std::vector<Token> next_tokens;
+
+ int token_start = -1;
+
+ while (paren_layer > 0 && (iterator_.utf8_index() < text_end_)) {
+ c = iterator_.GetCurrentChar();
+
+ if (i18n_utils::IsAlphaNumeric(c)) {
+ if (token_start == -1) {
+ // Start tracking a token.
+ token_start = iterator_.utf8_index();
+ }
+ AdvanceCursor();
+ } else {
+ // Non alphabetic.
+ if (c == '\\') {
+ // A backslash, let's look at the next character.
+ UChar32 n = i18n_utils::GetUChar32At(text_.begin(), text_.length(),
+ iterator_.utf8_index() + 1);
+ if (i18n_utils::IsAlphaNumeric(n)) {
+ // Alphabetic, skip the slash and don't end the last token.
+ AdvanceCursor();
+ } else {
+ // Not alphabetic, save the last token if necessary.
+ if (token_start != -1) {
+ next_tokens.push_back(
+ Token(Token::Type::RFC822_COMMENT,
+ text_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ // Skip the backslash.
+ AdvanceCursor();
+
+ if (n == ')' || n == '(' || n == '\\') {
+ // Skip these too if they're next.
+ AdvanceCursor();
+ }
+ }
+ } else {
+ // Not a backslash.
+ if (token_start != -1) {
+ next_tokens.push_back(
+ Token(Token::Type::RFC822_COMMENT,
+ text_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+
+ if (c == '(') {
+ paren_layer++;
+ } else if (c == ')') {
+ paren_layer--;
+ }
+ AdvanceCursor();
+ }
+ }
+ }
+
+ if (token_start != -1) {
+ // Ran past the end of text_ without getting the last token.
+
+ // substr returns "a view of the substring [pos, pos + // rcount), where
+ // rcount is the smaller of count and size() - pos" therefore the count
+ // argument can be any value >= this->cursor - token_start. Therefore,
+ // ignoring the mutation warning.
+ next_tokens.push_back(Token(
+ Token::Type::RFC822_COMMENT,
+ text_.substr(token_start, iterator_.utf8_index() - token_start)));
+ }
+ return next_tokens;
+ }
+
+ // Returns tokens found in the address.
+ std::vector<Token> ConsumeAddress() {
+ // Skip the first <.
+ AdvanceCursor();
+
+ // Save the start position.
+ CharacterIterator address_start_iterator = iterator_;
+ std::vector<Token> next_tokens;
+
+ // Place the at sign on the '<', so that if no at_sign is found, the default
+ // is that the entire address is the host part.
+ int at_sign = -1;
+ int address_end = -1;
+
+ UChar32 c = iterator_.GetCurrentChar();
+ // Quick scan for @ and > signs.
+ while (c != '>' && iterator_.utf8_index() < text_end_) {
+ AdvanceCursor();
+ c = iterator_.GetCurrentChar();
+ if (c == '@') {
+ at_sign = iterator_.utf8_index();
+ }
+ }
+
+ if (iterator_.utf8_index() <= address_start_iterator.utf8_index()) {
+ // There is nothing between the brackets, either we have "<" or "<>".
+ return next_tokens;
+ }
+
+ // Either we find a > or run to the end, either way this is the end of the
+ // address. The ending bracket will be handled by ConsumeUnquoted.
+ address_end = iterator_.utf8_index();
+
+ // Reset to the start.
+ iterator_ = address_start_iterator;
+
+ int address_start = address_start_iterator.utf8_index();
+
+ Token::Type type = Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL;
+
+ // Create a local address token.
+ if (at_sign != -1) {
+ next_tokens.push_back(
+ Token(Token::Type::RFC822_LOCAL_ADDRESS,
+ text_.substr(address_start, at_sign - address_start)));
+ } else {
+ // All the tokens in the address are host components.
+ type = Token::Type::RFC822_ADDRESS_COMPONENT_HOST;
+ // If no @ is found, treat the entire address as the host address.
+ at_sign = address_start - 1;
+ }
+
+ // The only case where we don't have a host address part is something like
+ // <localaddress@>. If there is no @, the at_sign is the default -1, and the
+ // host address is [0, address_end).
+ int host_address_start = at_sign + 1;
+ if (host_address_start < address_end) {
+ next_tokens.push_back(Token(
+ Token::Type::RFC822_HOST_ADDRESS,
+ text_.substr(host_address_start, address_end - host_address_start)));
+ }
+
+ next_tokens.push_back(
+ Token(Token::Type::RFC822_ADDRESS,
+ text_.substr(address_start, address_end - address_start)));
+
+ int token_start = -1;
+
+ while (iterator_.utf8_index() < address_end) {
+ c = iterator_.GetCurrentChar();
+
+ if (i18n_utils::IsAlphaNumeric(c)) {
+ if (token_start == -1) {
+ token_start = iterator_.utf8_index();
+ }
+ } else {
+ // non alphabetic
+ if (c == '\\') {
+ // A backslash, let's look at the next character.
+ CharacterIterator temp = iterator_;
+ temp.AdvanceToUtf32(iterator_.utf32_index() + 1);
+ UChar32 n = temp.GetCurrentChar();
+ if (!i18n_utils::IsAlphaNumeric(n)) {
+ // Not alphabetic, end the last token if necessary.
+ if (token_start != -1) {
+ next_tokens.push_back(Token(
+ type, text_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+ }
+ } else {
+ // Not backslash.
+ if (token_start != -1) {
+ next_tokens.push_back(Token(
+ type, text_.substr(token_start,
+ iterator_.utf8_index() - token_start)));
+ token_start = -1;
+ }
+ // Switch to host component tokens.
+ if (iterator_.utf8_index() == at_sign) {
+ type = Token::Type::RFC822_ADDRESS_COMPONENT_HOST;
+ }
+ }
+ }
+ AdvanceCursor();
+ }
+ if (token_start != -1) {
+ next_tokens.push_back(Token(
+ type,
+ text_.substr(token_start, iterator_.utf8_index() - token_start)));
+ }
+ // Unquoted will handle the closing bracket > if these is one.
+ return next_tokens;
+ }
+
+ void AdvanceCursor() {
+ iterator_.AdvanceToUtf32(iterator_.utf32_index() + 1);
+ }
+
+ void AdvancePastWhitespace() {
+ while (i18n_utils::IsWhitespaceAt(text_, iterator_.utf8_index())) {
+ AdvanceCursor();
+ }
+ }
+
+ std::string_view text_;
+ CharacterIterator iterator_;
+ int text_end_;
+
+ // A temporary store of Tokens. As we advance through the provided string,
+ // we parse entire addresses at a time rather than one token at a time.
+ // However, since we call the tokenizer with Advance() alternating with
+ // GetToken(), we need to store tokens for subsequent GetToken calls if
+ // Advance generates multiple tokens (it usually does). A vector is used as
+ // we need to iterate back and forth through tokens during snippeting. It is
+ // cleared by the destructor.
+ std::vector<Token> tokens_;
+ // Index to keep track of where we are in tokens_. This will always be set to
+ // point to an RFC822_TOKEN, or one past the end of the tokens_ vector. The
+ // only exception is before the first Advance call.
+ int token_index_;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
+Rfc822Tokenizer::Tokenize(std::string_view text) const {
+ return std::make_unique<Rfc822TokenIterator>(text);
+}
+
+libtextclassifier3::StatusOr<std::vector<Token>> Rfc822Tokenizer::TokenizeAll(
+ std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
+ Tokenize(text));
+ std::vector<Token> tokens;
+ while (iterator->Advance()) {
+ std::vector<Token> batch_tokens = iterator->GetTokens();
+ tokens.insert(tokens.end(), batch_tokens.begin(), batch_tokens.end());
+ }
+ return tokens;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/rfc822-tokenizer.h b/icing/tokenization/rfc822-tokenizer.h
new file mode 100644
index 0000000..09e4624
--- /dev/null
+++ b/icing/tokenization/rfc822-tokenizer.h
@@ -0,0 +1,38 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_RFC822_TOKENIZER_H_
+#define ICING_TOKENIZATION_RFC822_TOKENIZER_H_
+
+#include <vector>
+
+#include "icing/tokenization/tokenizer.h"
+
+namespace icing {
+namespace lib {
+
+class Rfc822Tokenizer : public Tokenizer {
+ public:
+ libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
+ std::string_view text) const override;
+
+ libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
+ std::string_view text) const override;
+
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_RFC822_TOKENIZER_H_
diff --git a/icing/tokenization/rfc822-tokenizer_test.cc b/icing/tokenization/rfc822-tokenizer_test.cc
new file mode 100644
index 0000000..ee3a95d
--- /dev/null
+++ b/icing/tokenization/rfc822-tokenizer_test.cc
@@ -0,0 +1,992 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/rfc822-tokenizer.h"
+
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/common-matchers.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::IsEmpty;
+
+TEST(Rfc822TokenizerTest, StartingState) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "a@g.c";
+ auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie();
+
+ ASSERT_THAT(token_iterator->GetTokens(), IsEmpty());
+ ASSERT_TRUE(token_iterator->Advance());
+ ASSERT_THAT(token_iterator->GetTokens(), Not(IsEmpty()));
+}
+
+TEST(Rfc822TokenizerTest, EmptyMiddleToken) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string s("<alex>,,<tom>");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "<alex>"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "alex"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<tom>"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "tom"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "tom"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "tom"))));
+}
+
+TEST(Rfc822TokenizerTest, Simple) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("<你alex@google.com>");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "<你alex@google.com>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "你alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "你alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "你alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+TEST(Rfc822TokenizerTest, Small) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string s = "\"a\"";
+
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "a"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "a"))));
+
+ s = "\"a\", \"b\"";
+
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "a"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "a"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "b"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "b"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "b"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "b"))));
+
+ s = "(a)";
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::RFC822_TOKEN, "(a)"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "a"))));
+}
+
+TEST(Rfc822TokenizerTest, PB) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("peanut (comment) butter, <alex@google.com>");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "peanut"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "peanut"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "comment"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "butter"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "butter"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "butter"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "butter"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<alex@google.com>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+TEST(Rfc822TokenizerTest, NoBrackets) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("alex@google.com");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"))));
+}
+
+TEST(Rfc822TokenizerTest, TwoAddresses) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("<你alex@google.com>; <alexsav@gmail.com>");
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "<你alex@google.com>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "你alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "你alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "你alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<alexsav@gmail.com>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alexsav"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "gmail.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alexsav@gmail.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alexsav"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "gmail"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+TEST(Rfc822TokenizerTest, Comment) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("(a comment) <alex@google.com>");
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ "(a comment) <alex@google.com>"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "a"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+TEST(Rfc822TokenizerTest, NameAndComment) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string_view s("\"a name\" also a name <alex@google.com>");
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ "\"a name\" also a name <alex@google.com>"),
+ EqualsToken(Token::Type::RFC822_NAME, "a"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_NAME, "also"),
+ EqualsToken(Token::Type::RFC822_NAME, "a"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+// Test from tokenizer_test.cc.
+TEST(Rfc822TokenizerTest, Rfc822SanityCheck) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string addr1("A name (A comment) <address@domain.com>");
+ std::string addr2(
+ "\"(Another name)\" (A different comment) "
+ "<bob-loblaw@foo.bar.com>");
+ std::string addr3("<no.at.sign.present>");
+ std::string addr4("<double@at@signs.present>");
+ std::string rfc822 = addr1 + ", " + addr2 + ", " + addr3 + ", " + addr4;
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(rfc822),
+ IsOkAndHolds(ElementsAre(
+
+ EqualsToken(Token::Type::RFC822_TOKEN, addr1),
+ EqualsToken(Token::Type::RFC822_NAME, "A"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "A"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "domain.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address@domain.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "domain"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+
+ EqualsToken(Token::Type::RFC822_TOKEN, addr2),
+ EqualsToken(Token::Type::RFC822_NAME, "Another"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "A"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "different"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "bob-loblaw"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "foo.bar.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "bob-loblaw@foo.bar.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "bob"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "loblaw"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "bar"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+
+ EqualsToken(Token::Type::RFC822_TOKEN, addr3),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "no.at.sign.present"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "no.at.sign.present"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "no"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "at"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "sign"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "present"),
+
+ EqualsToken(Token::Type::RFC822_TOKEN, addr4),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "double@at"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "signs.present"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "double@at@signs.present"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "double"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "at"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "signs"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "present"))));
+}
+
+// Tests from rfc822 converter.
+TEST(Rfc822TokenizerTest, SimpleRfcText) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string test_string =
+ "foo@google.com,bar@google.com,baz@google.com,foo+hello@google.com,baz@"
+ "corp.google.com";
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(test_string),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "foo@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "foo"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+
+ EqualsToken(Token::Type::RFC822_TOKEN, "bar@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "bar"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "bar@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "bar"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+
+ EqualsToken(Token::Type::RFC822_TOKEN, "baz@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "baz"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "baz@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "baz"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+
+ EqualsToken(Token::Type::RFC822_TOKEN, "foo+hello@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "hello"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo+hello@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "foo+hello"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+
+ EqualsToken(Token::Type::RFC822_TOKEN, "baz@corp.google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "baz"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "corp"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "baz@corp.google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "baz"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "corp.google.com"))));
+}
+
+TEST(Rfc822TokenizerTest, ComplicatedRfcText) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string test_string =
+ R"raw("Weird, But&(Also)\\Valid" Name (!With, "an" \\odd\\ cmt too¡) <Foo B(a)r,Baz@g.co>
+ <easy@google.com>)raw";
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(test_string),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(
+ Token::Type::RFC822_TOKEN,
+ R"raw("Weird, But&(Also)\\Valid" Name (!With, "an" \\odd\\ cmt too¡) <Foo B(a)r,Baz@g.co>)raw"),
+ EqualsToken(Token::Type::RFC822_NAME, "Weird"),
+ EqualsToken(Token::Type::RFC822_NAME, "But"),
+ EqualsToken(Token::Type::RFC822_NAME, "Also"),
+ EqualsToken(Token::Type::RFC822_NAME, "Valid"),
+ EqualsToken(Token::Type::RFC822_NAME, "Name"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "With"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "an"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "odd"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "cmt"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "too"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "Foo B(a)r,Baz"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "g.co"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "Foo B(a)r,Baz@g.co"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "Foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "B"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "r"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "Baz"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "g"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "co"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "<easy@google.com>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "easy"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "easy@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "easy"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+TEST(Rfc822TokenizerTest, FromHtmlBugs) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ // This input used to cause HTML parsing exception. We don't do HTML parsing
+ // any more (b/8388100) so we are just checking that it does not crash and
+ // that it retains the input.
+
+ // http://b/8988210. Put crashing string "&\r" x 100 into name and comment
+ // field of rfc822 token.
+
+ std::string s("\"");
+ for (int i = 0; i < 100; i++) {
+ s.append("&\r");
+ }
+ s.append("\" (");
+ for (int i = 0; i < 100; i++) {
+ s.append("&\r");
+ }
+ s.append(") <foo@google.com>");
+
+ // It shouldn't change anything
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, s),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "foo"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+TEST(Rfc822TokenizerTest, EmptyComponentsTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(""),
+ IsOkAndHolds(testing::IsEmpty()));
+
+ // Name is considered the address if address is empty.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("name<>"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"))));
+
+ // Empty name and address means that there is no token.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("(a long comment with nothing else)"),
+ IsOkAndHolds(
+ ElementsAre(EqualsToken(Token::Type::RFC822_TOKEN,
+ "(a long comment with nothing else)"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "a"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "long"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "with"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "nothing"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "else"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("name ()"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"))));
+
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(R"((comment) "")"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "(comment) \"\""),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"))));
+}
+
+TEST(Rfc822TokenizerTest, NameTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ // Name spread between address or comment.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("peanut <address> butter"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "peanut <address> butter"),
+ EqualsToken(Token::Type::RFC822_NAME, "peanut"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "address"),
+ EqualsToken(Token::Type::RFC822_NAME, "butter"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("peanut (comment) butter"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "peanut"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "peanut"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "comment"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "butter"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "butter"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "butter"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "butter"))));
+
+ // Dropping quotes when they're not needed.
+ std::string s = R"(peanut <address> "butter")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, s),
+ EqualsToken(Token::Type::RFC822_NAME, "peanut"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "address"),
+ EqualsToken(Token::Type::RFC822_NAME, "butter"))));
+
+ s = R"(peanut "butter")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(s),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "peanut"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "peanut"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "peanut"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "butter"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "butter"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "butter"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "butter"))));
+ // Adding quotes when they are needed.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("ple@se quote this <addr>"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "ple@se quote this <addr>"),
+ EqualsToken(Token::Type::RFC822_NAME, "ple"),
+ EqualsToken(Token::Type::RFC822_NAME, "se"),
+ EqualsToken(Token::Type::RFC822_NAME, "quote"),
+ EqualsToken(Token::Type::RFC822_NAME, "this"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "addr"))));
+}
+
+TEST(Rfc822TokenizerTest, CommentEscapeTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ // '(', ')', '\\' chars should be escaped. All other escaped chars should be
+ // unescaped.
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(R"((co\)mm\\en\(t))"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, R"((co\)mm\\en\(t))"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "co"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "mm"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "en"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "t"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"((c\om\ment) name)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, R"(c\om\ment)"),
+ EqualsToken(Token::Type::RFC822_COMMENT, R"(c\om\ment)"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"((co(m\))ment) name)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, R"(co(m\))ment)"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "co"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "m"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "ment"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"))));
+}
+
+TEST(Rfc822TokenizerTest, QuoteEscapeTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ // All names that include non-alphanumeric chars must be quoted and have '\\'
+ // and '"' chars escaped.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(n\\a\me <addr>)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, R"(n\\a\me <addr>)"),
+ EqualsToken(Token::Type::RFC822_NAME, "n"),
+ EqualsToken(Token::Type::RFC822_NAME, "a"),
+ EqualsToken(Token::Type::RFC822_NAME, "me"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "addr"))));
+
+ // Names that are within quotes should have all characters blindly unescaped.
+ // When a name is made into an address, it isn't re-escaped.
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"("n\\a\m\"e")"),
+ // <n\am"e>
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, R"(n\\a\m\"e)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "n"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "a\\m"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "e"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, R"(n\\a\m\"e)"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, R"(n\\a\m\"e)"))));
+}
+
+TEST(Rfc822TokenizerTest, UnterminatedComponentTest) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll("name (comment"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "comment"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(half of "the name)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "half"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "half"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "half"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "half"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "of"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "of"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "of"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "of"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "the name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "the"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "the name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "the name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"("name\)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(name (comment\)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "comment"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(<addr> "name\)"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "<addr> \"name\\"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "addr"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "addr"),
+ EqualsToken(Token::Type::RFC822_NAME, "name"))));
+
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(R"(name (comment\))"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "comment"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"))));
+}
+
+TEST(Rfc822TokenizerTest, Tokenize) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ std::string text =
+ R"raw("Berg" (home) <berg\@google.com>, tom\@google.com (work))raw";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ R"("Berg" (home) <berg\@google.com>)"),
+ EqualsToken(Token::Type::RFC822_NAME, "Berg"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "home"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "berg\\"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "berg\\@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "berg"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "tom\\@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "tom"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "tom\\@google.com"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "tom\\@google.com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "work"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "work"))));
+
+ text = R"raw(Foo Bar (something) <foo\@google.com>, )raw"
+ R"raw(blah\@google.com (something))raw";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ "Foo Bar (something) <foo\\@google.com>"),
+ EqualsToken(Token::Type::RFC822_NAME, "Foo"),
+ EqualsToken(Token::Type::RFC822_NAME, "Bar"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "something"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "foo\\"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo\\@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "blah\\@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "blah"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "blah\\@google.com"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "blah\\@google.com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "something"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "something"))));
+}
+
+TEST(Rfc822TokenizerTest, EdgeCases) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+
+ // Text to trigger the scenario where you have a non-alphabetic followed
+ // by a \ followed by non alphabetic to end an in-address token.
+ std::string text = R"raw(<be.\&rg@google.com>)raw";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN,
+ R"raw(<be.\&rg@google.com>)raw"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "be.\\&rg"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "be.\\&rg@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "be"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "rg"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+
+ // A \ followed by an alphabetic shouldn't end the token.
+ text = "<a\\lex@google.com>";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "<a\\lex@google.com>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "a\\lex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "a\\lex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "a\\lex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+
+ // \\ or \" in a quoted section.
+ text = R"("al\\ex@goo\"<idk>gle.com")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, R"(al\\ex@goo\"<idk>gle.com)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "al"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "ex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "goo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "idk"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "gle"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS,
+ R"(al\\ex@goo\"<idk>gle.com)"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "al\\\\ex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "goo\\\"<idk>gle.com"))));
+
+ text = "<alex@google.com";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "<alex@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+TEST(Rfc822TokenizerTest, NumberInAddress) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "<3alex@google.com>";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "<3alex@google.com>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "3alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "3alex@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "3alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"))));
+}
+
+TEST(Rfc822TokenizerTest, DoubleQuoteDoubleSlash) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = R"("alex\"")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "alex"))));
+
+ text = R"("alex\\\a")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, R"(alex\\\a)"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, R"(alex\\\a)"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, R"(alex\\\a)"))));
+}
+
+TEST(Rfc822TokenizerTest, TwoEmails) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "tjbarron@google.com alexsav@google.com";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "tjbarron@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "tjbarron"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "tjbarron@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "tjbarron"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "alexsav@google.com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alexsav"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alexsav@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alexsav"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"))));
+}
+
+TEST(Rfc822TokenizerTest, BackSlashes) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = R"("\name")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "name"))));
+
+ text = R"("name@foo\@gmail")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "name@foo\\@gmail"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "name"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "gmail"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "name@foo\\@gmail"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "name"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "foo\\@gmail"))));
+}
+
+TEST(Rfc822TokenizerTest, BigWhitespace) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "\"quoted\" <address>";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, text),
+ EqualsToken(Token::Type::RFC822_NAME, "quoted"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "address"))));
+}
+
+TEST(Rfc822TokenizerTest, AtSignFirst) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "\"@foo\"";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "foo"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "foo"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "foo"))));
+}
+
+TEST(Rfc822TokenizerTest, SlashThenUnicode) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = R"("quoted\你cjk")";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "quoted\\你cjk"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST,
+ "quoted\\你cjk"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "quoted\\你cjk"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "quoted\\你cjk"))));
+}
+
+TEST(Rfc822TokenizerTest, AddressEmptyAddress) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "<address> <> Name";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, text),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "address"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "address"),
+ EqualsToken(Token::Type::RFC822_NAME, "Name"))));
+}
+
+TEST(Rfc822TokenizerTest, ProperComment) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "(comment)alex@google.com";
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "comment)alex@google.com"),
+ EqualsToken(Token::Type::RFC822_COMMENT, "comment"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "google"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "com"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@google.com"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "google.com"))));
+}
+
+TEST(Rfc822TokenizerTest, SmallNameToEmail) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "a@g.c,b@g.c";
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "a@g.c"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "g"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "c"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "a@g.c"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "a"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "g.c"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "b@g.c"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "b"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "g"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "c"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "b@g.c"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "b"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "g.c"))));
+
+ text = "a\\\\@g.c";
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "a\\\\@g.c"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "a"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "g"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_HOST, "c"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "a\\\\@g.c"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "a"),
+ EqualsToken(Token::Type::RFC822_HOST_ADDRESS, "g.c"))));
+}
+
+TEST(Rfc822TokenizerTest, AtSignLast) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string_view text("<alex@>, tim@");
+ EXPECT_THAT(
+ rfc822_tokenizer.TokenizeAll(text),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::RFC822_TOKEN, "<alex@>"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "alex"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "alex@"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "alex"),
+ EqualsToken(Token::Type::RFC822_TOKEN, "tim"),
+ EqualsToken(Token::Type::RFC822_ADDRESS_COMPONENT_LOCAL, "tim"),
+ EqualsToken(Token::Type::RFC822_ADDRESS, "tim"),
+ EqualsToken(Token::Type::RFC822_LOCAL_ADDRESS, "tim"))));
+}
+
+TEST(Rfc822TokenizerTest, Commas) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = ",,,,,,,,,,,,,,,,,,,,,,,,,,;";
+ EXPECT_THAT(rfc822_tokenizer.TokenizeAll(text), IsOkAndHolds(IsEmpty()));
+}
+
+TEST(Rfc822TokenizerTest, ResetToTokenStartingAfter) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "a@g.c,b@g.c";
+ auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie();
+ ASSERT_TRUE(token_iterator->Advance());
+ ASSERT_TRUE(token_iterator->Advance());
+
+ ASSERT_TRUE(token_iterator->ResetToTokenStartingAfter(-1));
+ EXPECT_THAT(token_iterator->GetTokens().at(0).text, "a@g.c");
+
+ ASSERT_TRUE(token_iterator->ResetToTokenStartingAfter(5));
+ EXPECT_THAT(token_iterator->GetTokens().at(0).text, "b@g.c");
+
+ ASSERT_FALSE(token_iterator->ResetToTokenStartingAfter(6));
+}
+
+TEST(Rfc822TokenizerTest, ResetToTokenEndingBefore) {
+ Rfc822Tokenizer rfc822_tokenizer = Rfc822Tokenizer();
+ std::string text = "a@g.c,b@g.c";
+ auto token_iterator = rfc822_tokenizer.Tokenize(text).ValueOrDie();
+ token_iterator->Advance();
+
+ ASSERT_TRUE(token_iterator->ResetToTokenEndingBefore(5));
+ EXPECT_THAT(token_iterator->GetTokens().at(0).text, "a@g.c");
+
+ ASSERT_FALSE(token_iterator->ResetToTokenEndingBefore(4));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/simple/space-language-segmenter-factory.cc b/icing/tokenization/simple/space-language-segmenter-factory.cc
deleted file mode 100644
index 1cca603..0000000
--- a/icing/tokenization/simple/space-language-segmenter-factory.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/simple/space-language-segmenter.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-
-namespace language_segmenter_factory {
-
-// Creates a language segmenter with the given locale.
-//
-// Returns:
-// A LanguageSegmenter on success
-// INVALID_ARGUMENT if locale string is invalid
-//
-// TODO(samzheng): Figure out if we want to verify locale strings and notify
-// users. Right now illegal locale strings will be ignored by ICU. ICU
-// components will be created with its default locale.
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter>> Create(
- SegmenterOptions) {
- return std::make_unique<SpaceLanguageSegmenter>();
-}
-
-} // namespace language_segmenter_factory
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/simple/space-language-segmenter.cc b/icing/tokenization/simple/space-language-segmenter.cc
deleted file mode 100644
index 7e301ec..0000000
--- a/icing/tokenization/simple/space-language-segmenter.cc
+++ /dev/null
@@ -1,205 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tokenization/simple/space-language-segmenter.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/status.h"
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/status-macros.h"
-
-namespace icing {
-namespace lib {
-
-namespace {
-constexpr char kASCIISpace = ' ';
-} // namespace
-
-class SpaceLanguageSegmenterIterator : public LanguageSegmenter::Iterator {
- public:
- SpaceLanguageSegmenterIterator(std::string_view text)
- : text_(text), term_start_index_(0), term_end_index_exclusive_(0) {}
-
- // Advances to the next term. Returns false if it has reached the end.
- bool Advance() override {
- if (term_end_index_exclusive_ >= text_.size() ||
- term_start_index_ >= text_.size()) {
- // Reached the end
- return false;
- }
-
- // Next term starts where we left off.
- term_start_index_ = term_end_index_exclusive_;
-
- // We know a term is at least one length, so we can +1 first.
- term_end_index_exclusive_++;
-
- // We alternate terms between space and non-space. Figure out what type of
- // term we're currently on so we know how to stop.
- bool is_space = text_[term_start_index_] == kASCIISpace;
-
- while (term_end_index_exclusive_ < text_.size()) {
- bool end_is_space = text_[term_end_index_exclusive_] == kASCIISpace;
- if (is_space != end_is_space) {
- // We finally see a different type of character, reached the end.
- break;
- }
- // We're still seeing the same types of characters (saw a space and
- // still seeing spaces, or saw a non-space and still seeing non-spaces).
- // Haven't reached the next term yet, keep advancing.
- term_end_index_exclusive_++;
- }
-
- return true;
- }
-
- // Returns the current term. It can be called only when Advance() returns
- // true.
- std::string_view GetTerm() const override {
- if (text_[term_start_index_] == kASCIISpace) {
- // Rule: multiple continuous whitespaces are treated as one.
- return std::string_view(&text_[term_start_index_], 1);
- }
- return text_.substr(term_start_index_,
- term_end_index_exclusive_ - term_start_index_);
- }
-
- libtextclassifier3::StatusOr<int32_t> ResetToTermStartingAfter(
- int32_t offset) override {
- if (offset < 0) {
- // Start over from the beginning to find the first term.
- term_start_index_ = 0;
- term_end_index_exclusive_ = 0;
- } else {
- // Offset points to a term right now. Advance to get past the current
- // term.
- term_end_index_exclusive_ = offset;
- if (!Advance()) {
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No term found in '%s' that starts after offset %d",
- std::string(text_).c_str(), offset));
- }
- }
-
- // Advance again so we can point to the next term.
- if (!Advance()) {
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No term found in '%s' that starts after offset %d",
- std::string(text_).c_str(), offset));
- }
-
- return term_start_index_;
- }
-
- libtextclassifier3::StatusOr<int32_t> ResetToTermEndingBefore(
- int32_t offset) override {
- if (offset <= 0 || offset > text_.size()) {
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No term found in '%s' that ends before offset %d",
- std::string(text_).c_str(), offset));
- }
-
- if (offset == text_.size()) {
- // Special-case if the offset is the text length, this is the last term in
- // the text, which is also considered to be "ending before" the offset.
- term_end_index_exclusive_ = offset;
- ICING_ASSIGN_OR_RETURN(term_start_index_, GetTermStartingBefore(offset));
- return term_start_index_;
- }
-
- // Otherwise, this is just the end of the previous term and we still need to
- // find the start of the previous term.
- ICING_ASSIGN_OR_RETURN(term_end_index_exclusive_,
- GetTermStartingBefore(offset));
-
- if (term_end_index_exclusive_ == 0) {
- // The current term starts at the beginning of the underlying text_.
- // There is no term before this.
- return absl_ports::NotFoundError(IcingStringUtil::StringPrintf(
- "No term found in '%s' that ends before offset %d",
- std::string(text_).c_str(), offset));
- }
-
- // Reset ourselves to find the term before the end.
- ICING_ASSIGN_OR_RETURN(
- term_start_index_,
- GetTermStartingBefore(term_end_index_exclusive_ - 1));
- return term_start_index_;
- }
-
- libtextclassifier3::StatusOr<int32_t> ResetToStart() override {
- term_start_index_ = 0;
- term_end_index_exclusive_ = 0;
- if (!Advance()) {
- return absl_ports::NotFoundError("");
- }
- return term_start_index_;
- }
-
- private:
- // Return the start offset of the term starting right before the given offset.
- libtextclassifier3::StatusOr<int32_t> GetTermStartingBefore(int32_t offset) {
- bool is_space = text_[offset] == kASCIISpace;
-
- // Special-case that if offset was the text length, then we're already at
- // the "end" of our current term.
- if (offset == text_.size()) {
- is_space = text_[--offset] == kASCIISpace;
- }
-
- // While it's the same type of character (space vs non-space), we're in the
- // same term. So keep iterating backwards until we see a change.
- while (offset >= 0 && (text_[offset] == kASCIISpace) == is_space) {
- --offset;
- }
-
- // +1 is because offset was off-by-one to exit the while-loop.
- return ++offset;
- }
-
- // Text to be segmented
- std::string_view text_;
-
- // The start and end indices are used to track the positions of current
- // term.
- int term_start_index_;
- int term_end_index_exclusive_;
-};
-
-libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
-SpaceLanguageSegmenter::Segment(const std::string_view text) const {
- return std::make_unique<SpaceLanguageSegmenterIterator>(text);
-}
-
-libtextclassifier3::StatusOr<std::vector<std::string_view>>
-SpaceLanguageSegmenter::GetAllTerms(const std::string_view text) const {
- ICING_ASSIGN_OR_RETURN(std::unique_ptr<LanguageSegmenter::Iterator> iterator,
- Segment(text));
- std::vector<std::string_view> terms;
- while (iterator->Advance()) {
- terms.push_back(iterator->GetTerm());
- }
- return terms;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/simple/space-language-segmenter.h b/icing/tokenization/simple/space-language-segmenter.h
deleted file mode 100644
index de0a6d3..0000000
--- a/icing/tokenization/simple/space-language-segmenter.h
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
-#define ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/tokenization/language-segmenter.h"
-
-namespace icing {
-namespace lib {
-
-// Simple segmenter that splits on spaces, regardless of language. Continuous
-// whitespaces will be returned as a single whitespace character.
-class SpaceLanguageSegmenter : public LanguageSegmenter {
- public:
- SpaceLanguageSegmenter() = default;
- SpaceLanguageSegmenter(const SpaceLanguageSegmenter&) = delete;
- SpaceLanguageSegmenter& operator=(const SpaceLanguageSegmenter&) = delete;
-
- // Segmentation is based purely on whitespace; does not take into account the
- // language of the text.
- //
- // Returns:
- // An iterator of terms on success
- libtextclassifier3::StatusOr<std::unique_ptr<LanguageSegmenter::Iterator>>
- Segment(std::string_view text) const override;
-
- // Does not take into account the language of the text.
- //
- // Returns:
- // A list of terms on success
- // INTERNAL_ERROR if any error occurs
- libtextclassifier3::StatusOr<std::vector<std::string_view>> GetAllTerms(
- std::string_view text) const override;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TOKENIZATION_SIMPLE_SPACE_LANGUAGE_SEGMENTER_H_
diff --git a/icing/tokenization/simple/space-language-segmenter_test.cc b/icing/tokenization/simple/space-language-segmenter_test.cc
deleted file mode 100644
index 8ed38b2..0000000
--- a/icing/tokenization/simple/space-language-segmenter_test.cc
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/tokenization/language-segmenter-factory.h"
-#include "icing/tokenization/language-segmenter.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-using ::testing::ElementsAre;
-using ::testing::Eq;
-using ::testing::IsEmpty;
-
-TEST(SpaceLanguageSegmenterTest, EmptyText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
- EXPECT_THAT(language_segmenter->GetAllTerms(""), IsOkAndHolds(IsEmpty()));
-}
-
-TEST(SpaceLanguageSegmenterTest, SimpleText) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
- EXPECT_THAT(language_segmenter->GetAllTerms("Hello World"),
- IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Punctuation) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
-
- EXPECT_THAT(language_segmenter->GetAllTerms("Hello, World!!!"),
- IsOkAndHolds(ElementsAre("Hello,", " ", "World!!!")));
- EXPECT_THAT(language_segmenter->GetAllTerms("Open-source project"),
- IsOkAndHolds(ElementsAre("Open-source", " ", "project")));
- EXPECT_THAT(language_segmenter->GetAllTerms("100%"),
- IsOkAndHolds(ElementsAre("100%")));
- EXPECT_THAT(language_segmenter->GetAllTerms("(A&B)"),
- IsOkAndHolds(ElementsAre("(A&B)")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Alphanumeric) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
-
- // Alphanumeric terms are allowed
- EXPECT_THAT(language_segmenter->GetAllTerms("Se7en A4 3a"),
- IsOkAndHolds(ElementsAre("Se7en", " ", "A4", " ", "3a")));
-}
-
-TEST(SpaceLanguageSegmenterTest, Number) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
-
- // Alphanumeric terms are allowed
- EXPECT_THAT(
- language_segmenter->GetAllTerms("3.141592653589793238462643383279"),
- IsOkAndHolds(ElementsAre("3.141592653589793238462643383279")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("3,456.789"),
- IsOkAndHolds(ElementsAre("3,456.789")));
-
- EXPECT_THAT(language_segmenter->GetAllTerms("-123"),
- IsOkAndHolds(ElementsAre("-123")));
-}
-
-TEST(SpaceLanguageSegmenterTest, ContinuousWhitespaces) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
-
- // Multiple continuous whitespaces are treated as one.
- const int kNumSeparators = 256;
- const std::string text_with_spaces =
- absl_ports::StrCat("Hello", std::string(kNumSeparators, ' '), "World");
- EXPECT_THAT(language_segmenter->GetAllTerms(text_with_spaces),
- IsOkAndHolds(ElementsAre("Hello", " ", "World")));
-}
-
-TEST(SpaceLanguageSegmenterTest, NotCopyStrings) {
- ICING_ASSERT_OK_AND_ASSIGN(auto language_segmenter,
- language_segmenter_factory::Create());
- // Validates that the input strings are not copied
- const std::string text = "Hello World";
- const char* word1_address = text.c_str();
- const char* word2_address = text.c_str() + 6;
- ICING_ASSERT_OK_AND_ASSIGN(std::vector<std::string_view> terms,
- language_segmenter->GetAllTerms(text));
- ASSERT_THAT(terms, ElementsAre("Hello", " ", "World"));
- const char* word1_result_address = terms.at(0).data();
- const char* word2_result_address = terms.at(2).data();
-
- // The underlying char* should be the same
- EXPECT_THAT(word1_address, Eq(word1_result_address));
- EXPECT_THAT(word2_address, Eq(word2_result_address));
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/tokenization/token.h b/icing/tokenization/token.h
index 0bb3aaf..05d6fe4 100644
--- a/icing/tokenization/token.h
+++ b/icing/tokenization/token.h
@@ -20,16 +20,25 @@
namespace icing {
namespace lib {
-// TODO(samzheng) Add group id support if needed. Right now in raw query we
-// don't need group ids since all our query operators (OR, Exclusion, Property
-// Restriction) only apply to the token right after them (vs. applying to
-// multiple tokens after them). The "groups" of tokens can be easily recognized.
struct Token {
- enum Type {
+ enum class Type {
// Common types
REGULAR, // A token without special meanings, the value of it will be
// indexed or searched directly
+ VERBATIM, // A token that should be indexed and searched without any
+ // modifications to the raw text
+
+ // An RFC822 section with the content in RFC822_TOKEN tokenizes as follows:
+ RFC822_NAME, // "User", "Johnsson"
+ RFC822_COMMENT, // "A", "comment", "here"
+ RFC822_LOCAL_ADDRESS, // "user.name"
+ RFC822_HOST_ADDRESS, // "domain.name.com"
+ RFC822_ADDRESS, // "user.name@domain.name.com"
+ RFC822_ADDRESS_COMPONENT_LOCAL, // "user", "name",
+ RFC822_ADDRESS_COMPONENT_HOST, // "domain", "name", "com"
+ RFC822_TOKEN, // "User Johnsson (A comment) <user.name@domain.name.com>"
+
// Types only used in raw query
QUERY_OR, // Indicates OR logic between its left and right tokens
QUERY_EXCLUSION, // Indicates exclusion operation on next token
@@ -37,6 +46,20 @@ struct Token {
QUERY_LEFT_PARENTHESES, // Left parentheses
QUERY_RIGHT_PARENTHESES, // Right parentheses
+ // Types used in URL tokenization
+ URL_SCHEME, // "http", "https", "ftp", "content"
+ URL_USERNAME,
+ URL_PASSWORD,
+ URL_HOST_COMMON_PART, // Hosts are split into two types, common and
+ // significant. Common are e.g: www, ww2, .com, etc.
+ URL_HOST_SIGNIFICANT_PART,
+ URL_PORT,
+ URL_PATH_PART, // Tokenized path, e.g. /abc-d/e.fg-> [abc-d], [e.fg]
+ URL_QUERY, // After ?, before #, e.g. "param1=value-1&param2=value-2
+ URL_REF, // Anything after #. Could be anything
+ URL_SUFFIX,
+ URL_SUFFIX_INNERMOST,
+
// Indicates errors
INVALID,
};
@@ -46,10 +69,10 @@ struct Token {
: type(type_in), text(text_in) {}
// The type of token
- const Type type;
+ Type type;
// The content of token
- const std::string_view text;
+ std::string_view text;
};
} // namespace lib
diff --git a/icing/tokenization/tokenizer-factory.cc b/icing/tokenization/tokenizer-factory.cc
index 9ebbce5..d120ac8 100644
--- a/icing/tokenization/tokenizer-factory.cc
+++ b/icing/tokenization/tokenizer-factory.cc
@@ -22,7 +22,14 @@
#include "icing/tokenization/language-segmenter.h"
#include "icing/tokenization/plain-tokenizer.h"
#include "icing/tokenization/raw-query-tokenizer.h"
+#include "icing/tokenization/rfc822-tokenizer.h"
#include "icing/tokenization/tokenizer.h"
+
+#ifdef ENABLE_URL_TOKENIZER
+#include "icing/tokenization/url-tokenizer.h"
+#endif // ENABLE_URL_TOKENIZER
+
+#include "icing/tokenization/verbatim-tokenizer.h"
#include "icing/util/status-macros.h"
namespace icing {
@@ -31,14 +38,24 @@ namespace lib {
namespace tokenizer_factory {
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer>>
-CreateIndexingTokenizer(IndexingConfig::TokenizerType::Code type,
+CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type,
const LanguageSegmenter* lang_segmenter) {
ICING_RETURN_ERROR_IF_NULL(lang_segmenter);
switch (type) {
- case IndexingConfig::TokenizerType::PLAIN:
+ case StringIndexingConfig::TokenizerType::PLAIN:
return std::make_unique<PlainTokenizer>(lang_segmenter);
- case IndexingConfig::TokenizerType::NONE:
+ case StringIndexingConfig::TokenizerType::VERBATIM:
+ return std::make_unique<VerbatimTokenizer>();
+ case StringIndexingConfig::TokenizerType::RFC822:
+ return std::make_unique<Rfc822Tokenizer>();
+// TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
+// to Android.
+#ifdef ENABLE_URL_TOKENIZER
+ case StringIndexingConfig::TokenizerType::URL:
+ return std::make_unique<UrlTokenizer>();
+#endif // ENABLE_URL_TOKENIZER
+ case StringIndexingConfig::TokenizerType::NONE:
[[fallthrough]];
default:
// This should never happen.
diff --git a/icing/tokenization/tokenizer-factory.h b/icing/tokenization/tokenizer-factory.h
index f81fd96..8b9226d 100644
--- a/icing/tokenization/tokenizer-factory.h
+++ b/icing/tokenization/tokenizer-factory.h
@@ -37,7 +37,7 @@ namespace tokenizer_factory {
// FAILED_PRECONDITION on any null pointer input
// INVALID_ARGUMENT if tokenizer type is invalid
libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer>>
-CreateIndexingTokenizer(IndexingConfig::TokenizerType::Code type,
+CreateIndexingTokenizer(StringIndexingConfig::TokenizerType::Code type,
const LanguageSegmenter* lang_segmenter);
// All the supported query tokenizer types
diff --git a/icing/tokenization/tokenizer.h b/icing/tokenization/tokenizer.h
index 38c4745..fb7613f 100644
--- a/icing/tokenization/tokenizer.h
+++ b/icing/tokenization/tokenizer.h
@@ -18,9 +18,12 @@
#include <cstdint>
#include <memory>
#include <string_view>
+#include <vector>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/absl_ports/canonical_errors.h"
#include "icing/tokenization/token.h"
+#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -38,14 +41,6 @@ class Tokenizer {
public:
virtual ~Tokenizer() = default;
- enum Type {
- // Index tokenizers
- PLAIN, // Used to tokenize plain text input
-
- // Query tokenizers
- RAW_QUERY, // Used to tokenize raw queries
- };
-
// An iterator helping to get tokens.
// Example usage:
//
@@ -60,31 +55,48 @@ class Tokenizer {
// Advances to the next token. Returns false if it has reached the end.
virtual bool Advance() = 0;
- // Returns the current token. It can be called only when Advance() returns
- // true, otherwise an invalid token could be returned.
- virtual Token GetToken() const = 0;
+ // Returns the current token, maybe with compound tokens as well. It can be
+ // called only when Advance() returns true, otherwise an empty Token vector
+ // may be returned.
+ virtual std::vector<Token> GetTokens() const = 0;
+
+ virtual libtextclassifier3::StatusOr<CharacterIterator>
+ CalculateTokenStart() {
+ return absl_ports::UnimplementedError(
+ "CalculateTokenStart is not implemented!");
+ }
+
+ virtual libtextclassifier3::StatusOr<CharacterIterator>
+ CalculateTokenEndExclusive() {
+ return absl_ports::UnimplementedError(
+ "CalculateTokenEndExclusive is not implemented!");
+ }
// Sets the tokenizer to point at the first token that *starts* *after*
// offset. Returns false if there are no valid tokens starting after
// offset.
// Ex.
// auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
- // iterator.ResetToTokenAfter(4);
+ // iterator.ResetToTokenStartingAfter(4);
// // The first full token starting after position 4 (the 'b' in "bar") is
// // "baz".
// PrintToken(iterator.GetToken()); // prints "baz"
- virtual bool ResetToTokenAfter(int32_t offset) { return false; }
+ virtual bool ResetToTokenStartingAfter(int32_t utf32_offset) {
+ return false;
+ }
// Sets the tokenizer to point at the first token that *ends* *before*
// offset. Returns false if there are no valid tokens ending
// before offset.
// Ex.
// auto iterator = tokenizer.Tokenize("foo bar baz").ValueOrDie();
- // iterator.ResetToTokenBefore(4);
+ // iterator.ResetToTokenEndingBefore(4);
// // The first full token ending before position 4 (the 'b' in "bar") is
// // "foo".
// PrintToken(iterator.GetToken()); // prints "foo"
- virtual bool ResetToTokenBefore(int32_t offset) { return false; }
+ virtual bool ResetToTokenEndingBefore(int32_t utf32_offset) {
+ return false;
+ }
virtual bool ResetToStart() { return false; }
};
diff --git a/icing/tokenization/verbatim-tokenizer.cc b/icing/tokenization/verbatim-tokenizer.cc
new file mode 100644
index 0000000..9ca611d
--- /dev/null
+++ b/icing/tokenization/verbatim-tokenizer.cc
@@ -0,0 +1,144 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/tokenization/verbatim-tokenizer.h"
+
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/util/character-iterator.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+class VerbatimTokenIterator : public Tokenizer::Iterator {
+ public:
+ explicit VerbatimTokenIterator(std::string_view text)
+ : term_(std::move(text)) {}
+
+ bool Advance() override {
+ if (term_.empty() || has_advanced_to_end_) {
+ return false;
+ }
+
+ has_advanced_to_end_ = true;
+ return true;
+ }
+
+ std::vector<Token> GetTokens() const override {
+ std::vector<Token> result;
+
+ if (!term_.empty() && has_advanced_to_end_) {
+ result.push_back(Token(Token::Type::VERBATIM, term_));
+ }
+
+ return result;
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenStart()
+ override {
+ if (term_.empty()) {
+ return absl_ports::AbortedError(
+ "Could not calculate start of empty token.");
+ }
+
+ return CharacterIterator(term_, 0, 0, 0);
+ }
+
+ libtextclassifier3::StatusOr<CharacterIterator> CalculateTokenEndExclusive()
+ override {
+ if (term_.empty()) {
+ return absl_ports::AbortedError(
+ "Could not calculate end of empty token.");
+ }
+
+ if (token_end_iterator_.utf8_index() >= 0) {
+ return token_end_iterator_;
+ }
+
+ bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
+ if (moved_to_token_end) {
+ return token_end_iterator_;
+ } else {
+ return absl_ports::AbortedError("Could not move to end of token.");
+ }
+ }
+
+ bool ResetToTokenStartingAfter(int32_t utf32_offset) override {
+ // We can only reset to the sole verbatim token, so we must have a negative
+ // offset for it to be considered the token after.
+ if (utf32_offset < 0) {
+ // Because we are now at the sole verbatim token, we should ensure we can
+ // no longer advance past it.
+ has_advanced_to_end_ = true;
+ return true;
+ }
+ return false;
+ }
+
+ bool ResetToTokenEndingBefore(int32_t utf32_offset) override {
+ // We can only reset to the sole verbatim token, so we must have an offset
+ // after the end of the token for the reset to be valid. This means the
+ // provided utf-32 offset must be equal to or greater than the utf-32 length
+ // of the token.
+ if (token_end_iterator_.utf8_index() < 0) {
+ // Moves one index past the end of the term.
+ bool moved_to_token_end = token_end_iterator_.MoveToUtf8(term_.length());
+ if (!moved_to_token_end) {
+ // We're unable to reset as we failed to move to the end of the term.
+ return false;
+ }
+ }
+
+ if (utf32_offset >= token_end_iterator_.utf32_index()) {
+ // Because we are now at the sole verbatim token, we should ensure we can
+ // no longer advance past it.
+ has_advanced_to_end_ = true;
+ return true;
+ }
+ return false;
+ }
+
+ bool ResetToStart() override {
+ has_advanced_to_end_ = true;
+ return true;
+ }
+
+ private:
+ std::string_view term_;
+ CharacterIterator token_end_iterator_ = CharacterIterator(term_, -1, -1, -1);
+ // Used to determine whether we have advanced on the sole verbatim token
+ bool has_advanced_to_end_ = false;
+};
+
+libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>>
+VerbatimTokenizer::Tokenize(std::string_view text) const {
+ return std::make_unique<VerbatimTokenIterator>(text);
+}
+
+libtextclassifier3::StatusOr<std::vector<Token>> VerbatimTokenizer::TokenizeAll(
+ std::string_view text) const {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> iterator,
+ Tokenize(text));
+ std::vector<Token> tokens;
+ while (iterator->Advance()) {
+ std::vector<Token> batch = iterator->GetTokens();
+ tokens.insert(tokens.end(), batch.begin(), batch.end());
+ }
+ return tokens;
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/tokenization/verbatim-tokenizer.h b/icing/tokenization/verbatim-tokenizer.h
new file mode 100644
index 0000000..8404cf1
--- /dev/null
+++ b/icing/tokenization/verbatim-tokenizer.h
@@ -0,0 +1,41 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TOKENIZATION_VERBATIM_H_
+#define ICING_TOKENIZATION_VERBATIM_H_
+
+#include <memory>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/tokenization/tokenizer.h"
+
+namespace icing {
+namespace lib {
+
+// Provides verbatim tokenization on input text
+class VerbatimTokenizer : public Tokenizer {
+ public:
+ libtextclassifier3::StatusOr<std::unique_ptr<Tokenizer::Iterator>> Tokenize(
+ std::string_view text) const override;
+
+ libtextclassifier3::StatusOr<std::vector<Token>> TokenizeAll(
+ std::string_view text) const override;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TOKENIZATION_VERBATIM_H_
diff --git a/icing/tokenization/verbatim-tokenizer_test.cc b/icing/tokenization/verbatim-tokenizer_test.cc
new file mode 100644
index 0000000..bae69ff
--- /dev/null
+++ b/icing/tokenization/verbatim-tokenizer_test.cc
@@ -0,0 +1,210 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string_view>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/portable/platform.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/jni-test-helpers.h"
+#include "icing/testing/test-data.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/token.h"
+#include "icing/tokenization/tokenizer-factory.h"
+#include "icing/util/character-iterator.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+
+class VerbatimTokenizerTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ jni_cache_ = GetTestJniCache();
+ language_segmenter_factory::SegmenterOptions options(ULOC_US,
+ jni_cache_.get());
+ ICING_ASSERT_OK_AND_ASSIGN(
+ language_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+ }
+
+ std::unique_ptr<const JniCache> jni_cache_;
+ std::unique_ptr<LanguageSegmenter> language_segmenter_;
+};
+
+TEST_F(VerbatimTokenizerTest, Empty) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ EXPECT_THAT(verbatim_tokenizer->TokenizeAll(""), IsOkAndHolds(IsEmpty()));
+}
+
+TEST_F(VerbatimTokenizerTest, Simple) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ EXPECT_THAT(
+ verbatim_tokenizer->TokenizeAll("foo bar"),
+ IsOkAndHolds(ElementsAre(EqualsToken(Token::Type::VERBATIM, "foo bar"))));
+}
+
+TEST_F(VerbatimTokenizerTest, Punctuation) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ EXPECT_THAT(verbatim_tokenizer->TokenizeAll("Hello, world!"),
+ IsOkAndHolds(ElementsAre(
+ EqualsToken(Token::Type::VERBATIM, "Hello, world!"))));
+}
+
+TEST_F(VerbatimTokenizerTest, NoTokensBeforeAdvancing) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ // We should get no tokens if we get the token before advancing.
+ EXPECT_THAT(token_iterator->GetTokens(), IsEmpty());
+}
+
+TEST_F(VerbatimTokenizerTest, ResetToTokenEndingBefore) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ // Reset to beginning of verbatim of token. We provide an offset of 13 as it
+ // is larger than the final index (12) of the verbatim token.
+ EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
+ EXPECT_THAT(token_iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::VERBATIM, "Hello, world!")));
+
+ // Ensure our cached character iterator propertly maintains the end of the
+ // verbatim token.
+ EXPECT_TRUE(token_iterator->ResetToTokenEndingBefore(13));
+ EXPECT_THAT(token_iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::VERBATIM, "Hello, world!")));
+
+ // We should not be able to reset with an offset before or within
+ // the verbatim token's utf-32 length.
+ EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(0));
+ EXPECT_FALSE(token_iterator->ResetToTokenEndingBefore(12));
+}
+
+TEST_F(VerbatimTokenizerTest, ResetToTokenStartingAfter) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ // Get token without resetting
+ EXPECT_TRUE(token_iterator->Advance());
+ EXPECT_THAT(token_iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::VERBATIM, "Hello, world!")));
+
+ // We expect a sole verbatim token, so it's not possible to reset after the
+ // start of the token.
+ EXPECT_FALSE(token_iterator->ResetToTokenStartingAfter(1));
+
+ // We expect to be reset to the sole verbatim token when the offset is
+ // negative.
+ EXPECT_TRUE(token_iterator->ResetToTokenStartingAfter(-1));
+ EXPECT_THAT(token_iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::VERBATIM, "Hello, world!")));
+}
+
+TEST_F(VerbatimTokenizerTest, ResetToStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ // Get token without resetting
+ EXPECT_TRUE(token_iterator->Advance());
+ EXPECT_THAT(token_iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::VERBATIM, "Hello, world!")));
+
+ // Retrieve token again after resetting to start
+ EXPECT_TRUE(token_iterator->ResetToStart());
+ EXPECT_THAT(token_iterator->GetTokens(),
+ ElementsAre(EqualsToken(Token::Type::VERBATIM, "Hello, world!")));
+}
+
+TEST_F(VerbatimTokenizerTest, CalculateTokenStart) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator start_character_iterator,
+ token_iterator->CalculateTokenStart());
+
+ // We should retrieve the character 'H', the first character of the token.
+ EXPECT_THAT(start_character_iterator.GetCurrentChar(), Eq('H'));
+}
+
+TEST_F(VerbatimTokenizerTest, CalculateTokenEnd) {
+ ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Tokenizer> verbatim_tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ StringIndexingConfig::TokenizerType::VERBATIM,
+ language_segmenter_.get()));
+
+ constexpr std::string_view kText = "Hello, world!";
+ auto token_iterator = verbatim_tokenizer->Tokenize(kText).ValueOrDie();
+
+ ICING_ASSERT_OK_AND_ASSIGN(CharacterIterator end_character_iterator,
+ token_iterator->CalculateTokenEndExclusive());
+
+ // We should retrieve the the null character, as the returned character
+ // iterator will be set one past the end of the token.
+ EXPECT_THAT(end_character_iterator.GetCurrentChar(), Eq('\0'));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/tools/document-store-dump.cc b/icing/tools/document-store-dump.cc
deleted file mode 100644
index 45c9bf5..0000000
--- a/icing/tools/document-store-dump.cc
+++ /dev/null
@@ -1,119 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "icing/tools/document-store-dump.h"
-
-#include <cinttypes>
-
-#include "icing/absl_ports/str_cat.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/util/logging.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-void AppendDocumentProto(DocId document_id, const Document& doc,
- std::string* output) {
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- "Document {\n document_id: %d\n corpus_id: %d\n uri: "
- "'%s'\n score: %d\n created_timestamp_ms: %" PRIu64 "\n",
- static_cast<int>(document_id), doc.corpus_id(),
- doc.uri().c_str(), static_cast<int>(doc.score()),
- static_cast<int64_t>(doc.created_timestamp_ms())));
- for (const auto& section : doc.sections()) {
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- " section {\n id: %d\n indexed_length: "
- "%d\n content: '%s'\n snippet: '%s'\n",
- static_cast<int>(section.id()),
- static_cast<int>(section.indexed_length()),
- section.content().c_str(), section.snippet().c_str()));
- for (int64_t extracted_number : section.extracted_numbers()) {
- absl_ports::StrAppend(output, IcingStringUtil::StringPrintf(
- " extracted_numbers: %" PRId64 "\n",
- extracted_number));
- }
- for (const std::string& annotation_token : section.annotation_tokens()) {
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(" annotation_tokens: '%s'\n",
- annotation_token.c_str()));
- }
- std::string indexed = (section.config().indexed()) ? "true" : "false";
- std::string index_prefixes =
- (section.config().index_prefixes()) ? "true" : "false";
- absl_ports::StrAppend(
- output,
- IcingStringUtil::StringPrintf(
- " config {\n name: '%s'\n indexed: %s\n "
- "tokenizer: %d\n weight: %d\n index_prefixes: %s\n "
- "subsection_separator: '%s'\n",
- section.config().name().c_str(), indexed.c_str(),
- section.config().tokenizer(),
- static_cast<int>(section.config().weight()), index_prefixes.c_str(),
- section.config().subsection_separator().c_str()));
- for (const auto& variant_generator :
- section.config().variant_generators()) {
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- " variant_generators: %d\n", variant_generator));
- }
- absl_ports::StrAppend(
- output,
- IcingStringUtil::StringPrintf(
- " common_term_legacy_hit_score: %d\n "
- "rfc822_host_name_term_legacy_hit_score: %d\n "
- "semantic_property: '%s'\n universal_section_id: %d\n "
- "omnibox_section_type: %d\n st_section_type: %d\n }\n }\n",
- section.config().common_term_legacy_hit_score(),
- section.config().rfc822_host_name_term_legacy_hit_score(),
- section.config().semantic_property().c_str(),
- section.config().universal_section_id(),
- section.config().omnibox_section_type(),
- section.config().st_section_type()));
- }
- for (const auto& language : doc.languages()) {
- std::string used_classifier =
- (language.used_classifier()) ? "true" : "false";
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- " languages {\n language: %d\n score: %d\n "
- "used_classifier: %s\n }\n",
- language.language(), static_cast<int>(language.score()),
- used_classifier.c_str()));
- }
- absl_ports::StrAppend(
- output, IcingStringUtil::StringPrintf(
- " ANNOTATIONS PRINTING NOT IMPLEMENTED YET IN ICING-TOOL\n"));
-}
-
-} // namespace
-
-std::string GetDocumentStoreDump(const DocumentStore& document_store) {
- std::string output;
- for (DocId document_id = 0; document_id < document_store.num_documents();
- document_id++) {
- Document doc;
- if (!document_store.ReadDocument(document_id, &doc)) {
- ICING_LOG(FATAL) << "Failed to read document";
- }
-
- AppendDocumentProto(document_id, doc, &output);
- }
- return output;
-}
-
-} // namespace lib
-} // namespace icing
diff --git a/icing/tools/document-store-dump.h b/icing/tools/document-store-dump.h
deleted file mode 100644
index 023b301..0000000
--- a/icing/tools/document-store-dump.h
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
-#define ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
-
-#include <string>
-
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h"
-
-namespace icing {
-namespace lib {
-
-// Utility function for dumping the complete document store content.
-// This provides a human-readable representation of the document store, mainly
-// provided for easier understandability for developers.
-// The output of this class should only be available on cmdline-tool-level
-// (with root access), or unit tests. In other words it should not be possible
-// to trigger this on a release key device, for data protection reasons.
-std::string GetDocumentStoreDump(const DocumentStore& document_store);
-
-} // namespace lib
-} // namespace icing
-#endif // ICING_TOOLS_DOCUMENT_STORE_DUMP_H_
diff --git a/icing/tools/icing-tool.cc b/icing/tools/icing-tool.cc
deleted file mode 100644
index 72a11e9..0000000
--- a/icing/tools/icing-tool.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// Copyright 2012 Google Inc. All Rights Reserved.
-// Author: ulas@google.com (Ulas Kirazci)
-//
-// A tool to debug the native index.
-
-#include <getopt.h>
-#include <unistd.h>
-
-#include <string>
-
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/core/string-util.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/doc-property-filter.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/document-store.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/dynamic-trie.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/filesystem.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/mobstore.h"
-#include "java/com/google/android/gmscore/integ/modules/icing/jni/index/native-index-impl.h"
-#include "icing/absl_ports/str_cat.h"
-#include "icing/legacy/core/icing-string-util.h"
-#include "icing/tools/document-store-dump.h"
-#include "icing/util/logging.h"
-
-using std::vector;
-using ::wireless_android_play_playlog::icing::IndexRestorationStats;
-
-namespace icing {
-namespace lib {
-
-// 256KB for debugging.
-const size_t kMaxDocumentSizeForDebugging = 1u << 18;
-// Dump dynamic trie stats and contents.
-void ProcessDynamicTrie(const char* filename) {
- Filesystem filesystem;
- DynamicTrie trie(filename, DynamicTrie::RuntimeOptions(), &filesystem);
- if (!trie.Init()) {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Opening trie %s failed",
- filename);
- return;
- }
-
- std::string out;
- trie.GetDebugInfo(true, &out);
- printf("Stats:\n%s", out.c_str());
-
- std::ostringstream contents;
- vector<std::string> keys;
- trie.DumpTrie(&contents, &keys);
- printf("Contents:\n%s", contents.str().c_str());
-}
-
-NativeIndexImpl* MakeIndex(const char* root_dir) {
- NativeConfig native_config;
- native_config.set_max_document_size(kMaxDocumentSizeForDebugging);
- FlashIndexOptions flash_index_options(
- NativeIndexImpl::GetNativeIndexDir(root_dir));
- NativeIndexImpl* ni =
- new NativeIndexImpl(root_dir, native_config, flash_index_options);
- InitStatus init_status;
- if (!ni->Init(&init_status)) {
- ICING_LOG(FATAL) << "Failed to initialize legacy native index impl";
- }
-
- IndexRestorationStats unused;
- ni->RestoreIndex(IndexRequestSpec::default_instance(), &unused);
- return ni;
-}
-
-void RunQuery(NativeIndexImpl* ni, const std::string& query, int start,
- int num_results) {
- // Pull out corpusids and uris.
- QueryRequestSpec spec;
- spec.set_no_corpus_filter(true);
- spec.set_want_uris(true);
- spec.set_scoring_verbosity_level(1);
- spec.set_prefix_match(true);
-
- QueryResponse response;
- ni->ExecuteQuery(query, spec, 10000, start, num_results, &response);
-
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Query [%s] num results %u", query.c_str(), response.num_results());
-
- for (int i = 0, uri_offset = 0; i < response.num_results(); i++) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "%d: (cid=%u) uri %.*s", i, response.corpus_ids(i),
- response.uri_lengths(i), response.uri_buffer().data() + uri_offset);
- uri_offset += response.uri_lengths(i);
- }
-}
-
-void RunSuggest(NativeIndexImpl* ni, const std::string& prefix,
- int num_results) {
- SuggestionResponse results;
- ni->Suggest(prefix, num_results, vector<CorpusId>(), &results);
-
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Query [%s] num results %zu", prefix.c_str(),
- static_cast<size_t>(results.suggestions_size()));
-
- for (size_t i = 0; i < results.suggestions_size(); i++) {
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Sugg: [%s] display text [%s]", results.suggestions(i).query().c_str(),
- results.suggestions(i).display_text().c_str());
- }
-}
-
-int IcingTool(int argc, char** argv) {
- auto file_storage = CreatePosixFileStorage();
- enum Options {
- OPT_FILENAME,
- OPT_OP,
- OPT_QUERY,
- NUM_OPT,
- };
- static const option kOptions[NUM_OPT + 1] = {
- {"filename", 1, nullptr, 0},
- {"op", 1, nullptr, 0},
- {"query", 1, nullptr, 0},
- {nullptr, 0, nullptr, 0},
- };
- const char* opt_values[NUM_OPT];
- memset(opt_values, 0, sizeof(opt_values));
-
- while (true) {
- int opt_idx = -1;
- int ret = getopt_long(argc, argv, "", kOptions, &opt_idx);
- if (ret != 0) break;
-
- if (opt_idx >= 0 && opt_idx < NUM_OPT) {
- opt_values[opt_idx] = optarg;
- }
- }
-
- if (!opt_values[OPT_OP]) {
- ICING_LOG(ERROR) << "No op specified";
- return -1;
- }
-
- if (!opt_values[OPT_FILENAME]) {
- ICING_LOG(ERROR) << "No filename specified";
- return -1;
- }
- if (!strncmp(
- opt_values[OPT_FILENAME],
- "/data/data/com.google.android.gms/files/AppDataSearch",
- strlen("/data/data/com.google.android.gms/files/AppDataSearch"))) {
- ICING_LOG(ERROR)
- << "Should not read directly from the file in gmscore - "
- "icing-tool also commits writes as side-effects which corrupts "
- "the index on concurrent modification";
- return -1;
- }
-
- const char* op = opt_values[OPT_OP];
- DocumentStore::Options options(file_storage.get(),
- kMaxDocumentSizeForDebugging);
- if (!strcmp(op, "dyntrie")) {
- std::string full_file_path =
- absl_ports::StrCat(opt_values[OPT_FILENAME], "/idx.lexicon");
- ProcessDynamicTrie(full_file_path.c_str());
- } else if (!strcmp(op, "verify")) {
- std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
- ni->CheckVerify();
- } else if (!strcmp(op, "query")) {
- if (opt_values[OPT_QUERY] == nullptr) {
- ICING_LOG(FATAL) << "Opt value is null";
- }
-
- std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
- RunQuery(ni.get(), opt_values[OPT_QUERY], 0, 100);
- } else if (!strcmp(op, "suggest")) {
- if (opt_values[OPT_QUERY] == nullptr) {
- ICING_LOG(FATAL) << "Opt value is null";
- }
-
- std::unique_ptr<NativeIndexImpl> ni(MakeIndex(opt_values[OPT_FILENAME]));
- RunSuggest(ni.get(), opt_values[OPT_QUERY], 100);
- } else if (!strcmp(op, "dump-all-docs")) {
- DocumentStore ds(opt_values[OPT_FILENAME], options);
- if (!ds.Init()) {
- ICING_LOG(FATAL) << "Legacy document store failed to initialize";
- }
-
- printf(
- "------ Document Store Dump Start ------\n"
- "%s\n"
- "------ Document Store Dump End ------\n",
- GetDocumentStoreDump(ds).c_str());
- } else if (!strcmp(op, "dump-uris")) {
- CorpusId corpus_id = kInvalidCorpusId;
- if (opt_values[OPT_QUERY]) {
- // Query is corpus id.
- corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT
- }
- DocumentStore ds(opt_values[OPT_FILENAME], options);
- if (!ds.Init()) {
- ICING_LOG(FATAL) << "Legacy document store failed to initialize";
- }
-
- DocPropertyFilter dpf;
- ds.AddDeletedTagFilter(&dpf);
-
- // Dump with format "<corpusid> <uri> <tagname>*".
- int filtered = 0;
- vector<std::string> tagnames;
- for (DocId document_id = 0; document_id < ds.num_documents();
- document_id++) {
- Document doc;
- if (!ds.ReadDocument(document_id, &doc)) {
- ICING_LOG(FATAL) << "Failed to read document.";
- }
-
- if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) {
- filtered++;
- continue;
- }
- if (dpf.Match(0, document_id)) {
- filtered++;
- continue;
- }
-
- tagnames.clear();
- ds.GetAllSetUserTagNames(document_id, &tagnames);
-
- printf("%d %s %s\n", doc.corpus_id(), doc.uri().c_str(),
- StringUtil::JoinStrings("/", tagnames).c_str());
- }
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Processed %u filtered %d", ds.num_documents(), filtered);
- } else if (!strcmp(op, "dump-docs")) {
- std::string out_filename = opt_values[OPT_FILENAME];
- out_filename.append("/docs-dump");
- CorpusId corpus_id = kInvalidCorpusId;
- if (opt_values[OPT_QUERY]) {
- // Query is corpus id.
- corpus_id = atoi(opt_values[OPT_QUERY]); // NOLINT
- out_filename.push_back('.');
- out_filename.append(opt_values[OPT_QUERY]);
- }
- DocumentStore ds(opt_values[OPT_FILENAME], options);
- if (!ds.Init()) {
- ICING_LOG(FATAL) << "Legacy document store failed to initialize";
- }
-
- DocPropertyFilter dpf;
- ds.AddDeletedTagFilter(&dpf);
-
- // Dump with format (<32-bit length><serialized content>)*.
- FILE* fp = fopen(out_filename.c_str(), "w");
- int filtered = 0;
- for (DocId document_id = 0; document_id < ds.num_documents();
- document_id++) {
- Document doc;
- if (!ds.ReadDocument(document_id, &doc)) {
- ICING_LOG(FATAL) << "Failed to read document.";
- }
-
- if (corpus_id != kInvalidCorpusId && corpus_id != doc.corpus_id()) {
- filtered++;
- continue;
- }
- if (dpf.Match(0, document_id)) {
- filtered++;
- continue;
- }
-
- std::string serialized = doc.SerializeAsString();
- uint32_t length = serialized.size();
- if (fwrite(&length, 1, sizeof(length), fp) != sizeof(length)) {
- ICING_LOG(FATAL) << "Failed to write length information to file";
- }
-
- if (fwrite(serialized.data(), 1, serialized.size(), fp) !=
- serialized.size()) {
- ICING_LOG(FATAL) << "Failed to write document to file";
- }
- }
- ICING_VLOG(1) << IcingStringUtil::StringPrintf(
- "Processed %u filtered %d", ds.num_documents(), filtered);
- fclose(fp);
- } else {
- ICING_LOG(ERROR) << IcingStringUtil::StringPrintf("Unknown op %s", op);
- return -1;
- }
-
- return 0;
-}
-
-} // namespace lib
-} // namespace icing
-
-int main(int argc, char** argv) { return icing::lib::IcingTool(argc, argv); }
diff --git a/icing/transform/icu/icu-normalizer.cc b/icing/transform/icu/icu-normalizer.cc
index 0bb8326..58d4956 100644
--- a/icing/transform/icu/icu-normalizer.cc
+++ b/icing/transform/icu/icu-normalizer.cc
@@ -29,6 +29,7 @@
#include "icing/util/status-macros.h"
#include "unicode/umachine.h"
#include "unicode/unorm2.h"
+#include "unicode/ustring.h"
#include "unicode/utrans.h"
namespace icing {
@@ -41,13 +42,15 @@ namespace {
// form decomposition) and NFKC (compatible normalization form composition)
// are applied as well as some other rules we need. More information at
// http://www.unicode.org/reports/tr15/
-// TODO(samzheng) Figure out if we need to support small hiragana to katakana
+//
+// Please note that the following rules don't support small hiragana to katakana
// transformation.
constexpr UChar kTransformRulesUtf16[] =
u"Lower; " // Lowercase
"Latin-ASCII; " // Map Latin characters to ASCII characters
"Hiragana-Katakana; " // Map hiragana to katakana
"[:Latin:] NFD; " // Decompose Latin letters
+ "[:Greek:] NFD; " // Decompose Greek letters
"[:Nonspacing Mark:] Remove; " // Remove accent / diacritic marks
"NFKC"; // Decompose and compose everything
@@ -74,7 +77,7 @@ bool DiacriticCharToAscii(const UNormalizer2* normalizer2, UChar32 uchar32_in,
}
// Maximum number of pieces a Unicode character can be decomposed into.
- // TODO(samzheng) figure out if this number is proper.
+ // TODO(tjbarron) figure out if this number is proper.
constexpr int kDecompositionBufferCapacity = 5;
// A buffer used to store Unicode decomposition mappings of only one
@@ -132,17 +135,16 @@ std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
}
- // Checks if the first character is within ASCII range or can be transformed
- // into an ASCII char. Since the term is tokenized, we know that the whole
- // term can be transformed into ASCII if the first character can.
- UChar32 first_uchar32 =
- i18n_utils::GetUChar32At(term.data(), term.length(), 0);
- if (normalizer2 != nullptr && first_uchar32 != i18n_utils::kInvalidUChar32 &&
- DiacriticCharToAscii(normalizer2, first_uchar32, nullptr)) {
- // This is a faster method to normalize Latin terms.
- normalized_text = NormalizeLatin(normalizer2, term);
- } else {
- normalized_text = term_transformer_->Transform(term);
+ // Normalize the prefix that can be transformed into ASCII.
+ // This is a faster method to normalize Latin terms.
+ NormalizeLatinResult result = NormalizeLatin(normalizer2, term);
+ normalized_text = std::move(result.text);
+ if (result.end_pos < term.length()) {
+ // Some portion of term couldn't be normalized via NormalizeLatin. Use
+ // term_transformer to handle this portion.
+ std::string_view rest_term = term.substr(result.end_pos);
+ absl_ports::StrAppend(&normalized_text,
+ term_transformer_->Transform(rest_term));
}
if (normalized_text.length() > max_term_byte_size_) {
@@ -152,35 +154,32 @@ std::string IcuNormalizer::NormalizeTerm(const std::string_view term) const {
return normalized_text;
}
-std::string IcuNormalizer::NormalizeLatin(const UNormalizer2* normalizer2,
- const std::string_view term) const {
- std::string result;
- result.reserve(term.length());
- for (int i = 0; i < term.length(); i++) {
- if (i18n_utils::IsAscii(term[i])) {
- result.push_back(std::tolower(term[i]));
- } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
- UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
- if (uchar32 == i18n_utils::kInvalidUChar32) {
- ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
- << " at position" << i;
- continue;
- }
- char ascii_char;
- if (DiacriticCharToAscii(normalizer2, uchar32, &ascii_char)) {
- result.push_back(std::tolower(ascii_char));
- } else {
- // We don't know how to transform / decompose this Unicode character, it
- // probably means that some other Unicode characters are mixed with
- // Latin characters. This shouldn't happen if input term is properly
- // tokenized. We handle it here in case there're something wrong with
- // the tokenizers.
- int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- absl_ports::StrAppend(&result, term.substr(i, utf8_length));
- }
+IcuNormalizer::NormalizeLatinResult IcuNormalizer::NormalizeLatin(
+ const UNormalizer2* normalizer2, const std::string_view term) const {
+ NormalizeLatinResult result = {};
+ if (normalizer2 == nullptr) {
+ return result;
+ }
+ CharacterIterator char_itr(term);
+ result.text.reserve(term.length());
+ char ascii_char;
+ while (char_itr.utf8_index() < term.length()) {
+ UChar32 c = char_itr.GetCurrentChar();
+ if (i18n_utils::IsAscii(c)) {
+ result.text.push_back(std::tolower(c));
+ } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
+ result.text.push_back(std::tolower(ascii_char));
+ } else {
+ // We don't know how to transform / decompose this Unicode character, it
+ // probably means that some other Unicode characters are mixed with Latin
+ // characters. We return the partial result here and let the caller handle
+ // the rest.
+ result.end_pos = char_itr.utf8_index();
+ return result;
}
+ char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
}
-
+ result.end_pos = term.length();
return result;
}
@@ -260,5 +259,114 @@ std::string IcuNormalizer::TermTransformer::Transform(
return std::move(utf8_term_or).ValueOrDie();
}
+bool IcuNormalizer::FindNormalizedLatinMatchEndPosition(
+ const UNormalizer2* normalizer2, std::string_view term,
+ CharacterIterator& char_itr, std::string_view normalized_term,
+ CharacterIterator& normalized_char_itr) const {
+ if (normalizer2 == nullptr) {
+ return false;
+ }
+ char ascii_char;
+ while (char_itr.utf8_index() < term.length() &&
+ normalized_char_itr.utf8_index() < normalized_term.length()) {
+ UChar32 c = char_itr.GetCurrentChar();
+ if (i18n_utils::IsAscii(c)) {
+ c = std::tolower(c);
+ } else if (DiacriticCharToAscii(normalizer2, c, &ascii_char)) {
+ c = std::tolower(ascii_char);
+ } else {
+ return false;
+ }
+ UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
+ if (c != normalized_c) {
+ return true;
+ }
+ char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
+ normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
+ }
+ return true;
+}
+
+CharacterIterator
+IcuNormalizer::TermTransformer::FindNormalizedNonLatinMatchEndPosition(
+ std::string_view term, CharacterIterator char_itr,
+ std::string_view normalized_term) const {
+ CharacterIterator normalized_char_itr(normalized_term);
+ UErrorCode status = U_ZERO_ERROR;
+
+ constexpr int kUtf16CharBufferLength = 6;
+ UChar c16[kUtf16CharBufferLength];
+ int32_t c16_length;
+ int32_t limit;
+
+ constexpr int kCharBufferLength = 3 * 4;
+ char normalized_buffer[kCharBufferLength];
+ int32_t c8_length;
+ while (char_itr.utf8_index() < term.length() &&
+ normalized_char_itr.utf8_index() < normalized_term.length()) {
+ UChar32 c = char_itr.GetCurrentChar();
+ int c_lenth = i18n_utils::GetUtf8Length(c);
+ u_strFromUTF8(c16, kUtf16CharBufferLength, &c16_length,
+ term.data() + char_itr.utf8_index(),
+ /*srcLength=*/c_lenth, &status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+
+ limit = c16_length;
+ utrans_transUChars(u_transliterator_, c16, &c16_length,
+ kUtf16CharBufferLength,
+ /*start=*/0, &limit, &status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+
+ u_strToUTF8(normalized_buffer, kCharBufferLength, &c8_length, c16,
+ c16_length, &status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+
+ for (int i = 0; i < c8_length; ++i) {
+ if (normalized_buffer[i] !=
+ normalized_term[normalized_char_itr.utf8_index() + i]) {
+ return char_itr;
+ }
+ }
+ normalized_char_itr.AdvanceToUtf8(normalized_char_itr.utf8_index() +
+ c8_length);
+ char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
+ }
+ if (U_FAILURE(status)) {
+ // Failed to transform, return its original form.
+ ICING_LOG(WARNING) << "Failed to normalize UTF8 term: " << term;
+ }
+ return char_itr;
+}
+
+CharacterIterator IcuNormalizer::FindNormalizedMatchEndPosition(
+ std::string_view term, std::string_view normalized_term) const {
+ UErrorCode status = U_ZERO_ERROR;
+ // ICU manages the singleton instance
+ const UNormalizer2* normalizer2 = unorm2_getNFCInstance(&status);
+ if (U_FAILURE(status)) {
+ ICING_LOG(WARNING) << "Failed to create a UNormalizer2 instance";
+ }
+
+ CharacterIterator char_itr(term);
+ CharacterIterator normalized_char_itr(normalized_term);
+ if (FindNormalizedLatinMatchEndPosition(
+ normalizer2, term, char_itr, normalized_term, normalized_char_itr)) {
+ return char_itr;
+ }
+ // Some portion of term couldn't be normalized via
+ // FindNormalizedLatinMatchEndPosition. Use term_transformer to handle this
+ // portion.
+ std::string_view rest_normalized_term =
+ normalized_term.substr(normalized_char_itr.utf8_index());
+ return term_transformer_->FindNormalizedNonLatinMatchEndPosition(
+ term, char_itr, rest_normalized_term);
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/transform/icu/icu-normalizer.h b/icing/transform/icu/icu-normalizer.h
index f20a9fb..f6f2b78 100644
--- a/icing/transform/icu/icu-normalizer.h
+++ b/icing/transform/icu/icu-normalizer.h
@@ -21,6 +21,7 @@
#include "icing/text_classifier/lib3/utils/base/statusor.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
#include "unicode/unorm2.h"
#include "unicode/utrans.h"
@@ -32,7 +33,8 @@ namespace lib {
// 2. Transforms full-width Latin characters to ASCII characters if possible.
// 3. Transforms hiragana to katakana.
// 4. Removes accent / diacritic marks on Latin characters
-// 5. Normalized text must be less than or equal to max_term_byte_size,
+// 5. Removes accent / diacritic marks on Greek characters
+// 6. Normalized text must be less than or equal to max_term_byte_size,
// otherwise it will be truncated.
//
// There're some other rules from ICU not listed here, please see .cc file for
@@ -56,6 +58,17 @@ class IcuNormalizer : public Normalizer {
// result in the non-Latin characters not properly being normalized
std::string NormalizeTerm(std::string_view term) const override;
+ // Returns a CharacterIterator pointing to one past the end of the segment of
+ // term that (once normalized) matches with normalized_term.
+ //
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
+ // CharacterIterator(u8:4, u16:4, u32:4).
+ //
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
+ // CharacterIterator(u8:0, u16:0, u32:0).
+ CharacterIterator FindNormalizedMatchEndPosition(
+ std::string_view term, std::string_view normalized_term) const override;
+
private:
// A handler class that helps manage the lifecycle of UTransliterator. It's
// used in IcuNormalizer to transform terms into the formats we need.
@@ -75,6 +88,12 @@ class IcuNormalizer : public Normalizer {
// Transforms the text based on our rules described at top of this file
std::string Transform(std::string_view term) const;
+ // Returns a CharacterIterator pointing to one past the end of the segment
+ // of a non-latin term that (once normalized) matches with normalized_term.
+ CharacterIterator FindNormalizedNonLatinMatchEndPosition(
+ std::string_view term, CharacterIterator char_itr,
+ std::string_view normalized_term) const;
+
private:
explicit TermTransformer(UTransliterator* u_transliterator);
@@ -83,14 +102,36 @@ class IcuNormalizer : public Normalizer {
UTransliterator* u_transliterator_;
};
+ struct NormalizeLatinResult {
+ // A string representing the maximum prefix of term (can be empty or term
+ // itself) that can be normalized into ASCII.
+ std::string text;
+ // The first position of the char within term that normalization failed to
+ // transform into an ASCII char, or term.length() if all chars can be
+ // transformed.
+ size_t end_pos;
+ };
+
explicit IcuNormalizer(std::unique_ptr<TermTransformer> term_transformer,
int max_term_byte_size);
// Helper method to normalize Latin terms only. Rules applied:
// 1. Uppercase to lowercase
// 2. Remove diacritic (accent) marks
- std::string NormalizeLatin(const UNormalizer2* normalizer2,
- std::string_view term) const;
+ NormalizeLatinResult NormalizeLatin(const UNormalizer2* normalizer2,
+ std::string_view term) const;
+
+ // Set char_itr and normalized_char_itr to point to one past the end of the
+ // segments of term and normalized_term that can match if normalized into
+ // ASCII. In this case, true will be returned.
+ //
+ // The method stops at the position when char_itr cannot be normalized into
+ // ASCII and returns false, so that term_transformer can handle the remaining
+ // portion.
+ bool FindNormalizedLatinMatchEndPosition(
+ const UNormalizer2* normalizer2, std::string_view term,
+ CharacterIterator& char_itr, std::string_view normalized_term,
+ CharacterIterator& normalized_char_itr) const;
// Used to transform terms into their normalized forms.
std::unique_ptr<TermTransformer> term_transformer_;
diff --git a/icing/transform/icu/icu-normalizer_benchmark.cc b/icing/transform/icu/icu-normalizer_benchmark.cc
index b037538..89d5f1e 100644
--- a/icing/transform/icu/icu-normalizer_benchmark.cc
+++ b/icing/transform/icu/icu-normalizer_benchmark.cc
@@ -14,8 +14,8 @@
#include "testing/base/public/benchmark.h"
#include "gmock/gmock.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
@@ -25,7 +25,7 @@
// //icing/transform/icu:icu-normalizer_benchmark
//
// $ blaze-bin/icing/transform/icu/icu-normalizer_benchmark
-// --benchmarks=all
+// --benchmark_filter=all
//
// Run on an Android device:
// Make target //icing/transform:normalizer depend on
@@ -39,8 +39,8 @@
// blaze-bin/icing/transform/icu/icu-normalizer_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/icu-normalizer_benchmark --benchmarks=all
-// --adb
+// $ adb shell /data/local/tmp/icu-normalizer_benchmark
+// --benchmark_filter=all --adb
// Flag to tell the benchmark that it'll be run on an Android device via adb,
// the benchmark will set up data files accordingly.
@@ -61,7 +61,6 @@ void BM_NormalizeUppercase(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
-
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string(state.range(0), 'A');
@@ -95,7 +94,6 @@ void BM_NormalizeAccent(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
-
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -123,7 +121,7 @@ BENCHMARK(BM_NormalizeAccent)
->Arg(2048000)
->Arg(4096000);
-void BM_NormalizeHiragana(benchmark::State& state) {
+void BM_NormalizeGreekAccent(benchmark::State& state) {
bool run_via_adb = absl::GetFlag(FLAGS_adb);
if (!run_via_adb) {
ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
@@ -133,7 +131,43 @@ void BM_NormalizeHiragana(benchmark::State& state) {
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<Normalizer> normalizer,
normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("άὰᾶἀἄ");
+ }
+
+ for (auto _ : state) {
+ normalizer->NormalizeTerm(input_string);
+ }
+}
+BENCHMARK(BM_NormalizeGreekAccent)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_NormalizeHiragana(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
/*max_term_byte_size=*/std::numeric_limits<int>::max()));
std::string input_string;
@@ -161,6 +195,121 @@ BENCHMARK(BM_NormalizeHiragana)
->Arg(2048000)
->Arg(4096000);
+void BM_UppercaseSubTokenLength(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string(state.range(0), 'A');
+ std::string normalized_input_string(state.range(0), 'a');
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_UppercaseSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_AccentSubTokenLength(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ std::string normalized_input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("àáâãā");
+ normalized_input_string.append("aaaaa");
+ }
+
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_AccentSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_HiraganaSubTokenLength(benchmark::State& state) {
+ bool run_via_adb = absl::GetFlag(FLAGS_adb);
+ if (!run_via_adb) {
+ ICING_ASSERT_OK(icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ std::string normalized_input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("あいうえお");
+ normalized_input_string.append("アイウエオ");
+ }
+
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_HiraganaSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
} // namespace
} // namespace lib
diff --git a/icing/transform/icu/icu-normalizer_test.cc b/icing/transform/icu/icu-normalizer_test.cc
index 83fa972..0df23fc 100644
--- a/icing/transform/icu/icu-normalizer_test.cc
+++ b/icing/transform/icu/icu-normalizer_test.cc
@@ -16,8 +16,8 @@
#include "gmock/gmock.h"
#include "gtest/gtest.h"
-#include "icing/helpers/icu/icu-data-file-helper.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/icu-data-file-helper.h"
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/testing/test-data.h"
#include "icing/transform/normalizer-factory.h"
@@ -83,14 +83,12 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
Eq("eeeeeeeeeeeeeeeeeeeeeeeeeee"));
EXPECT_THAT(normalizer_->NormalizeTerm("Ḟḟ"), Eq("ff"));
EXPECT_THAT(normalizer_->NormalizeTerm("ĜĞĠĢḠḡĝğġģ"), Eq("gggggggggg"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"),
- Eq("hhhhhhhhhhhhh"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĤḢḤḦḨḪḣḥḧḩḫĥẖ"), Eq("hhhhhhhhhhhhh"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÌÍÎÏĨĪĬḬḭḯìíîïĩīĭ"),
Eq("iiiiiiiiiiiiiiiii"));
EXPECT_THAT(normalizer_->NormalizeTerm("Ĵĵ"), Eq("jj"));
EXPECT_THAT(normalizer_->NormalizeTerm("ĶḰḲḴḵḱḳķ"), Eq("kkkkkkkk"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"),
- Eq("lllllllllllll"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ĹĻĽḶḸḼḷḹḻḽĺļľ"), Eq("lllllllllllll"));
EXPECT_THAT(normalizer_->NormalizeTerm("ḾṀṂḿṁṃ"), Eq("mmmmmm"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÑŃŅŇṄṆṈṊṅṇṉṋñńņň"),
Eq("nnnnnnnnnnnnnnnn"));
@@ -109,23 +107,42 @@ TEST_F(IcuNormalizerTest, LatinLetterRemoveAccent) {
EXPECT_THAT(normalizer_->NormalizeTerm("ŴẀẂẄẆẈẁẃẅẇẉŵ"), Eq("wwwwwwwwwwww"));
EXPECT_THAT(normalizer_->NormalizeTerm("ẊẌẋẍ"), Eq("xxxx"));
EXPECT_THAT(normalizer_->NormalizeTerm("ÝŶŸẎẏŷýÿ"), Eq("yyyyyyyy"));
- EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"),
- Eq("zzzzzzzzzzzz"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ŹŻŽẐẒẔẑẓẕźżž"), Eq("zzzzzzzzzzzz"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Barış"), Eq("baris"));
+}
+
+TEST_F(IcuNormalizerTest, GreekLetterRemoveAccent) {
+ EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημερα"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφη"));
+ EXPECT_THAT(normalizer_->NormalizeTerm(
+ "ἈἉἊἋἌἍἎἏᾈᾉᾊᾋᾌᾍᾎᾏᾸᾹᾺΆᾼἀἁἂἃἄἅἆἇὰάᾀᾁᾂᾃᾄᾅᾆᾇᾰᾱᾲᾳᾴᾶᾷ"),
+ Eq("αααααααααααααααααααααααααααααααααααααααααααααα"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ἘἙἚἛἜἝῈΈἐἑἒἓἔἕὲέ"),
+ Eq("εεεεεεεεεεεεεεεε"));
+ EXPECT_THAT(
+ normalizer_->NormalizeTerm("ἨἩἪἫἬἭἮἯᾘᾙᾚᾛᾜᾝᾞᾟῊΉῌἠἡἢἣἤἥἦἧὴήᾐᾑᾒᾓᾔᾕᾖᾗῂῃῄῆῇ"),
+ Eq("ηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηηη"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ἸἹἺἻἼἽἾἿῘῙῚΊἰἱἲἳἴἵἶἷὶίῐῑῒΐῖῗ"),
+ Eq("ιιιιιιιιιιιιιιιιιιιιιιιιιιιι"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ὈὉὊὋὌὍῸΌὀὁὂὃὄὅὸό"),
+ Eq("οοοοοοοοοοοοοοοο"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("ὙὛὝὟῨῩῪΎὐὑὒὓὔὕὖὗὺύῠῡῢΰῦῧ"),
+ Eq("υυυυυυυυυυυυυυυυυυυυυυυυ"));
+ EXPECT_THAT(
+ normalizer_->NormalizeTerm("ὨὩὪὫὬὭὮὯᾨᾩᾪᾫᾬᾭᾮᾯῺΏῼὠὡὢὣὤὥὦὧὼώᾠᾡᾢᾣᾤᾥᾦᾧῲῳῴῶῷ"),
+ Eq("ωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωωω"));
+ EXPECT_THAT(normalizer_->NormalizeTerm("Ῥῤῥ"), Eq("ρρρ"));
}
// Accent / diacritic marks won't be removed in non-latin chars, e.g. in
-// Japanese and Greek
+// Japanese
TEST_F(IcuNormalizerTest, NonLatinLetterNotRemoveAccent) {
// Katakana
EXPECT_THAT(normalizer_->NormalizeTerm("ダヂヅデド"), Eq("ダヂヅデド"));
- // Greek
- EXPECT_THAT(normalizer_->NormalizeTerm("kαλημέρα"), Eq("kαλημέρα"));
- EXPECT_THAT(normalizer_->NormalizeTerm("εγγραφή"), Eq("εγγραφή"));
// Our current ICU rules can't handle Hebrew properly, e.g. the accents in
// "אָלֶף־בֵּית עִבְרִי"
// will be removed.
- // TODO (samzheng): figure out how we should handle Hebrew.
}
TEST_F(IcuNormalizerTest, FullWidthCharsToASCII) {
@@ -232,6 +249,158 @@ TEST_F(IcuNormalizerTest, Truncate) {
}
}
+TEST_F(IcuNormalizerTest, PrefixMatchLength) {
+ // Verify that FindNormalizedMatchEndPosition will properly find the length of
+ // the prefix match when given a non-normalized term and a normalized term
+ // is a prefix of the non-normalized one.
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Upper to lower
+ std::string term = "MDI";
+ CharacterIterator match_end =
+ normalizer->FindNormalizedMatchEndPosition(term, "md");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD"));
+
+ term = "Icing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin"));
+
+ // Full-width
+ term = "525600";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "525");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
+
+ term = "FULLWIDTH";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "full");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
+
+ // Hiragana to Katakana
+ term = "あいうえお";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+ term = "かきくけこ";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+ // Latin accents
+ term = "Zürich";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+ term = "après-midi";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+ term = "Buenos días";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí"));
+
+ term = "BarışIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "baris");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Barış"));
+
+ term = "ÀĄḁáIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "aaaa");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ÀĄḁá"));
+
+ // Greek accents
+ term = "άνθρωπος";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ"));
+
+ term = "καλημέρα";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημε");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ"));
+
+ term = "όχι";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ"));
+
+ term = "πότε";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ"));
+
+ term = "ἈἉἊἋIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "αααα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ"));
+}
+
+TEST_F(IcuNormalizerTest, SharedPrefixMatchLength) {
+ // Verify that FindNormalizedMatchEndPosition will properly find the length of
+ // the prefix match when given a non-normalized term and a normalized term
+ // that share a common prefix.
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Upper to lower
+ std::string term = "MDI";
+ CharacterIterator match_end =
+ normalizer->FindNormalizedMatchEndPosition(term, "mgm");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M"));
+
+ term = "Icing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic"));
+
+ // Full-width
+ term = "525600";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
+
+ term = "FULLWIDTH";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
+
+ // Hiragana to Katakana
+ term = "あいうえお";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+ term = "かきくけこ";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+ // Latin accents
+ term = "Zürich";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+ term = "après-midi";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+ term = "días";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día"));
+
+ term = "BarışIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "barismdi");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Barış"));
+
+ // Greek accents
+ term = "άνθρωπος";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ανθν");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("άνθ"));
+
+ term = "καλημέρα";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "καλημεος");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("καλημέ"));
+
+ term = "όχι";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "οχκα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("όχ"));
+
+ term = "πότε";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ποτρα");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("πότ"));
+
+ term = "ἈἉἊἋIcing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "ααααmdi");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("ἈἉἊἋ"));
+}
+
} // namespace
} // namespace lib
} // namespace icing
diff --git a/icing/transform/map/map-normalizer.cc b/icing/transform/map/map-normalizer.cc
index c888551..61fce65 100644
--- a/icing/transform/map/map-normalizer.cc
+++ b/icing/transform/map/map-normalizer.cc
@@ -14,8 +14,7 @@
#include "icing/transform/map/map-normalizer.h"
-#include <ctype.h>
-
+#include <cctype>
#include <string>
#include <string_view>
#include <unordered_map>
@@ -23,6 +22,7 @@
#include "icing/absl_ports/str_cat.h"
#include "icing/transform/map/normalization-map.h"
+#include "icing/util/character-iterator.h"
#include "icing/util/i18n-utils.h"
#include "icing/util/logging.h"
#include "unicode/utypes.h"
@@ -30,48 +30,70 @@
namespace icing {
namespace lib {
+namespace {
+
+UChar32 NormalizeChar(UChar32 c) {
+ if (i18n_utils::GetUtf16Length(c) > 1) {
+ // All the characters we need to normalize can be encoded into a
+ // single char16_t. If this character needs more than 1 char16_t code
+ // unit, we can skip normalization and append it directly.
+ return c;
+ }
+
+ // The original character can be encoded into a single char16_t.
+ const std::unordered_map<char16_t, char16_t>* normalization_map =
+ GetNormalizationMap();
+ if (normalization_map == nullptr) {
+ // Normalization map couldn't be properly initialized, append the original
+ // character.
+ ICING_LOG(WARNING) << "Unable to get a valid pointer to normalization map!";
+ return c;
+ }
+ auto iterator = normalization_map->find(static_cast<char16_t>(c));
+ if (iterator == normalization_map->end()) {
+ // Normalization mapping not found, append the original character.
+ return c;
+ }
+
+ // Found a normalization mapping. The normalized character (stored in a
+ // char16_t) can have 1 or 2 bytes.
+ if (i18n_utils::IsAscii(iterator->second)) {
+ // The normalized character has 1 byte. It may be an upper-case char.
+ // Lower-case it before returning it.
+ return std::tolower(static_cast<char>(iterator->second));
+ } else {
+ return iterator->second;
+ }
+}
+
+} // namespace
+
std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
std::string normalized_text;
normalized_text.reserve(term.length());
- for (int i = 0; i < term.length(); ++i) {
- if (i18n_utils::IsAscii(term[i])) {
- // The original character has 1 byte.
- normalized_text.push_back(std::tolower(term[i]));
- } else if (i18n_utils::IsLeadUtf8Byte(term[i])) {
- UChar32 uchar32 = i18n_utils::GetUChar32At(term.data(), term.length(), i);
+ int current_pos = 0;
+ while (current_pos < term.length()) {
+ if (i18n_utils::IsAscii(term[current_pos])) {
+ normalized_text.push_back(std::tolower(term[current_pos]));
+ ++current_pos;
+ } else {
+ UChar32 uchar32 =
+ i18n_utils::GetUChar32At(term.data(), term.length(), current_pos);
if (uchar32 == i18n_utils::kInvalidUChar32) {
ICING_LOG(WARNING) << "Unable to get uchar32 from " << term
- << " at position" << i;
+ << " at position" << current_pos;
+ ++current_pos;
continue;
}
- int utf8_length = i18n_utils::GetUtf8Length(uchar32);
- if (i18n_utils::GetUtf16Length(uchar32) > 1) {
- // All the characters we need to normalize can be encoded into a
- // single char16_t. If this character needs more than 1 char16_t code
- // unit, we can skip normalization and append it directly.
- absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
- continue;
- }
- // The original character can be encoded into a single char16_t.
- const std::unordered_map<char16_t, char16_t>& normalization_map =
- GetNormalizationMap();
- auto iterator = normalization_map.find(static_cast<char16_t>(uchar32));
- if (iterator != normalization_map.end()) {
- // Found a normalization mapping. The normalized character (stored in a
- // char16_t) can have 1 or 2 bytes.
- if (i18n_utils::IsAscii(iterator->second)) {
- // The normalized character has 1 byte.
- normalized_text.push_back(
- std::tolower(static_cast<char>(iterator->second)));
- } else {
- // The normalized character has 2 bytes.
- i18n_utils::AppendUchar32ToUtf8(&normalized_text, iterator->second);
- }
+ UChar32 normalized_char32 = NormalizeChar(uchar32);
+ if (i18n_utils::IsAscii(normalized_char32)) {
+ normalized_text.push_back(normalized_char32);
} else {
- // Normalization mapping not found, append the original character.
- absl_ports::StrAppend(&normalized_text, term.substr(i, utf8_length));
+ // The normalized character has 2 bytes.
+ i18n_utils::AppendUchar32ToUtf8(&normalized_text, normalized_char32);
}
+ current_pos += i18n_utils::GetUtf8Length(uchar32);
}
}
@@ -82,5 +104,27 @@ std::string MapNormalizer::NormalizeTerm(std::string_view term) const {
return normalized_text;
}
+CharacterIterator MapNormalizer::FindNormalizedMatchEndPosition(
+ std::string_view term, std::string_view normalized_term) const {
+ CharacterIterator char_itr(term);
+ CharacterIterator normalized_char_itr(normalized_term);
+ while (char_itr.utf8_index() < term.length() &&
+ normalized_char_itr.utf8_index() < normalized_term.length()) {
+ UChar32 c = char_itr.GetCurrentChar();
+ if (i18n_utils::IsAscii(c)) {
+ c = std::tolower(c);
+ } else {
+ c = NormalizeChar(c);
+ }
+ UChar32 normalized_c = normalized_char_itr.GetCurrentChar();
+ if (c != normalized_c) {
+ return char_itr;
+ }
+ char_itr.AdvanceToUtf32(char_itr.utf32_index() + 1);
+ normalized_char_itr.AdvanceToUtf32(normalized_char_itr.utf32_index() + 1);
+ }
+ return char_itr;
+}
+
} // namespace lib
} // namespace icing
diff --git a/icing/transform/map/map-normalizer.h b/icing/transform/map/map-normalizer.h
index f9c0e42..ed996ae 100644
--- a/icing/transform/map/map-normalizer.h
+++ b/icing/transform/map/map-normalizer.h
@@ -19,6 +19,7 @@
#include <string_view>
#include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -39,6 +40,17 @@ class MapNormalizer : public Normalizer {
// Read more mapping details in normalization-map.cc
std::string NormalizeTerm(std::string_view term) const override;
+ // Returns a CharacterIterator pointing to one past the end of the segment of
+ // term that (once normalized) matches with normalized_term.
+ //
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
+ // CharacterIterator(u8:4, u16:4, u32:4).
+ //
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
+ // CharacterIterator(u8:0, u16:0, u32:0).
+ CharacterIterator FindNormalizedMatchEndPosition(
+ std::string_view term, std::string_view normalized_term) const override;
+
private:
// The maximum term length allowed after normalization.
int max_term_byte_size_;
diff --git a/icing/transform/map/map-normalizer_benchmark.cc b/icing/transform/map/map-normalizer_benchmark.cc
index 691afc6..4560329 100644
--- a/icing/transform/map/map-normalizer_benchmark.cc
+++ b/icing/transform/map/map-normalizer_benchmark.cc
@@ -24,7 +24,7 @@
// //icing/transform/map:map-normalizer_benchmark
//
// $ blaze-bin/icing/transform/map/map-normalizer_benchmark
-// --benchmarks=all
+// --benchmark_filter=all
//
// Run on an Android device:
// $ blaze build --copt="-DGOOGLE_COMMANDLINEFLAGS_FULL_API=1"
@@ -35,7 +35,7 @@
// blaze-bin/icing/transform/map/map-normalizer_benchmark
// /data/local/tmp/
//
-// $ adb shell /data/local/tmp/map-normalizer_benchmark --benchmarks=all
+// $ adb shell /data/local/tmp/map-normalizer_benchmark --benchmark_filter=all
namespace icing {
namespace lib {
@@ -143,6 +143,104 @@ BENCHMARK(BM_NormalizeHiragana)
->Arg(2048000)
->Arg(4096000);
+void BM_UppercaseSubTokenLength(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string(state.range(0), 'A');
+ std::string normalized_input_string(state.range(0), 'a');
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_UppercaseSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_AccentSubTokenLength(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ std::string normalized_input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("àáâãā");
+ normalized_input_string.append("aaaaa");
+ }
+
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_AccentSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
+void BM_HiraganaSubTokenLength(benchmark::State& state) {
+ ICING_ASSERT_OK_AND_ASSIGN(
+ std::unique_ptr<Normalizer> normalizer,
+ normalizer_factory::Create(
+ /*max_term_byte_size=*/std::numeric_limits<int>::max()));
+
+ std::string input_string;
+ std::string normalized_input_string;
+ while (input_string.length() < state.range(0)) {
+ input_string.append("あいうえお");
+ normalized_input_string.append("アイウエオ");
+ }
+
+ for (auto _ : state) {
+ normalizer->FindNormalizedMatchEndPosition(input_string,
+ normalized_input_string);
+ }
+}
+BENCHMARK(BM_HiraganaSubTokenLength)
+ ->Arg(1000)
+ ->Arg(2000)
+ ->Arg(4000)
+ ->Arg(8000)
+ ->Arg(16000)
+ ->Arg(32000)
+ ->Arg(64000)
+ ->Arg(128000)
+ ->Arg(256000)
+ ->Arg(384000)
+ ->Arg(512000)
+ ->Arg(1024000)
+ ->Arg(2048000)
+ ->Arg(4096000);
+
} // namespace
} // namespace lib
diff --git a/icing/transform/map/map-normalizer_test.cc b/icing/transform/map/map-normalizer_test.cc
index b62ae0e..adc5623 100644
--- a/icing/transform/map/map-normalizer_test.cc
+++ b/icing/transform/map/map-normalizer_test.cc
@@ -23,6 +23,7 @@
#include "icing/testing/icu-i18n-test-utils.h"
#include "icing/transform/normalizer-factory.h"
#include "icing/transform/normalizer.h"
+#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -199,6 +200,104 @@ TEST(MapNormalizerTest, Truncate) {
}
}
+TEST(MapNormalizerTest, PrefixMatchLength) {
+ // Verify that FindNormalizedMatchEndPosition will properly find the length of
+ // the prefix match when given a non-normalized term and a normalized term
+ // is a prefix of the non-normalized one.
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Upper to lower
+ std::string term = "MDI";
+ CharacterIterator match_end =
+ normalizer->FindNormalizedMatchEndPosition(term, "md");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("MD"));
+
+ term = "Icing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "icin");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Icin"));
+
+ // Full-width
+ term = "525600";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "525");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
+
+ term = "FULLWIDTH";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "full");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
+
+ // Hiragana to Katakana
+ term = "あいうえお";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+ term = "かきくけこ";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "カ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+ // Latin accents
+ term = "Zürich";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "zur");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+ term = "après-midi";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "apre");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+ term = "Buenos días";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "buenos di");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Buenos dí"));
+}
+
+TEST(MapNormalizerTest, SharedPrefixMatchLength) {
+ // Verify that FindNormalizedMatchEndPosition will properly find the length of
+ // the prefix match when given a non-normalized term and a normalized term
+ // that share a common prefix.
+ ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
+ /*max_term_byte_size=*/1000));
+
+ // Upper to lower
+ std::string term = "MDI";
+ CharacterIterator match_end =
+ normalizer->FindNormalizedMatchEndPosition(term, "mgm");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("M"));
+
+ term = "Icing";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "icky");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Ic"));
+
+ // Full-width
+ term = "525600";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "525788");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("525"));
+
+ term = "FULLWIDTH";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "fully");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("FULL"));
+
+ // Hiragana to Katakana
+ term = "あいうえお";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "アイエオ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("あい"));
+
+ term = "かきくけこ";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "カケコ");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("か"));
+
+ // Latin accents
+ term = "Zürich";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "zurg");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("Zür"));
+
+ term = "après-midi";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "apreciate");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("aprè"));
+
+ term = "días";
+ match_end = normalizer->FindNormalizedMatchEndPosition(term, "diamond");
+ EXPECT_THAT(term.substr(0, match_end.utf8_index()), Eq("día"));
+}
+
} // namespace
} // namespace lib
diff --git a/icing/transform/map/normalization-map.cc b/icing/transform/map/normalization-map.cc
index c318036..0994ab8 100644
--- a/icing/transform/map/normalization-map.cc
+++ b/icing/transform/map/normalization-map.cc
@@ -691,19 +691,21 @@ constexpr NormalizationPair kNormalizationMappings[] = {
} // namespace
-const std::unordered_map<char16_t, char16_t>& GetNormalizationMap() {
+const std::unordered_map<char16_t, char16_t> *GetNormalizationMap() {
// The map is allocated dynamically the first time this function is executed.
- static const std::unordered_map<char16_t, char16_t> normalization_map = [] {
- std::unordered_map<char16_t, char16_t> map;
- // Size of all the mappings is about 2.5 KiB.
- constexpr int numMappings =
- sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
- map.reserve(numMappings);
- for (size_t i = 0; i < numMappings; ++i) {
- map.emplace(kNormalizationMappings[i].from, kNormalizationMappings[i].to);
- }
- return map;
- }();
+ static const std::unordered_map<char16_t, char16_t> *const normalization_map =
+ [] {
+ auto *map = new std::unordered_map<char16_t, char16_t>();
+ // Size of all the mappings is about 2.5 KiB.
+ constexpr int numMappings =
+ sizeof(kNormalizationMappings) / sizeof(NormalizationPair);
+ map->reserve(numMappings);
+ for (size_t i = 0; i < numMappings; ++i) {
+ map->emplace(kNormalizationMappings[i].from,
+ kNormalizationMappings[i].to);
+ }
+ return map;
+ }();
return normalization_map;
}
diff --git a/icing/transform/map/normalization-map.h b/icing/transform/map/normalization-map.h
index aea85bd..ac7872b 100644
--- a/icing/transform/map/normalization-map.h
+++ b/icing/transform/map/normalization-map.h
@@ -23,7 +23,7 @@ namespace lib {
// Returns a map containing normalization mappings. A mapping (A -> B) means
// that we'll transform every character 'A' into 'B'. See normalization-map.cc
// for mapping details.
-const std::unordered_map<char16_t, char16_t>& GetNormalizationMap();
+const std::unordered_map<char16_t, char16_t>* GetNormalizationMap();
} // namespace lib
} // namespace icing
diff --git a/icing/transform/normalizer.h b/icing/transform/normalizer.h
index 4cbfa63..2110f0f 100644
--- a/icing/transform/normalizer.h
+++ b/icing/transform/normalizer.h
@@ -20,6 +20,7 @@
#include <string_view>
#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/util/character-iterator.h"
namespace icing {
namespace lib {
@@ -39,6 +40,17 @@ class Normalizer {
// Normalizes the input term based on rules. See implementation classes for
// specific transformation rules.
virtual std::string NormalizeTerm(std::string_view term) const = 0;
+
+ // Returns a CharacterIterator pointing to one past the end of the segment of
+ // term that (once normalized) matches with normalized_term.
+ //
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
+ // CharacterIterator(u8:4, u16:4, u32:4).
+ //
+ // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
+ // CharacterIterator(u8:0, u16:0, u32:0).
+ virtual CharacterIterator FindNormalizedMatchEndPosition(
+ std::string_view term, std::string_view normalized_term) const = 0;
};
} // namespace lib
diff --git a/icing/transform/simple/none-normalizer-factory.cc b/icing/transform/simple/none-normalizer-factory.cc
deleted file mode 100644
index 6b35270..0000000
--- a/icing/transform/simple/none-normalizer-factory.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
-#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
-
-#include <memory>
-#include <string_view>
-
-#include "icing/text_classifier/lib3/utils/base/statusor.h"
-#include "icing/absl_ports/canonical_errors.h"
-#include "icing/transform/normalizer.h"
-#include "icing/transform/simple/none-normalizer.h"
-
-namespace icing {
-namespace lib {
-
-namespace normalizer_factory {
-
-// Creates a dummy normalizer. The term is not normalized, but
-// the text will be truncated to max_term_byte_size if it exceeds the max size.
-//
-// Returns:
-// A normalizer on success
-// INVALID_ARGUMENT if max_term_byte_size <= 0
-// INTERNAL_ERROR on errors
-libtextclassifier3::StatusOr<std::unique_ptr<Normalizer>> Create(
- int max_term_byte_size) {
- if (max_term_byte_size <= 0) {
- return absl_ports::InvalidArgumentError(
- "max_term_byte_size must be greater than zero.");
- }
-
- return std::make_unique<NoneNormalizer>(max_term_byte_size);
-}
-
-} // namespace normalizer_factory
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_FACTORY_H_
diff --git a/icing/transform/simple/none-normalizer.h b/icing/transform/simple/none-normalizer.h
deleted file mode 100644
index 47085e1..0000000
--- a/icing/transform/simple/none-normalizer.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
-#define ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
-
-#include <string>
-#include <string_view>
-
-#include "icing/transform/normalizer.h"
-
-namespace icing {
-namespace lib {
-
-// This normalizer is not meant for production use. Currently only used to get
-// the Icing library to compile in Jetpack.
-//
-// No normalization is done, but the term is truncated if it exceeds
-// max_term_byte_size.
-class NoneNormalizer : public Normalizer {
- public:
- explicit NoneNormalizer(int max_term_byte_size)
- : max_term_byte_size_(max_term_byte_size){};
-
- std::string NormalizeTerm(std::string_view term) const override {
- if (term.length() > max_term_byte_size_) {
- return std::string(term.substr(0, max_term_byte_size_));
- }
- return std::string(term);
- }
-
- private:
- // The maximum term length allowed after normalization.
- int max_term_byte_size_;
-};
-
-} // namespace lib
-} // namespace icing
-
-#endif // ICING_TRANSFORM_SIMPLE_NONE_NORMALIZER_H_
diff --git a/icing/transform/simple/none-normalizer_test.cc b/icing/transform/simple/none-normalizer_test.cc
deleted file mode 100644
index e074828..0000000
--- a/icing/transform/simple/none-normalizer_test.cc
+++ /dev/null
@@ -1,74 +0,0 @@
-// Copyright (C) 2019 Google LLC
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <memory>
-
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-#include "icing/testing/common-matchers.h"
-#include "icing/transform/normalizer-factory.h"
-#include "icing/transform/normalizer.h"
-
-namespace icing {
-namespace lib {
-namespace {
-
-using ::testing::Eq;
-
-TEST(NoneNormalizerTest, Creation) {
- EXPECT_THAT(normalizer_factory::Create(
- /*max_term_byte_size=*/5),
- IsOk());
- EXPECT_THAT(normalizer_factory::Create(
- /*max_term_byte_size=*/0),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
- EXPECT_THAT(normalizer_factory::Create(
- /*max_term_byte_size=*/-1),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT));
-}
-
-TEST(IcuNormalizerTest, NoNormalizationDone) {
- ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
- /*max_term_byte_size=*/1000));
- EXPECT_THAT(normalizer->NormalizeTerm(""), Eq(""));
- EXPECT_THAT(normalizer->NormalizeTerm("hello world"), Eq("hello world"));
-
- // Capitalization
- EXPECT_THAT(normalizer->NormalizeTerm("MDI"), Eq("MDI"));
-
- // Accents
- EXPECT_THAT(normalizer->NormalizeTerm("Zürich"), Eq("Zürich"));
-
- // Full-width punctuation to ASCII punctuation
- EXPECT_THAT(normalizer->NormalizeTerm("。,!?:”"), Eq("。,!?:”"));
-
- // Half-width katakana
- EXPECT_THAT(normalizer->NormalizeTerm("カ"), Eq("カ"));
-}
-
-TEST(NoneNormalizerTest, Truncate) {
- ICING_ASSERT_OK_AND_ASSIGN(auto normalizer, normalizer_factory::Create(
- /*max_term_byte_size=*/5));
-
- // Won't be truncated
- EXPECT_THAT(normalizer->NormalizeTerm("hi"), Eq("hi"));
- EXPECT_THAT(normalizer->NormalizeTerm("hello"), Eq("hello"));
-
- // Truncated to length 5.
- EXPECT_THAT(normalizer->NormalizeTerm("hello!"), Eq("hello"));
-}
-
-} // namespace
-} // namespace lib
-} // namespace icing
diff --git a/icing/util/bit-util.h b/icing/util/bit-util.h
index e2bb817..7ca20b4 100644
--- a/icing/util/bit-util.h
+++ b/icing/util/bit-util.h
@@ -24,19 +24,18 @@ namespace bit_util {
// Manipulating bit fields.
//
-// x value containing the bit field(s)
-// offset offset of bit field in x
-// len len of bit field in x
+// value value containing the bit field(s)
+// lsb_offset offset of bit field in value, starting from the least significant
+// bit. for example, the '1' in '0100' has a lsb_offset of 2
+// len len of bit field in value
//
// REQUIREMENTS
//
-// - x an unsigned integer <= 64 bits
-// - offset + len <= sizeof(x) * 8
+// - value is an unsigned integer <= 64 bits
+// - lsb_offset + len <= sizeof(value) * 8
//
// There is no error checking so you will get garbage if you don't
// ensure the above.
-//
-// To set a value, use BITFIELD_CLEAR then BITFIELD_OR.
// Shifting by more than the word length is undefined (on ARM it has the
// intended effect, but on Intel it shifts by % word length), so check the
@@ -44,20 +43,65 @@ namespace bit_util {
inline uint64_t BitfieldMask(uint32_t len) {
return ((len == 0) ? 0U : ((~uint64_t{0}) >> (64 - (len))));
}
-inline uint64_t BitfieldGet(uint64_t mask, uint32_t lsb_offset, uint32_t len) {
- return ((mask) >> (lsb_offset)) & BitfieldMask(len);
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+ uint8_t* value_out) {
+ *value_out &= ~(BitfieldMask(len) << lsb_offset);
+}
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+ uint16_t* value_out) {
+ *value_out &= ~(BitfieldMask(len) << lsb_offset);
+}
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+ uint32_t* value_out) {
+ *value_out &= ~(BitfieldMask(len) << lsb_offset);
+}
+
+inline void BitfieldClear(uint32_t lsb_offset, uint32_t len,
+ uint64_t* value_out) {
+ *value_out &= ~(BitfieldMask(len) << lsb_offset);
+}
+
+inline uint64_t BitfieldGet(uint64_t value, uint32_t lsb_offset, uint32_t len) {
+ return ((value) >> (lsb_offset)) & BitfieldMask(len);
+}
+
+inline void BitfieldSet(uint8_t new_value, uint32_t lsb_offset, uint32_t len,
+ uint8_t* value_out) {
+ BitfieldClear(lsb_offset, len, value_out);
+
+ // We conservatively mask new_value at len so value won't be corrupted if
+ // new_value >= (1 << len).
+ *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
+}
+
+inline void BitfieldSet(uint16_t new_value, uint32_t lsb_offset, uint32_t len,
+ uint16_t* value_out) {
+ BitfieldClear(lsb_offset, len, value_out);
+
+ // We conservatively mask new_value at len so value won't be corrupted if
+ // new_value >= (1 << len).
+ *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
}
-inline void BitfieldSet(uint32_t value, uint32_t lsb_offset, uint32_t len,
- uint32_t* mask) {
- // We conservatively mask val at len so x won't be corrupted if val >=
- // 1 << len.
- *mask |= (uint64_t{value} & BitfieldMask(len)) << (lsb_offset);
+
+inline void BitfieldSet(uint32_t new_value, uint32_t lsb_offset, uint32_t len,
+ uint32_t* value_out) {
+ BitfieldClear(lsb_offset, len, value_out);
+
+ // We conservatively mask new_value at len so value won't be corrupted if
+ // new_value >= (1 << len).
+ *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
}
-inline void BitfieldSet(uint64_t value, uint32_t lsb_offset, uint32_t len,
- uint64_t* mask) {
- // We conservatively mask val at len so x won't be corrupted if val >=
- // 1 << len.
- *mask |= (value & BitfieldMask(len)) << (lsb_offset);
+
+inline void BitfieldSet(uint64_t new_value, uint32_t lsb_offset, uint32_t len,
+ uint64_t* value_out) {
+ BitfieldClear(lsb_offset, len, value_out);
+
+ // We conservatively mask new_value at len so value won't be corrupted if
+ // new_value >= (1 << len).
+ *value_out |= (new_value & BitfieldMask(len)) << (lsb_offset);
}
} // namespace bit_util
diff --git a/icing/util/bit-util_test.cc b/icing/util/bit-util_test.cc
new file mode 100644
index 0000000..3b86a21
--- /dev/null
+++ b/icing/util/bit-util_test.cc
@@ -0,0 +1,145 @@
+// Copyright (C) 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/bit-util.h"
+
+#include <memory>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+namespace {
+
+using ::testing::Eq;
+
+TEST(BitUtilTest, BitfieldMask) {
+ // Check that we can handle up to uint8_t's
+ EXPECT_THAT(bit_util::BitfieldMask(/*len=*/0), Eq(0b0));
+ EXPECT_THAT(bit_util::BitfieldMask(/*len=*/1), Eq(0b01));
+
+ // Check that we can handle up to uint32_t's
+ EXPECT_THAT(bit_util::BitfieldMask(/*len=*/16), Eq(0b01111111111111111));
+
+ // Check that we can handle up to uint64_t's
+ EXPECT_THAT(
+ bit_util::BitfieldMask(/*len=*/63),
+ Eq(0b0111111111111111111111111111111111111111111111111111111111111111));
+}
+
+TEST(BitUtilTest, BitfieldClear) {
+ // Check that we can handle up to uint8_t's
+ uint8_t value_8 = 0b0;
+ bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b0));
+
+ value_8 = 0b01;
+ bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b00));
+
+ value_8 = 0b011;
+ bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b001));
+
+ value_8 = 0b011;
+ bit_util::BitfieldClear(/*lsb_offset=*/0, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b000));
+
+ value_8 = 0b0110;
+ bit_util::BitfieldClear(/*lsb_offset=*/1, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b0000));
+
+ // Check that we can handle up to uint32_t's
+ uint32_t value_32 = 0b010000000000000000000000;
+ bit_util::BitfieldClear(/*lsb_offset=*/22, /*len=*/1, &value_32);
+ EXPECT_THAT(value_32, Eq(0b0));
+
+ // Check that we can handle up to uint64_t's
+ uint64_t value_64 = 0b0100000000000000000000000000000000000;
+ bit_util::BitfieldClear(/*lsb_offset=*/35, /*len=*/1, &value_64);
+ EXPECT_THAT(value_64, Eq(0b0));
+}
+
+TEST(BitUtilTest, BitfieldGet) {
+ // Get something in the uint8_t range
+ EXPECT_THAT(bit_util::BitfieldGet(0b0, /*lsb_offset=*/0, /*len=*/1), Eq(0b0));
+ EXPECT_THAT(bit_util::BitfieldGet(0b01, /*lsb_offset=*/0, /*len=*/1),
+ Eq(0b01));
+ EXPECT_THAT(bit_util::BitfieldGet(0b010, /*lsb_offset=*/1, /*len=*/1),
+ Eq(0b01));
+ EXPECT_THAT(bit_util::BitfieldGet(0b001, /*lsb_offset=*/1, /*len=*/1),
+ Eq(0b0));
+ EXPECT_THAT(bit_util::BitfieldGet(0b011, /*lsb_offset=*/0, /*len=*/2),
+ Eq(0b011));
+ EXPECT_THAT(bit_util::BitfieldGet(0b0110, /*lsb_offset=*/1, /*len=*/2),
+ Eq(0b011));
+ EXPECT_THAT(bit_util::BitfieldGet(0b0101, /*lsb_offset=*/0, /*len=*/3),
+ Eq(0b0101));
+
+ // Get something in the uint32_t range
+ EXPECT_THAT(
+ bit_util::BitfieldGet(0b01000000000000, /*lsb_offset=*/12, /*len=*/1),
+ Eq(0b01));
+
+ // Get something in the uint64_t range
+ EXPECT_THAT(bit_util::BitfieldGet(0b010000000000000000000000000000000000,
+ /*lsb_offset=*/34, /*len=*/1),
+ Eq(0b01));
+}
+
+TEST(BitUtilTest, BitfieldSet) {
+ // Set something in the uint8_t range
+ uint8_t value_8 = 0b0;
+ bit_util::BitfieldSet(0b0, /*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b0));
+
+ value_8 = 0b01;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b01));
+
+ value_8 = 0b00;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/0, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b01));
+
+ value_8 = 0b00;
+ bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b011));
+
+ value_8 = 0b01;
+ bit_util::BitfieldSet(0b011, /*lsb_offset=*/0, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b011));
+
+ value_8 = 0b01;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/1, /*len=*/1, &value_8);
+ EXPECT_THAT(value_8, Eq(0b011));
+
+ value_8 = 0b0001;
+ bit_util::BitfieldSet(0b011, /*lsb_offset=*/1, /*len=*/2, &value_8);
+ EXPECT_THAT(value_8, Eq(0b0111));
+
+ // Set something in the uint32_t range
+ uint32_t value_32 = 0b0;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/16, /*len=*/1, &value_32);
+ EXPECT_THAT(value_32, Eq(0b010000000000000000));
+
+ // Set something in the uint64_t range
+ uint64_t value_64 = 0b0;
+ bit_util::BitfieldSet(0b01, /*lsb_offset=*/34, /*len=*/1, &value_64);
+ EXPECT_THAT(value_64, Eq(0b010000000000000000000000000000000000));
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/character-iterator.cc b/icing/util/character-iterator.cc
new file mode 100644
index 0000000..0ab1e50
--- /dev/null
+++ b/icing/util/character-iterator.cc
@@ -0,0 +1,269 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/character-iterator.h"
+
+#include "icing/util/i18n-utils.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+// Returns the lead byte of the UTF-8 character that includes the byte at
+// current_byte_index within it.
+int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
+ while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
+ --current_byte_index;
+ }
+ return current_byte_index;
+}
+
+} // namespace
+
+UChar32 CharacterIterator::GetCurrentChar() {
+ if (cached_current_char_ == i18n_utils::kInvalidUChar32) {
+ // Our indices point to the right character, we just need to read that
+ // character. No need to worry about an error. If GetUChar32At fails, then
+ // current_char will be i18n_utils::kInvalidUChar32.
+ cached_current_char_ =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ }
+ return cached_current_char_;
+}
+
+bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
+ return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
+ : RewindToUtf8(desired_utf8_index);
+}
+
+bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
+ ResetToStartIfNecessary();
+
+ if (desired_utf8_index > text_.length()) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work forwards.
+ UChar32 uchar32 = cached_current_char_;
+ while (utf8_index_ < desired_utf8_index) {
+ uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > desired_utf8_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
+ ++utf32_index_;
+ }
+ cached_current_char_ =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ return true;
+}
+
+bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
+ if (desired_utf8_index < 0) {
+ // Enforce the requirement.
+ return false;
+ }
+ // Need to work backwards.
+ UChar32 uchar32 = cached_current_char_;
+ while (utf8_index_ > desired_utf8_index) {
+ int utf8_index = utf8_index_ - 1;
+ utf8_index = GetUTF8StartPosition(text_, utf8_index);
+ if (utf8_index < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ // We've found the start of a unicode char!
+ uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
+ int expected_length = utf8_index_ - utf8_index;
+ if (uchar32 == i18n_utils::kInvalidUChar32 ||
+ expected_length != i18n_utils::GetUtf8Length(uchar32)) {
+ // Either unable to retrieve a valid UTF-32 character at the previous
+ // position or we skipped past an invalid sequence while seeking the
+ // previous start position.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ cached_current_char_ = uchar32;
+ utf8_index_ = utf8_index;
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ --utf32_index_;
+ }
+ return true;
+}
+
+bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
+ return (desired_utf16_index > utf16_index_)
+ ? AdvanceToUtf16(desired_utf16_index)
+ : RewindToUtf16(desired_utf16_index);
+}
+
+bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
+ ResetToStartIfNecessary();
+
+ UChar32 uchar32 = cached_current_char_;
+ while (utf16_index_ < desired_utf16_index) {
+ uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+ if (utf16_index_ + utf16_length > desired_utf16_index) {
+ // Ah! Don't go too far!
+ break;
+ }
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > text_.length()) {
+ // Enforce the requirement.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += utf16_length;
+ ++utf32_index_;
+ }
+ cached_current_char_ =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ return true;
+}
+
+bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
+ if (desired_utf16_index < 0) {
+ return false;
+ }
+ UChar32 uchar32 = cached_current_char_;
+ while (utf16_index_ > desired_utf16_index) {
+ int utf8_index = utf8_index_ - 1;
+ utf8_index = GetUTF8StartPosition(text_, utf8_index);
+ if (utf8_index < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ // We've found the start of a unicode char!
+ uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
+ int expected_length = utf8_index_ - utf8_index;
+ if (uchar32 == i18n_utils::kInvalidUChar32 ||
+ expected_length != i18n_utils::GetUtf8Length(uchar32)) {
+ // Either unable to retrieve a valid UTF-32 character at the previous
+ // position or we skipped past an invalid sequence while seeking the
+ // previous start position.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ cached_current_char_ = uchar32;
+ utf8_index_ = utf8_index;
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ --utf32_index_;
+ }
+ return true;
+}
+
+bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
+ return (desired_utf32_index > utf32_index_)
+ ? AdvanceToUtf32(desired_utf32_index)
+ : RewindToUtf32(desired_utf32_index);
+}
+
+bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
+ ResetToStartIfNecessary();
+
+ UChar32 uchar32 = cached_current_char_;
+ while (utf32_index_ < desired_utf32_index) {
+ uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ if (uchar32 == i18n_utils::kInvalidUChar32) {
+ // Unable to retrieve a valid UTF-32 character at the previous position.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ int utf16_length = i18n_utils::GetUtf16Length(uchar32);
+ int utf8_length = i18n_utils::GetUtf8Length(uchar32);
+ if (utf8_index_ + utf8_length > text_.length()) {
+ // Enforce the requirement.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ utf8_index_ += utf8_length;
+ utf16_index_ += utf16_length;
+ ++utf32_index_;
+ }
+ cached_current_char_ =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
+ return true;
+}
+
+bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
+ if (desired_utf32_index < 0) {
+ return false;
+ }
+ UChar32 uchar32 = cached_current_char_;
+ while (utf32_index_ > desired_utf32_index) {
+ int utf8_index = utf8_index_ - 1;
+ utf8_index = GetUTF8StartPosition(text_, utf8_index);
+ if (utf8_index < 0) {
+ // Somehow, there wasn't a single UTF-8 lead byte at
+ // requested_byte_index or an earlier byte.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ // We've found the start of a unicode char!
+ uchar32 =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
+ int expected_length = utf8_index_ - utf8_index;
+ if (uchar32 == i18n_utils::kInvalidUChar32 ||
+ expected_length != i18n_utils::GetUtf8Length(uchar32)) {
+ // Either unable to retrieve a valid UTF-32 character at the previous
+ // position or we skipped past an invalid sequence while seeking the
+ // previous start position.
+ cached_current_char_ = i18n_utils::kInvalidUChar32;
+ return false;
+ }
+ cached_current_char_ = uchar32;
+ utf8_index_ = utf8_index;
+ utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
+ --utf32_index_;
+ }
+ return true;
+}
+
+void CharacterIterator::ResetToStartIfNecessary() {
+ if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
+ utf8_index_ = 0;
+ utf16_index_ = 0;
+ utf32_index_ = 0;
+ cached_current_char_ =
+ i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
+ }
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/character-iterator.h b/icing/util/character-iterator.h
new file mode 100644
index 0000000..893718a
--- /dev/null
+++ b/icing/util/character-iterator.h
@@ -0,0 +1,116 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_CHARACTER_ITERATOR_H_
+#define ICING_UTIL_CHARACTER_ITERATOR_H_
+
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/util/i18n-utils.h"
+
+namespace icing {
+namespace lib {
+
+class CharacterIterator {
+ public:
+ explicit CharacterIterator(std::string_view text)
+ : CharacterIterator(text, 0, 0, 0) {}
+
+ CharacterIterator(std::string_view text, int utf8_index, int utf16_index,
+ int utf32_index)
+ : text_(text),
+ cached_current_char_(i18n_utils::kInvalidUChar32),
+ utf8_index_(utf8_index),
+ utf16_index_(utf16_index),
+ utf32_index_(utf32_index) {}
+
+ // Returns the character that the iterator currently points to.
+ // i18n_utils::kInvalidUChar32 if unable to read that character.
+ UChar32 GetCurrentChar();
+
+ // Moves current position to desired_utf8_index.
+ // REQUIRES: 0 <= desired_utf8_index <= text_.length()
+ bool MoveToUtf8(int desired_utf8_index);
+
+ // Advances from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: desired_utf8_index <= text_.length()
+ // desired_utf8_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf8(int desired_utf8_index);
+
+ // Rewinds from current position to the character that includes the specified
+ // UTF-8 index.
+ // REQUIRES: 0 <= desired_utf8_index
+ bool RewindToUtf8(int desired_utf8_index);
+
+ // Moves current position to desired_utf16_index.
+ // REQUIRES: 0 <= desired_utf16_index <= text_.utf16_length()
+ bool MoveToUtf16(int desired_utf16_index);
+
+ // Advances current position to desired_utf16_index.
+ // REQUIRES: desired_utf16_index <= text_.utf16_length()
+ // desired_utf16_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf16(int desired_utf16_index);
+
+ // Rewinds current position to desired_utf16_index.
+ // REQUIRES: 0 <= desired_utf16_index
+ bool RewindToUtf16(int desired_utf16_index);
+
+ // Moves current position to desired_utf32_index.
+ // REQUIRES: 0 <= desired_utf32_index <= text_.utf32_length()
+ bool MoveToUtf32(int desired_utf32_index);
+
+ // Advances current position to desired_utf32_index.
+ // REQUIRES: desired_utf32_index <= text_.utf32_length()
+ // desired_utf32_index is allowed to point one index past the end, but no
+ // further.
+ bool AdvanceToUtf32(int desired_utf32_index);
+
+ // Rewinds current position to desired_utf32_index.
+ // REQUIRES: 0 <= desired_utf32_index
+ bool RewindToUtf32(int desired_utf32_index);
+
+ int utf8_index() const { return utf8_index_; }
+ int utf16_index() const { return utf16_index_; }
+ int utf32_index() const { return utf32_index_; }
+
+ bool operator==(const CharacterIterator& rhs) const {
+ // cached_current_char_ is just that: a cached value. As such, it's not
+ // considered for equality.
+ return text_ == rhs.text_ && utf8_index_ == rhs.utf8_index_ &&
+ utf16_index_ == rhs.utf16_index_ && utf32_index_ == rhs.utf32_index_;
+ }
+
+ std::string DebugString() const {
+ return IcingStringUtil::StringPrintf("(u8:%d,u16:%d,u32:%d)", utf8_index_,
+ utf16_index_, utf32_index_);
+ }
+
+ private:
+ // Resets the character iterator to the start of the text if any of the
+ // indices are negative.
+ void ResetToStartIfNecessary();
+
+ std::string_view text_;
+ UChar32 cached_current_char_;
+ int utf8_index_;
+ int utf16_index_;
+ int utf32_index_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_CHARACTER_ITERATOR_H_
diff --git a/icing/util/character-iterator_test.cc b/icing/util/character-iterator_test.cc
new file mode 100644
index 0000000..195a47b
--- /dev/null
+++ b/icing/util/character-iterator_test.cc
@@ -0,0 +1,266 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/character-iterator.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/testing/icu-i18n-test-utils.h"
+
+namespace icing {
+namespace lib {
+
+using ::testing::Eq;
+using ::testing::IsFalse;
+using ::testing::IsTrue;
+
+TEST(CharacterIteratorTest, BasicUtf8) {
+ constexpr std::string_view kText = "¿Dónde está la biblioteca?";
+ CharacterIterator iterator(kText);
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
+
+ EXPECT_THAT(iterator.AdvanceToUtf8(4), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
+ /*utf32_index=*/2)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf8(18), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
+ /*utf32_index=*/15)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf8(28), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
+ /*utf32_index=*/25)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf8(29), IsTrue());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26,
+ /*utf32_index=*/26)));
+
+ EXPECT_THAT(iterator.RewindToUtf8(28), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
+ /*utf32_index=*/25)));
+
+ EXPECT_THAT(iterator.RewindToUtf8(18), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
+ /*utf32_index=*/15)));
+
+ EXPECT_THAT(iterator.RewindToUtf8(4), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
+ /*utf32_index=*/2)));
+
+ EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0,
+ /*utf32_index=*/0)));
+}
+
+TEST(CharacterIteratorTest, BasicUtf16) {
+ constexpr std::string_view kText = "¿Dónde está la biblioteca?";
+ CharacterIterator iterator(kText);
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
+
+ EXPECT_THAT(iterator.AdvanceToUtf16(2), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
+ /*utf32_index=*/2)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf16(15), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
+ /*utf32_index=*/15)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf16(25), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
+ /*utf32_index=*/25)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf16(26), IsTrue());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26,
+ /*utf32_index=*/26)));
+
+ EXPECT_THAT(iterator.RewindToUtf16(25), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
+ /*utf32_index=*/25)));
+
+ EXPECT_THAT(iterator.RewindToUtf16(15), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
+ /*utf32_index=*/15)));
+
+ EXPECT_THAT(iterator.RewindToUtf16(2), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
+ /*utf32_index=*/2)));
+
+ EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0,
+ /*utf32_index=*/0)));
+}
+
+TEST(CharacterIteratorTest, BasicUtf32) {
+ constexpr std::string_view kText = "¿Dónde está la biblioteca?";
+ CharacterIterator iterator(kText);
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
+
+ EXPECT_THAT(iterator.AdvanceToUtf32(2), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
+ /*utf32_index=*/2)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf32(15), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
+ /*utf32_index=*/15)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf32(25), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
+ /*utf32_index=*/25)));
+
+ EXPECT_THAT(iterator.AdvanceToUtf32(26), IsTrue());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/29, /*utf16_index=*/26,
+ /*utf32_index=*/26)));
+
+ EXPECT_THAT(iterator.RewindToUtf32(25), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/28, /*utf16_index=*/25,
+ /*utf32_index=*/25)));
+
+ EXPECT_THAT(iterator.RewindToUtf32(15), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/18, /*utf16_index=*/15,
+ /*utf32_index=*/15)));
+
+ EXPECT_THAT(iterator.RewindToUtf32(2), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/3, /*utf16_index=*/2,
+ /*utf32_index=*/2)));
+
+ EXPECT_THAT(iterator.RewindToUtf32(0), IsTrue());
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
+ EXPECT_THAT(iterator,
+ Eq(CharacterIterator(kText, /*utf8_index=*/0, /*utf16_index=*/0,
+ /*utf32_index=*/0)));
+}
+
+TEST(CharacterIteratorTest, InvalidUtf) {
+ // "\255" is an invalid sequence.
+ constexpr std::string_view kText = "foo \255 bar";
+ CharacterIterator iterator(kText);
+
+ // Try to advance to the 'b' in 'bar'. This will fail and leave us pointed at
+ // the invalid sequence '\255'. Get CurrentChar() should return an invalid
+ // character.
+ EXPECT_THAT(iterator.AdvanceToUtf8(6), IsFalse());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32));
+ CharacterIterator exp_iterator(kText, /*utf8_index=*/4, /*utf16_index=*/4,
+ /*utf32_index=*/4);
+ EXPECT_THAT(iterator, Eq(exp_iterator));
+
+ EXPECT_THAT(iterator.AdvanceToUtf16(6), IsFalse());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32));
+ EXPECT_THAT(iterator, Eq(exp_iterator));
+
+ EXPECT_THAT(iterator.AdvanceToUtf32(6), IsFalse());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(i18n_utils::kInvalidUChar32));
+ EXPECT_THAT(iterator, Eq(exp_iterator));
+
+ // Create the iterator with it pointing at the 'b' in 'bar'.
+ iterator = CharacterIterator(kText, /*utf8_index=*/6, /*utf16_index=*/6,
+ /*utf32_index=*/6);
+ EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
+
+ // Try to advance to the last 'o' in 'foo'. This will fail and leave us
+ // pointed at the ' ' before the invalid sequence '\255'.
+ exp_iterator = CharacterIterator(kText, /*utf8_index=*/5, /*utf16_index=*/5,
+ /*utf32_index=*/5);
+ EXPECT_THAT(iterator.RewindToUtf8(2), IsFalse());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(' '));
+ EXPECT_THAT(iterator, Eq(exp_iterator));
+
+ EXPECT_THAT(iterator.RewindToUtf16(2), IsFalse());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(' '));
+ EXPECT_THAT(iterator, Eq(exp_iterator));
+
+ EXPECT_THAT(iterator.RewindToUtf32(2), IsFalse());
+ EXPECT_THAT(iterator.GetCurrentChar(), Eq(' '));
+ EXPECT_THAT(iterator, Eq(exp_iterator));
+}
+
+TEST(CharacterIteratorTest, MoveToUtfNegativeIndex) {
+ constexpr std::string_view kText = "¿Dónde está la biblioteca?";
+
+ CharacterIterator iterator_utf8(kText, /*utf8_index=*/-1, /*utf16_index=*/0,
+ /*utf32_index=*/0);
+ // We should be able to successfully move when the index is negative.
+ EXPECT_THAT(iterator_utf8.MoveToUtf8(0), IsTrue());
+ // The character cache should be reset and contain the first character when
+ // resetting to index 0.
+ EXPECT_THAT(UCharToString(iterator_utf8.GetCurrentChar()), Eq("¿"));
+ EXPECT_THAT(iterator_utf8.utf8_index(), Eq(0));
+ EXPECT_THAT(iterator_utf8.utf16_index(), Eq(0));
+ EXPECT_THAT(iterator_utf8.utf32_index(), Eq(0));
+
+ CharacterIterator iterator_utf16(kText, /*utf8_index=*/0, /*utf16_index=*/-1,
+ /*utf32_index=*/0);
+ EXPECT_THAT(iterator_utf16.MoveToUtf16(1), IsTrue());
+ EXPECT_THAT(iterator_utf16.GetCurrentChar(), Eq('D'));
+ EXPECT_THAT(iterator_utf16.utf8_index(), Eq(2));
+ EXPECT_THAT(iterator_utf16.utf16_index(), Eq(1));
+ EXPECT_THAT(iterator_utf16.utf32_index(), Eq(1));
+
+ CharacterIterator iterator_utf32(kText, /*utf8_index=*/0, /*utf16_index=*/0,
+ /*utf32_index=*/-1);
+ EXPECT_THAT(iterator_utf32.MoveToUtf32(2), IsTrue());
+ EXPECT_THAT(UCharToString(iterator_utf32.GetCurrentChar()), Eq("ó"));
+ EXPECT_THAT(iterator_utf32.utf8_index(), Eq(3));
+ EXPECT_THAT(iterator_utf32.utf16_index(), Eq(2));
+ EXPECT_THAT(iterator_utf32.utf32_index(), Eq(2));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/clock.cc b/icing/util/clock.cc
index 3593f13..270b5f0 100644
--- a/icing/util/clock.cc
+++ b/icing/util/clock.cc
@@ -16,20 +16,31 @@
#include <chrono> // NOLINT. Abseil library is not available in AOSP so we have
// to use chrono to get current time in milliseconds.
+#include <memory>
namespace icing {
namespace lib {
+int64_t GetSteadyTimeNanoseconds() {
+ return std::chrono::duration_cast<std::chrono::nanoseconds>(
+ std::chrono::steady_clock::now().time_since_epoch())
+ .count();
+}
+
+int64_t GetSteadyTimeMilliseconds() {
+ return std::chrono::duration_cast<std::chrono::milliseconds>(
+ std::chrono::steady_clock::now().time_since_epoch())
+ .count();
+}
+
int64_t Clock::GetSystemTimeMilliseconds() const {
return std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
}
-uint64_t GetSteadyTimeNanoseconds() {
- return std::chrono::duration_cast<std::chrono::nanoseconds>(
- std::chrono::steady_clock::now().time_since_epoch())
- .count();
+std::unique_ptr<Timer> Clock::GetNewTimer() const {
+ return std::make_unique<Timer>();
}
} // namespace lib
diff --git a/icing/util/clock.h b/icing/util/clock.h
index 58628f3..d987a4c 100644
--- a/icing/util/clock.h
+++ b/icing/util/clock.h
@@ -16,10 +16,45 @@
#define ICING_UTIL_CLOCK_H_
#include <cstdint>
+#include <functional>
+#include <memory>
namespace icing {
namespace lib {
+// Returns the current steady time in nanoseconds. The steady clock is different
+// from the system clock. It's monotonic and never returns a lower value than a
+// previous call, while a system clock can be occasionally adjusted.
+int64_t GetSteadyTimeNanoseconds();
+
+// Returns the current steady time in Milliseconds. The steady clock is
+// different from the system clock. It's monotonic and never returns a lower
+// value than a previous call, while a system clock can be occasionally
+// adjusted.
+int64_t GetSteadyTimeMilliseconds();
+
+// Used to calculate the elapsed time.
+class Timer {
+ public:
+ // Creates and starts the timer.
+ Timer() : start_timestamp_nanoseconds_(GetSteadyTimeNanoseconds()) {}
+
+ virtual ~Timer() = default;
+
+ // Returns the elapsed time from when timer started.
+ virtual int64_t GetElapsedMilliseconds() const {
+ return GetElapsedNanoseconds() / 1000000;
+ }
+
+ // Returns the elapsed time from when timer started.
+ virtual int64_t GetElapsedNanoseconds() const {
+ return GetSteadyTimeNanoseconds() - start_timestamp_nanoseconds_;
+ }
+
+ private:
+ int64_t start_timestamp_nanoseconds_;
+};
+
// Wrapper around real-time clock functions. This is separated primarily so
// tests can override this clock and inject it into the class under test.
class Clock {
@@ -29,12 +64,39 @@ class Clock {
// Returns the current time in milliseconds, it's guaranteed that the return
// value is non-negative.
virtual int64_t GetSystemTimeMilliseconds() const;
+
+ // Returns a timer used to calculate the elapsed time. The timer starts when
+ // the method returns.
+ virtual std::unique_ptr<Timer> GetNewTimer() const;
};
-// Returns the current steady time in nanoseconds. The steady clock is different
-// from the system clock. It's monotonic and never returns a lower value than a
-// previous call, while a system clock can be occasionally adjusted.
-uint64_t GetSteadyTimeNanoseconds();
+// A convenient RAII timer class that receives a callback. Upon destruction, the
+// callback will be called with the elapsed milliseconds or nanoseconds passed
+// as a parameter, depending on which Unit was passed in the constructor.
+class ScopedTimer {
+ public:
+ enum class Unit { kMillisecond, kNanosecond };
+
+ ScopedTimer(std::unique_ptr<Timer> timer,
+ std::function<void(int64_t)> callback,
+ Unit unit = Unit::kMillisecond)
+ : timer_(std::move(timer)), callback_(std::move(callback)), unit_(unit) {}
+
+ ~ScopedTimer() {
+ if (unit_ == Unit::kMillisecond) {
+ callback_(timer_->GetElapsedMilliseconds());
+ } else {
+ callback_(timer_->GetElapsedNanoseconds());
+ }
+ }
+
+ const Timer& timer() const { return *timer_; }
+
+ private:
+ std::unique_ptr<Timer> timer_;
+ std::function<void(int64_t)> callback_;
+ Unit unit_;
+};
} // namespace lib
} // namespace icing
diff --git a/icing/util/crc32.h b/icing/util/crc32.h
index e8c7c8f..207a80a 100644
--- a/icing/util/crc32.h
+++ b/icing/util/crc32.h
@@ -28,10 +28,6 @@ namespace lib {
// implementation.
//
// See https://www.zlib.net/manual.html#Checksum for more details.
-//
-// TODO (samzheng): investigate/benchmark swapping zlib crc32 with
-// util/hash/crc32c.h. Regarding util/hash/crc32c.h, CRC32C::Extend crashes as
-// described in b/145837799.
class Crc32 {
public:
// Default to the checksum of an empty string, that is "0".
@@ -39,6 +35,8 @@ class Crc32 {
explicit Crc32(uint32_t init_crc) : crc_(init_crc) {}
+ explicit Crc32(std::string_view str) : crc_(0) { Append(str); }
+
inline bool operator==(const Crc32& other) const {
return crc_ == other.Get();
}
diff --git a/icing/util/data-loss.h b/icing/util/data-loss.h
new file mode 100644
index 0000000..cb19ce2
--- /dev/null
+++ b/icing/util/data-loss.h
@@ -0,0 +1,36 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_DATA_LOSS_H_
+#define ICING_UTIL_DATA_LOSS_H_
+
+namespace icing {
+namespace lib {
+
+enum DataLoss {
+ // No data loss happened. Everything initialized correctly.
+ NONE,
+
+ // Anything changes made after a persist to disk call were lost. This includes
+ // adding new data, removing old data, and modifying existing data.
+ PARTIAL,
+
+ // All data is lost. IcingSearchEngine has completely reset.
+ COMPLETE
+};
+
+}
+} // namespace icing
+
+#endif // ICING_UTIL_DATA_LOSS_H_
diff --git a/icing/util/document-validator.cc b/icing/util/document-validator.cc
index 36b84f8..e0880ea 100644
--- a/icing/util/document-validator.cc
+++ b/icing/util/document-validator.cc
@@ -19,6 +19,8 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/absl_ports/canonical_errors.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
#include "icing/schema/schema-util.h"
#include "icing/util/status-macros.h"
@@ -32,12 +34,13 @@ DocumentValidator::DocumentValidator(const SchemaStore* schema_store)
: schema_store_(schema_store) {}
libtextclassifier3::Status DocumentValidator::Validate(
- const DocumentProto& document) {
+ const DocumentProto& document, int depth) {
if (document.namespace_().empty()) {
return absl_ports::InvalidArgumentError("Field 'namespace' is empty.");
}
- if (document.uri().empty()) {
+ // Only require a non-empty uri on top-level documents.
+ if (depth == 0 && document.uri().empty()) {
return absl_ports::InvalidArgumentError("Field 'uri' is empty.");
}
@@ -96,12 +99,12 @@ libtextclassifier3::Status DocumentValidator::Validate(
if (property_iter == parsed_property_configs.property_config_map.end()) {
return absl_ports::NotFoundError(absl_ports::StrCat(
"Property config '", property.name(), "' not found for key: (",
- document.namespace_(), ", ", document.uri(), ")."));
+ document.namespace_(), ", ", document.uri(),
+ ") of type: ", document.schema(), "."));
}
const PropertyConfigProto& property_config = *property_iter->second;
// Get the property value size according to data type.
- // TODO (samzheng): make sure values of other data types are empty.
int value_size = 0;
if (property_config.data_type() == PropertyConfigProto::DataType::STRING) {
value_size = property.string_values_size();
@@ -148,24 +151,28 @@ libtextclassifier3::Status DocumentValidator::Validate(
// fail, we don't need to validate the extra documents.
if (property_config.data_type() ==
PropertyConfigProto::DataType::DOCUMENT) {
- const std::string_view nested_type_expected =
- property_config.schema_type();
+ ICING_ASSIGN_OR_RETURN(
+ const std::unordered_set<SchemaTypeId>* nested_type_ids_expected,
+ schema_store_->GetSchemaTypeIdsWithChildren(
+ property_config.schema_type()));
for (const DocumentProto& nested_document : property.document_values()) {
- if (nested_type_expected.compare(nested_document.schema()) != 0) {
+ libtextclassifier3::StatusOr<SchemaTypeId> nested_document_type_id_or =
+ schema_store_->GetSchemaTypeId(nested_document.schema());
+ if (!nested_document_type_id_or.ok() ||
+ nested_type_ids_expected->count(
+ nested_document_type_id_or.ValueOrDie()) == 0) {
return absl_ports::InvalidArgumentError(absl_ports::StrCat(
- "Property '", property.name(), "' should have type '",
- nested_type_expected,
- "' but actual "
- "value has type '",
+ "Property '", property.name(), "' should be type or subtype of '",
+ property_config.schema_type(), "' but actual value has type '",
nested_document.schema(), "' for key: (", document.namespace_(),
", ", document.uri(), ")."));
}
- ICING_RETURN_IF_ERROR(Validate(nested_document));
+ ICING_RETURN_IF_ERROR(Validate(nested_document, depth + 1));
}
}
}
if (num_required_properties_actual <
- parsed_property_configs.num_required_properties) {
+ parsed_property_configs.required_properties.size()) {
return absl_ports::InvalidArgumentError(
absl_ports::StrCat("One or more required fields missing for key: (",
document.namespace_(), ", ", document.uri(), ")."));
diff --git a/icing/util/document-validator.h b/icing/util/document-validator.h
index 34a3217..28dd940 100644
--- a/icing/util/document-validator.h
+++ b/icing/util/document-validator.h
@@ -17,7 +17,6 @@
#include "icing/text_classifier/lib3/utils/base/status.h"
#include "icing/proto/document.pb.h"
-#include "icing/proto/schema.pb.h"
#include "icing/schema/schema-store.h"
namespace icing {
@@ -32,7 +31,8 @@ class DocumentValidator {
// This function validates:
// 1. DocumentProto.namespace is not empty
- // 2. DocumentProto.uri is not empty
+ // 2. DocumentProto.uri is not empty in top-level documents. Nested documents
+ // may have empty uris.
// 3. DocumentProto.schema is not empty
// 4. DocumentProto.schema matches one of SchemaTypeConfigProto.schema_type
// in the given SchemaProto in constructor
@@ -56,7 +56,8 @@ class DocumentValidator {
// In addition, all nested DocumentProto will also be validated towards the
// requirements above.
//
- // DocumentProto.custom_properties are not validated.
+ // 'depth' indicates what nesting level the document may be at. A top-level
+ // document has a nesting depth of 0.
//
// Returns:
// OK on success
@@ -65,7 +66,8 @@ class DocumentValidator {
// NOT_FOUND if case 4 or 7 fails
// ALREADY_EXISTS if case 6 fails
// INTERNAL on any I/O error
- libtextclassifier3::Status Validate(const DocumentProto& document);
+ libtextclassifier3::Status Validate(const DocumentProto& document,
+ int depth = 0);
void UpdateSchemaStore(const SchemaStore* schema_store) {
schema_store_ = schema_store;
diff --git a/icing/util/document-validator_test.cc b/icing/util/document-validator_test.cc
index 16bdf78..9d10b36 100644
--- a/icing/util/document-validator_test.cc
+++ b/icing/util/document-validator_test.cc
@@ -20,24 +20,31 @@
#include "gtest/gtest.h"
#include "icing/document-builder.h"
#include "icing/file/filesystem.h"
+#include "icing/proto/document.pb.h"
#include "icing/proto/schema.pb.h"
+#include "icing/schema-builder.h"
#include "icing/schema/schema-store.h"
#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
#include "icing/testing/tmp-directory.h"
namespace icing {
namespace lib {
namespace {
+
using ::testing::HasSubstr;
-// type and property names of EmailMessage
+// type and property names of EmailMessage and EmailMessageWithNote
constexpr char kTypeEmail[] = "EmailMessage";
+constexpr char kTypeEmailWithNote[] = "EmailMessageWithNote";
constexpr char kPropertySubject[] = "subject";
constexpr char kPropertyText[] = "text";
constexpr char kPropertyRecipients[] = "recipients";
+constexpr char kPropertyNote[] = "note";
// type and property names of Conversation
constexpr char kTypeConversation[] = "Conversation";
+constexpr char kTypeConversationWithEmailNote[] = "ConversationWithEmailNote";
constexpr char kPropertyName[] = "name";
constexpr char kPropertyEmails[] = "emails";
// Other values
@@ -49,41 +56,86 @@ class DocumentValidatorTest : public ::testing::Test {
DocumentValidatorTest() {}
void SetUp() override {
- SchemaProto schema;
- auto type_config = schema.add_types();
- CreateEmailTypeConfig(type_config);
-
- type_config = schema.add_types();
- CreateConversationTypeConfig(type_config);
-
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kTypeEmail)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertySubject)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyText)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyRecipients)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kTypeEmailWithNote)
+ .AddParentType(kTypeEmail)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertySubject)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyText)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyRecipients)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyNote)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kTypeConversation)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyEmails)
+ .SetDataTypeDocument(
+ kTypeEmail, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kTypeConversationWithEmailNote)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyEmails)
+ .SetDataTypeDocument(
+ kTypeEmailWithNote,
+ /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
+
+ schema_dir_ = GetTestTempDir() + "/schema_store";
+ ASSERT_TRUE(filesystem_.CreateDirectory(schema_dir_.c_str()));
ICING_ASSERT_OK_AND_ASSIGN(
- schema_store_, SchemaStore::Create(&filesystem_, GetTestTempDir()));
- ASSERT_THAT(schema_store_->SetSchema(schema), IsOk());
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_dir_, &fake_clock_));
+ ASSERT_THAT(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
document_validator_ =
std::make_unique<DocumentValidator>(schema_store_.get());
}
- static void CreateEmailTypeConfig(SchemaTypeConfigProto* type_config) {
- type_config->set_schema_type(kTypeEmail);
-
- auto subject = type_config->add_properties();
- subject->set_property_name(kPropertySubject);
- subject->set_data_type(PropertyConfigProto::DataType::STRING);
- subject->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- auto text = type_config->add_properties();
- text->set_property_name(kPropertyText);
- text->set_data_type(PropertyConfigProto::DataType::STRING);
- text->set_cardinality(PropertyConfigProto::Cardinality::OPTIONAL);
-
- auto recipients = type_config->add_properties();
- recipients->set_property_name(kPropertyRecipients);
- recipients->set_data_type(PropertyConfigProto::DataType::STRING);
- recipients->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- }
-
- static DocumentBuilder SimpleEmailBuilder() {
+ DocumentBuilder SimpleEmailBuilder() {
return DocumentBuilder()
.SetKey(kDefaultNamespace, "email/1")
.SetSchema(kTypeEmail)
@@ -93,22 +145,18 @@ class DocumentValidatorTest : public ::testing::Test {
kDefaultString);
}
- static void CreateConversationTypeConfig(SchemaTypeConfigProto* type_config) {
- type_config->set_schema_type(kTypeConversation);
-
- auto name = type_config->add_properties();
- name->set_property_name(kPropertyName);
- name->set_data_type(PropertyConfigProto::DataType::STRING);
- name->set_cardinality(PropertyConfigProto::Cardinality::REQUIRED);
-
- auto emails = type_config->add_properties();
- emails->set_property_name(kPropertyEmails);
- emails->set_data_type(PropertyConfigProto::DataType::DOCUMENT);
- emails->set_cardinality(PropertyConfigProto::Cardinality::REPEATED);
- emails->set_schema_type(kTypeEmail);
+ DocumentBuilder SimpleEmailWithNoteBuilder() {
+ return DocumentBuilder()
+ .SetKey(kDefaultNamespace, "email_with_note/1")
+ .SetSchema(kTypeEmailWithNote)
+ .AddStringProperty(kPropertySubject, kDefaultString)
+ .AddStringProperty(kPropertyText, kDefaultString)
+ .AddStringProperty(kPropertyRecipients, kDefaultString, kDefaultString,
+ kDefaultString)
+ .AddStringProperty(kPropertyNote, kDefaultString);
}
- static DocumentBuilder SimpleConversationBuilder() {
+ DocumentBuilder SimpleConversationBuilder() {
return DocumentBuilder()
.SetKey(kDefaultNamespace, "conversation/1")
.SetSchema(kTypeConversation)
@@ -118,9 +166,11 @@ class DocumentValidatorTest : public ::testing::Test {
SimpleEmailBuilder().Build());
}
- std::unique_ptr<DocumentValidator> document_validator_;
- std::unique_ptr<SchemaStore> schema_store_;
+ std::string schema_dir_;
Filesystem filesystem_;
+ FakeClock fake_clock_;
+ std::unique_ptr<SchemaStore> schema_store_;
+ std::unique_ptr<DocumentValidator> document_validator_;
};
TEST_F(DocumentValidatorTest, ValidateSimpleSchemasOk) {
@@ -138,13 +188,27 @@ TEST_F(DocumentValidatorTest, ValidateEmptyNamespaceInvalid) {
HasSubstr("'namespace' is empty")));
}
-TEST_F(DocumentValidatorTest, ValidateEmptyUriInvalid) {
+TEST_F(DocumentValidatorTest, ValidateTopLevelEmptyUriInvalid) {
DocumentProto email = SimpleEmailBuilder().SetUri("").Build();
EXPECT_THAT(document_validator_->Validate(email),
StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
HasSubstr("'uri' is empty")));
}
+TEST_F(DocumentValidatorTest, ValidateNestedEmptyUriValid) {
+ DocumentProto conversation =
+ SimpleConversationBuilder()
+ .ClearProperties()
+ .AddStringProperty(kPropertyName, kDefaultString)
+ .AddDocumentProperty(kPropertyEmails,
+ SimpleEmailBuilder()
+ .SetUri("") // Empty nested uri
+ .Build())
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(conversation), IsOk());
+}
+
TEST_F(DocumentValidatorTest, ValidateEmptySchemaInvalid) {
DocumentProto email = SimpleEmailBuilder().SetSchema("").Build();
EXPECT_THAT(document_validator_->Validate(email),
@@ -192,18 +256,6 @@ TEST_F(DocumentValidatorTest, ValidateNonexistentPropertyNotFound) {
HasSubstr("'WrongPropertyName' not found")));
}
-TEST_F(DocumentValidatorTest, ValidateAllCustomPropertyOk) {
- DocumentProto email =
- SimpleEmailBuilder()
- // A nonexistent property, would've triggered a NotFound message
- .AddCustomStringProperty("WrongPropertyName", kDefaultString)
- // 'subject' property should've been a string according to the schema
- .AddCustomBooleanProperty(kPropertySubject, false, true)
- .Build();
-
- EXPECT_THAT(document_validator_->Validate(email), IsOk());
-}
-
TEST_F(DocumentValidatorTest, ValidateExactlyOneRequiredValueOk) {
// Required property should have exactly 1 value
DocumentProto email =
@@ -297,10 +349,82 @@ TEST_F(DocumentValidatorTest,
SimpleEmailBuilder().Build())
.Build();
- EXPECT_THAT(document_validator_->Validate(conversation),
- StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
- HasSubstr("'emails' should have type 'EmailMessage' but "
- "actual value has type 'Conversation'")));
+ EXPECT_THAT(
+ document_validator_->Validate(conversation),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'emails' should be type or subtype of 'EmailMessage' "
+ "but actual value has type 'Conversation'")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateNestedPropertyMatchSubtypeOk) {
+ DocumentProto conversation =
+ DocumentBuilder()
+ .SetKey(kDefaultNamespace, "conversation/1")
+ .SetSchema(kTypeConversation)
+ .AddStringProperty(kPropertyName, kDefaultString)
+ .AddDocumentProperty(kPropertyEmails, SimpleEmailBuilder().Build(),
+ // This is a subtype, which is ok.
+ SimpleEmailWithNoteBuilder().Build(),
+ SimpleEmailBuilder().Build())
+ .Build();
+
+ EXPECT_THAT(document_validator_->Validate(conversation), IsOk());
+}
+
+TEST_F(DocumentValidatorTest, ValidateNestedPropertyNonexistentTypeInvalid) {
+ DocumentProto conversation =
+ DocumentBuilder()
+ .SetKey(kDefaultNamespace, "conversation/1")
+ .SetSchema(kTypeConversation)
+ .AddStringProperty(kPropertyName, kDefaultString)
+ .AddDocumentProperty(
+ kPropertyEmails, SimpleEmailBuilder().Build(),
+ // Nonexistent type is not allowed
+ DocumentBuilder()
+ .SetKey(kDefaultNamespace, "email_with_note/1")
+ .SetSchema("Nonexistent")
+ .Build(),
+ SimpleEmailBuilder().Build())
+ .Build();
+
+ EXPECT_THAT(
+ document_validator_->Validate(conversation),
+ StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr("'emails' should be type or subtype of 'EmailMessage' "
+ "but actual value has type 'Nonexistent'")));
+}
+
+TEST_F(DocumentValidatorTest, ValidateNestedPropertyMatchSuperTypeInvalid) {
+ DocumentProto conversation1 =
+ DocumentBuilder()
+ .SetKey(kDefaultNamespace, "conversation_with_email_note/1")
+ .SetSchema(kTypeConversationWithEmailNote)
+ .AddStringProperty(kPropertyName, kDefaultString)
+ .AddDocumentProperty(kPropertyEmails,
+ SimpleEmailWithNoteBuilder().Build(),
+ SimpleEmailWithNoteBuilder().Build(),
+ SimpleEmailWithNoteBuilder().Build())
+ .Build();
+ EXPECT_THAT(document_validator_->Validate(conversation1), IsOk());
+
+ DocumentProto conversation2 =
+ DocumentBuilder()
+ .SetKey(kDefaultNamespace, "conversation_with_email_note/2")
+ .SetSchema(kTypeConversationWithEmailNote)
+ .AddStringProperty(kPropertyName, kDefaultString)
+ .AddDocumentProperty(kPropertyEmails,
+ SimpleEmailWithNoteBuilder().Build(),
+ // This is a super type, which is not ok.
+ SimpleEmailBuilder().Build(),
+ SimpleEmailWithNoteBuilder().Build())
+ .Build();
+ EXPECT_THAT(
+ document_validator_->Validate(conversation2),
+ StatusIs(
+ libtextclassifier3::StatusCode::INVALID_ARGUMENT,
+ HasSubstr(
+ "'emails' should be type or subtype of 'EmailMessageWithNote' "
+ "but actual value has type 'EmailMessage'")));
}
TEST_F(DocumentValidatorTest, ValidateNestedPropertyInvalid) {
@@ -321,12 +445,26 @@ TEST_F(DocumentValidatorTest, ValidateNestedPropertyInvalid) {
}
TEST_F(DocumentValidatorTest, HandleTypeConfigMapChangesOk) {
- SchemaProto email_schema;
- auto type_config = email_schema.add_types();
- CreateEmailTypeConfig(type_config);
+ SchemaProto email_schema =
+ SchemaBuilder()
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kTypeEmail)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertySubject)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyText)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyRecipients)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
- // Create a custom directory so we don't collide with the test's preset schema
- // in SetUp
+ // Create a custom directory so we don't collide
+ // with the test's preset schema in SetUp
const std::string custom_schema_dir = GetTestTempDir() + "/custom_schema";
filesystem_.DeleteDirectoryRecursively(custom_schema_dir.c_str());
filesystem_.CreateDirectoryRecursively(custom_schema_dir.c_str());
@@ -334,8 +472,11 @@ TEST_F(DocumentValidatorTest, HandleTypeConfigMapChangesOk) {
// Set a schema with only the 'Email' type
ICING_ASSERT_OK_AND_ASSIGN(
std::unique_ptr<SchemaStore> schema_store,
- SchemaStore::Create(&filesystem_, custom_schema_dir));
- ASSERT_THAT(schema_store->SetSchema(email_schema), IsOk());
+ SchemaStore::Create(&filesystem_, custom_schema_dir, &fake_clock_));
+ ASSERT_THAT(schema_store->SetSchema(
+ email_schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
DocumentValidator document_validator(schema_store.get());
@@ -347,13 +488,29 @@ TEST_F(DocumentValidatorTest, HandleTypeConfigMapChangesOk) {
HasSubstr("'Conversation' not found")));
// Add the 'Conversation' type
- SchemaProto email_and_conversation_schema = email_schema;
- type_config = email_and_conversation_schema.add_types();
- CreateConversationTypeConfig(type_config);
+ SchemaProto email_and_conversation_schema =
+ SchemaBuilder(email_schema)
+ .AddType(SchemaTypeConfigBuilder()
+ .SetType(kTypeConversation)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kPropertyName)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_REQUIRED))
+ .AddProperty(
+ PropertyConfigBuilder()
+ .SetName(kPropertyEmails)
+ .SetDataTypeDocument(
+ kTypeEmail, /*index_nested_properties=*/true)
+ .SetCardinality(CARDINALITY_REPEATED)))
+ .Build();
// DocumentValidator should be able to handle the SchemaStore getting updated
// separately
- ASSERT_THAT(schema_store->SetSchema(email_and_conversation_schema), IsOk());
+ ASSERT_THAT(
+ schema_store->SetSchema(email_and_conversation_schema,
+ /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false),
+ IsOk());
ICING_EXPECT_OK(document_validator.Validate(conversation));
}
diff --git a/icing/util/encode-util.cc b/icing/util/encode-util.cc
new file mode 100644
index 0000000..2642da7
--- /dev/null
+++ b/icing/util/encode-util.cc
@@ -0,0 +1,50 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/encode-util.h"
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+namespace icing {
+namespace lib {
+
+namespace encode_util {
+
+std::string EncodeIntToCString(uint64_t value) {
+ std::string encoded_str;
+ // Encode it in base128 and add 1 to make sure that there is no 0-byte. This
+ // increases the size of the encoded_str from 8-bytes to 10-bytes at worst.
+ do {
+ encoded_str.push_back((value & 0x7F) + 1);
+ value >>= 7;
+ } while (value);
+ return encoded_str;
+}
+
+uint64_t DecodeIntFromCString(std::string_view encoded_str) {
+ uint64_t value = 0;
+ for (int i = encoded_str.length() - 1; i >= 0; --i) {
+ value <<= 7;
+ char c = encoded_str[i] - 1;
+ value |= (c & 0x7F);
+ }
+ return value;
+}
+
+} // namespace encode_util
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/encode-util.h b/icing/util/encode-util.h
new file mode 100644
index 0000000..5a31acb
--- /dev/null
+++ b/icing/util/encode-util.h
@@ -0,0 +1,45 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_ENCODE_UTIL_H_
+#define ICING_UTIL_ENCODE_UTIL_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+namespace icing {
+namespace lib {
+
+namespace encode_util {
+
+// Converts an unsigned 64-bit integer to a C string that doesn't contain 0-byte
+// since C string uses 0-byte as terminator. This increases the size of the
+// encoded_str from 8-bytes to 10-bytes at worst.
+//
+// Note that it is compatible with unsigned 32-bit integers, i.e. casting an
+// uint32_t to uint64_t with the same value and encoding it by this method will
+// get the same string.
+std::string EncodeIntToCString(uint64_t value);
+
+// Converts a C string (encoded from EncodeIntToCString()) to an unsigned 64-bit
+// integer.
+uint64_t DecodeIntFromCString(std::string_view encoded_str);
+
+} // namespace encode_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_ENCODE_UTIL_H_
diff --git a/icing/util/encode-util_test.cc b/icing/util/encode-util_test.cc
new file mode 100644
index 0000000..c6cb984
--- /dev/null
+++ b/icing/util/encode-util_test.cc
@@ -0,0 +1,91 @@
+// Copyright (C) 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/encode-util.h"
+
+#include <cstdint>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+namespace encode_util {
+
+namespace {
+
+using ::testing::Eq;
+using ::testing::Gt;
+using ::testing::SizeIs;
+
+TEST(EncodeUtilTest, IntCStringZeroConversion) {
+ uint64_t value = 0;
+ std::string encoded_str = EncodeIntToCString(value);
+
+ EXPECT_THAT(encoded_str, SizeIs(Gt(0)));
+ EXPECT_THAT(DecodeIntFromCString(encoded_str), Eq(value));
+}
+
+TEST(EncodeUtilTest, IntCStringConversionIsReversible) {
+ uint64_t value = 123456;
+ std::string encoded_str = EncodeIntToCString(value);
+ EXPECT_THAT(DecodeIntFromCString(encoded_str), Eq(value));
+}
+
+TEST(EncodeUtilTest, MultipleIntCStringConversionsAreReversible) {
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(25)), Eq(25));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(766)), Eq(766));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(2305)), Eq(2305));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(6922)), Eq(6922));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(62326)), Eq(62326));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(186985)), Eq(186985));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(560962)), Eq(560962));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(1682893)), Eq(1682893));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(15146065)), Eq(15146065));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(136314613)),
+ Eq(136314613));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(1226831545)),
+ Eq(1226831545));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(11041483933)),
+ Eq(11041483933));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(2683080596566)),
+ Eq(2683080596566));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(72443176107373)),
+ Eq(72443176107373));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(1955965754899162)),
+ Eq(1955965754899162));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(52811075382277465)),
+ Eq(52811075382277465));
+ EXPECT_THAT(DecodeIntFromCString(EncodeIntToCString(4277697105964474945)),
+ Eq(4277697105964474945));
+}
+
+TEST(EncodeUtilTest, MultipleValidEncodedCStringIntConversionsAreReversible) {
+ // Only valid encoded C string (no zero bytes, length is between 1 and 10) are
+ // reversible.
+ EXPECT_THAT(EncodeIntToCString(DecodeIntFromCString("foo")), Eq("foo"));
+ EXPECT_THAT(EncodeIntToCString(DecodeIntFromCString("bar")), Eq("bar"));
+ EXPECT_THAT(EncodeIntToCString(DecodeIntFromCString("baz")), Eq("baz"));
+ EXPECT_THAT(EncodeIntToCString(DecodeIntFromCString("Icing")), Eq("Icing"));
+ EXPECT_THAT(EncodeIntToCString(DecodeIntFromCString("Google")), Eq("Google"));
+ EXPECT_THAT(EncodeIntToCString(DecodeIntFromCString("Youtube")),
+ Eq("Youtube"));
+}
+
+} // namespace
+
+} // namespace encode_util
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/fingerprint-util.cc b/icing/util/fingerprint-util.cc
new file mode 100644
index 0000000..0ea843f
--- /dev/null
+++ b/icing/util/fingerprint-util.cc
@@ -0,0 +1,48 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/fingerprint-util.h"
+
+namespace icing {
+namespace lib {
+
+namespace fingerprint_util {
+
+// A formatter to properly handle a string that is actually just a hash value.
+std::string GetFingerprintString(uint64_t fingerprint) {
+ std::string encoded_fprint;
+ // DynamicTrie cannot handle keys with '0' as bytes. So, we encode it in
+ // base128 and add 1 to make sure that no byte is '0'. This increases the
+ // size of the encoded_fprint from 8-bytes to 10-bytes.
+ while (fingerprint) {
+ encoded_fprint.push_back((fingerprint & 0x7F) + 1);
+ fingerprint >>= 7;
+ }
+ return encoded_fprint;
+}
+
+uint64_t GetFingerprint(std::string_view fingerprint_string) {
+ uint64_t fprint = 0;
+ for (int i = fingerprint_string.length() - 1; i >= 0; --i) {
+ fprint <<= 7;
+ char c = fingerprint_string[i] - 1;
+ fprint |= (c & 0x7F);
+ }
+ return fprint;
+}
+
+} // namespace fingerprint_util
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/fingerprint-util.h b/icing/util/fingerprint-util.h
new file mode 100644
index 0000000..9e98617
--- /dev/null
+++ b/icing/util/fingerprint-util.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_FINGERPRINT_UTIL_H_
+#define ICING_UTIL_FINGERPRINT_UTIL_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+namespace icing {
+namespace lib {
+
+namespace fingerprint_util {
+
+// Converts from a fingerprint to a fingerprint string.
+std::string GetFingerprintString(uint64_t fingerprint);
+
+// Converts from a fingerprint string to a fingerprint.
+uint64_t GetFingerprint(std::string_view fingerprint_string);
+
+// A formatter to properly handle a string that is actually just a hash value.
+class FingerprintStringFormatter {
+ public:
+ std::string operator()(std::string_view fingerprint_string) {
+ uint64_t fingerprint = GetFingerprint(fingerprint_string);
+ return std::to_string(fingerprint);
+ }
+};
+
+} // namespace fingerprint_util
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_FINGERPRINT_UTIL_H_
diff --git a/icing/util/fingerprint-util_test.cc b/icing/util/fingerprint-util_test.cc
new file mode 100644
index 0000000..948c75a
--- /dev/null
+++ b/icing/util/fingerprint-util_test.cc
@@ -0,0 +1,75 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/fingerprint-util.h"
+
+#include <cstdint>
+#include <limits>
+
+#include "icing/text_classifier/lib3/utils/hash/farmhash.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace icing {
+namespace lib {
+namespace fingerprint_util {
+
+namespace {
+
+using ::testing::Eq;
+
+TEST(FingerprintUtilTest, ConversionIsReversible) {
+ std::string str = "foo-bar-baz";
+ uint64_t fprint = tc3farmhash::Fingerprint64(str);
+ std::string fprint_string = GetFingerprintString(fprint);
+ EXPECT_THAT(GetFingerprint(fprint_string), Eq(fprint));
+}
+
+TEST(FingerprintUtilTest, ZeroConversionIsReversible) {
+ uint64_t fprint = 0;
+ std::string fprint_string = GetFingerprintString(fprint);
+ EXPECT_THAT(GetFingerprint(fprint_string), Eq(fprint));
+}
+
+TEST(FingerprintUtilTest, MultipleConversionsAreReversible) {
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(25)), Eq(25));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(766)), Eq(766));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(2305)), Eq(2305));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(6922)), Eq(6922));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(62326)), Eq(62326));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(186985)), Eq(186985));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(560962)), Eq(560962));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(1682893)), Eq(1682893));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(15146065)), Eq(15146065));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(136314613)), Eq(136314613));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(1226831545)), Eq(1226831545));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(11041483933)),
+ Eq(11041483933));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(2683080596566)),
+ Eq(2683080596566));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(72443176107373)),
+ Eq(72443176107373));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(1955965754899162)),
+ Eq(1955965754899162));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(52811075382277465)),
+ Eq(52811075382277465));
+ EXPECT_THAT(GetFingerprint(GetFingerprintString(4277697105964474945)),
+ Eq(4277697105964474945));
+}
+
+} // namespace
+
+} // namespace fingerprint_util
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/i18n-utils.cc b/icing/util/i18n-utils.cc
index 9cf992f..ada9ef2 100644
--- a/icing/util/i18n-utils.cc
+++ b/icing/util/i18n-utils.cc
@@ -38,7 +38,7 @@ namespace {
// (https://www.fileformat.info/info/unicode/category/index.htm). The set of
// characters that are regarded as punctuation is not the same for std::ispunct
// and u_ispunct.
-const std::string ascii_icu_punctuation = "!\"#%&'*,./:;?@\\_-([{}])";
+constexpr std::string_view kAsciiIcuPunctuation = "!\"#%&'*,./:;?@\\_-([{}])";
} // namespace
@@ -99,22 +99,25 @@ void SafeTruncateUtf8(std::string* str, int truncate_to_length) {
return;
}
- while (truncate_to_length > 0) {
- if (IsLeadUtf8Byte(str->at(truncate_to_length))) {
- str->resize(truncate_to_length);
- return;
+ str->resize(SafeTruncateUtf8Length(str->c_str(), truncate_to_length));
+}
+
+int SafeTruncateUtf8Length(const char* str, int desired_length) {
+ while (desired_length > 0) {
+ if (IsLeadUtf8Byte(str[desired_length])) {
+ break;
}
- truncate_to_length--;
+ --desired_length;
}
-
- // Truncates to an empty string
- str->resize(0);
+ return desired_length;
}
bool IsAscii(char c) { return U8_IS_SINGLE((uint8_t)c); }
bool IsAscii(UChar32 c) { return U8_LENGTH(c) == 1; }
+bool IsAlphaNumeric(UChar32 c) { return u_isalnum(c); }
+
int GetUtf8Length(UChar32 c) { return U8_LENGTH(c); }
int GetUtf16Length(UChar32 c) { return U16_LENGTH(c); }
@@ -126,7 +129,7 @@ bool IsPunctuationAt(std::string_view input, int position, int* char_len_out) {
if (char_len_out != nullptr) {
*char_len_out = 1;
}
- return ascii_icu_punctuation.find(input[position]) != std::string::npos;
+ return kAsciiIcuPunctuation.find(input[position]) != std::string_view::npos;
}
UChar32 c = GetUChar32At(input.data(), input.length(), position);
if (char_len_out != nullptr) {
@@ -155,7 +158,7 @@ void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar) {
uint8_t utf8_buffer[4]; // U8_APPEND writes 0 to 4 bytes
int utf8_index = 0;
- UBool has_error = FALSE;
+ UBool has_error = false;
// utf8_index is advanced to the end of the contents if successful
U8_APPEND(utf8_buffer, utf8_index, sizeof(utf8_buffer), uchar, has_error);
diff --git a/icing/util/i18n-utils.h b/icing/util/i18n-utils.h
index e103bab..491df6b 100644
--- a/icing/util/i18n-utils.h
+++ b/icing/util/i18n-utils.h
@@ -50,6 +50,13 @@ libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16(
// Returns the char at the given position.
UChar32 GetUChar32At(const char* data, int length, int position);
+// Returns the safe position to truncate a UTF8 string at so that multi-byte
+// UTF8 characters are not cut in the middle. The returned value will always be
+// 0 <= val <= desired_length.
+//
+// REQUIRES: 0 <= desired_length < strlen(str)
+int SafeTruncateUtf8Length(const char* str, int desired_length);
+
// Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut
// in the middle. The string will be truncated in place.
void SafeTruncateUtf8(std::string* str, int truncate_to_length);
@@ -60,6 +67,9 @@ bool IsAscii(char c);
// Checks if the Unicode char is within ASCII range.
bool IsAscii(UChar32 c);
+// Checks if the Unicode char is alphanumeric.
+bool IsAlphaNumeric(UChar32 c);
+
// Returns how many code units (char) are used for the UTF-8 encoding of this
// Unicode character. Returns 0 if not valid.
int GetUtf8Length(UChar32 c);
diff --git a/icing/util/logging.cc b/icing/util/logging.cc
new file mode 100644
index 0000000..f60526b
--- /dev/null
+++ b/icing/util/logging.cc
@@ -0,0 +1,125 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/logging.h"
+
+#include <atomic>
+#include <exception>
+#include <string_view>
+
+#include "icing/proto/debug.pb.h"
+#include "icing/util/logging_raw.h"
+
+namespace icing {
+namespace lib {
+namespace {
+// Returns pointer to beginning of last /-separated token from file_name.
+// file_name should be a pointer to a zero-terminated array of chars.
+// E.g., "foo/bar.cc" -> "bar.cc", "foo/" -> "", "foo" -> "foo".
+const char *JumpToBasename(const char *file_name) {
+ if (file_name == nullptr) {
+ return nullptr;
+ }
+
+ // Points to the beginning of the last encountered token.
+ size_t last_token_start = std::string_view(file_name).find_last_of('/');
+ if (last_token_start == std::string_view::npos) {
+ return file_name;
+ }
+ return file_name + last_token_start + 1;
+}
+
+// Calculate the logging level value based on severity and verbosity.
+constexpr uint32_t CalculateLoggingLevel(LogSeverity::Code severity,
+ uint16_t verbosity) {
+ uint32_t logging_level = static_cast<uint16_t>(severity);
+ logging_level = (logging_level << 16) | verbosity;
+ return logging_level;
+}
+
+#if defined(ICING_DEBUG_LOGGING)
+#define DEFAULT_LOGGING_LEVEL CalculateLoggingLevel(LogSeverity::VERBOSE, 1)
+#else
+#define DEFAULT_LOGGING_LEVEL CalculateLoggingLevel(LogSeverity::INFO, 0)
+#endif
+
+// The current global logging level for Icing, which controls which logs are
+// printed based on severity and verbosity.
+//
+// This needs to be global so that it can be easily accessed from ICING_LOG and
+// ICING_VLOG macros spread throughout the entire code base.
+//
+// The first 16 bits represent the minimal log severity.
+// The last 16 bits represent the current verbosity.
+std::atomic<uint32_t> global_logging_level = DEFAULT_LOGGING_LEVEL;
+
+} // namespace
+
+// Whether we should log according to the current logging level.
+bool ShouldLog(LogSeverity::Code severity, int16_t verbosity) {
+ if (verbosity < 0) {
+ return false;
+ }
+ // Using the relaxed order for better performance because we only need to
+ // guarantee the atomicity for this specific statement, without the need to
+ // worry about reordering.
+ uint32_t curr_logging_level =
+ global_logging_level.load(std::memory_order_relaxed);
+ // If severity is less than the the threshold set.
+ if (static_cast<uint16_t>(severity) < (curr_logging_level >> 16)) {
+ return false;
+ }
+ if (severity == LogSeverity::VERBOSE) {
+ // return whether the verbosity is within the current verbose level set.
+ return verbosity <= (curr_logging_level & 0xffff);
+ }
+ return true;
+}
+
+bool SetLoggingLevel(LogSeverity::Code severity, int16_t verbosity) {
+ if (verbosity < 0) {
+ return false;
+ }
+ if (severity > LogSeverity::VERBOSE && verbosity > 0) {
+ return false;
+ }
+ // Using the relaxed order for better performance because we only need to
+ // guarantee the atomicity for this specific statement, without the need to
+ // worry about reordering.
+ global_logging_level.store(CalculateLoggingLevel(severity, verbosity),
+ std::memory_order_relaxed);
+ return true;
+}
+
+LogMessage::LogMessage(LogSeverity::Code severity, uint16_t verbosity,
+ const char *file_name, int line_number)
+ : severity_(severity),
+ verbosity_(verbosity),
+ should_log_(ShouldLog(severity_, verbosity_)),
+ stream_(should_log_) {
+ if (should_log_) {
+ stream_ << JumpToBasename(file_name) << ":" << line_number << ": ";
+ }
+}
+
+LogMessage::~LogMessage() {
+ if (should_log_) {
+ LowLevelLogging(severity_, kIcingLoggingTag, stream_.message);
+ }
+ if (severity_ == LogSeverity::FATAL) {
+ std::terminate(); // Will print a stacktrace (stdout or logcat).
+ }
+}
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/logging.h b/icing/util/logging.h
index 9d598fe..23280dc 100644
--- a/icing/util/logging.h
+++ b/icing/util/logging.h
@@ -15,14 +15,146 @@
#ifndef ICING_UTIL_LOGGING_H_
#define ICING_UTIL_LOGGING_H_
-#include "icing/text_classifier/lib3/utils/base/logging.h"
+#include <atomic>
+#include <cstdint>
+#include <string>
+#include "icing/proto/debug.pb.h"
+
+// This header provides base/logging.h style macros, ICING_LOG and ICING_VLOG,
+// for logging in various platforms. The macros use __android_log_write on
+// Android, and log to stdout/stderr on others. It also provides a function
+// SetLoggingLevel to control the log severity level for ICING_LOG and verbosity
+// for ICING_VLOG.
namespace icing {
namespace lib {
-// TODO(b/146903474) Add verbose level control
-#define ICING_VLOG(verbose_level) TC3_VLOG(verbose_level)
-#define ICING_LOG(severity) TC3_LOG(severity)
+// Whether we should log according to the current logging level.
+// The function will always return false when verbosity is negative.
+bool ShouldLog(LogSeverity::Code severity, int16_t verbosity = 0);
+
+// Set the minimal logging severity to be enabled, and the verbose level to see
+// from the logs.
+// Return false if severity is set higher than VERBOSE but verbosity is not 0.
+// The function will always return false when verbosity is negative.
+bool SetLoggingLevel(LogSeverity::Code severity, int16_t verbosity = 0);
+
+// A tiny code footprint string stream for assembling log messages.
+struct LoggingStringStream {
+ explicit LoggingStringStream(bool should_log) : should_log_(should_log) {}
+ LoggingStringStream& stream() { return *this; }
+
+ std::string message;
+ const bool should_log_;
+};
+
+template <typename T>
+inline LoggingStringStream& operator<<(LoggingStringStream& stream,
+ const T& entry) {
+ if (stream.should_log_) {
+ stream.message.append(std::to_string(entry));
+ }
+ return stream;
+}
+
+template <typename T>
+inline LoggingStringStream& operator<<(LoggingStringStream& stream,
+ T* const entry) {
+ if (stream.should_log_) {
+ stream.message.append(
+ std::to_string(reinterpret_cast<const uint64_t>(entry)));
+ }
+ return stream;
+}
+
+inline LoggingStringStream& operator<<(LoggingStringStream& stream,
+ const char* message) {
+ if (stream.should_log_) {
+ stream.message.append(message);
+ }
+ return stream;
+}
+
+inline LoggingStringStream& operator<<(LoggingStringStream& stream,
+ const std::string& message) {
+ if (stream.should_log_) {
+ stream.message.append(message);
+ }
+ return stream;
+}
+
+inline LoggingStringStream& operator<<(LoggingStringStream& stream,
+ std::string_view message) {
+ if (stream.should_log_) {
+ stream.message.append(message);
+ }
+ return stream;
+}
+
+template <typename T1, typename T2>
+inline LoggingStringStream& operator<<(LoggingStringStream& stream,
+ const std::pair<T1, T2>& entry) {
+ if (stream.should_log_) {
+ stream << "(" << entry.first << ", " << entry.second << ")";
+ }
+ return stream;
+}
+
+// The class that does all the work behind our ICING_LOG(severity) macros. Each
+// ICING_LOG(severity) << obj1 << obj2 << ...; logging statement creates a
+// LogMessage temporary object containing a stringstream. Each operator<< adds
+// info to that stringstream and the LogMessage destructor performs the actual
+// logging. The reason this works is that in C++, "all temporary objects are
+// destroyed as the last step in evaluating the full-expression that (lexically)
+// contains the point where they were created." For more info, see
+// http://en.cppreference.com/w/cpp/language/lifetime. Hence, the destructor is
+// invoked after the last << from that logging statement.
+class LogMessage {
+ public:
+ LogMessage(LogSeverity::Code severity, uint16_t verbosity,
+ const char* file_name, int line_number) __attribute__((noinline));
+
+ ~LogMessage() __attribute__((noinline));
+
+ // Returns the stream associated with the logger object.
+ LoggingStringStream& stream() { return stream_; }
+
+ private:
+ const LogSeverity::Code severity_;
+ const uint16_t verbosity_;
+ const bool should_log_;
+
+ // Stream that "prints" all info into a string (not to a file). We construct
+ // here the entire logging message and next print it in one operation.
+ LoggingStringStream stream_;
+};
+
+inline constexpr char kIcingLoggingTag[] = "AppSearchIcing";
+
+// Define consts to make it easier to refer to log severities in code.
+constexpr ::icing::lib::LogSeverity::Code VERBOSE =
+ ::icing::lib::LogSeverity::VERBOSE;
+
+constexpr ::icing::lib::LogSeverity::Code DBG = ::icing::lib::LogSeverity::DBG;
+
+constexpr ::icing::lib::LogSeverity::Code INFO =
+ ::icing::lib::LogSeverity::INFO;
+
+constexpr ::icing::lib::LogSeverity::Code WARNING =
+ ::icing::lib::LogSeverity::WARNING;
+
+constexpr ::icing::lib::LogSeverity::Code ERROR =
+ ::icing::lib::LogSeverity::ERROR;
+
+constexpr ::icing::lib::LogSeverity::Code FATAL =
+ ::icing::lib::LogSeverity::FATAL;
+
+#define ICING_VLOG(verbose_level) \
+ ::icing::lib::LogMessage(VERBOSE, verbose_level, __FILE__, __LINE__).stream()
+
+#define ICING_LOG(severity) \
+ ::icing::lib::LogMessage(severity, /*verbosity=*/0, __FILE__, __LINE__) \
+ .stream()
} // namespace lib
} // namespace icing
diff --git a/icing/util/logging_raw.cc b/icing/util/logging_raw.cc
new file mode 100644
index 0000000..44dd000
--- /dev/null
+++ b/icing/util/logging_raw.cc
@@ -0,0 +1,104 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/logging_raw.h"
+
+#include <cstdio>
+#include <string>
+
+#include "icing/proto/debug.pb.h"
+
+// NOTE: this file contains two implementations: one for Android, one for all
+// other cases. We always build exactly one implementation.
+#if defined(__ANDROID__)
+
+// Compiled as part of Android.
+#include <android/log.h>
+
+namespace icing {
+namespace lib {
+
+namespace {
+// Converts LogSeverity to level for __android_log_write.
+int GetAndroidLogLevel(LogSeverity::Code severity) {
+ switch (severity) {
+ case LogSeverity::VERBOSE:
+ return ANDROID_LOG_VERBOSE;
+ case LogSeverity::DBG:
+ return ANDROID_LOG_DEBUG;
+ case LogSeverity::INFO:
+ return ANDROID_LOG_INFO;
+ case LogSeverity::WARNING:
+ return ANDROID_LOG_WARN;
+ case LogSeverity::ERROR:
+ return ANDROID_LOG_ERROR;
+ case LogSeverity::FATAL:
+ return ANDROID_LOG_FATAL;
+ }
+}
+} // namespace
+
+void LowLevelLogging(LogSeverity::Code severity, const std::string& tag,
+ const std::string& message) {
+ const int android_log_level = GetAndroidLogLevel(severity);
+#if __ANDROID_API__ >= 30
+ if (!__android_log_is_loggable(android_log_level, tag.c_str(),
+ /*default_prio=*/ANDROID_LOG_INFO)) {
+ return;
+ }
+#endif // __ANDROID_API__ >= 30
+ __android_log_write(android_log_level, tag.c_str(), message.c_str());
+}
+
+} // namespace lib
+} // namespace icing
+
+#else // if defined(__ANDROID__)
+
+// Not on Android: implement LowLevelLogging to print to stderr (see below).
+namespace icing {
+namespace lib {
+
+namespace {
+// Converts LogSeverity to human-readable text.
+const char *LogSeverityToString(LogSeverity::Code severity) {
+ switch (severity) {
+ case LogSeverity::VERBOSE:
+ return "VERBOSE";
+ case LogSeverity::DBG:
+ return "DEBUG";
+ case LogSeverity::INFO:
+ return "INFO";
+ case LogSeverity::WARNING:
+ return "WARNING";
+ case LogSeverity::ERROR:
+ return "ERROR";
+ case LogSeverity::FATAL:
+ return "FATAL";
+ }
+}
+} // namespace
+
+void LowLevelLogging(LogSeverity::Code severity, const std::string &tag,
+ const std::string &message) {
+ // TODO(b/146903474) Do not log to stderr for logs other than FATAL and ERROR.
+ fprintf(stderr, "[%s] %s : %s\n", LogSeverityToString(severity), tag.c_str(),
+ message.c_str());
+ fflush(stderr);
+}
+
+} // namespace lib
+} // namespace icing
+
+#endif // if defined(__ANDROID__)
diff --git a/icing/util/logging_raw.h b/icing/util/logging_raw.h
new file mode 100644
index 0000000..99dddb6
--- /dev/null
+++ b/icing/util/logging_raw.h
@@ -0,0 +1,34 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_UTIL_LOGGING_RAW_H_
+#define ICING_UTIL_LOGGING_RAW_H_
+
+#include <string>
+
+#include "icing/proto/debug.pb.h"
+
+namespace icing {
+namespace lib {
+
+// Low-level logging primitive. Logs a message, with the indicated log
+// severity. From android/log.h: "the tag normally corresponds to the component
+// that emits the log message, and should be reasonably small".
+void LowLevelLogging(LogSeverity::Code severity, const std::string &tag,
+ const std::string &message);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_UTIL_LOGGING_RAW_H_
diff --git a/icing/util/logging_test.cc b/icing/util/logging_test.cc
new file mode 100644
index 0000000..eac018e
--- /dev/null
+++ b/icing/util/logging_test.cc
@@ -0,0 +1,158 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/logging.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/proto/debug.pb.h"
+#include "icing/util/logging_raw.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+using ::testing::EndsWith;
+using ::testing::IsEmpty;
+
+TEST(LoggingTest, SetLoggingLevelWithInvalidArguments) {
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::DBG, 1));
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::INFO, 1));
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::WARNING, 1));
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::ERROR, 1));
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::FATAL, 1));
+
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::DBG, 2));
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::INFO, 2));
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::WARNING, 2));
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::ERROR, 2));
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::FATAL, 2));
+
+ EXPECT_FALSE(SetLoggingLevel(LogSeverity::VERBOSE, -1));
+}
+
+TEST(LoggingTest, SetLoggingLevelTest) {
+ // Set to INFO
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::INFO));
+ EXPECT_FALSE(ShouldLog(LogSeverity::DBG));
+ EXPECT_TRUE(ShouldLog(LogSeverity::INFO));
+ EXPECT_TRUE(ShouldLog(LogSeverity::WARNING));
+
+ // Set to WARNING
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::WARNING));
+ EXPECT_FALSE(ShouldLog(LogSeverity::DBG));
+ EXPECT_FALSE(ShouldLog(LogSeverity::INFO));
+ EXPECT_TRUE(ShouldLog(LogSeverity::WARNING));
+
+ // Set to DEBUG
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::DBG));
+ EXPECT_TRUE(ShouldLog(LogSeverity::DBG));
+ EXPECT_TRUE(ShouldLog(LogSeverity::INFO));
+ EXPECT_TRUE(ShouldLog(LogSeverity::WARNING));
+}
+
+TEST(LoggingTest, VerboseLoggingTest) {
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::VERBOSE, 1));
+ EXPECT_TRUE(ShouldLog(LogSeverity::VERBOSE, 1));
+ EXPECT_TRUE(ShouldLog(LogSeverity::DBG));
+ EXPECT_TRUE(ShouldLog(LogSeverity::INFO));
+ EXPECT_TRUE(ShouldLog(LogSeverity::WARNING));
+ EXPECT_TRUE(ShouldLog(LogSeverity::ERROR));
+ EXPECT_TRUE(ShouldLog(LogSeverity::FATAL));
+}
+
+TEST(LoggingTest, VerboseLoggingIsControlledByVerbosity) {
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::VERBOSE, 2));
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, 3));
+ EXPECT_TRUE(ShouldLog(LogSeverity::VERBOSE, 2));
+ EXPECT_TRUE(ShouldLog(LogSeverity::VERBOSE, 1));
+
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::VERBOSE, 1));
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, 2));
+ EXPECT_TRUE(ShouldLog(LogSeverity::VERBOSE, 1));
+
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::VERBOSE, 0));
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, 1));
+ EXPECT_TRUE(ShouldLog(LogSeverity::VERBOSE, 0));
+
+ // Negative verbosity is invalid.
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, -1));
+}
+
+TEST(LoggingTest, DebugLoggingTest) {
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::DBG));
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, 1));
+ EXPECT_TRUE(ShouldLog(LogSeverity::DBG));
+ EXPECT_TRUE(ShouldLog(LogSeverity::INFO));
+ EXPECT_TRUE(ShouldLog(LogSeverity::WARNING));
+ EXPECT_TRUE(ShouldLog(LogSeverity::ERROR));
+ EXPECT_TRUE(ShouldLog(LogSeverity::FATAL));
+}
+
+TEST(LoggingTest, InfoLoggingTest) {
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::INFO));
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, 1));
+ EXPECT_FALSE(ShouldLog(LogSeverity::DBG));
+ EXPECT_TRUE(ShouldLog(LogSeverity::INFO));
+ EXPECT_TRUE(ShouldLog(LogSeverity::WARNING));
+ EXPECT_TRUE(ShouldLog(LogSeverity::ERROR));
+ EXPECT_TRUE(ShouldLog(LogSeverity::FATAL));
+}
+
+TEST(LoggingTest, WarningLoggingTest) {
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::WARNING));
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, 1));
+ EXPECT_FALSE(ShouldLog(LogSeverity::DBG));
+ EXPECT_FALSE(ShouldLog(LogSeverity::INFO));
+ EXPECT_TRUE(ShouldLog(LogSeverity::WARNING));
+ EXPECT_TRUE(ShouldLog(LogSeverity::ERROR));
+ EXPECT_TRUE(ShouldLog(LogSeverity::FATAL));
+}
+
+TEST(LoggingTest, ErrorLoggingTest) {
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::ERROR));
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, 1));
+ EXPECT_FALSE(ShouldLog(LogSeverity::DBG));
+ EXPECT_FALSE(ShouldLog(LogSeverity::INFO));
+ EXPECT_FALSE(ShouldLog(LogSeverity::WARNING));
+ EXPECT_TRUE(ShouldLog(LogSeverity::ERROR));
+ EXPECT_TRUE(ShouldLog(LogSeverity::FATAL));
+}
+
+TEST(LoggingTest, FatalLoggingTest) {
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::FATAL));
+ EXPECT_FALSE(ShouldLog(LogSeverity::VERBOSE, 1));
+ EXPECT_FALSE(ShouldLog(LogSeverity::DBG));
+ EXPECT_FALSE(ShouldLog(LogSeverity::INFO));
+ EXPECT_FALSE(ShouldLog(LogSeverity::WARNING));
+ EXPECT_FALSE(ShouldLog(LogSeverity::ERROR));
+ EXPECT_TRUE(ShouldLog(LogSeverity::FATAL));
+}
+
+TEST(LoggingTest, LoggingStreamTest) {
+ ASSERT_TRUE(SetLoggingLevel(LogSeverity::INFO));
+ // This one should be logged.
+ LoggingStringStream stream1 = (ICING_LOG(INFO) << "Hello"
+ << "World!");
+ EXPECT_THAT(stream1.message, EndsWith("HelloWorld!"));
+
+ // This one should not be logged, thus empty.
+ LoggingStringStream stream2 = (ICING_LOG(DBG) << "Hello"
+ << "World!");
+ EXPECT_THAT(stream2.message, IsEmpty());
+}
+
+} // namespace
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/math-util.h b/icing/util/math-util.h
index fc11a09..3f2a69d 100644
--- a/icing/util/math-util.h
+++ b/icing/util/math-util.h
@@ -37,7 +37,7 @@ inline double SafeDivide(double first, double second) {
template <typename IntType>
static IntType RoundDownTo(IntType input_value, IntType rounding_value) {
static_assert(std::numeric_limits<IntType>::is_integer,
- "RoundUpTo() operation type is not integer");
+ "RoundDownTo() operation type is not integer");
if (input_value <= 0) {
return 0;
diff --git a/icing/util/snippet-helpers.cc b/icing/util/snippet-helpers.cc
new file mode 100644
index 0000000..ca6f423
--- /dev/null
+++ b/icing/util/snippet-helpers.cc
@@ -0,0 +1,94 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/snippet-helpers.h"
+
+#include <algorithm>
+#include <string_view>
+
+#include "icing/proto/document.pb.h"
+#include "icing/proto/search.pb.h"
+#include "icing/schema/property-util.h"
+
+namespace icing {
+namespace lib {
+
+std::vector<std::string_view> GetWindows(
+ std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
+ std::vector<std::string_view> windows;
+ for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
+ windows.push_back(content.substr(match.window_byte_position(),
+ match.window_byte_length()));
+ }
+ return windows;
+}
+
+std::vector<std::string_view> GetMatches(
+ std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
+ std::vector<std::string_view> matches;
+ for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
+ matches.push_back(content.substr(match.exact_match_byte_position(),
+ match.exact_match_byte_length()));
+ }
+ return matches;
+}
+
+std::vector<std::string_view> GetSubMatches(
+ std::string_view content, const SnippetProto::EntryProto& snippet_proto) {
+ std::vector<std::string_view> matches;
+ for (const SnippetMatchProto& match : snippet_proto.snippet_matches()) {
+ matches.push_back(content.substr(match.exact_match_byte_position(),
+ match.submatch_byte_length()));
+ }
+ return matches;
+}
+
+std::string_view GetString(const DocumentProto* document,
+ std::string_view property_path_expr) {
+ std::vector<std::string_view> properties =
+ property_util::SplitPropertyPathExpr(property_path_expr);
+ for (int i = 0; i < properties.size(); ++i) {
+ property_util::PropertyInfo property_info =
+ property_util::ParsePropertyNameExpr(properties.at(i));
+ if (property_info.index == property_util::kWildcardPropertyIndex) {
+ // Use index = 0 by default.
+ property_info.index = 0;
+ }
+
+ const PropertyProto* prop =
+ property_util::GetPropertyProto(*document, property_info.name);
+ if (prop == nullptr) {
+ // requested property doesn't exist in the document. Return empty string.
+ return "";
+ }
+ if (i == properties.size() - 1) {
+ // The last property. Get the string_value
+ if (prop->string_values_size() - 1 < property_info.index) {
+ // The requested string doesn't exist. Return empty string.
+ return "";
+ }
+ return prop->string_values(property_info.index);
+ } else if (prop->document_values_size() - 1 < property_info.index) {
+ // The requested subproperty doesn't exist. return an empty string.
+ return "";
+ } else {
+ // Go to the next subproperty.
+ document = &prop->document_values(property_info.index);
+ }
+ }
+ return "";
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/snippet-helpers.h b/icing/util/snippet-helpers.h
new file mode 100644
index 0000000..d7349ba
--- /dev/null
+++ b/icing/util/snippet-helpers.h
@@ -0,0 +1,60 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_TESTING_SNIPPET_HELPERS_H_
+#define ICING_TESTING_SNIPPET_HELPERS_H_
+
+#include <string>
+
+#include "icing/proto/document.pb.h"
+#include "icing/proto/search.pb.h"
+
+namespace icing {
+namespace lib {
+
+// Retrieve pointer to the PropertyProto identified by property_name.
+// Returns nullptr if no such property exists.
+//
+// NOTE: This function does not handle nesting or indexes. "foo.bar" will return
+// a nullptr even if document contains a property called "foo" that contains a
+// subproperty called "bar".
+const PropertyProto* GetProperty(const DocumentProto& document,
+ const std::string& property_name);
+
+// Retrieves all windows defined by the snippet_proto for the content.
+std::vector<std::string_view> GetWindows(
+ std::string_view content, const SnippetProto::EntryProto& snippet_proto);
+
+// Retrieves all matches defined by the snippet_proto for the content.
+std::vector<std::string_view> GetMatches(
+ std::string_view content, const SnippetProto::EntryProto& snippet_proto);
+
+// Retrieves all submatches defined by the snippet_proto for the content.
+std::vector<std::string_view> GetSubMatches(
+ std::string_view content, const SnippetProto::EntryProto& snippet_proto);
+
+// Retrieves the string value held in the document corresponding to the
+// property_path_expr.
+// Example:
+// - GetString(doc, "foo") will retrieve the first string value in the
+// property "foo" in document or an empty string if it doesn't exist.
+// - GetString(doc, "foo[1].bar[2]") will retrieve the third string value in
+// the subproperty "bar" of the second document value in the property "foo".
+std::string_view GetString(const DocumentProto* document,
+ std::string_view property_path_expr);
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_TESTING_SNIPPET_HELPERS_H_
diff --git a/icing/util/tokenized-document.cc b/icing/util/tokenized-document.cc
new file mode 100644
index 0000000..19aaddf
--- /dev/null
+++ b/icing/util/tokenized-document.cc
@@ -0,0 +1,92 @@
+// Copyright (C) 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/tokenized-document.h"
+
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/status.h"
+#include "icing/proto/document.pb.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "icing/tokenization/tokenizer-factory.h"
+#include "icing/tokenization/tokenizer.h"
+#include "icing/util/document-validator.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+libtextclassifier3::StatusOr<std::vector<TokenizedSection>> Tokenize(
+ const SchemaStore* schema_store,
+ const LanguageSegmenter* language_segmenter,
+ const std::vector<Section<std::string_view>>& string_sections) {
+ std::vector<TokenizedSection> tokenized_string_sections;
+ for (const Section<std::string_view>& section : string_sections) {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
+ tokenizer_factory::CreateIndexingTokenizer(
+ section.metadata.tokenizer, language_segmenter));
+ std::vector<std::string_view> token_sequence;
+ for (std::string_view subcontent : section.content) {
+ ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
+ tokenizer->Tokenize(subcontent));
+ while (itr->Advance()) {
+ std::vector<Token> batch_tokens = itr->GetTokens();
+ for (const Token& token : batch_tokens) {
+ token_sequence.push_back(token.text);
+ }
+ }
+ }
+ tokenized_string_sections.emplace_back(SectionMetadata(section.metadata),
+ std::move(token_sequence));
+ }
+
+ return tokenized_string_sections;
+}
+
+} // namespace
+
+/* static */ libtextclassifier3::StatusOr<TokenizedDocument>
+TokenizedDocument::Create(const SchemaStore* schema_store,
+ const LanguageSegmenter* language_segmenter,
+ DocumentProto document) {
+ DocumentValidator validator(schema_store);
+ ICING_RETURN_IF_ERROR(validator.Validate(document));
+
+ ICING_ASSIGN_OR_RETURN(SectionGroup section_group,
+ schema_store->ExtractSections(document));
+
+ ICING_ASSIGN_OR_RETURN(JoinablePropertyGroup joinable_property_group,
+ schema_store->ExtractJoinableProperties(document));
+
+ // Tokenize string sections
+ ICING_ASSIGN_OR_RETURN(
+ std::vector<TokenizedSection> tokenized_string_sections,
+ Tokenize(schema_store, language_segmenter,
+ section_group.string_sections));
+
+ return TokenizedDocument(std::move(document),
+ std::move(tokenized_string_sections),
+ std::move(section_group.integer_sections),
+ std::move(joinable_property_group));
+}
+
+} // namespace lib
+} // namespace icing
diff --git a/icing/util/tokenized-document.h b/icing/util/tokenized-document.h
new file mode 100644
index 0000000..7cc34e3
--- /dev/null
+++ b/icing/util/tokenized-document.h
@@ -0,0 +1,92 @@
+// Copyright (C) 2020 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_
+#define ICING_STORE_TOKENIZED_DOCUMENT_H_
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+#include "icing/proto/document.pb.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/tokenization/language-segmenter.h"
+
+namespace icing {
+namespace lib {
+
+struct TokenizedSection {
+ SectionMetadata metadata;
+ std::vector<std::string_view> token_sequence;
+
+ TokenizedSection(SectionMetadata&& metadata_in,
+ std::vector<std::string_view>&& token_sequence_in)
+ : metadata(std::move(metadata_in)),
+ token_sequence(std::move(token_sequence_in)) {}
+};
+
+class TokenizedDocument {
+ public:
+ static libtextclassifier3::StatusOr<TokenizedDocument> Create(
+ const SchemaStore* schema_store,
+ const LanguageSegmenter* language_segmenter, DocumentProto document);
+
+ const DocumentProto& document() const { return document_; }
+
+ int32_t num_string_tokens() const {
+ int32_t num_string_tokens = 0;
+ for (const TokenizedSection& section : tokenized_string_sections_) {
+ num_string_tokens += section.token_sequence.size();
+ }
+ return num_string_tokens;
+ }
+
+ const std::vector<TokenizedSection>& tokenized_string_sections() const {
+ return tokenized_string_sections_;
+ }
+
+ const std::vector<Section<int64_t>>& integer_sections() const {
+ return integer_sections_;
+ }
+
+ const std::vector<JoinableProperty<std::string_view>>&
+ qualified_id_join_properties() const {
+ return joinable_property_group_.qualified_id_properties;
+ }
+
+ private:
+ // Use TokenizedDocument::Create() to instantiate.
+ explicit TokenizedDocument(
+ DocumentProto&& document,
+ std::vector<TokenizedSection>&& tokenized_string_sections,
+ std::vector<Section<int64_t>>&& integer_sections,
+ JoinablePropertyGroup&& joinable_property_group)
+ : document_(std::move(document)),
+ tokenized_string_sections_(std::move(tokenized_string_sections)),
+ integer_sections_(std::move(integer_sections)),
+ joinable_property_group_(std::move(joinable_property_group)) {}
+
+ DocumentProto document_;
+ std::vector<TokenizedSection> tokenized_string_sections_;
+ std::vector<Section<int64_t>> integer_sections_;
+ JoinablePropertyGroup joinable_property_group_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_STORE_TOKENIZED_DOCUMENT_H_
diff --git a/icing/util/tokenized-document_test.cc b/icing/util/tokenized-document_test.cc
new file mode 100644
index 0000000..7c97776
--- /dev/null
+++ b/icing/util/tokenized-document_test.cc
@@ -0,0 +1,455 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/util/tokenized-document.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "icing/document-builder.h"
+#include "icing/file/filesystem.h"
+#include "icing/portable/platform.h"
+#include "icing/proto/document.pb.h"
+#include "icing/proto/schema.pb.h"
+#include "icing/proto/term.pb.h"
+#include "icing/schema-builder.h"
+#include "icing/schema/joinable-property.h"
+#include "icing/schema/schema-store.h"
+#include "icing/schema/section.h"
+#include "icing/testing/common-matchers.h"
+#include "icing/testing/fake-clock.h"
+#include "icing/testing/icu-data-file-helper.h"
+#include "icing/testing/test-data.h"
+#include "icing/testing/tmp-directory.h"
+#include "icing/tokenization/language-segmenter-factory.h"
+#include "icing/tokenization/language-segmenter.h"
+#include "unicode/uloc.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+using ::icing::lib::portable_equals_proto::EqualsProto;
+using ::testing::ElementsAre;
+using ::testing::Eq;
+using ::testing::IsEmpty;
+using ::testing::SizeIs;
+
+// schema types
+static constexpr std::string_view kFakeType = "FakeType";
+
+// Indexable properties and section Id. Section Id is determined by the
+// lexicographical order of indexable property path.
+static constexpr std::string_view kIndexableIntegerProperty1 =
+ "indexableInteger1";
+static constexpr std::string_view kIndexableIntegerProperty2 =
+ "indexableInteger2";
+static constexpr std::string_view kStringExactProperty = "stringExact";
+static constexpr std::string_view kStringPrefixProperty = "stringPrefix";
+
+static constexpr SectionId kIndexableInteger1SectionId = 0;
+static constexpr SectionId kIndexableInteger2SectionId = 1;
+static constexpr SectionId kStringExactSectionId = 2;
+static constexpr SectionId kStringPrefixSectionId = 3;
+
+// Joinable properties and joinable property id. Joinable property id is
+// determined by the lexicographical order of joinable property path.
+static constexpr std::string_view kQualifiedId1 = "qualifiedId1";
+static constexpr std::string_view kQualifiedId2 = "qualifiedId2";
+
+static constexpr JoinablePropertyId kQualifiedId1JoinablePropertyId = 0;
+static constexpr JoinablePropertyId kQualifiedId2JoinablePropertyId = 1;
+
+const SectionMetadata kIndexableInteger1SectionMetadata(
+ kIndexableInteger1SectionId, TYPE_INT64, TOKENIZER_NONE, TERM_MATCH_UNKNOWN,
+ NUMERIC_MATCH_RANGE, std::string(kIndexableIntegerProperty1));
+
+const SectionMetadata kIndexableInteger2SectionMetadata(
+ kIndexableInteger2SectionId, TYPE_INT64, TOKENIZER_NONE, TERM_MATCH_UNKNOWN,
+ NUMERIC_MATCH_RANGE, std::string(kIndexableIntegerProperty2));
+
+const SectionMetadata kStringExactSectionMetadata(
+ kStringExactSectionId, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_EXACT,
+ NUMERIC_MATCH_UNKNOWN, std::string(kStringExactProperty));
+
+const SectionMetadata kStringPrefixSectionMetadata(
+ kStringPrefixSectionId, TYPE_STRING, TOKENIZER_PLAIN, TERM_MATCH_PREFIX,
+ NUMERIC_MATCH_UNKNOWN, std::string(kStringPrefixProperty));
+
+const JoinablePropertyMetadata kQualifiedId1JoinablePropertyMetadata(
+ kQualifiedId1JoinablePropertyId, TYPE_STRING,
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID, std::string(kQualifiedId1));
+
+const JoinablePropertyMetadata kQualifiedId2JoinablePropertyMetadata(
+ kQualifiedId2JoinablePropertyId, TYPE_STRING,
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID, std::string(kQualifiedId2));
+
+// Other non-indexable/joinable properties.
+constexpr std::string_view kUnindexedStringProperty = "unindexedString";
+constexpr std::string_view kUnindexedIntegerProperty = "unindexedInteger";
+
+class TokenizedDocumentTest : public ::testing::Test {
+ protected:
+ void SetUp() override {
+ test_dir_ = GetTestTempDir() + "/icing";
+ schema_store_dir_ = test_dir_ + "/schema_store";
+ filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str());
+
+ if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
+ ICING_ASSERT_OK(
+ // File generated via icu_data_file rule in //icing/BUILD.
+ icu_data_file_helper::SetUpICUDataFile(
+ GetTestFilePath("icing/icu.dat")));
+ }
+
+ language_segmenter_factory::SegmenterOptions options(ULOC_US);
+ ICING_ASSERT_OK_AND_ASSIGN(
+ lang_segmenter_,
+ language_segmenter_factory::Create(std::move(options)));
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ schema_store_,
+ SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
+
+ SchemaProto schema =
+ SchemaBuilder()
+ .AddType(
+ SchemaTypeConfigBuilder()
+ .SetType(kFakeType)
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kUnindexedStringProperty)
+ .SetDataType(TYPE_STRING)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kUnindexedIntegerProperty)
+ .SetDataType(TYPE_INT64)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kIndexableIntegerProperty1)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kIndexableIntegerProperty2)
+ .SetDataTypeInt64(NUMERIC_MATCH_RANGE)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kStringExactProperty)
+ .SetDataTypeString(TERM_MATCH_EXACT,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_REPEATED))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kStringPrefixProperty)
+ .SetDataTypeString(TERM_MATCH_PREFIX,
+ TOKENIZER_PLAIN)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kQualifiedId1)
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL))
+ .AddProperty(PropertyConfigBuilder()
+ .SetName(kQualifiedId2)
+ .SetDataTypeJoinableString(
+ JOINABLE_VALUE_TYPE_QUALIFIED_ID)
+ .SetCardinality(CARDINALITY_OPTIONAL)))
+ .Build();
+ ICING_ASSERT_OK(schema_store_->SetSchema(
+ schema, /*ignore_errors_and_delete_documents=*/false,
+ /*allow_circular_schema_definitions=*/false));
+ }
+
+ void TearDown() override {
+ schema_store_.reset();
+
+ // Check that the schema store directory is the *only* directory in the
+ // schema_store_dir_. IOW, ensure that all temporary directories have been
+ // properly cleaned up.
+ std::vector<std::string> sub_dirs;
+ ASSERT_TRUE(filesystem_.ListDirectory(test_dir_.c_str(), &sub_dirs));
+ ASSERT_THAT(sub_dirs, ElementsAre("schema_store"));
+
+ // Finally, clean everything up.
+ ASSERT_TRUE(filesystem_.DeleteDirectoryRecursively(test_dir_.c_str()));
+ }
+
+ Filesystem filesystem_;
+ FakeClock fake_clock_;
+ std::string test_dir_;
+ std::string schema_store_dir_;
+ std::unique_ptr<LanguageSegmenter> lang_segmenter_;
+ std::unique_ptr<SchemaStore> schema_store_;
+};
+
+TEST_F(TokenizedDocumentTest, CreateAll) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUnindexedStringProperty),
+ "hello world unindexed")
+ .AddStringProperty(std::string(kStringExactProperty), "test foo",
+ "test bar", "test baz")
+ .AddStringProperty(std::string(kStringPrefixProperty), "foo bar baz")
+ .AddInt64Property(std::string(kUnindexedIntegerProperty), 789)
+ .AddInt64Property(std::string(kIndexableIntegerProperty1), 1, 2, 3)
+ .AddInt64Property(std::string(kIndexableIntegerProperty2), 456)
+ .AddStringProperty(std::string(kQualifiedId1), "pkg$db/ns#uri1")
+ .AddStringProperty(std::string(kQualifiedId2), "pkg$db/ns#uri2")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ EXPECT_THAT(tokenized_document.document(), EqualsProto(document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(9));
+
+ // string sections
+ EXPECT_THAT(tokenized_document.tokenized_string_sections(), SizeIs(2));
+ EXPECT_THAT(tokenized_document.tokenized_string_sections().at(0).metadata,
+ Eq(kStringExactSectionMetadata));
+ EXPECT_THAT(
+ tokenized_document.tokenized_string_sections().at(0).token_sequence,
+ ElementsAre("test", "foo", "test", "bar", "test", "baz"));
+ EXPECT_THAT(tokenized_document.tokenized_string_sections().at(1).metadata,
+ Eq(kStringPrefixSectionMetadata));
+ EXPECT_THAT(
+ tokenized_document.tokenized_string_sections().at(1).token_sequence,
+ ElementsAre("foo", "bar", "baz"));
+
+ // integer sections
+ EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(2));
+ EXPECT_THAT(tokenized_document.integer_sections().at(0).metadata,
+ Eq(kIndexableInteger1SectionMetadata));
+ EXPECT_THAT(tokenized_document.integer_sections().at(0).content,
+ ElementsAre(1, 2, 3));
+ EXPECT_THAT(tokenized_document.integer_sections().at(1).metadata,
+ Eq(kIndexableInteger2SectionMetadata));
+ EXPECT_THAT(tokenized_document.integer_sections().at(1).content,
+ ElementsAre(456));
+
+ // Qualified id join properties
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties(), SizeIs(2));
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(0).metadata,
+ Eq(kQualifiedId1JoinablePropertyMetadata));
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(0).values,
+ ElementsAre("pkg$db/ns#uri1"));
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(1).metadata,
+ Eq(kQualifiedId2JoinablePropertyMetadata));
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(1).values,
+ ElementsAre("pkg$db/ns#uri2"));
+}
+
+TEST_F(TokenizedDocumentTest, CreateNoIndexableIntegerProperties) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddInt64Property(std::string(kUnindexedIntegerProperty), 789)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ EXPECT_THAT(tokenized_document.document(), EqualsProto(document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(0));
+
+ // string sections
+ EXPECT_THAT(tokenized_document.tokenized_string_sections(), IsEmpty());
+
+ // integer sections
+ EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty());
+
+ // Qualified id join properties
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty());
+}
+
+TEST_F(TokenizedDocumentTest, CreateMultipleIndexableIntegerProperties) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddInt64Property(std::string(kUnindexedIntegerProperty), 789)
+ .AddInt64Property(std::string(kIndexableIntegerProperty1), 1, 2, 3)
+ .AddInt64Property(std::string(kIndexableIntegerProperty2), 456)
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ EXPECT_THAT(tokenized_document.document(), EqualsProto(document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(0));
+
+ // string sections
+ EXPECT_THAT(tokenized_document.tokenized_string_sections(), IsEmpty());
+
+ // integer sections
+ EXPECT_THAT(tokenized_document.integer_sections(), SizeIs(2));
+ EXPECT_THAT(tokenized_document.integer_sections().at(0).metadata,
+ Eq(kIndexableInteger1SectionMetadata));
+ EXPECT_THAT(tokenized_document.integer_sections().at(0).content,
+ ElementsAre(1, 2, 3));
+ EXPECT_THAT(tokenized_document.integer_sections().at(1).metadata,
+ Eq(kIndexableInteger2SectionMetadata));
+ EXPECT_THAT(tokenized_document.integer_sections().at(1).content,
+ ElementsAre(456));
+
+ // Qualified id join properties
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty());
+}
+
+TEST_F(TokenizedDocumentTest, CreateNoIndexableStringProperties) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUnindexedStringProperty),
+ "hello world unindexed")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ EXPECT_THAT(tokenized_document.document(), EqualsProto(document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(0));
+
+ // string sections
+ EXPECT_THAT(tokenized_document.tokenized_string_sections(), IsEmpty());
+
+ // integer sections
+ EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty());
+
+ // Qualified id join properties
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty());
+}
+
+TEST_F(TokenizedDocumentTest, CreateMultipleIndexableStringProperties) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUnindexedStringProperty),
+ "hello world unindexed")
+ .AddStringProperty(std::string(kStringExactProperty), "test foo",
+ "test bar", "test baz")
+ .AddStringProperty(std::string(kStringPrefixProperty), "foo bar baz")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ EXPECT_THAT(tokenized_document.document(), EqualsProto(document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(9));
+
+ // string sections
+ EXPECT_THAT(tokenized_document.tokenized_string_sections(), SizeIs(2));
+ EXPECT_THAT(tokenized_document.tokenized_string_sections().at(0).metadata,
+ Eq(kStringExactSectionMetadata));
+ EXPECT_THAT(
+ tokenized_document.tokenized_string_sections().at(0).token_sequence,
+ ElementsAre("test", "foo", "test", "bar", "test", "baz"));
+ EXPECT_THAT(tokenized_document.tokenized_string_sections().at(1).metadata,
+ Eq(kStringPrefixSectionMetadata));
+ EXPECT_THAT(
+ tokenized_document.tokenized_string_sections().at(1).token_sequence,
+ ElementsAre("foo", "bar", "baz"));
+
+ // integer sections
+ EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty());
+
+ // Qualified id join properties
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty());
+}
+
+TEST_F(TokenizedDocumentTest, CreateNoJoinQualifiedIdProperties) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUnindexedStringProperty),
+ "hello world unindexed")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ EXPECT_THAT(tokenized_document.document(), EqualsProto(document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(0));
+
+ // string sections
+ EXPECT_THAT(tokenized_document.tokenized_string_sections(), IsEmpty());
+
+ // integer sections
+ EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty());
+
+ // Qualified id join properties
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties(), IsEmpty());
+}
+
+TEST_F(TokenizedDocumentTest, CreateMultipleJoinQualifiedIdProperties) {
+ DocumentProto document =
+ DocumentBuilder()
+ .SetKey("icing", "fake_type/1")
+ .SetSchema(std::string(kFakeType))
+ .AddStringProperty(std::string(kUnindexedStringProperty),
+ "hello world unindexed")
+ .AddStringProperty(std::string(kQualifiedId1), "pkg$db/ns#uri1")
+ .AddStringProperty(std::string(kQualifiedId2), "pkg$db/ns#uri2")
+ .Build();
+
+ ICING_ASSERT_OK_AND_ASSIGN(
+ TokenizedDocument tokenized_document,
+ TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
+ document));
+
+ EXPECT_THAT(tokenized_document.document(), EqualsProto(document));
+ EXPECT_THAT(tokenized_document.num_string_tokens(), Eq(0));
+
+ // string sections
+ EXPECT_THAT(tokenized_document.tokenized_string_sections(), IsEmpty());
+
+ // integer sections
+ EXPECT_THAT(tokenized_document.integer_sections(), IsEmpty());
+
+ // Qualified id join properties
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties(), SizeIs(2));
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(0).metadata,
+ Eq(kQualifiedId1JoinablePropertyMetadata));
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(0).values,
+ ElementsAre("pkg$db/ns#uri1"));
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(1).metadata,
+ Eq(kQualifiedId2JoinablePropertyMetadata));
+ EXPECT_THAT(tokenized_document.qualified_id_join_properties().at(1).values,
+ ElementsAre("pkg$db/ns#uri2"));
+}
+
+} // namespace
+
+} // namespace lib
+} // namespace icing
diff --git a/java/build.gradle b/java/build.gradle
deleted file mode 100644
index 206c74f..0000000
--- a/java/build.gradle
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2020 The Android Open Source Project
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-buildscript {
- boolean unbundleBuild = (new File('unbundled-build')).exists()
- repositories {
- maven { url '../../../prebuilts/androidx/external' }
- if (unbundleBuild) {
- jcenter()
- }
- }
- dependencies {
- classpath('gradle.plugin.com.google.protobuf:protobuf-gradle-plugin:0.8.8')
- classpath('org.anarres.jarjar:jarjar-gradle:1.0.1')
- }
-}
-
-apply plugin: 'java-library'
-apply plugin: 'com.google.protobuf'
-apply plugin: 'org.anarres.jarjar'
-apply plugin: 'idea'
-
-sourceSets {
- main {
- proto {
- srcDir '../proto'
- include '**/*.proto'
- }
- }
-}
-
-compileJava {
- sourceCompatibility = JavaVersion.VERSION_1_7
- targetCompatibility = JavaVersion.VERSION_1_7
-}
-
-dependencies {
- implementation('com.google.protobuf:protobuf-javalite:3.10.0')
-}
-
-protobuf {
- protoc {
- artifact = 'com.google.protobuf:protoc:3.10.0'
- }
-
- generateProtoTasks {
- all().each { task ->
- task.builtins {
- java {
- option 'lite'
- }
- }
- }
- }
-}
-
-jarjar.repackage('jarjarTask') {
- destinationName "icing-java-jarjar.jar"
- from 'com.google.protobuf:protobuf-javalite:3.10.0'
- from files(sourceSets.main.output.classesDirs)
- dependsOn sourceSets.main.output
- classRename 'com.google.protobuf.**', 'com.google.android.icing.protobuf.@1'
-}
-
-configurations {
- jarjarConf
-}
-
-artifacts {
- jarjarConf(jarjarTask.destinationPath) {
- name 'icing-java-jarjar'
- type 'jar'
- builtBy jarjarTask
- }
-}
diff --git a/java/src/com/google/android/icing/IcingSearchEngine.java b/java/src/com/google/android/icing/IcingSearchEngine.java
index 3ac5eef..e73f16b 100644
--- a/java/src/com/google/android/icing/IcingSearchEngine.java
+++ b/java/src/com/google/android/icing/IcingSearchEngine.java
@@ -14,21 +14,29 @@
package com.google.android.icing;
-import android.util.Log;
import androidx.annotation.NonNull;
+import androidx.annotation.Nullable;
+import com.google.android.icing.proto.DebugInfoResultProto;
+import com.google.android.icing.proto.DebugInfoVerbosity;
import com.google.android.icing.proto.DeleteByNamespaceResultProto;
+import com.google.android.icing.proto.DeleteByQueryResultProto;
import com.google.android.icing.proto.DeleteBySchemaTypeResultProto;
import com.google.android.icing.proto.DeleteResultProto;
import com.google.android.icing.proto.DocumentProto;
+import com.google.android.icing.proto.GetAllNamespacesResultProto;
import com.google.android.icing.proto.GetOptimizeInfoResultProto;
import com.google.android.icing.proto.GetResultProto;
+import com.google.android.icing.proto.GetResultSpecProto;
import com.google.android.icing.proto.GetSchemaResultProto;
import com.google.android.icing.proto.GetSchemaTypeResultProto;
import com.google.android.icing.proto.IcingSearchEngineOptions;
import com.google.android.icing.proto.InitializeResultProto;
+import com.google.android.icing.proto.LogSeverity;
import com.google.android.icing.proto.OptimizeResultProto;
import com.google.android.icing.proto.PersistToDiskResultProto;
+import com.google.android.icing.proto.PersistType;
import com.google.android.icing.proto.PutResultProto;
+import com.google.android.icing.proto.ReportUsageResultProto;
import com.google.android.icing.proto.ResetResultProto;
import com.google.android.icing.proto.ResultSpecProto;
import com.google.android.icing.proto.SchemaProto;
@@ -36,354 +44,234 @@ import com.google.android.icing.proto.ScoringSpecProto;
import com.google.android.icing.proto.SearchResultProto;
import com.google.android.icing.proto.SearchSpecProto;
import com.google.android.icing.proto.SetSchemaResultProto;
-import com.google.android.icing.proto.StatusProto;
-import com.google.android.icing.protobuf.InvalidProtocolBufferException;
-
-/** Java wrapper to access native APIs in external/icing/icing/icing-search-engine.h */
-public final class IcingSearchEngine {
+import com.google.android.icing.proto.StorageInfoResultProto;
+import com.google.android.icing.proto.SuggestionResponse;
+import com.google.android.icing.proto.SuggestionSpecProto;
+import com.google.android.icing.proto.UsageReport;
+
+/**
+ * Java wrapper to access {@link IcingSearchEngineImpl}.
+ *
+ * <p>It converts byte array from {@link IcingSearchEngineImpl} to corresponding protos.
+ *
+ * <p>If this instance has been closed, the instance is no longer usable.
+ *
+ * <p>Keep this class to be non-Final so that it can be mocked in AppSearch.
+ *
+ * <p>NOTE: This class is NOT thread-safe.
+ */
+public class IcingSearchEngine implements IcingSearchEngineInterface {
private static final String TAG = "IcingSearchEngine";
+ private final IcingSearchEngineImpl icingSearchEngineImpl;
- private final long nativePointer;
+ /**
+ * @throws IllegalStateException if IcingSearchEngine fails to be created
+ */
+ public IcingSearchEngine(@NonNull IcingSearchEngineOptions options) {
+ icingSearchEngineImpl = new IcingSearchEngineImpl(options.toByteArray());
+ }
- static {
- // NOTE: This can fail with an UnsatisfiedLinkError
- System.loadLibrary("icing");
+ @Override
+ public void close() {
+ icingSearchEngineImpl.close();
}
- /** @throws IllegalStateException if IcingSearchEngine fails to be created */
- public IcingSearchEngine(@NonNull IcingSearchEngineOptions options) {
- nativePointer = nativeCreate(options.toByteArray());
- if (nativePointer == 0) {
- Log.e(TAG, "Failed to create IcingSearchEngine.");
- throw new IllegalStateException("Failed to create IcingSearchEngine.");
- }
+ @SuppressWarnings({"deprecation", "removal"}) // b/316643605
+ @Override
+ protected void finalize() throws Throwable {
+ icingSearchEngineImpl.close();
+ super.finalize();
}
@NonNull
+ @Override
public InitializeResultProto initialize() {
- byte[] initializeResultBytes = nativeInitialize(nativePointer);
- if (initializeResultBytes == null) {
- Log.e(TAG, "Received null InitializeResult from native.");
- return InitializeResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return InitializeResultProto.parseFrom(initializeResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing InitializeResultProto.", e);
- return InitializeResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ return IcingSearchEngineUtils.byteArrayToInitializeResultProto(
+ icingSearchEngineImpl.initialize());
}
@NonNull
+ @Override
public SetSchemaResultProto setSchema(@NonNull SchemaProto schema) {
return setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false);
}
@NonNull
+ @Override
public SetSchemaResultProto setSchema(
@NonNull SchemaProto schema, boolean ignoreErrorsAndDeleteDocuments) {
- byte[] setSchemaResultBytes =
- nativeSetSchema(nativePointer, schema.toByteArray(), ignoreErrorsAndDeleteDocuments);
- if (setSchemaResultBytes == null) {
- Log.e(TAG, "Received null SetSchemaResultProto from native.");
- return SetSchemaResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return SetSchemaResultProto.parseFrom(setSchemaResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing SetSchemaResultProto.", e);
- return SetSchemaResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ return IcingSearchEngineUtils.byteArrayToSetSchemaResultProto(
+ icingSearchEngineImpl.setSchema(schema.toByteArray(), ignoreErrorsAndDeleteDocuments));
}
@NonNull
+ @Override
public GetSchemaResultProto getSchema() {
- byte[] getSchemaResultBytes = nativeGetSchema(nativePointer);
- if (getSchemaResultBytes == null) {
- Log.e(TAG, "Received null GetSchemaResultProto from native.");
- return GetSchemaResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return GetSchemaResultProto.parseFrom(getSchemaResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing GetSchemaResultProto.", e);
- return GetSchemaResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ return IcingSearchEngineUtils.byteArrayToGetSchemaResultProto(
+ icingSearchEngineImpl.getSchema());
}
@NonNull
+ @Override
public GetSchemaTypeResultProto getSchemaType(@NonNull String schemaType) {
- byte[] getSchemaTypeResultBytes = nativeGetSchemaType(nativePointer, schemaType);
- if (getSchemaTypeResultBytes == null) {
- Log.e(TAG, "Received null GetSchemaTypeResultProto from native.");
- return GetSchemaTypeResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return GetSchemaTypeResultProto.parseFrom(getSchemaTypeResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing GetSchemaTypeResultProto.", e);
- return GetSchemaTypeResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ return IcingSearchEngineUtils.byteArrayToGetSchemaTypeResultProto(
+ icingSearchEngineImpl.getSchemaType(schemaType));
}
@NonNull
+ @Override
public PutResultProto put(@NonNull DocumentProto document) {
- byte[] putResultBytes = nativePut(nativePointer, document.toByteArray());
- if (putResultBytes == null) {
- Log.e(TAG, "Received null PutResultProto from native.");
- return PutResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return PutResultProto.parseFrom(putResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing PutResultProto.", e);
- return PutResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ return IcingSearchEngineUtils.byteArrayToPutResultProto(
+ icingSearchEngineImpl.put(document.toByteArray()));
+ }
+
+ @NonNull
+ @Override
+ public GetResultProto get(
+ @NonNull String namespace, @NonNull String uri, @NonNull GetResultSpecProto getResultSpec) {
+ return IcingSearchEngineUtils.byteArrayToGetResultProto(
+ icingSearchEngineImpl.get(namespace, uri, getResultSpec.toByteArray()));
+ }
+
+ @NonNull
+ @Override
+ public ReportUsageResultProto reportUsage(@NonNull UsageReport usageReport) {
+ return IcingSearchEngineUtils.byteArrayToReportUsageResultProto(
+ icingSearchEngineImpl.reportUsage(usageReport.toByteArray()));
}
@NonNull
- public GetResultProto get(@NonNull String namespace, @NonNull String uri) {
- byte[] getResultBytes = nativeGet(nativePointer, namespace, uri);
- if (getResultBytes == null) {
- Log.e(TAG, "Received null GetResultProto from native.");
- return GetResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return GetResultProto.parseFrom(getResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing GetResultProto.", e);
- return GetResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ @Override
+ public GetAllNamespacesResultProto getAllNamespaces() {
+ return IcingSearchEngineUtils.byteArrayToGetAllNamespacesResultProto(
+ icingSearchEngineImpl.getAllNamespaces());
}
@NonNull
+ @Override
public SearchResultProto search(
@NonNull SearchSpecProto searchSpec,
@NonNull ScoringSpecProto scoringSpec,
@NonNull ResultSpecProto resultSpec) {
- byte[] searchResultBytes =
- nativeSearch(
- nativePointer,
- searchSpec.toByteArray(),
- scoringSpec.toByteArray(),
- resultSpec.toByteArray());
- if (searchResultBytes == null) {
- Log.e(TAG, "Received null SearchResultProto from native.");
- return SearchResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return SearchResultProto.parseFrom(searchResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing SearchResultProto.", e);
- return SearchResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ return IcingSearchEngineUtils.byteArrayToSearchResultProto(
+ icingSearchEngineImpl.search(
+ searchSpec.toByteArray(), scoringSpec.toByteArray(), resultSpec.toByteArray()));
}
@NonNull
- public DeleteResultProto delete(@NonNull String namespace, @NonNull String uri) {
- byte[] deleteResultBytes = nativeDelete(nativePointer, namespace, uri);
- if (deleteResultBytes == null) {
- Log.e(TAG, "Received null DeleteResultProto from native.");
- return DeleteResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return DeleteResultProto.parseFrom(deleteResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing DeleteResultProto.", e);
- return DeleteResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ @Override
+ public SearchResultProto getNextPage(long nextPageToken) {
+ return IcingSearchEngineUtils.byteArrayToSearchResultProto(
+ icingSearchEngineImpl.getNextPage(nextPageToken));
}
@NonNull
- public DeleteByNamespaceResultProto deleteByNamespace(@NonNull String namespace) {
- byte[] deleteByNamespaceResultBytes = nativeDeleteByNamespace(nativePointer, namespace);
- if (deleteByNamespaceResultBytes == null) {
- Log.e(TAG, "Received null DeleteByNamespaceResultProto from native.");
- return DeleteByNamespaceResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return DeleteByNamespaceResultProto.parseFrom(deleteByNamespaceResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing DeleteByNamespaceResultProto.", e);
- return DeleteByNamespaceResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ @Override
+ public void invalidateNextPageToken(long nextPageToken) {
+ icingSearchEngineImpl.invalidateNextPageToken(nextPageToken);
}
@NonNull
- public DeleteBySchemaTypeResultProto deleteBySchemaType(@NonNull String schemaType) {
- byte[] deleteBySchemaTypeResultBytes = nativeDeleteBySchemaType(nativePointer, schemaType);
- if (deleteBySchemaTypeResultBytes == null) {
- Log.e(TAG, "Received null DeleteBySchemaTypeResultProto from native.");
- return DeleteBySchemaTypeResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return DeleteBySchemaTypeResultProto.parseFrom(deleteBySchemaTypeResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing DeleteBySchemaTypeResultProto.", e);
- return DeleteBySchemaTypeResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ @Override
+ public DeleteResultProto delete(@NonNull String namespace, @NonNull String uri) {
+ return IcingSearchEngineUtils.byteArrayToDeleteResultProto(
+ icingSearchEngineImpl.delete(namespace, uri));
}
@NonNull
- public PersistToDiskResultProto persistToDisk() {
- byte[] persistToDiskResultBytes = nativePersistToDisk(nativePointer);
- if (persistToDiskResultBytes == null) {
- Log.e(TAG, "Received null PersistToDiskResultProto from native.");
- return PersistToDiskResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return PersistToDiskResultProto.parseFrom(persistToDiskResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing PersistToDiskResultProto.", e);
- return PersistToDiskResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ @Override
+ public SuggestionResponse searchSuggestions(@NonNull SuggestionSpecProto suggestionSpec) {
+ return IcingSearchEngineUtils.byteArrayToSuggestionResponse(
+ icingSearchEngineImpl.searchSuggestions(suggestionSpec.toByteArray()));
}
@NonNull
- public OptimizeResultProto optimize() {
- byte[] optimizeResultBytes = nativeOptimize(nativePointer);
- if (optimizeResultBytes == null) {
- Log.e(TAG, "Received null OptimizeResultProto from native.");
- return OptimizeResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return OptimizeResultProto.parseFrom(optimizeResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing OptimizeResultProto.", e);
- return OptimizeResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ @Override
+ public DeleteByNamespaceResultProto deleteByNamespace(@NonNull String namespace) {
+ return IcingSearchEngineUtils.byteArrayToDeleteByNamespaceResultProto(
+ icingSearchEngineImpl.deleteByNamespace(namespace));
}
@NonNull
- public GetOptimizeInfoResultProto getOptimizeInfo() {
- byte[] getOptimizeInfoResultBytes = nativeGetOptimizeInfo(nativePointer);
- if (getOptimizeInfoResultBytes == null) {
- Log.e(TAG, "Received null GetOptimizeInfoResultProto from native.");
- return GetOptimizeInfoResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return GetOptimizeInfoResultProto.parseFrom(getOptimizeInfoResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing GetOptimizeInfoResultProto.", e);
- return GetOptimizeInfoResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ @Override
+ public DeleteBySchemaTypeResultProto deleteBySchemaType(@NonNull String schemaType) {
+ return IcingSearchEngineUtils.byteArrayToDeleteBySchemaTypeResultProto(
+ icingSearchEngineImpl.deleteBySchemaType(schemaType));
}
@NonNull
- public ResetResultProto reset() {
- byte[] resetResultBytes = nativeReset(nativePointer);
- if (resetResultBytes == null) {
- Log.e(TAG, "Received null ResetResultProto from native.");
- return ResetResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
-
- try {
- return ResetResultProto.parseFrom(resetResultBytes);
- } catch (InvalidProtocolBufferException e) {
- Log.e(TAG, "Error parsing ResetResultProto.", e);
- return ResetResultProto.newBuilder()
- .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
- .build();
- }
+ @Override
+ public DeleteByQueryResultProto deleteByQuery(@NonNull SearchSpecProto searchSpec) {
+ return deleteByQuery(searchSpec, /*returnDeletedDocumentInfo=*/ false);
}
- private static native long nativeCreate(byte[] icingSearchEngineOptionsBytes);
-
- private static native byte[] nativeInitialize(long nativePointer);
-
- private static native byte[] nativeSetSchema(
- long nativePointer, byte[] schemaBytes, boolean ignoreErrorsAndDeleteDocuments);
-
- private static native byte[] nativeGetSchema(long nativePointer);
+ @NonNull
+ @Override
+ public DeleteByQueryResultProto deleteByQuery(
+ @NonNull SearchSpecProto searchSpec, boolean returnDeletedDocumentInfo) {
+ return IcingSearchEngineUtils.byteArrayToDeleteByQueryResultProto(
+ icingSearchEngineImpl.deleteByQuery(searchSpec.toByteArray(), returnDeletedDocumentInfo));
+ }
- private static native byte[] nativeGetSchemaType(long nativePointer, String schemaType);
+ @NonNull
+ @Override
+ public PersistToDiskResultProto persistToDisk(@NonNull PersistType.Code persistTypeCode) {
+ return IcingSearchEngineUtils.byteArrayToPersistToDiskResultProto(
+ icingSearchEngineImpl.persistToDisk(persistTypeCode.getNumber()));
+ }
- private static native byte[] nativePut(long nativePointer, byte[] documentBytes);
+ @NonNull
+ @Override
+ public OptimizeResultProto optimize() {
+ return IcingSearchEngineUtils.byteArrayToOptimizeResultProto(icingSearchEngineImpl.optimize());
+ }
- private static native byte[] nativeGet(long nativePointer, String namespace, String uri);
+ @NonNull
+ @Override
+ public GetOptimizeInfoResultProto getOptimizeInfo() {
+ return IcingSearchEngineUtils.byteArrayToGetOptimizeInfoResultProto(
+ icingSearchEngineImpl.getOptimizeInfo());
+ }
- private static native byte[] nativeSearch(
- long nativePointer, byte[] searchSpecBytes, byte[] scoringSpecBytes, byte[] resultSpecBytes);
+ @NonNull
+ @Override
+ public StorageInfoResultProto getStorageInfo() {
+ return IcingSearchEngineUtils.byteArrayToStorageInfoResultProto(
+ icingSearchEngineImpl.getStorageInfo());
+ }
- private static native byte[] nativeDelete(long nativePointer, String namespace, String uri);
+ @NonNull
+ @Override
+ public DebugInfoResultProto getDebugInfo(DebugInfoVerbosity.Code verbosity) {
+ return IcingSearchEngineUtils.byteArrayToDebugInfoResultProto(
+ icingSearchEngineImpl.getDebugInfo(verbosity.getNumber()));
+ }
- private static native byte[] nativeDeleteByNamespace(long nativePointer, String namespace);
+ @NonNull
+ @Override
+ public ResetResultProto reset() {
+ return IcingSearchEngineUtils.byteArrayToResetResultProto(icingSearchEngineImpl.reset());
+ }
- private static native byte[] nativeDeleteBySchemaType(long nativePointer, String schemaType);
+ public static boolean shouldLog(LogSeverity.Code severity) {
+ return shouldLog(severity, (short) 0);
+ }
- private static native byte[] nativePersistToDisk(long nativePointer);
+ public static boolean shouldLog(LogSeverity.Code severity, short verbosity) {
+ return IcingSearchEngineImpl.shouldLog((short) severity.getNumber(), verbosity);
+ }
- private static native byte[] nativeOptimize(long nativePointer);
+ public static boolean setLoggingLevel(LogSeverity.Code severity) {
+ return setLoggingLevel(severity, (short) 0);
+ }
- private static native byte[] nativeGetOptimizeInfo(long nativePointer);
+ public static boolean setLoggingLevel(LogSeverity.Code severity, short verbosity) {
+ return IcingSearchEngineImpl.setLoggingLevel((short) severity.getNumber(), verbosity);
+ }
- private static native byte[] nativeReset(long nativePointer);
+ @Nullable
+ public static String getLoggingTag() {
+ return IcingSearchEngineImpl.getLoggingTag();
+ }
}
diff --git a/java/src/com/google/android/icing/IcingSearchEngineImpl.java b/java/src/com/google/android/icing/IcingSearchEngineImpl.java
new file mode 100644
index 0000000..3a00a5a
--- /dev/null
+++ b/java/src/com/google/android/icing/IcingSearchEngineImpl.java
@@ -0,0 +1,331 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.android.icing;
+
+import android.util.Log;
+import androidx.annotation.NonNull;
+import androidx.annotation.Nullable;
+import java.io.Closeable;
+
+/**
+ * Java wrapper to access native APIs in external/icing/icing/icing-search-engine.h
+ *
+ * <p>If this instance has been closed, the instance is no longer usable.
+ *
+ * <p>Keep this class to be non-Final so that it can be mocked in AppSearch.
+ *
+ * <p>NOTE: This class is NOT thread-safe.
+ */
+public class IcingSearchEngineImpl implements Closeable {
+
+ private static final String TAG = "IcingSearchEngineImpl";
+
+ private long nativePointer;
+
+ private boolean closed = false;
+
+ static {
+ // NOTE: This can fail with an UnsatisfiedLinkError
+ System.loadLibrary("icing");
+ }
+
+ /**
+ * @throws IllegalStateException if IcingSearchEngineImpl fails to be created
+ */
+ public IcingSearchEngineImpl(@NonNull byte[] optionsBytes) {
+ nativePointer = nativeCreate(optionsBytes);
+ if (nativePointer == 0) {
+ Log.e(TAG, "Failed to create IcingSearchEngineImpl.");
+ throw new IllegalStateException("Failed to create IcingSearchEngineImpl.");
+ }
+ }
+
+ private void throwIfClosed() {
+ if (closed) {
+ throw new IllegalStateException("Trying to use a closed IcingSearchEngineImpl instance.");
+ }
+ }
+
+ @Override
+ public void close() {
+ if (closed) {
+ return;
+ }
+
+ if (nativePointer != 0) {
+ nativeDestroy(this);
+ }
+ nativePointer = 0;
+ closed = true;
+ }
+
+ @SuppressWarnings({"deprecation", "removal"}) // b/316643605
+ @Override
+ protected void finalize() throws Throwable {
+ close();
+ super.finalize();
+ }
+
+ @Nullable
+ public byte[] initialize() {
+ throwIfClosed();
+ return nativeInitialize(this);
+ }
+
+ @Nullable
+ public byte[] setSchema(@NonNull byte[] schemaBytes) {
+ return setSchema(schemaBytes, /* ignoreErrorsAndDeleteDocuments= */ false);
+ }
+
+ @Nullable
+ public byte[] setSchema(@NonNull byte[] schemaBytes, boolean ignoreErrorsAndDeleteDocuments) {
+ throwIfClosed();
+ return nativeSetSchema(this, schemaBytes, ignoreErrorsAndDeleteDocuments);
+ }
+
+ @Nullable
+ public byte[] getSchema() {
+ throwIfClosed();
+ return nativeGetSchema(this);
+ }
+
+ @Nullable
+ public byte[] getSchemaType(@NonNull String schemaType) {
+ throwIfClosed();
+ return nativeGetSchemaType(this, schemaType);
+ }
+
+ @Nullable
+ public byte[] put(@NonNull byte[] documentBytes) {
+ throwIfClosed();
+ return nativePut(this, documentBytes);
+ }
+
+ @Nullable
+ public byte[] get(
+ @NonNull String namespace, @NonNull String uri, @NonNull byte[] getResultSpecBytes) {
+ throwIfClosed();
+ return nativeGet(this, namespace, uri, getResultSpecBytes);
+ }
+
+ @Nullable
+ public byte[] reportUsage(@NonNull byte[] usageReportBytes) {
+ throwIfClosed();
+ return nativeReportUsage(this, usageReportBytes);
+ }
+
+ @Nullable
+ public byte[] getAllNamespaces() {
+ throwIfClosed();
+ return nativeGetAllNamespaces(this);
+ }
+
+ @Nullable
+ public byte[] search(
+ @NonNull byte[] searchSpecBytes,
+ @NonNull byte[] scoringSpecBytes,
+ @NonNull byte[] resultSpecBytes) {
+ throwIfClosed();
+
+ // Note that on Android System.currentTimeMillis() is the standard "wall" clock and can be set
+ // by the user or the phone network so the time may jump backwards or forwards unpredictably.
+ // This could lead to inaccurate final JNI latency calculations or unexpected negative numbers
+ // in the case where the phone time is changed while sending data across JNI layers.
+ // However these occurrences should be very rare, so we will keep usage of
+ // System.currentTimeMillis() due to the lack of better time functions that can provide a
+ // consistent timestamp across all platforms.
+ long javaToNativeStartTimestampMs = System.currentTimeMillis();
+ return nativeSearch(
+ this, searchSpecBytes, scoringSpecBytes, resultSpecBytes, javaToNativeStartTimestampMs);
+ }
+
+ @Nullable
+ public byte[] getNextPage(long nextPageToken) {
+ throwIfClosed();
+ return nativeGetNextPage(this, nextPageToken, System.currentTimeMillis());
+ }
+
+ @NonNull
+ public void invalidateNextPageToken(long nextPageToken) {
+ throwIfClosed();
+ nativeInvalidateNextPageToken(this, nextPageToken);
+ }
+
+ @Nullable
+ public byte[] delete(@NonNull String namespace, @NonNull String uri) {
+ throwIfClosed();
+ return nativeDelete(this, namespace, uri);
+ }
+
+ @Nullable
+ public byte[] searchSuggestions(@NonNull byte[] suggestionSpecBytes) {
+ throwIfClosed();
+ return nativeSearchSuggestions(this, suggestionSpecBytes);
+ }
+
+ @Nullable
+ public byte[] deleteByNamespace(@NonNull String namespace) {
+ throwIfClosed();
+ return nativeDeleteByNamespace(this, namespace);
+ }
+
+ @Nullable
+ public byte[] deleteBySchemaType(@NonNull String schemaType) {
+ throwIfClosed();
+ return nativeDeleteBySchemaType(this, schemaType);
+ }
+
+ @Nullable
+ public byte[] deleteByQuery(@NonNull byte[] searchSpecBytes) {
+ return deleteByQuery(searchSpecBytes, /* returnDeletedDocumentInfo= */ false);
+ }
+
+ @Nullable
+ public byte[] deleteByQuery(@NonNull byte[] searchSpecBytes, boolean returnDeletedDocumentInfo) {
+ throwIfClosed();
+ return nativeDeleteByQuery(this, searchSpecBytes, returnDeletedDocumentInfo);
+ }
+
+ @Nullable
+ public byte[] persistToDisk(int persistTypeCode) {
+ throwIfClosed();
+ return nativePersistToDisk(this, persistTypeCode);
+ }
+
+ @Nullable
+ public byte[] optimize() {
+ throwIfClosed();
+ return nativeOptimize(this);
+ }
+
+ @Nullable
+ public byte[] getOptimizeInfo() {
+ throwIfClosed();
+ return nativeGetOptimizeInfo(this);
+ }
+
+ @Nullable
+ public byte[] getStorageInfo() {
+ throwIfClosed();
+ return nativeGetStorageInfo(this);
+ }
+
+ @Nullable
+ public byte[] getDebugInfo(int verbosityCode) {
+ throwIfClosed();
+ return nativeGetDebugInfo(this, verbosityCode);
+ }
+
+ @Nullable
+ public byte[] reset() {
+ throwIfClosed();
+ return nativeReset(this);
+ }
+
+ public static boolean shouldLog(short severity) {
+ return shouldLog(severity, (short) 0);
+ }
+
+ public static boolean shouldLog(short severity, short verbosity) {
+ return nativeShouldLog(severity, verbosity);
+ }
+
+ public static boolean setLoggingLevel(short severity) {
+ return setLoggingLevel(severity, (short) 0);
+ }
+
+ public static boolean setLoggingLevel(short severity, short verbosity) {
+ return nativeSetLoggingLevel(severity, verbosity);
+ }
+
+ @Nullable
+ public static String getLoggingTag() {
+ String tag = nativeGetLoggingTag();
+ if (tag == null) {
+ Log.e(TAG, "Received null logging tag from native.");
+ }
+ return tag;
+ }
+
+ private static native long nativeCreate(byte[] icingSearchEngineOptionsBytes);
+
+ private static native void nativeDestroy(IcingSearchEngineImpl instance);
+
+ private static native byte[] nativeInitialize(IcingSearchEngineImpl instance);
+
+ private static native byte[] nativeSetSchema(
+ IcingSearchEngineImpl instance, byte[] schemaBytes, boolean ignoreErrorsAndDeleteDocuments);
+
+ private static native byte[] nativeGetSchema(IcingSearchEngineImpl instance);
+
+ private static native byte[] nativeGetSchemaType(
+ IcingSearchEngineImpl instance, String schemaType);
+
+ private static native byte[] nativePut(IcingSearchEngineImpl instance, byte[] documentBytes);
+
+ private static native byte[] nativeGet(
+ IcingSearchEngineImpl instance, String namespace, String uri, byte[] getResultSpecBytes);
+
+ private static native byte[] nativeReportUsage(
+ IcingSearchEngineImpl instance, byte[] usageReportBytes);
+
+ private static native byte[] nativeGetAllNamespaces(IcingSearchEngineImpl instance);
+
+ private static native byte[] nativeSearch(
+ IcingSearchEngineImpl instance,
+ byte[] searchSpecBytes,
+ byte[] scoringSpecBytes,
+ byte[] resultSpecBytes,
+ long javaToNativeStartTimestampMs);
+
+ private static native byte[] nativeGetNextPage(
+ IcingSearchEngineImpl instance, long nextPageToken, long javaToNativeStartTimestampMs);
+
+ private static native void nativeInvalidateNextPageToken(
+ IcingSearchEngineImpl instance, long nextPageToken);
+
+ private static native byte[] nativeDelete(
+ IcingSearchEngineImpl instance, String namespace, String uri);
+
+ private static native byte[] nativeDeleteByNamespace(
+ IcingSearchEngineImpl instance, String namespace);
+
+ private static native byte[] nativeDeleteBySchemaType(
+ IcingSearchEngineImpl instance, String schemaType);
+
+ private static native byte[] nativeDeleteByQuery(
+ IcingSearchEngineImpl instance, byte[] searchSpecBytes, boolean returnDeletedDocumentInfo);
+
+ private static native byte[] nativePersistToDisk(IcingSearchEngineImpl instance, int persistType);
+
+ private static native byte[] nativeOptimize(IcingSearchEngineImpl instance);
+
+ private static native byte[] nativeGetOptimizeInfo(IcingSearchEngineImpl instance);
+
+ private static native byte[] nativeGetStorageInfo(IcingSearchEngineImpl instance);
+
+ private static native byte[] nativeReset(IcingSearchEngineImpl instance);
+
+ private static native byte[] nativeSearchSuggestions(
+ IcingSearchEngineImpl instance, byte[] suggestionSpecBytes);
+
+ private static native byte[] nativeGetDebugInfo(IcingSearchEngineImpl instance, int verbosity);
+
+ private static native boolean nativeShouldLog(short severity, short verbosity);
+
+ private static native boolean nativeSetLoggingLevel(short severity, short verbosity);
+
+ private static native String nativeGetLoggingTag();
+}
diff --git a/java/src/com/google/android/icing/IcingSearchEngineInterface.java b/java/src/com/google/android/icing/IcingSearchEngineInterface.java
new file mode 100644
index 0000000..0bc58f1
--- /dev/null
+++ b/java/src/com/google/android/icing/IcingSearchEngineInterface.java
@@ -0,0 +1,148 @@
+package com.google.android.icing;
+
+import com.google.android.icing.proto.DebugInfoResultProto;
+import com.google.android.icing.proto.DebugInfoVerbosity;
+import com.google.android.icing.proto.DeleteByNamespaceResultProto;
+import com.google.android.icing.proto.DeleteByQueryResultProto;
+import com.google.android.icing.proto.DeleteBySchemaTypeResultProto;
+import com.google.android.icing.proto.DeleteResultProto;
+import com.google.android.icing.proto.DocumentProto;
+import com.google.android.icing.proto.GetAllNamespacesResultProto;
+import com.google.android.icing.proto.GetOptimizeInfoResultProto;
+import com.google.android.icing.proto.GetResultProto;
+import com.google.android.icing.proto.GetResultSpecProto;
+import com.google.android.icing.proto.GetSchemaResultProto;
+import com.google.android.icing.proto.GetSchemaTypeResultProto;
+import com.google.android.icing.proto.InitializeResultProto;
+import com.google.android.icing.proto.OptimizeResultProto;
+import com.google.android.icing.proto.PersistToDiskResultProto;
+import com.google.android.icing.proto.PersistType;
+import com.google.android.icing.proto.PutResultProto;
+import com.google.android.icing.proto.ReportUsageResultProto;
+import com.google.android.icing.proto.ResetResultProto;
+import com.google.android.icing.proto.ResultSpecProto;
+import com.google.android.icing.proto.SchemaProto;
+import com.google.android.icing.proto.ScoringSpecProto;
+import com.google.android.icing.proto.SearchResultProto;
+import com.google.android.icing.proto.SearchSpecProto;
+import com.google.android.icing.proto.SetSchemaResultProto;
+import com.google.android.icing.proto.StorageInfoResultProto;
+import com.google.android.icing.proto.SuggestionResponse;
+import com.google.android.icing.proto.SuggestionSpecProto;
+import com.google.android.icing.proto.UsageReport;
+import java.io.Closeable;
+
+/** A common user-facing interface to expose the funcationalities provided by Icing Library. */
+public interface IcingSearchEngineInterface extends Closeable {
+ /**
+ * Initializes the current IcingSearchEngine implementation.
+ *
+ * <p>Internally the icing instance will be initialized.
+ */
+ InitializeResultProto initialize();
+
+ /** Sets the schema for the icing instance. */
+ SetSchemaResultProto setSchema(SchemaProto schema);
+
+ /**
+ * Sets the schema for the icing instance.
+ *
+ * @param ignoreErrorsAndDeleteDocuments force to set the schema and delete documents in case of
+ * incompatible schema change.
+ */
+ SetSchemaResultProto setSchema(SchemaProto schema, boolean ignoreErrorsAndDeleteDocuments);
+
+ /** Gets the schema for the icing instance. */
+ GetSchemaResultProto getSchema();
+
+ /**
+ * Gets the schema for the icing instance.
+ *
+ * @param schemaType type of the schema.
+ */
+ GetSchemaTypeResultProto getSchemaType(String schemaType);
+
+ /** Puts the document. */
+ PutResultProto put(DocumentProto document);
+
+ /**
+ * Gets the document.
+ *
+ * @param namespace namespace of the document.
+ * @param uri uri of the document.
+ * @param getResultSpec the spec for getting the document.
+ */
+ GetResultProto get(String namespace, String uri, GetResultSpecProto getResultSpec);
+
+ /** Reports usage. */
+ ReportUsageResultProto reportUsage(UsageReport usageReport);
+
+ /** Gets all namespaces. */
+ GetAllNamespacesResultProto getAllNamespaces();
+
+ /**
+ * Searches over the documents.
+ *
+ * <p>Documents need to be retrieved on the following {@link #getNextPage} calls on the returned
+ * {@link SearchResultProto}.
+ */
+ SearchResultProto search(
+ SearchSpecProto searchSpec, ScoringSpecProto scoringSpec, ResultSpecProto resultSpec);
+
+ /** Gets the next page. */
+ SearchResultProto getNextPage(long nextPageToken);
+
+ /** Invalidates the next page token. */
+ void invalidateNextPageToken(long nextPageToken);
+
+ /**
+ * Deletes the document.
+ *
+ * @param namespace the namespace the document to be deleted belong to.
+ * @param uri the uri for the document to be deleted.
+ */
+ DeleteResultProto delete(String namespace, String uri);
+
+ /** Returns the suggestions for the search query. */
+ SuggestionResponse searchSuggestions(SuggestionSpecProto suggestionSpec);
+
+ /** Deletes documents by the namespace. */
+ DeleteByNamespaceResultProto deleteByNamespace(String namespace);
+
+ /** Deletes documents by the schema type. */
+ DeleteBySchemaTypeResultProto deleteBySchemaType(String schemaType);
+
+ /** Deletes documents by the search query. */
+ DeleteByQueryResultProto deleteByQuery(SearchSpecProto searchSpec);
+
+ /**
+ * Deletes document by the search query
+ *
+ * @param returnDeletedDocumentInfo whether additional information about deleted documents will be
+ * included in {@link DeleteByQueryResultProto}.
+ */
+ DeleteByQueryResultProto deleteByQuery(
+ SearchSpecProto searchSpec, boolean returnDeletedDocumentInfo);
+
+ /** Makes sure every update/delete received till this point is flushed to disk. */
+ PersistToDiskResultProto persistToDisk(PersistType.Code persistTypeCode);
+
+ /** Makes the icing instance run tasks that are too expensive to be run in real-time. */
+ OptimizeResultProto optimize();
+
+ /** Gets information about the optimization. */
+ GetOptimizeInfoResultProto getOptimizeInfo();
+
+ /** Gets information about the storage. */
+ StorageInfoResultProto getStorageInfo();
+
+ /** Gets the debug information for the current icing instance. */
+ DebugInfoResultProto getDebugInfo(DebugInfoVerbosity.Code verbosity);
+
+ /** Clears all data from the current icing instance, and reinitializes it. */
+ ResetResultProto reset();
+
+ /** Closes the current icing instance. */
+ @Override
+ void close();
+}
diff --git a/java/src/com/google/android/icing/IcingSearchEngineUtils.java b/java/src/com/google/android/icing/IcingSearchEngineUtils.java
new file mode 100644
index 0000000..0913216
--- /dev/null
+++ b/java/src/com/google/android/icing/IcingSearchEngineUtils.java
@@ -0,0 +1,471 @@
+// Copyright (C) 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package com.google.android.icing;
+
+import android.util.Log;
+import androidx.annotation.NonNull;
+import androidx.annotation.Nullable;
+import com.google.android.icing.proto.DebugInfoResultProto;
+import com.google.android.icing.proto.DeleteByNamespaceResultProto;
+import com.google.android.icing.proto.DeleteByQueryResultProto;
+import com.google.android.icing.proto.DeleteBySchemaTypeResultProto;
+import com.google.android.icing.proto.DeleteResultProto;
+import com.google.android.icing.proto.GetAllNamespacesResultProto;
+import com.google.android.icing.proto.GetOptimizeInfoResultProto;
+import com.google.android.icing.proto.GetResultProto;
+import com.google.android.icing.proto.GetSchemaResultProto;
+import com.google.android.icing.proto.GetSchemaTypeResultProto;
+import com.google.android.icing.proto.InitializeResultProto;
+import com.google.android.icing.proto.OptimizeResultProto;
+import com.google.android.icing.proto.PersistToDiskResultProto;
+import com.google.android.icing.proto.PutResultProto;
+import com.google.android.icing.proto.ReportUsageResultProto;
+import com.google.android.icing.proto.ResetResultProto;
+import com.google.android.icing.proto.SearchResultProto;
+import com.google.android.icing.proto.SetSchemaResultProto;
+import com.google.android.icing.proto.StatusProto;
+import com.google.android.icing.proto.StorageInfoResultProto;
+import com.google.android.icing.proto.SuggestionResponse;
+import com.google.protobuf.ExtensionRegistryLite;
+import com.google.protobuf.InvalidProtocolBufferException;
+
+/**
+ * Contains utility methods for IcingSearchEngine to convert byte arrays to the corresponding
+ * protos.
+ *
+ * <p>It is also being used by AppSearch dynamite 0p client APIs to convert byte arrays to the
+ * protos.
+ */
+public final class IcingSearchEngineUtils {
+ private static final String TAG = "IcingSearchEngineUtils";
+ private static final ExtensionRegistryLite EXTENSION_REGISTRY_LITE =
+ ExtensionRegistryLite.getEmptyRegistry();
+
+ private IcingSearchEngineUtils() {}
+
+ // TODO(b/240333360) Check to see if we can use one template function to replace those
+ @NonNull
+ public static InitializeResultProto byteArrayToInitializeResultProto(
+ @Nullable byte[] initializeResultBytes) {
+ if (initializeResultBytes == null) {
+ Log.e(TAG, "Received null InitializeResult from native.");
+ return InitializeResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return InitializeResultProto.parseFrom(initializeResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing InitializeResultProto.", e);
+ return InitializeResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static SetSchemaResultProto byteArrayToSetSchemaResultProto(
+ @Nullable byte[] setSchemaResultBytes) {
+ if (setSchemaResultBytes == null) {
+ Log.e(TAG, "Received null SetSchemaResultProto from native.");
+ return SetSchemaResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return SetSchemaResultProto.parseFrom(setSchemaResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing SetSchemaResultProto.", e);
+ return SetSchemaResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static GetSchemaResultProto byteArrayToGetSchemaResultProto(
+ @Nullable byte[] getSchemaResultBytes) {
+ if (getSchemaResultBytes == null) {
+ Log.e(TAG, "Received null GetSchemaResultProto from native.");
+ return GetSchemaResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return GetSchemaResultProto.parseFrom(getSchemaResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing GetSchemaResultProto.", e);
+ return GetSchemaResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static GetSchemaTypeResultProto byteArrayToGetSchemaTypeResultProto(
+ @Nullable byte[] getSchemaTypeResultBytes) {
+ if (getSchemaTypeResultBytes == null) {
+ Log.e(TAG, "Received null GetSchemaTypeResultProto from native.");
+ return GetSchemaTypeResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return GetSchemaTypeResultProto.parseFrom(getSchemaTypeResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing GetSchemaTypeResultProto.", e);
+ return GetSchemaTypeResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static PutResultProto byteArrayToPutResultProto(@Nullable byte[] putResultBytes) {
+ if (putResultBytes == null) {
+ Log.e(TAG, "Received null PutResultProto from native.");
+ return PutResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return PutResultProto.parseFrom(putResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing PutResultProto.", e);
+ return PutResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static GetResultProto byteArrayToGetResultProto(@Nullable byte[] getResultBytes) {
+ if (getResultBytes == null) {
+ Log.e(TAG, "Received null GetResultProto from native.");
+ return GetResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return GetResultProto.parseFrom(getResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing GetResultProto.", e);
+ return GetResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static ReportUsageResultProto byteArrayToReportUsageResultProto(
+ @Nullable byte[] reportUsageResultBytes) {
+ if (reportUsageResultBytes == null) {
+ Log.e(TAG, "Received null ReportUsageResultProto from native.");
+ return ReportUsageResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return ReportUsageResultProto.parseFrom(reportUsageResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing ReportUsageResultProto.", e);
+ return ReportUsageResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static GetAllNamespacesResultProto byteArrayToGetAllNamespacesResultProto(
+ @Nullable byte[] getAllNamespacesResultBytes) {
+ if (getAllNamespacesResultBytes == null) {
+ Log.e(TAG, "Received null GetAllNamespacesResultProto from native.");
+ return GetAllNamespacesResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return GetAllNamespacesResultProto.parseFrom(
+ getAllNamespacesResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing GetAllNamespacesResultProto.", e);
+ return GetAllNamespacesResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static SearchResultProto byteArrayToSearchResultProto(@Nullable byte[] searchResultBytes) {
+ if (searchResultBytes == null) {
+ Log.e(TAG, "Received null SearchResultProto from native.");
+ return SearchResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ SearchResultProto.Builder searchResultProtoBuilder =
+ SearchResultProto.newBuilder().mergeFrom(searchResultBytes, EXTENSION_REGISTRY_LITE);
+ setNativeToJavaJniLatency(searchResultProtoBuilder);
+ return searchResultProtoBuilder.build();
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing SearchResultProto.", e);
+ return SearchResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ private static void setNativeToJavaJniLatency(
+ SearchResultProto.Builder searchResultProtoBuilder) {
+ int nativeToJavaLatencyMs =
+ (int)
+ (System.currentTimeMillis()
+ - searchResultProtoBuilder.getQueryStats().getNativeToJavaStartTimestampMs());
+ searchResultProtoBuilder.setQueryStats(
+ searchResultProtoBuilder.getQueryStats().toBuilder()
+ .setNativeToJavaJniLatencyMs(nativeToJavaLatencyMs));
+ }
+
+ @NonNull
+ public static DeleteResultProto byteArrayToDeleteResultProto(@Nullable byte[] deleteResultBytes) {
+ if (deleteResultBytes == null) {
+ Log.e(TAG, "Received null DeleteResultProto from native.");
+ return DeleteResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return DeleteResultProto.parseFrom(deleteResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing DeleteResultProto.", e);
+ return DeleteResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static SuggestionResponse byteArrayToSuggestionResponse(
+ @Nullable byte[] suggestionResponseBytes) {
+ if (suggestionResponseBytes == null) {
+ Log.e(TAG, "Received null suggestionResponseBytes from native.");
+ return SuggestionResponse.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return SuggestionResponse.parseFrom(suggestionResponseBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing suggestionResponseBytes.", e);
+ return SuggestionResponse.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static DeleteByNamespaceResultProto byteArrayToDeleteByNamespaceResultProto(
+ @Nullable byte[] deleteByNamespaceResultBytes) {
+ if (deleteByNamespaceResultBytes == null) {
+ Log.e(TAG, "Received null DeleteByNamespaceResultProto from native.");
+ return DeleteByNamespaceResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return DeleteByNamespaceResultProto.parseFrom(
+ deleteByNamespaceResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing DeleteByNamespaceResultProto.", e);
+ return DeleteByNamespaceResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static DeleteBySchemaTypeResultProto byteArrayToDeleteBySchemaTypeResultProto(
+ @Nullable byte[] deleteBySchemaTypeResultBytes) {
+ if (deleteBySchemaTypeResultBytes == null) {
+ Log.e(TAG, "Received null DeleteBySchemaTypeResultProto from native.");
+ return DeleteBySchemaTypeResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return DeleteBySchemaTypeResultProto.parseFrom(
+ deleteBySchemaTypeResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing DeleteBySchemaTypeResultProto.", e);
+ return DeleteBySchemaTypeResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static DeleteByQueryResultProto byteArrayToDeleteByQueryResultProto(
+ @Nullable byte[] deleteResultBytes) {
+ if (deleteResultBytes == null) {
+ Log.e(TAG, "Received null DeleteResultProto from native.");
+ return DeleteByQueryResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return DeleteByQueryResultProto.parseFrom(deleteResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing DeleteResultProto.", e);
+ return DeleteByQueryResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static PersistToDiskResultProto byteArrayToPersistToDiskResultProto(
+ @Nullable byte[] persistToDiskResultBytes) {
+ if (persistToDiskResultBytes == null) {
+ Log.e(TAG, "Received null PersistToDiskResultProto from native.");
+ return PersistToDiskResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return PersistToDiskResultProto.parseFrom(persistToDiskResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing PersistToDiskResultProto.", e);
+ return PersistToDiskResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static OptimizeResultProto byteArrayToOptimizeResultProto(
+ @Nullable byte[] optimizeResultBytes) {
+ if (optimizeResultBytes == null) {
+ Log.e(TAG, "Received null OptimizeResultProto from native.");
+ return OptimizeResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return OptimizeResultProto.parseFrom(optimizeResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing OptimizeResultProto.", e);
+ return OptimizeResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static GetOptimizeInfoResultProto byteArrayToGetOptimizeInfoResultProto(
+ @Nullable byte[] getOptimizeInfoResultBytes) {
+ if (getOptimizeInfoResultBytes == null) {
+ Log.e(TAG, "Received null GetOptimizeInfoResultProto from native.");
+ return GetOptimizeInfoResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return GetOptimizeInfoResultProto.parseFrom(
+ getOptimizeInfoResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing GetOptimizeInfoResultProto.", e);
+ return GetOptimizeInfoResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static StorageInfoResultProto byteArrayToStorageInfoResultProto(
+ @Nullable byte[] storageInfoResultProtoBytes) {
+ if (storageInfoResultProtoBytes == null) {
+ Log.e(TAG, "Received null StorageInfoResultProto from native.");
+ return StorageInfoResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return StorageInfoResultProto.parseFrom(storageInfoResultProtoBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing GetOptimizeInfoResultProto.", e);
+ return StorageInfoResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static DebugInfoResultProto byteArrayToDebugInfoResultProto(
+ @Nullable byte[] debugInfoResultProtoBytes) {
+ if (debugInfoResultProtoBytes == null) {
+ Log.e(TAG, "Received null DebugInfoResultProto from native.");
+ return DebugInfoResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return DebugInfoResultProto.parseFrom(debugInfoResultProtoBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing DebugInfoResultProto.", e);
+ return DebugInfoResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+
+ @NonNull
+ public static ResetResultProto byteArrayToResetResultProto(@Nullable byte[] resetResultBytes) {
+ if (resetResultBytes == null) {
+ Log.e(TAG, "Received null ResetResultProto from native.");
+ return ResetResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+
+ try {
+ return ResetResultProto.parseFrom(resetResultBytes, EXTENSION_REGISTRY_LITE);
+ } catch (InvalidProtocolBufferException e) {
+ Log.e(TAG, "Error parsing ResetResultProto.", e);
+ return ResetResultProto.newBuilder()
+ .setStatus(StatusProto.newBuilder().setCode(StatusProto.Code.INTERNAL))
+ .build();
+ }
+ }
+}
diff --git a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
index 01a6050..1ed2d9a 100644
--- a/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
+++ b/java/tests/instrumentation/src/com/google/android/icing/IcingSearchEngineTest.java
@@ -15,25 +15,32 @@
package com.google.android.icing;
import static com.google.common.truth.Truth.assertThat;
+import static com.google.common.truth.Truth.assertWithMessage;
-import androidx.test.core.app.ApplicationProvider;
+import com.google.android.icing.IcingSearchEngine;
+import com.google.android.icing.proto.DebugInfoResultProto;
+import com.google.android.icing.proto.DebugInfoVerbosity;
import com.google.android.icing.proto.DeleteByNamespaceResultProto;
+import com.google.android.icing.proto.DeleteByQueryResultProto;
import com.google.android.icing.proto.DeleteBySchemaTypeResultProto;
import com.google.android.icing.proto.DeleteResultProto;
import com.google.android.icing.proto.DocumentProto;
+import com.google.android.icing.proto.GetAllNamespacesResultProto;
import com.google.android.icing.proto.GetOptimizeInfoResultProto;
import com.google.android.icing.proto.GetResultProto;
+import com.google.android.icing.proto.GetResultSpecProto;
import com.google.android.icing.proto.GetSchemaResultProto;
import com.google.android.icing.proto.GetSchemaTypeResultProto;
import com.google.android.icing.proto.IcingSearchEngineOptions;
-import com.google.android.icing.proto.IndexingConfig;
-import com.google.android.icing.proto.IndexingConfig.TokenizerType;
import com.google.android.icing.proto.InitializeResultProto;
+import com.google.android.icing.proto.LogSeverity;
import com.google.android.icing.proto.OptimizeResultProto;
import com.google.android.icing.proto.PersistToDiskResultProto;
+import com.google.android.icing.proto.PersistType;
import com.google.android.icing.proto.PropertyConfigProto;
import com.google.android.icing.proto.PropertyProto;
import com.google.android.icing.proto.PutResultProto;
+import com.google.android.icing.proto.ReportUsageResultProto;
import com.google.android.icing.proto.ResetResultProto;
import com.google.android.icing.proto.ResultSpecProto;
import com.google.android.icing.proto.SchemaProto;
@@ -42,11 +49,26 @@ import com.google.android.icing.proto.ScoringSpecProto;
import com.google.android.icing.proto.SearchResultProto;
import com.google.android.icing.proto.SearchSpecProto;
import com.google.android.icing.proto.SetSchemaResultProto;
+import com.google.android.icing.proto.SnippetMatchProto;
+import com.google.android.icing.proto.SnippetProto;
import com.google.android.icing.proto.StatusProto;
+import com.google.android.icing.proto.StorageInfoResultProto;
+import com.google.android.icing.proto.StringIndexingConfig;
+import com.google.android.icing.proto.StringIndexingConfig.TokenizerType;
+import com.google.android.icing.proto.SuggestionResponse;
+import com.google.android.icing.proto.SuggestionScoringSpecProto;
+import com.google.android.icing.proto.SuggestionSpecProto;
import com.google.android.icing.proto.TermMatchType;
-import com.google.android.icing.IcingSearchEngine;
+import com.google.android.icing.proto.TermMatchType.Code;
+import com.google.android.icing.proto.UsageReport;
+import java.io.File;
+import java.util.HashMap;
+import java.util.Map;
+import org.junit.After;
import org.junit.Before;
+import org.junit.Rule;
import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
@@ -56,10 +78,13 @@ import org.junit.runners.JUnit4;
*/
@RunWith(JUnit4.class)
public final class IcingSearchEngineTest {
+ @Rule public TemporaryFolder temporaryFolder = new TemporaryFolder();
private static final String EMAIL_TYPE = "Email";
- private String filesDir;
+ private File tempDir;
+
+ private IcingSearchEngine icingSearchEngine;
private static SchemaTypeConfigProto createEmailTypeConfig() {
return SchemaTypeConfigProto.newBuilder()
@@ -69,8 +94,8 @@ public final class IcingSearchEngineTest {
.setPropertyName("subject")
.setDataType(PropertyConfigProto.DataType.Code.STRING)
.setCardinality(PropertyConfigProto.Cardinality.Code.OPTIONAL)
- .setIndexingConfig(
- IndexingConfig.newBuilder()
+ .setStringIndexingConfig(
+ StringIndexingConfig.newBuilder()
.setTokenizerType(TokenizerType.Code.PLAIN)
.setTermMatchType(TermMatchType.Code.PREFIX)))
.addProperties(
@@ -78,8 +103,8 @@ public final class IcingSearchEngineTest {
.setPropertyName("body")
.setDataType(PropertyConfigProto.DataType.Code.STRING)
.setCardinality(PropertyConfigProto.Cardinality.Code.OPTIONAL)
- .setIndexingConfig(
- IndexingConfig.newBuilder()
+ .setStringIndexingConfig(
+ StringIndexingConfig.newBuilder()
.setTokenizerType(TokenizerType.Code.PLAIN)
.setTermMatchType(TermMatchType.Code.PREFIX)))
.build();
@@ -96,78 +121,74 @@ public final class IcingSearchEngineTest {
@Before
public void setUp() throws Exception {
- filesDir = ApplicationProvider.getApplicationContext().getFilesDir().getCanonicalPath();
+ tempDir = temporaryFolder.newFolder();
+ IcingSearchEngineOptions options =
+ IcingSearchEngineOptions.newBuilder().setBaseDir(tempDir.getCanonicalPath()).build();
+ icingSearchEngine = new IcingSearchEngine(options);
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ icingSearchEngine.close();
}
@Test
public void testInitialize() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
-
- InitializeResultProto initializeResultProto = icing.initialize();
- assertThat(initializeResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ InitializeResultProto initializeResultProto = icingSearchEngine.initialize();
+ assertStatusOk(initializeResultProto.getStatus());
}
@Test
public void testSetAndGetSchema() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
SetSchemaResultProto setSchemaResultProto =
- icing.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false);
- assertThat(setSchemaResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false);
+ assertStatusOk(setSchemaResultProto.getStatus());
- GetSchemaResultProto getSchemaResultProto = icing.getSchema();
- assertThat(getSchemaResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ GetSchemaResultProto getSchemaResultProto = icingSearchEngine.getSchema();
+ assertStatusOk(getSchemaResultProto.getStatus());
assertThat(getSchemaResultProto.getSchema()).isEqualTo(schema);
GetSchemaTypeResultProto getSchemaTypeResultProto =
- icing.getSchemaType(emailTypeConfig.getSchemaType());
- assertThat(getSchemaTypeResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ icingSearchEngine.getSchemaType(emailTypeConfig.getSchemaType());
+ assertStatusOk(getSchemaTypeResultProto.getStatus());
assertThat(getSchemaTypeResultProto.getSchemaTypeConfig()).isEqualTo(emailTypeConfig);
}
@Test
public void testPutAndGetDocuments() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
assertThat(
- icing
+ icingSearchEngine
.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
.getStatus()
.getCode())
.isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument = createEmailDocument("namespace", "uri");
- PutResultProto putResultProto = icing.put(emailDocument);
- assertThat(putResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ PutResultProto putResultProto = icingSearchEngine.put(emailDocument);
+ assertStatusOk(putResultProto.getStatus());
- GetResultProto getResultProto = icing.get("namespace", "uri");
- assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ GetResultProto getResultProto =
+ icingSearchEngine.get("namespace", "uri", GetResultSpecProto.getDefaultInstance());
+ assertStatusOk(getResultProto.getStatus());
assertThat(getResultProto.getDocument()).isEqualTo(emailDocument);
}
@Test
public void testSearch() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
assertThat(
- icing
+ icingSearchEngine
.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
.getStatus()
.getCode())
@@ -177,7 +198,7 @@ public final class IcingSearchEngineTest {
createEmailDocument("namespace", "uri").toBuilder()
.addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
.build();
- assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.put(emailDocument).getStatus());
SearchSpecProto searchSpec =
SearchSpecProto.newBuilder()
@@ -186,138 +207,595 @@ public final class IcingSearchEngineTest {
.build();
SearchResultProto searchResultProto =
- icing.search(
+ icingSearchEngine.search(
searchSpec,
ScoringSpecProto.getDefaultInstance(),
ResultSpecProto.getDefaultInstance());
- assertThat(searchResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(searchResultProto.getStatus());
assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
assertThat(searchResultProto.getResults(0).getDocument()).isEqualTo(emailDocument);
+
+ assertThat(searchResultProto.getQueryStats().hasNativeToJavaStartTimestampMs()).isTrue();
+ assertThat(searchResultProto.getQueryStats().hasNativeToJavaJniLatencyMs()).isTrue();
+ assertThat(searchResultProto.getQueryStats().hasJavaToNativeJniLatencyMs()).isTrue();
+ assertThat(searchResultProto.getQueryStats().getNativeToJavaStartTimestampMs())
+ .isGreaterThan(0);
+ assertThat(searchResultProto.getQueryStats().getNativeToJavaJniLatencyMs()).isAtLeast(0);
+ assertThat(searchResultProto.getQueryStats().getJavaToNativeJniLatencyMs()).isAtLeast(0);
+ }
+
+ @Test
+ public void testGetNextPage() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icingSearchEngine
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ Map<String, DocumentProto> documents = new HashMap<>();
+ for (int i = 0; i < 10; i++) {
+ DocumentProto emailDocument =
+ createEmailDocument("namespace", "uri:" + i).toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
+ .build();
+ documents.put("uri:" + i, emailDocument);
+ assertWithMessage(icingSearchEngine.put(emailDocument).getStatus().getMessage())
+ .that(icingSearchEngine.put(emailDocument).getStatus().getCode())
+ .isEqualTo(StatusProto.Code.OK);
+ }
+
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("foo")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+ ResultSpecProto resultSpecProto = ResultSpecProto.newBuilder().setNumPerPage(1).build();
+
+ SearchResultProto searchResultProto =
+ icingSearchEngine.search(
+ searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+ assertStatusOk(searchResultProto.getStatus());
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+ DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+ assertThat(resultDocument).isEqualTo(documents.remove(resultDocument.getUri()));
+
+ assertThat(searchResultProto.getQueryStats().hasNativeToJavaStartTimestampMs()).isTrue();
+ assertThat(searchResultProto.getQueryStats().hasNativeToJavaJniLatencyMs()).isTrue();
+ assertThat(searchResultProto.getQueryStats().hasJavaToNativeJniLatencyMs()).isTrue();
+ assertThat(searchResultProto.getQueryStats().getNativeToJavaStartTimestampMs())
+ .isGreaterThan(0);
+ assertThat(searchResultProto.getQueryStats().getNativeToJavaJniLatencyMs()).isAtLeast(0);
+ assertThat(searchResultProto.getQueryStats().getJavaToNativeJniLatencyMs()).isAtLeast(0);
+
+ // fetch rest pages
+ for (int i = 1; i < 5; i++) {
+ searchResultProto = icingSearchEngine.getNextPage(searchResultProto.getNextPageToken());
+ assertWithMessage(searchResultProto.getStatus().getMessage())
+ .that(searchResultProto.getStatus().getCode())
+ .isEqualTo(StatusProto.Code.OK);
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+ resultDocument = searchResultProto.getResults(0).getDocument();
+ assertThat(resultDocument).isEqualTo(documents.remove(resultDocument.getUri()));
+ }
+
+ // invalidate rest result
+ icingSearchEngine.invalidateNextPageToken(searchResultProto.getNextPageToken());
+
+ searchResultProto = icingSearchEngine.getNextPage(searchResultProto.getNextPageToken());
+ assertStatusOk(searchResultProto.getStatus());
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(0);
}
@Test
public void testDelete() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
assertThat(
- icing
+ icingSearchEngine
.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
.getStatus()
.getCode())
.isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument = createEmailDocument("namespace", "uri");
- assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.put(emailDocument).getStatus());
- DeleteResultProto deleteResultProto = icing.delete("namespace", "uri");
- assertThat(deleteResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ DeleteResultProto deleteResultProto = icingSearchEngine.delete("namespace", "uri");
+ assertStatusOk(deleteResultProto.getStatus());
- GetResultProto getResultProto = icing.get("namespace", "uri");
+ GetResultProto getResultProto =
+ icingSearchEngine.get("namespace", "uri", GetResultSpecProto.getDefaultInstance());
assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
}
@Test
public void testDeleteByNamespace() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
assertThat(
- icing
+ icingSearchEngine
.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
.getStatus()
.getCode())
.isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument = createEmailDocument("namespace", "uri");
- assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.put(emailDocument).getStatus());
DeleteByNamespaceResultProto deleteByNamespaceResultProto =
- icing.deleteByNamespace("namespace");
- assertThat(deleteByNamespaceResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ icingSearchEngine.deleteByNamespace("namespace");
+ assertStatusOk(deleteByNamespaceResultProto.getStatus());
- GetResultProto getResultProto = icing.get("namespace", "uri");
+ GetResultProto getResultProto =
+ icingSearchEngine.get("namespace", "uri", GetResultSpecProto.getDefaultInstance());
assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
}
@Test
public void testDeleteBySchemaType() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
assertThat(
- icing
+ icingSearchEngine
.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
.getStatus()
.getCode())
.isEqualTo(StatusProto.Code.OK);
DocumentProto emailDocument = createEmailDocument("namespace", "uri");
- assertThat(icing.put(emailDocument).getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.put(emailDocument).getStatus());
DeleteBySchemaTypeResultProto deleteBySchemaTypeResultProto =
- icing.deleteBySchemaType(EMAIL_TYPE);
- assertThat(deleteBySchemaTypeResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ icingSearchEngine.deleteBySchemaType(EMAIL_TYPE);
+ assertStatusOk(deleteBySchemaTypeResultProto.getStatus());
- GetResultProto getResultProto = icing.get("namespace", "uri");
+ GetResultProto getResultProto =
+ icingSearchEngine.get("namespace", "uri", GetResultSpecProto.getDefaultInstance());
assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
}
@Test
+ public void testDeleteByQuery() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icingSearchEngine
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
+ .build();
+
+ assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+ DocumentProto emailDocument2 =
+ createEmailDocument("namespace", "uri2").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("bar"))
+ .build();
+
+ assertStatusOk(icingSearchEngine.put(emailDocument2).getStatus());
+
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("foo")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+
+ SearchResultProto searchResultProto =
+ icingSearchEngine.search(
+ searchSpec,
+ ScoringSpecProto.getDefaultInstance(),
+ ResultSpecProto.getDefaultInstance());
+ assertStatusOk(searchResultProto.getStatus());
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+ assertThat(searchResultProto.getResults(0).getDocument()).isEqualTo(emailDocument1);
+
+ DeleteByQueryResultProto deleteResultProto = icingSearchEngine.deleteByQuery(searchSpec);
+ assertStatusOk(deleteResultProto.getStatus());
+ // By default, the deleteByQuery API does not return the summary about deleted documents, unless
+ // the returnDeletedDocumentInfo parameter is set to true.
+ assertThat(deleteResultProto.getDeletedDocumentsList()).isEmpty();
+
+ GetResultProto getResultProto =
+ icingSearchEngine.get("namespace", "uri1", GetResultSpecProto.getDefaultInstance());
+ assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
+ getResultProto =
+ icingSearchEngine.get("namespace", "uri2", GetResultSpecProto.getDefaultInstance());
+ assertStatusOk(getResultProto.getStatus());
+ }
+
+ @Test
+ public void testDeleteByQueryWithDeletedDocumentInfo() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icingSearchEngine
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
+ .build();
+
+ assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+ DocumentProto emailDocument2 =
+ createEmailDocument("namespace", "uri2").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("bar"))
+ .build();
+
+ assertStatusOk(icingSearchEngine.put(emailDocument2).getStatus());
+
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("foo")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+
+ DeleteByQueryResultProto deleteResultProto =
+ icingSearchEngine.deleteByQuery(searchSpec, /*returnDeletedDocumentInfo=*/ true);
+ assertStatusOk(deleteResultProto.getStatus());
+ DeleteByQueryResultProto.DocumentGroupInfo info =
+ DeleteByQueryResultProto.DocumentGroupInfo.newBuilder()
+ .setNamespace("namespace")
+ .setSchema("Email")
+ .addUris("uri1")
+ .build();
+ assertThat(deleteResultProto.getDeletedDocumentsList()).containsExactly(info);
+
+ GetResultProto getResultProto =
+ icingSearchEngine.get("namespace", "uri1", GetResultSpecProto.getDefaultInstance());
+ assertThat(getResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.NOT_FOUND);
+ getResultProto =
+ icingSearchEngine.get("namespace", "uri2", GetResultSpecProto.getDefaultInstance());
+ assertStatusOk(getResultProto.getStatus());
+ }
+
+ @Test
public void testPersistToDisk() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
- PersistToDiskResultProto persistToDiskResultProto = icing.persistToDisk();
- assertThat(persistToDiskResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ PersistToDiskResultProto persistToDiskResultProto =
+ icingSearchEngine.persistToDisk(PersistType.Code.LITE);
+ assertStatusOk(persistToDiskResultProto.getStatus());
}
@Test
public void testOptimize() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
- OptimizeResultProto optimizeResultProto = icing.optimize();
- assertThat(optimizeResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ OptimizeResultProto optimizeResultProto = icingSearchEngine.optimize();
+ assertStatusOk(optimizeResultProto.getStatus());
}
@Test
public void testGetOptimizeInfo() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
- GetOptimizeInfoResultProto getOptimizeInfoResultProto = icing.getOptimizeInfo();
- assertThat(getOptimizeInfoResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ GetOptimizeInfoResultProto getOptimizeInfoResultProto = icingSearchEngine.getOptimizeInfo();
+ assertStatusOk(getOptimizeInfoResultProto.getStatus());
assertThat(getOptimizeInfoResultProto.getOptimizableDocs()).isEqualTo(0);
assertThat(getOptimizeInfoResultProto.getEstimatedOptimizableBytes()).isEqualTo(0);
}
@Test
+ public void testGetStorageInfo() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ StorageInfoResultProto storageInfoResultProto = icingSearchEngine.getStorageInfo();
+ assertStatusOk(storageInfoResultProto.getStatus());
+ }
+
+ @Test
+ public void testGetDebugInfo() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icingSearchEngine
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument = createEmailDocument("namespace", "uri");
+ assertStatusOk(icingSearchEngine.put(emailDocument).getStatus());
+
+ DebugInfoResultProto debugInfoResultProtoBasic =
+ icingSearchEngine.getDebugInfo(DebugInfoVerbosity.Code.BASIC);
+ assertStatusOk(debugInfoResultProtoBasic.getStatus());
+ assertThat(debugInfoResultProtoBasic.getDebugInfo().getDocumentInfo().getCorpusInfoList())
+ .isEmpty(); // because verbosity=BASIC
+
+ DebugInfoResultProto debugInfoResultProtoDetailed =
+ icingSearchEngine.getDebugInfo(DebugInfoVerbosity.Code.DETAILED);
+ assertStatusOk(debugInfoResultProtoDetailed.getStatus());
+ assertThat(debugInfoResultProtoDetailed.getDebugInfo().getDocumentInfo().getCorpusInfoList())
+ .hasSize(1); // because verbosity=DETAILED
+ }
+
+ @Test
+ public void testGetAllNamespaces() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icingSearchEngine
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument = createEmailDocument("namespace", "uri");
+ assertStatusOk(icingSearchEngine.put(emailDocument).getStatus());
+
+ GetAllNamespacesResultProto getAllNamespacesResultProto = icingSearchEngine.getAllNamespaces();
+ assertStatusOk(getAllNamespacesResultProto.getStatus());
+ assertThat(getAllNamespacesResultProto.getNamespacesList()).containsExactly("namespace");
+ }
+
+ @Test
public void testReset() throws Exception {
- IcingSearchEngineOptions options =
- IcingSearchEngineOptions.newBuilder().setBaseDir(filesDir).build();
- IcingSearchEngine icing = new IcingSearchEngine(options);
- assertThat(icing.initialize().getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ ResetResultProto resetResultProto = icingSearchEngine.reset();
+ assertStatusOk(resetResultProto.getStatus());
+ }
+
+ @Test
+ public void testReportUsage() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ // Set schema and put a document.
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icingSearchEngine
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument = createEmailDocument("namespace", "uri");
+ PutResultProto putResultProto = icingSearchEngine.put(emailDocument);
+ assertStatusOk(putResultProto.getStatus());
+
+ // Report usage
+ UsageReport usageReport =
+ UsageReport.newBuilder()
+ .setDocumentNamespace("namespace")
+ .setDocumentUri("uri")
+ .setUsageTimestampMs(1)
+ .setUsageType(UsageReport.UsageType.USAGE_TYPE1)
+ .build();
+ ReportUsageResultProto reportUsageResultProto = icingSearchEngine.reportUsage(usageReport);
+ assertStatusOk(reportUsageResultProto.getStatus());
+ }
+
+ @Test
+ public void testCJKTSnippets() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build();
+ assertStatusOk(
+ icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
+
+ // String: "天是蓝的"
+ // ^ ^^ ^
+ // UTF16 idx: 0 1 2 3
+ // Breaks into segments: "天", "是", "蓝", "的"
+ // "The sky is blue"
+ String chinese = "天是蓝的";
+ assertThat(chinese.length()).isEqualTo(4);
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(chinese))
+ .build();
+ assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("是")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+ ResultSpecProto resultSpecProto =
+ ResultSpecProto.newBuilder()
+ .setSnippetSpec(
+ ResultSpecProto.SnippetSpecProto.newBuilder()
+ .setNumToSnippet(Integer.MAX_VALUE)
+ .setNumMatchesPerProperty(Integer.MAX_VALUE))
+ .build();
+
+ // Search and make sure that we got a single successful results
+ SearchResultProto searchResultProto =
+ icingSearchEngine.search(
+ searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+ assertStatusOk(searchResultProto.getStatus());
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+
+ // Ensure that one and only one property was matched and it was "subject"
+ SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet();
+ assertThat(snippetProto.getEntriesList()).hasSize(1);
+ SnippetProto.EntryProto entryProto = snippetProto.getEntries(0);
+ assertThat(entryProto.getPropertyName()).isEqualTo("subject");
+
+ // Get the content for "subject" and see what the match is.
+ DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+ assertThat(resultDocument.getPropertiesList()).hasSize(1);
+ PropertyProto subjectProperty = resultDocument.getProperties(0);
+ assertThat(subjectProperty.getName()).isEqualTo("subject");
+ assertThat(subjectProperty.getStringValuesList()).hasSize(1);
+ String content = subjectProperty.getStringValues(0);
+
+ // Ensure that there is one and only one match within "subject"
+ assertThat(entryProto.getSnippetMatchesList()).hasSize(1);
+ SnippetMatchProto matchProto = entryProto.getSnippetMatches(0);
+
+ int matchStart = matchProto.getExactMatchUtf16Position();
+ int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
+ assertThat(matchStart).isEqualTo(1);
+ assertThat(matchEnd).isEqualTo(2);
+ String match = content.substring(matchStart, matchEnd);
+ assertThat(match).isEqualTo("是");
+ }
+
+ @Test
+ public void testUtf16MultiByteSnippets() throws Exception {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(createEmailTypeConfig()).build();
+ assertStatusOk(
+ icingSearchEngine.setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false).getStatus());
+
+ // String: "𐀀𐀁 𐀂𐀃 𐀄"
+ // ^ ^ ^
+ // UTF16 idx: 0 5 10
+ // Breaks into segments: "𐀀𐀁", "𐀂𐀃", "𐀄"
+ String text = "𐀀𐀁 𐀂𐀃 𐀄";
+ assertThat(text.length()).isEqualTo(12);
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues(text))
+ .build();
+ assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+
+ // Search and request snippet matching but no windowing.
+ SearchSpecProto searchSpec =
+ SearchSpecProto.newBuilder()
+ .setQuery("𐀂")
+ .setTermMatchType(TermMatchType.Code.PREFIX)
+ .build();
+ ResultSpecProto resultSpecProto =
+ ResultSpecProto.newBuilder()
+ .setSnippetSpec(
+ ResultSpecProto.SnippetSpecProto.newBuilder()
+ .setNumToSnippet(Integer.MAX_VALUE)
+ .setNumMatchesPerProperty(Integer.MAX_VALUE))
+ .build();
+
+ // Search and make sure that we got a single successful results
+ SearchResultProto searchResultProto =
+ icingSearchEngine.search(
+ searchSpec, ScoringSpecProto.getDefaultInstance(), resultSpecProto);
+ assertStatusOk(searchResultProto.getStatus());
+ assertThat(searchResultProto.getResultsCount()).isEqualTo(1);
+
+ // Ensure that one and only one property was matched and it was "subject"
+ SnippetProto snippetProto = searchResultProto.getResults(0).getSnippet();
+ assertThat(snippetProto.getEntriesList()).hasSize(1);
+ SnippetProto.EntryProto entryProto = snippetProto.getEntries(0);
+ assertThat(entryProto.getPropertyName()).isEqualTo("subject");
+
+ // Get the content for "subject" and see what the match is.
+ DocumentProto resultDocument = searchResultProto.getResults(0).getDocument();
+ assertThat(resultDocument.getPropertiesList()).hasSize(1);
+ PropertyProto subjectProperty = resultDocument.getProperties(0);
+ assertThat(subjectProperty.getName()).isEqualTo("subject");
+ assertThat(subjectProperty.getStringValuesList()).hasSize(1);
+ String content = subjectProperty.getStringValues(0);
+
+ // Ensure that there is one and only one match within "subject"
+ assertThat(entryProto.getSnippetMatchesList()).hasSize(1);
+ SnippetMatchProto matchProto = entryProto.getSnippetMatches(0);
+
+ int matchStart = matchProto.getExactMatchUtf16Position();
+ int matchEnd = matchStart + matchProto.getExactMatchUtf16Length();
+ assertThat(matchStart).isEqualTo(5);
+ assertThat(matchEnd).isEqualTo(9);
+ String match = content.substring(matchStart, matchEnd);
+ assertThat(match).isEqualTo("𐀂𐀃");
+ }
+
+ @Test
+ public void testSearchSuggestions() {
+ assertStatusOk(icingSearchEngine.initialize().getStatus());
+
+ SchemaTypeConfigProto emailTypeConfig = createEmailTypeConfig();
+ SchemaProto schema = SchemaProto.newBuilder().addTypes(emailTypeConfig).build();
+ assertThat(
+ icingSearchEngine
+ .setSchema(schema, /*ignoreErrorsAndDeleteDocuments=*/ false)
+ .getStatus()
+ .getCode())
+ .isEqualTo(StatusProto.Code.OK);
+
+ DocumentProto emailDocument1 =
+ createEmailDocument("namespace", "uri1").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("fo"))
+ .build();
+ DocumentProto emailDocument2 =
+ createEmailDocument("namespace", "uri2").toBuilder()
+ .addProperties(PropertyProto.newBuilder().setName("subject").addStringValues("foo"))
+ .build();
+ assertStatusOk(icingSearchEngine.put(emailDocument1).getStatus());
+ assertStatusOk(icingSearchEngine.put(emailDocument2).getStatus());
+
+ SuggestionSpecProto suggestionSpec =
+ SuggestionSpecProto.newBuilder()
+ .setPrefix("f")
+ .setNumToReturn(10)
+ .setScoringSpec(
+ SuggestionScoringSpecProto.newBuilder()
+ .setScoringMatchType(Code.EXACT_ONLY)
+ .build())
+ .build();
+
+ SuggestionResponse response = icingSearchEngine.searchSuggestions(suggestionSpec);
+ assertStatusOk(response.getStatus());
+ assertThat(response.getSuggestionsList()).hasSize(2);
+ assertThat(response.getSuggestions(0).getQuery()).isEqualTo("foo");
+ assertThat(response.getSuggestions(1).getQuery()).isEqualTo("fo");
+ }
+
+ @Test
+ public void testLogging() throws Exception {
+ // Set to INFO
+ assertThat(IcingSearchEngine.setLoggingLevel(LogSeverity.Code.INFO)).isTrue();
+ assertThat(IcingSearchEngine.shouldLog(LogSeverity.Code.INFO)).isTrue();
+ assertThat(IcingSearchEngine.shouldLog(LogSeverity.Code.DBG)).isFalse();
+
+ // Set to WARNING
+ assertThat(IcingSearchEngine.setLoggingLevel(LogSeverity.Code.WARNING)).isTrue();
+ assertThat(IcingSearchEngine.shouldLog(LogSeverity.Code.WARNING)).isTrue();
+ assertThat(IcingSearchEngine.shouldLog(LogSeverity.Code.INFO)).isFalse();
+
+ // Set to DEBUG
+ assertThat(IcingSearchEngine.setLoggingLevel(LogSeverity.Code.DBG)).isTrue();
+ assertThat(IcingSearchEngine.shouldLog(LogSeverity.Code.DBG)).isTrue();
+ assertThat(IcingSearchEngine.shouldLog(LogSeverity.Code.VERBOSE)).isFalse();
+
+ // Set to VERBOSE
+ assertThat(IcingSearchEngine.setLoggingLevel(LogSeverity.Code.VERBOSE, (short) 1)).isTrue();
+ assertThat(IcingSearchEngine.shouldLog(LogSeverity.Code.VERBOSE, (short) 1)).isTrue();
+ assertThat(IcingSearchEngine.shouldLog(LogSeverity.Code.VERBOSE, (short) 2)).isFalse();
+
+ assertThat(IcingSearchEngine.getLoggingTag()).isNotEmpty();
+ }
- ResetResultProto resetResultProto = icing.reset();
- assertThat(resetResultProto.getStatus().getCode()).isEqualTo(StatusProto.Code.OK);
+ private static void assertStatusOk(StatusProto status) {
+ assertWithMessage(status.getMessage()).that(status.getCode()).isEqualTo(StatusProto.Code.OK);
}
}
diff --git a/lint-baseline.xml b/lint-baseline.xml
new file mode 100644
index 0000000..5d2b935
--- /dev/null
+++ b/lint-baseline.xml
@@ -0,0 +1,487 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<issues format="6" by="lint 8.1.0-beta02" type="baseline" client="gradle" dependencies="false" name="AGP (8.1.0-beta02)" variant="all" version="8.1.0-beta02">
+
+ <issue
+ id="KotlinPropertyAccess"
+ message="The getter return type (`GetSchemaResultProto`) and setter parameter type (`SchemaProto`) getter and setter methods for property `schema` should have exactly the same type to allow be accessed as a property from Kotlin; see https://android.github.io/kotlin-guides/interop.html#property-prefixes"
+ errorLine1=" public GetSchemaResultProto getSchema() {"
+ errorLine2=" ~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngine.java"/>
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngine.java"
+ message="Setter here"/>
+ </issue>
+
+ <issue
+ id="KotlinPropertyAccess"
+ message="The getter return type (`GetSchemaResultProto`) and setter parameter type (`SchemaProto`) getter and setter methods for property `schema` should have exactly the same type to allow be accessed as a property from Kotlin; see https://android.github.io/kotlin-guides/interop.html#property-prefixes"
+ errorLine1=" GetSchemaResultProto getSchema();"
+ errorLine2=" ~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"
+ message="Setter here"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" public BreakIteratorBatcher(Locale locale) {"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/BreakIteratorBatcher.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" public void setText(String text) {"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/BreakIteratorBatcher.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" public int[] next(int batchSize) {"
+ errorLine2=" ~~~~~">
+ <location
+ file="java/src/com/google/android/icing/BreakIteratorBatcher.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" public DebugInfoResultProto getDebugInfo(DebugInfoVerbosity.Code verbosity) {"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngine.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" public static boolean shouldLog(LogSeverity.Code severity) {"
+ errorLine2=" ~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngine.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" public static boolean shouldLog(LogSeverity.Code severity, short verbosity) {"
+ errorLine2=" ~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngine.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" public static boolean setLoggingLevel(LogSeverity.Code severity) {"
+ errorLine2=" ~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngine.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" public static boolean setLoggingLevel(LogSeverity.Code severity, short verbosity) {"
+ errorLine2=" ~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngine.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" InitializeResultProto initialize();"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SetSchemaResultProto setSchema(SchemaProto schema);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SetSchemaResultProto setSchema(SchemaProto schema);"
+ errorLine2=" ~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SetSchemaResultProto setSchema(SchemaProto schema, boolean ignoreErrorsAndDeleteDocuments);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SetSchemaResultProto setSchema(SchemaProto schema, boolean ignoreErrorsAndDeleteDocuments);"
+ errorLine2=" ~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetSchemaResultProto getSchema();"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetSchemaTypeResultProto getSchemaType(String schemaType);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetSchemaTypeResultProto getSchemaType(String schemaType);"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" PutResultProto put(DocumentProto document);"
+ errorLine2=" ~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" PutResultProto put(DocumentProto document);"
+ errorLine2=" ~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetResultProto get(String namespace, String uri, GetResultSpecProto getResultSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetResultProto get(String namespace, String uri, GetResultSpecProto getResultSpec);"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetResultProto get(String namespace, String uri, GetResultSpecProto getResultSpec);"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetResultProto get(String namespace, String uri, GetResultSpecProto getResultSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" ReportUsageResultProto reportUsage(UsageReport usageReport);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" ReportUsageResultProto reportUsage(UsageReport usageReport);"
+ errorLine2=" ~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetAllNamespacesResultProto getAllNamespaces();"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SearchResultProto search("
+ errorLine2=" ~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SearchSpecProto searchSpec, ScoringSpecProto scoringSpec, ResultSpecProto resultSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SearchSpecProto searchSpec, ScoringSpecProto scoringSpec, ResultSpecProto resultSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SearchSpecProto searchSpec, ScoringSpecProto scoringSpec, ResultSpecProto resultSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SearchResultProto getNextPage(long nextPageToken);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteResultProto delete(String namespace, String uri);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteResultProto delete(String namespace, String uri);"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteResultProto delete(String namespace, String uri);"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SuggestionResponse searchSuggestions(SuggestionSpecProto suggestionSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SuggestionResponse searchSuggestions(SuggestionSpecProto suggestionSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteByNamespaceResultProto deleteByNamespace(String namespace);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteByNamespaceResultProto deleteByNamespace(String namespace);"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteBySchemaTypeResultProto deleteBySchemaType(String schemaType);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteBySchemaTypeResultProto deleteBySchemaType(String schemaType);"
+ errorLine2=" ~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteByQueryResultProto deleteByQuery(SearchSpecProto searchSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteByQueryResultProto deleteByQuery(SearchSpecProto searchSpec);"
+ errorLine2=" ~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DeleteByQueryResultProto deleteByQuery("
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" SearchSpecProto searchSpec, boolean returnDeletedDocumentInfo);"
+ errorLine2=" ~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" PersistToDiskResultProto persistToDisk(PersistType.Code persistTypeCode);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" PersistToDiskResultProto persistToDisk(PersistType.Code persistTypeCode);"
+ errorLine2=" ~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" OptimizeResultProto optimize();"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" GetOptimizeInfoResultProto getOptimizeInfo();"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" StorageInfoResultProto getStorageInfo();"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DebugInfoResultProto getDebugInfo(DebugInfoVerbosity.Code verbosity);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" DebugInfoResultProto getDebugInfo(DebugInfoVerbosity.Code verbosity);"
+ errorLine2=" ~~~~~~~~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+ <issue
+ id="UnknownNullness"
+ message="Unknown nullability; explicitly declare as `@Nullable` or `@NonNull` to improve Kotlin interoperability; see https://developer.android.com/kotlin/interop#nullability_annotations"
+ errorLine1=" ResetResultProto reset();"
+ errorLine2=" ~~~~~~~~~~~~~~~~">
+ <location
+ file="java/src/com/google/android/icing/IcingSearchEngineInterface.java"/>
+ </issue>
+
+</issues>
diff --git a/nativeLib/build.gradle b/nativeLib/build.gradle
index ce7dca7..6b30451 100644
--- a/nativeLib/build.gradle
+++ b/nativeLib/build.gradle
@@ -14,43 +14,6 @@
* limitations under the License.
*/
-buildscript {
- boolean unbundleBuild = (new File('unbundled-build')).exists()
- repositories {
- maven { url '../../../prebuilts/androidx/external' }
- if (unbundleBuild) {
- jcenter()
- }
- }
- dependencies {
- classpath('gradle.plugin.com.google.protobuf:protobuf-gradle-plugin:0.8.8')
- }
-}
-
-apply plugin: 'AndroidXPlugin'
-apply plugin: 'com.android.library'
-
-android {
- defaultConfig {
- externalNativeBuild {
- cmake {
- cppFlags "-std=c++17"
- arguments "-DCMAKE_VERBOSE_MAKEFILE=ON"
- targets "icing"
- }
- }
- }
-
- sourceSets {
- main {
- manifest.srcFile '../AndroidManifest.xml'
- }
- }
-
- externalNativeBuild {
- cmake {
- version '3.10.2'
- path '../CMakeLists.txt'
- }
- }
-}
+// TODO(b/161205849): We've had to move libicing.so compilation into appsearch:appsearch to get
+// it included into the exported aar. Find a proper solution for bundling libicing.so into
+// appsearch-release.aar and move compilation of libicing.so back into the external/icing tree.
diff --git a/proto/icing/index/numeric/wildcard-property-storage.proto b/proto/icing/index/numeric/wildcard-property-storage.proto
new file mode 100644
index 0000000..7f02b77
--- /dev/null
+++ b/proto/icing/index/numeric/wildcard-property-storage.proto
@@ -0,0 +1,22 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+// Next tag: 2
+message WildcardPropertyStorage {
+ repeated string property_entries = 1;
+}
diff --git a/proto/icing/proto/debug.proto b/proto/icing/proto/debug.proto
new file mode 100644
index 0000000..90d1981
--- /dev/null
+++ b/proto/icing/proto/debug.proto
@@ -0,0 +1,137 @@
+// Copyright 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+import "icing/proto/schema.proto";
+import "icing/proto/status.proto";
+import "icing/proto/storage.proto";
+
+option java_package = "com.google.android.icing.proto";
+option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
+message LogSeverity {
+ enum Code {
+ VERBOSE = 0;
+ // Unable to use DEBUG at this time because it breaks YTM's iOS tests
+ // cs/?q=%22-DDEBUG%3D1%22%20f:%2FYoutubeMusic%20f:blueprint&ssfr=1
+ DBG = 1;
+ INFO = 2;
+ WARNING = 3;
+ ERROR = 4;
+ FATAL = 5;
+ }
+}
+
+message DebugInfoVerbosity {
+ enum Code {
+ // Simplest debug information.
+ BASIC = 0;
+ // More detailed debug information as indicated in the field documentation
+ // below.
+ DETAILED = 1;
+ }
+}
+
+// Next tag: 4
+message IndexDebugInfoProto {
+ // Storage information of the index.
+ optional IndexStorageInfoProto index_storage_info = 1;
+
+ // A formatted string containing the following information:
+ // lexicon_info: Information about the main lexicon
+ // last_added_document_id: Last added document id
+ // flash_index_storage_info: If verbosity = DETAILED, return information about
+ // the posting list storage
+ //
+ // No direct contents from user-provided documents will ever appear in this
+ // string.
+ optional string main_index_info = 2;
+
+ // A formatted string containing the following information:
+ // curr_size: Current number of hits
+ // hit_buffer_size: The maximum possible number of hits
+ // last_added_document_id: Last added document id
+ // searchable_end: The first position in the hit buffer that is not sorted
+ // yet, or curr_size if all hits are sorted
+ // index_crc: The most recent checksum of the lite index, by calling
+ // LiteIndex::ComputeChecksum()
+ // lexicon_info: Information about the lite lexicon
+ //
+ // No direct contents from user-provided documents will ever appear in this
+ // string.
+ optional string lite_index_info = 3;
+}
+
+// Next tag: 4
+message DocumentDebugInfoProto {
+ // Storage information of the document store.
+ optional DocumentStorageInfoProto document_storage_info = 1;
+
+ // The most recent checksum of the document store, by calling
+ // DocumentStore::ComputeChecksum().
+ optional uint32 crc = 2;
+
+ message CorpusInfo {
+ optional string namespace = 1;
+ optional string schema = 2;
+ optional uint32 total_documents = 3;
+ optional uint32 total_token = 4;
+ }
+
+ // If verbosity = DETAILED, return the total number of documents and tokens in
+ // each (namespace, schema type) pair.
+ // Note that deleted and expired documents are skipped in the output.
+ repeated CorpusInfo corpus_info = 3;
+}
+
+// Next tag: 3
+message SchemaDebugInfoProto {
+ // Copy of the SchemaProto if it has been set in the schema store.
+ // Modifying this does not affect the Schema that IcingSearchEngine holds.
+ optional SchemaProto schema = 1;
+
+ // The most recent checksum of the schema store, by calling
+ // SchemaStore::ComputeChecksum().
+ optional uint32 crc = 2;
+}
+
+// Next tag: 4
+message DebugInfoProto {
+ // Debug information of the index.
+ optional IndexDebugInfoProto index_info = 1;
+
+ // Debug information of the document store.
+ optional DocumentDebugInfoProto document_info = 2;
+
+ // Debug information of the schema store.
+ optional SchemaDebugInfoProto schema_info = 3;
+}
+
+// Next tag: 3
+message DebugInfoResultProto {
+ // Status code can be one of:
+ // OK
+ // FAILED_PRECONDITION if IcingSearchEngine has not been initialized yet
+ // INTERNAL on IO errors, crc compute error.
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+
+ // Debug information for Icing.
+ optional DebugInfoProto debug_info = 2;
+}
diff --git a/proto/icing/proto/document.proto b/proto/icing/proto/document.proto
index 1caf169..1a501e7 100644
--- a/proto/icing/proto/document.proto
+++ b/proto/icing/proto/document.proto
@@ -16,6 +16,7 @@ syntax = "proto2";
package icing.lib;
+import "icing/proto/logging.proto";
import "icing/proto/status.proto";
option java_package = "com.google.android.icing.proto";
@@ -23,7 +24,7 @@ option java_multiple_files = true;
option objc_class_prefix = "ICNG";
// Defines a unit of data understood by the IcingSearchEngine.
-// Next tag: 9
+// Next tag: 10
message DocumentProto {
// REQUIRED: Namespace that this Document resides in.
// Namespaces can affect read/write permissions.
@@ -49,11 +50,6 @@ message DocumentProto {
// already defined in the schema for this Document's schema_type.
repeated PropertyProto properties = 5;
- // OPTIONAL: Properties that will not be validated against the schema,
- // indexed, or be searchable. The properties will be stored in the Documents,
- // but never looked at by Icing.
- repeated PropertyProto custom_properties = 6;
-
// OPTIONAL: Score of the document which could be used during search result
// ranking. Negative values will lead to validation errors. The default is the
// lowest score 0.
@@ -68,6 +64,17 @@ message DocumentProto {
// TODO(cassiewang): Benchmark if fixed64 or some other proto type is better
// in terms of space/time efficiency. Both for ttl_ms and timestamp fields
optional int64 ttl_ms = 8 [default = 0];
+
+ // Defines document level data that's generated internally by Icing.
+ message InternalFields {
+ // The length of the document as a count of tokens (or terms) in all indexed
+ // text properties. This field is used in the computation of BM25F relevance
+ // score.
+ optional int32 length_in_tokens = 1;
+ }
+ optional InternalFields internal_fields = 9;
+
+ reserved 6;
}
// Holds a property field of the Document.
@@ -88,7 +95,7 @@ message PropertyProto {
}
// Result of a call to IcingSearchEngine.Put
-// Next tag: 2
+// Next tag: 3
message PutResultProto {
// Status code can be one of:
// OK
@@ -102,6 +109,12 @@ message PutResultProto {
// TODO(b/147699081): Fix error codes: +ABORTED
// go/icing-library-apis.
optional StatusProto status = 1;
+
+ // Stats of the function call. Inside PutDocumentStatsProto, the function
+ // call latency 'latency_ms' will always be populated. The other fields will
+ // be accurate only when the status above is OK. See logging.proto for
+ // details.
+ optional PutDocumentStatsProto put_document_stats = 2;
}
// Result of a call to IcingSearchEngine.Get
@@ -139,7 +152,7 @@ message GetAllNamespacesResultProto {
}
// Result of a call to IcingSearchEngine.Delete
-// Next tag: 2
+// Next tag: 3
message DeleteResultProto {
// Status code can be one of:
// OK
@@ -152,10 +165,13 @@ message DeleteResultProto {
// TODO(b/147699081): Fix error codes: +ABORTED.
// go/icing-library-apis.
optional StatusProto status = 1;
+
+ // Stats for delete execution performance.
+ optional DeleteStatsProto delete_stats = 2;
}
// Result of a call to IcingSearchEngine.DeleteByNamespace
-// Next tag: 2
+// Next tag: 3
message DeleteByNamespaceResultProto {
// Status code can be one of:
// OK
@@ -168,10 +184,13 @@ message DeleteByNamespaceResultProto {
// TODO(b/147699081): Fix error codes: +ABORTED.
// go/icing-library-apis.
optional StatusProto status = 1;
+
+ // Stats for delete execution performance.
+ optional DeleteStatsProto delete_stats = 2;
}
// Result of a call to IcingSearchEngine.DeleteBySchemaType
-// Next tag: 2
+// Next tag: 3
message DeleteBySchemaTypeResultProto {
// Status code can be one of:
// OK
@@ -184,4 +203,41 @@ message DeleteBySchemaTypeResultProto {
// TODO(b/147699081): Fix error codes: +ABORTED.
// go/icing-library-apis.
optional StatusProto status = 1;
+
+ // Stats for delete execution performance.
+ optional DeleteStatsProto delete_stats = 2;
+}
+
+// Result of a call to IcingSearchEngine.DeleteByQuery
+// Next tag: 5
+message DeleteByQueryResultProto {
+ // Status code can be one of:
+ // OK
+ // FAILED_PRECONDITION
+ // NOT_FOUND
+ // INTERNAL
+ //
+ // See status.proto for more details.
+ //
+ // TODO(b/147699081): Fix error codes: +ABORTED.
+ // go/icing-library-apis.
+ optional StatusProto status = 1;
+
+ // Stats for delete execution performance.
+ optional DeleteByQueryStatsProto delete_by_query_stats = 3;
+
+ // Used by DeleteByQueryResultProto to return information about deleted
+ // documents.
+ message DocumentGroupInfo {
+ optional string namespace = 1;
+ optional string schema = 2;
+ repeated string uris = 3;
+ }
+
+ // Additional return message that shows the uris of the deleted documents, if
+ // users set return_deleted_document_info to true.
+ // The result is grouped by the corresponding namespace and type.
+ repeated DocumentGroupInfo deleted_documents = 4;
+
+ reserved 2;
}
diff --git a/proto/icing/proto/document_wrapper.proto b/proto/icing/proto/document_wrapper.proto
index e8eb992..929ee33 100644
--- a/proto/icing/proto/document_wrapper.proto
+++ b/proto/icing/proto/document_wrapper.proto
@@ -20,7 +20,6 @@ import "icing/proto/document.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
-
option objc_class_prefix = "ICNG";
// DocumentWrapper as a wrapper of the user-facing DocumentProto is meant to
@@ -30,6 +29,5 @@ option objc_class_prefix = "ICNG";
message DocumentWrapper {
optional DocumentProto document = 1;
- // Indicates if the document is marked as deleted
- optional bool deleted = 2;
+ reserved 2;
}
diff --git a/proto/icing/proto/initialize.proto b/proto/icing/proto/initialize.proto
index eac88e6..9dd9e88 100644
--- a/proto/icing/proto/initialize.proto
+++ b/proto/icing/proto/initialize.proto
@@ -16,33 +16,20 @@ syntax = "proto2";
package icing.lib;
+import "icing/proto/logging.proto";
import "icing/proto/status.proto";
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
-
option objc_class_prefix = "ICNG";
-// Next tag: 5
+// Next tag: 16
message IcingSearchEngineOptions {
// Directory to persist files for Icing. Required.
// If Icing was previously initialized with this directory, it will reload
// the index saved by the last instance.
optional string base_dir = 1;
- // The maximum number of tokens to be allowed per document. If a document
- // exceeds this number of tokens, then only the first max_tokens_per_doc
- // will be indexed.
- //
- // Clients may use this value to prevent the possibility of a select few
- // documents from exhausting limits in the index that are shared between all
- // documents (ie max allowed index size).
- //
- // Valid values: [1, INT_MAX], Current default is 1/5 of the default of
- // max_document_size.
- // Optional.
- optional int32 max_tokens_per_doc = 2 [default = 13107];
-
// The maximum allowable token length. All tokens in excess of this size
// will be truncated to max_token_length before being indexed.
//
@@ -70,10 +57,90 @@ message IcingSearchEngineOptions {
// Valid values: [1, INT_MAX]
// Optional.
optional int32 index_merge_size = 4 [default = 1048576]; // 1 MiB
+
+ // Whether to use namespace id or namespace name to build up fingerprint for
+ // document_key_mapper_ and corpus_mapper_ in document store.
+ // TODO(b/259969017) Flip the default value of this flag to true at the time
+ // when we switch to use persistent hash map for document_key_mapper_ so that
+ // we just need one reconstruction of the internal mappers.
+ optional bool document_store_namespace_id_fingerprint = 5;
+
+ // The threshold of the percentage of invalid documents to rebuild index
+ // during optimize, i.e. we rebuild index if and only if
+ // |invalid_documents| / |all_documents| >= optimize_rebuild_index_threshold
+ //
+ // Rebuilding the index could be faster than optimizing the index if we have
+ // removed most of the documents.
+ // Based on benchmarks, 85%~95% seems to be a good threshold for most cases.
+ //
+ // Default to 0 for better rollout of the new index optimize.
+ optional float optimize_rebuild_index_threshold = 6 [default = 0.0];
+
+ // Level of compression, NO_COMPRESSION = 0, BEST_SPEED = 1,
+ // BEST_COMPRESSION = 9
+ // Valid values: [0, 9]
+ // Optional.
+ optional int32 compression_level = 7 [default = 3];
+
+ // OPTIONAL: Whether to allow circular references between schema types for
+ // the schema definition.
+ //
+ // Even when set to true, circular references are still not allowed in the
+ // following cases:
+ // 1. All edges of a cycle have index_nested_properties=true
+ // 2. One of the types in the cycle has a joinable property, or depends on
+ // a type with a joinable property.
+ // This is because such a cycle would lead to an infinite number of
+ // indexed/joinable properties:
+ //
+ // The default value is false.
+ optional bool allow_circular_schema_definitions = 8;
+
+ // Whether memory map max possible file size for FileBackedVector before
+ // growing the actual file size.
+ optional bool pre_mapping_fbv = 9;
+
+ // Whether use persistent hash map as the key mapper (if false, then fall back
+ // to dynamic trie key mapper).
+ optional bool use_persistent_hash_map = 10;
+
+ // Integer index bucket split threshold.
+ optional int32 integer_index_bucket_split_threshold = 11 [default = 65536];
+
+ // Whether Icing should sort and merge its lite index HitBuffer unsorted tail
+ // at indexing time.
+ //
+ // If set to true, the HitBuffer will be sorted at indexing time after
+ // exceeding the sort threshold. If false, the HifBuffer will be sorted at
+ // querying time, before the first query after inserting new elements into the
+ // HitBuffer.
+ //
+ // The default value is false.
+ optional bool lite_index_sort_at_indexing = 12;
+
+ // Size (in bytes) at which Icing's lite index should sort and merge the
+ // HitBuffer's unsorted tail into the sorted head for sorting at indexing
+ // time. Size specified here is the maximum byte size to allow for the
+ // unsorted tail section.
+ //
+ // Setting a lower sort size reduces querying latency at the expense of
+ // indexing latency.
+ optional int32 lite_index_sort_size = 13 [default = 8192]; // 8 KiB
+
+ optional bool use_new_qualified_id_join_index = 14;
+
+ // Whether to build the metadata hits used for property existence check, which
+ // is required to support the hasProperty function in advanced query.
+ //
+ // TODO(b/309826655): Implement the feature flag derived files rebuild
+ // mechanism to handle index rebuild, instead of using index's magic value.
+ optional bool build_property_existence_metadata_hits = 15;
+
+ reserved 2;
}
// Result of a call to IcingSearchEngine.Initialize
-// Next tag: 2
+// Next tag: 3
message InitializeResultProto {
// Status code can be one of:
// OK
@@ -88,6 +155,12 @@ message InitializeResultProto {
// go/icing-library-apis.
optional StatusProto status = 1;
+ // Stats of the function call. Inside InitializeStatsProto, the function call
+ // latency 'latency_ms' will always be populated. The other fields will be
+ // accurate only when the status above is OK or WARNING_DATA_LOSS. See
+ // logging.proto for details.
+ optional InitializeStatsProto initialize_stats = 2;
+
// TODO(b/147699081): Add a field to indicate lost_schema and lost_documents.
// go/icing-library-apis.
}
diff --git a/proto/icing/proto/internal/optimize.proto b/proto/icing/proto/internal/optimize.proto
new file mode 100644
index 0000000..4ed3d73
--- /dev/null
+++ b/proto/icing/proto/internal/optimize.proto
@@ -0,0 +1,29 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+option java_package = "com.google.android.icing.internal.proto";
+option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
+// A status that is saved internally in Icing to track information about how
+// often Optimize runs.
+// Next tag: 2
+message OptimizeStatusProto {
+ // The Epoch time at which the last successfuly optimize ran.
+ optional int64 last_successful_optimize_run_time_ms = 1;
+}
diff --git a/proto/icing/proto/logging.proto b/proto/icing/proto/logging.proto
new file mode 100644
index 0000000..fcedeed
--- /dev/null
+++ b/proto/icing/proto/logging.proto
@@ -0,0 +1,364 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+import "icing/proto/scoring.proto";
+
+option java_package = "com.google.android.icing.proto";
+option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
+// Stats of the top-level function IcingSearchEngine::Initialize().
+// Next tag: 14
+message InitializeStatsProto {
+ // Overall time used for the function call.
+ optional int32 latency_ms = 1;
+
+ // The cause of IcingSearchEngine recovering from a previous bad state during
+ // initialization.
+ enum RecoveryCause {
+ // No recovery happened.
+ NONE = 0;
+
+ // Data loss in ground truth.
+ DATA_LOSS = 1;
+
+ // Data in index is inconsistent with ground truth.
+ INCONSISTENT_WITH_GROUND_TRUTH = 2;
+
+ // Changes were made to the schema, but possibly not fully applied to the
+ // document store and the index - requiring a recovery.
+ SCHEMA_CHANGES_OUT_OF_SYNC = 3;
+
+ // Random I/O errors.
+ IO_ERROR = 4;
+
+ // The document log is using legacy format.
+ LEGACY_DOCUMENT_LOG_FORMAT = 5;
+
+ // The current code version is different from existing data version.
+ VERSION_CHANGED = 6;
+
+ // Any dependencies have changed.
+ DEPENDENCIES_CHANGED = 7;
+ }
+
+ // Possible recovery causes for document store:
+ // - DATA_LOSS
+ // - SCHEMA_CHANGES_OUT_OF_SYNC
+ // - IO_ERROR
+ optional RecoveryCause document_store_recovery_cause = 2;
+
+ // Possible recovery causes for index:
+ // - INCONSISTENT_WITH_GROUND_TRUTH
+ // - SCHEMA_CHANGES_OUT_OF_SYNC
+ // - IO_ERROR
+ optional RecoveryCause index_restoration_cause = 3;
+
+ // Possible recovery causes for index:
+ // - IO_ERROR
+ optional RecoveryCause schema_store_recovery_cause = 4;
+
+ // Time used to recover the document store.
+ optional int32 document_store_recovery_latency_ms = 5;
+
+ // Time used to restore the index.
+ optional int32 index_restoration_latency_ms = 6;
+
+ // Time used to restore the schema store.
+ optional int32 schema_store_recovery_latency_ms = 7;
+
+ // Status regarding how much data is lost during the initialization.
+ enum DocumentStoreDataStatus {
+ // Document store is successfully initialized or fully recovered.
+ NO_DATA_LOSS = 0;
+
+ // Ground truth data is partially lost.
+ PARTIAL_LOSS = 1;
+
+ // Ground truth data is completely lost.
+ COMPLETE_LOSS = 2;
+ }
+ optional DocumentStoreDataStatus document_store_data_status = 8;
+
+ // Number of documents currently in document store. Those may
+ // include alive, deleted, and expired documents.
+ optional int32 num_documents = 9;
+
+ // Number of schema types currently in schema store.
+ optional int32 num_schema_types = 10;
+
+ // Number of consecutive initialization failures that immediately preceded
+ // this initialization.
+ optional int32 num_previous_init_failures = 11;
+
+ // Possible recovery causes for integer index:
+ // - INCONSISTENT_WITH_GROUND_TRUTH
+ // - SCHEMA_CHANGES_OUT_OF_SYNC
+ // - IO_ERROR
+ optional RecoveryCause integer_index_restoration_cause = 12;
+
+ // Possible recovery causes for qualified id join index:
+ // - INCONSISTENT_WITH_GROUND_TRUTH
+ // - SCHEMA_CHANGES_OUT_OF_SYNC
+ // - IO_ERROR
+ optional RecoveryCause qualified_id_join_index_restoration_cause = 13;
+}
+
+// Stats of the top-level function IcingSearchEngine::Put().
+// Next tag: 12
+message PutDocumentStatsProto {
+ // Overall time used for the function call.
+ optional int32 latency_ms = 1;
+
+ // Time used to store the document.
+ optional int32 document_store_latency_ms = 2;
+
+ // Time used to index the document.
+ optional int32 index_latency_ms = 3;
+
+ // Time used to merge the indices.
+ optional int32 index_merge_latency_ms = 4;
+
+ // Document size in bytes.
+ optional int32 document_size = 5;
+
+ message TokenizationStats {
+ // Number of tokens added to the index.
+ optional int32 num_tokens_indexed = 1;
+
+ // Number of metadata tokens added to the index, which can only be added by
+ // PropertyExistenceIndexingHandler currently.
+ optional int32 num_metadata_tokens_indexed = 3;
+
+ reserved 2;
+ }
+ optional TokenizationStats tokenization_stats = 6;
+
+ // Time used to index all indexable string terms and property existence
+ // metadata terms in the document. It does not include the time to merge
+ // indices or the time to sort the lite index.
+ optional int32 term_index_latency_ms = 7;
+
+ // Time used to index all indexable integers in the document.
+ optional int32 integer_index_latency_ms = 8;
+
+ // Time used to index all qualified id join strings in the document.
+ optional int32 qualified_id_join_index_latency_ms = 9;
+
+ // Time used to sort the LiteIndex's HitBuffer.
+ optional int32 lite_index_sort_latency_ms = 10;
+
+ // Time used to index all metadata terms in the document, which can only be
+ // added by PropertyExistenceIndexingHandler currently.
+ optional int32 metadata_term_index_latency_ms = 11;
+}
+
+// Stats of the top-level function IcingSearchEngine::Search() and
+// IcingSearchEngine::GetNextPage().
+// Next tag: 26
+message QueryStatsProto {
+ // TODO(b/305098009): deprecate. Use parent_search_stats instead.
+ // The UTF-8 length of the query string
+ optional int32 query_length = 16;
+
+ // TODO(b/305098009): deprecate. Use parent_search_stats instead.
+ // Number of terms in the query string.
+ optional int32 num_terms = 1;
+
+ // TODO(b/305098009): deprecate. Use parent_search_stats instead.
+ // Number of namespaces filtered.
+ optional int32 num_namespaces_filtered = 2;
+
+ // TODO(b/305098009): deprecate. Use parent_search_stats instead.
+ // Number of schema types filtered.
+ optional int32 num_schema_types_filtered = 3;
+
+ // TODO(b/305098009): deprecate. Use parent_search_stats instead.
+ // Strategy of scoring and ranking.
+ optional ScoringSpecProto.RankingStrategy.Code ranking_strategy = 4;
+
+ // Whether the function call is querying the first page. If it’s
+ // not, Icing will fetch the results from cache so that some steps
+ // may be skipped.
+ optional bool is_first_page = 5;
+
+ // The requested number of results in one page.
+ optional int32 requested_page_size = 6;
+
+ // The actual number of results returned in the current page.
+ optional int32 num_results_returned_current_page = 7;
+
+ // TODO(b/305098009): deprecate. Use parent_search_stats instead.
+ // Number of documents scored.
+ optional int32 num_documents_scored = 8;
+
+ // How many of the results in the page returned were snippeted.
+ optional int32 num_results_with_snippets = 15;
+
+ // Overall time used for the function call.
+ optional int32 latency_ms = 10;
+
+ // TODO(b/305098009): deprecate. Use parent_search_stats instead.
+ // Time used to parse the query, including 2 parts: tokenizing and
+ // transforming tokens into an iterator tree.
+ optional int32 parse_query_latency_ms = 11;
+
+ // TODO(b/305098009): deprecate. Use parent_search_stats instead.
+ // Time used to score the raw results.
+ optional int32 scoring_latency_ms = 12;
+
+ // Time used to rank the scored results.
+ optional int32 ranking_latency_ms = 13;
+
+ // Time used to fetch the document protos. Note that it includes the
+ // time to snippet if ‘has_snippets’ is true.
+ optional int32 document_retrieval_latency_ms = 14;
+
+ // Time passed while waiting to acquire the lock before query execution.
+ optional int32 lock_acquisition_latency_ms = 17;
+
+ // Timestamp taken just before sending proto across the JNI boundary from
+ // native to java side.
+ optional int64 native_to_java_start_timestamp_ms = 18;
+
+ // Time used to send protos across the JNI boundary from java to native side.
+ optional int32 java_to_native_jni_latency_ms = 19;
+
+ // Time used to send protos across the JNI boundary from native to java side.
+ optional int32 native_to_java_jni_latency_ms = 20;
+
+ // The native latency due to the join operation.
+ optional int32 join_latency_ms = 21;
+
+ // Number of documents scored.
+ optional int32 num_joined_results_returned_current_page = 22;
+
+ // Whether it contains join query or not.
+ optional bool is_join_query = 23;
+
+ // Stats of the search. Only valid for first page.
+ // Next tag: 13
+ message SearchStats {
+ // The UTF-8 length of the query string
+ optional int32 query_length = 1;
+
+ // Number of terms in the query string.
+ optional int32 num_terms = 2;
+
+ // Number of namespaces filtered.
+ optional int32 num_namespaces_filtered = 3;
+
+ // Number of schema types filtered.
+ optional int32 num_schema_types_filtered = 4;
+
+ // Strategy of scoring and ranking.
+ optional ScoringSpecProto.RankingStrategy.Code ranking_strategy = 5;
+
+ // Number of documents scored.
+ optional int32 num_documents_scored = 6;
+
+ // Time used to parse the query, including 2 parts: tokenizing and
+ // transforming tokens into an iterator tree.
+ optional int32 parse_query_latency_ms = 7;
+
+ // Time used to score the raw results.
+ optional int32 scoring_latency_ms = 8;
+
+ // Whether it contains numeric query or not.
+ optional bool is_numeric_query = 9;
+
+ // Number of hits fetched by lite index before applying any filters.
+ optional int32 num_fetched_hits_lite_index = 10;
+
+ // Number of hits fetched by main index before applying any filters.
+ optional int32 num_fetched_hits_main_index = 11;
+
+ // Number of hits fetched by integer index before applying any filters.
+ optional int32 num_fetched_hits_integer_index = 12;
+ }
+
+ // Search stats for parent. Only valid for first page.
+ optional SearchStats parent_search_stats = 24;
+
+ // Search stats for child.
+ optional SearchStats child_search_stats = 25;
+
+ reserved 9;
+}
+
+// Stats of the top-level functions IcingSearchEngine::Delete,
+// IcingSearchEngine::DeleteByNamespace, IcingSearchEngine::DeleteBySchemaType.
+// Next tag: 4
+message DeleteStatsProto {
+ // Overall time used for the function call.
+ optional int32 latency_ms = 1;
+
+ message DeleteType {
+ enum Code {
+ // Default. Should never be used.
+ UNKNOWN = 0;
+
+ // Delete one document.
+ SINGLE = 1;
+
+ // Delete by query. This value is deprecated.
+ // IcingSearchEngine::DeleteByQuery will return a DeleteByQueryStatsProto
+ // rather than a DeleteStatsProto.
+ DEPRECATED_QUERY = 2 [deprecated = true];
+
+ // Delete by namespace.
+ NAMESPACE = 3;
+
+ // Delete by schema type.
+ SCHEMA_TYPE = 4;
+ }
+ }
+ optional DeleteType.Code delete_type = 2;
+
+ // Number of documents deleted by this call.
+ optional int32 num_documents_deleted = 3;
+}
+
+// Stats of the top-level functions IcingSearchEngine::DeleteByQuery.
+// Next tag: 9
+message DeleteByQueryStatsProto {
+ // Overall time used for the function call.
+ optional int32 latency_ms = 1;
+
+ // Number of documents deleted by this call.
+ optional int32 num_documents_deleted = 2;
+
+ // The UTF-8 length of the query string
+ optional int32 query_length = 3;
+
+ // Number of terms in the query string.
+ optional int32 num_terms = 4;
+
+ // Number of namespaces filtered.
+ optional int32 num_namespaces_filtered = 5;
+
+ // Number of schema types filtered.
+ optional int32 num_schema_types_filtered = 6;
+
+ // Time used to parse the query, including 2 parts: tokenizing and
+ // transforming tokens into an iterator tree.
+ optional int32 parse_query_latency_ms = 7;
+
+ // Time used to delete each document.
+ optional int32 document_removal_latency_ms = 8;
+}
diff --git a/proto/icing/proto/optimize.proto b/proto/icing/proto/optimize.proto
index 1baa64c..675f980 100644
--- a/proto/icing/proto/optimize.proto
+++ b/proto/icing/proto/optimize.proto
@@ -23,7 +23,7 @@ option java_multiple_files = true;
option objc_class_prefix = "ICNG";
// Result of a call to IcingSearchEngine.Optimize
-// Next tag: 2
+// Next tag: 3
message OptimizeResultProto {
// Status code can be one of:
// OK
@@ -35,12 +35,13 @@ message OptimizeResultProto {
// See status.proto for more details.
optional StatusProto status = 1;
+ optional OptimizeStatsProto optimize_stats = 2;
// TODO(b/147699081): Add a field to indicate lost_schema and lost_documents.
// go/icing-library-apis.
}
// Result of a call to IcingSearchEngine.GetOptimizeInfo
-// Next tag: 4
+// Next tag: 5
message GetOptimizeInfoResultProto {
// Status code can be one of:
// OK
@@ -57,4 +58,54 @@ message GetOptimizeInfoResultProto {
// Estimated bytes that could be recovered. The exact size per document isn't
// tracked, so this is based off an average document size.
optional int64 estimated_optimizable_bytes = 3;
+
+ // The amount of time since the last optimize ran.
+ optional int64 time_since_last_optimize_ms = 4;
+}
+
+// Next tag: 13
+message OptimizeStatsProto {
+ // Overall time used for the function call.
+ optional int32 latency_ms = 1;
+
+ // Time used to optimize the document store.
+ optional int32 document_store_optimize_latency_ms = 2;
+
+ // Time used to restore the index.
+ optional int32 index_restoration_latency_ms = 3;
+
+ // Number of documents before the optimization.
+ optional int32 num_original_documents = 4;
+
+ // Number of documents deleted.
+ optional int32 num_deleted_documents = 5;
+
+ // Number of documents expired.
+ optional int32 num_expired_documents = 6;
+
+ // Size of storage before the optimize.
+ optional int64 storage_size_before = 7;
+
+ // Size of storage after the optimize.
+ optional int64 storage_size_after = 8;
+
+ // The amount of time since the last optimize ran.
+ optional int64 time_since_last_optimize_ms = 9;
+
+ enum IndexRestorationMode {
+ // The index has been translated in place to match the optimized document
+ // store.
+ INDEX_TRANSLATION = 0;
+ // The index has been rebuilt from scratch during optimization. This could
+ // happen when we received a DATA_LOSS error from OptimizeDocumentStore,
+ // Index::Optimize failed, or rebuilding could be faster.
+ FULL_INDEX_REBUILD = 1;
+ }
+ optional IndexRestorationMode index_restoration_mode = 10;
+
+ // Number of namespaces before the optimization.
+ optional int32 num_original_namespaces = 11;
+
+ // Number of namespaces deleted.
+ optional int32 num_deleted_namespaces = 12;
}
diff --git a/proto/icing/proto/persist.proto b/proto/icing/proto/persist.proto
index 77cf987..8d6b372 100644
--- a/proto/icing/proto/persist.proto
+++ b/proto/icing/proto/persist.proto
@@ -22,6 +22,28 @@ option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
option objc_class_prefix = "ICNG";
+// The type of persistence guarantee that PersistToDisk should provide.
+// Next tag: 3
+message PersistType {
+ enum Code {
+ // Default. Should never be used.
+ UNKNOWN = 0;
+
+ // Only persist the ground truth. A successful PersistToDisk(LITE) should
+ // ensure that no data is lost the next time Icing initializes. This
+ // should be called after each batch of mutations.
+ LITE = 1;
+
+ // Persists all data in internal Icing components. A successful
+ // PersistToDisk(FULL) should not only ensure no data loss like
+ // PersistToDisk(LITE), but also prevent the need to recover internal data
+ // structures the next time Icing initializes. This should be called at
+ // some point before the app terminates.
+ FULL = 2;
+ }
+ optional Code code = 1;
+}
+
// Result of a call to IcingSearchEngine.Persist
// Next tag: 2
message PersistToDiskResultProto {
diff --git a/proto/icing/proto/schema.proto b/proto/icing/proto/schema.proto
index 3a7ee5d..c716dba 100644
--- a/proto/icing/proto/schema.proto
+++ b/proto/icing/proto/schema.proto
@@ -34,7 +34,7 @@ option objc_class_prefix = "ICNG";
// TODO(cassiewang) Define a sample proto file that can be used by tests and for
// documentation.
//
-// Next tag: 5
+// Next tag: 7
message SchemaTypeConfigProto {
// REQUIRED: Named type that uniquely identifies the structured, logical
// schema being defined.
@@ -51,19 +51,32 @@ message SchemaTypeConfigProto {
// easier.
repeated PropertyConfigProto properties = 4;
+ // Version is an arbitrary number that the client may use to keep track of
+ // different incarnations of the schema. Icing library imposes no requirements
+ // on this field and will not validate it in anyway. If a client calls
+ // SetSchema with a schema that contains one or more new version numbers, then
+ // those version numbers will be updated so long as the SetSchema call
+ // succeeds. Clients are free to leave the version number unset, in which case
+ // it will default to value == 0.
+ optional int32 version = 5;
+
+ // An experimental field to make the type as a subtype of parent_types, which
+ // enables parent_types to be interpreted as its subtypes in the context of
+ // the Search APIs, including schema type filters and projections specified in
+ // TypePropertyMask.
+ repeated string parent_types = 6;
+
reserved 2, 3;
}
-// Describes how a single property should be indexed.
+// Describes how a string property should be indexed.
// Next tag: 3
-message IndexingConfig {
+message StringIndexingConfig {
// Indicates how the content of this property should be matched in the index.
//
// TermMatchType.Code=UNKNOWN
// Content in this property will not be tokenized or indexed. Useful if the
- // data type is not made up of terms (e.g. DOCUMENT or BYTES type). All the
- // properties inside the nested property won't be indexed regardless of the
- // value of the term_match_type field for the nested properties.
+ // data type is not indexable. See schema-util for details.
//
// TermMatchType.Code=EXACT_ONLY
// Content in this property should only be returned for queries matching the
@@ -79,20 +92,123 @@ message IndexingConfig {
message TokenizerType {
enum Code {
// It is only valid for tokenizer_type to be 'NONE' if the data type is
- // DOCUMENT.
+ // not indexed.
NONE = 0;
// Tokenization for plain text.
PLAIN = 1;
+
+ // Tokenizes text in verbatim. This means no normalization or segmentation
+ // is applied to string values that are tokenized using this type.
+ // Therefore, the output token is equivalent to the raw string text. For
+ // example, "Hello, world!" would be tokenized as "Hello, world!"
+ // preserving punctuation and capitalization, and not creating separate
+ // tokens between the space.
+ VERBATIM = 2;
+
+ // Tokenizes text as an email address. This means it will tokenize a
+ // string into multiple emails, and further tokenize those into parts of
+ // an email address. These parts include the local address, host
+ // components, local components, as well as the name and comments. For
+ // example, "User (comment) <user@domain.com>" would be tokenized into a
+ // "User" name token, a "comment" comment token, a "user" local address, a
+ // "user" local component token, a "domain" host component token, a "com"
+ // host component token, a "user@domain.com" address token, and the entire
+ // original string as an rfc822 token.
+ // See more here: https://datatracker.ietf.org/doc/html/rfc822
+ RFC822 = 3;
+
+ // Tokenizes text as an url address. This tokenizes a url string into a
+ // token for each component in the url, as well as any significant
+ // url suffixes. For example,
+ // https://www.google.com/path/subpath?query#ref would be tokenizes into a
+ // scheme token "https“; 3 host tokens "www", "google", "com"; 2 path
+ // tokens "path", "subpath"; a query token "query"; a reference token
+ // "ref"; and 3 suffix tokens
+ // "https://www.google.com/path/subpath?query#ref",
+ // "www.google.com/path/subpath?query#ref",
+ // "google.com/path/subpath?query#ref".
+ // Currently only supports tokenization of one url string at a time
+ // i.e. the input string cannot have spaces in the middle, but can have
+ // leading or trailing spaces.
+ URL = 4;
}
}
optional TokenizerType.Code tokenizer_type = 2;
}
+// Describes how a document property should be indexed.
+// Next tag: 3
+message DocumentIndexingConfig {
+ // OPTIONAL: Whether nested properties within the document property should be
+ // indexed. If true, then all nested properties will be indexed according to
+ // the property's own indexing configurations. If false, nested documents'
+ // properties will not be indexed even if they have an indexing configuration.
+ //
+ // The default value is false.
+ optional bool index_nested_properties = 1;
+
+ // List of nested properties within the document to index. Only the
+ // provided list of properties will be indexed according to the property's
+ // indexing configurations.
+ //
+ // index_nested_properties must be false in order to use this feature.
+ repeated string indexable_nested_properties_list = 2;
+}
+
+// Describes how a int64 property should be indexed.
+// Next tag: 3
+message IntegerIndexingConfig {
+ // OPTIONAL: Indicates how the int64 contents of this property should be
+ // matched.
+ //
+ // The default value is UNKNOWN.
+ message NumericMatchType {
+ enum Code {
+ // Contents in this property will not be indexed. Useful if the int64
+ // property type is not indexable.
+ UNKNOWN = 0;
+
+ // Contents in this property should only be returned for queries matching
+ // the range.
+ RANGE = 1;
+ }
+ }
+ optional NumericMatchType.Code numeric_match_type = 1;
+}
+
+// Describes how a property can be used to join this document with another
+// document. See JoinSpecProto (in search.proto) for more details.
+// Next tag: 3
+message JoinableConfig {
+ // OPTIONAL: Indicates what joinable type the content value of this property
+ // is.
+ //
+ // The default value is NONE.
+ message ValueType {
+ enum Code {
+ // Value in this property is not joinable.
+ NONE = 0;
+
+ // Value in this property is a joinable (string) qualified id, which is
+ // composed of namespace and uri.
+ // See JoinSpecProto (in search.proto) and DocumentProto (in
+ // document.proto) for more details about qualified id, namespace and uri.
+ QUALIFIED_ID = 1;
+ }
+ }
+ optional ValueType.Code value_type = 1;
+
+ // If the parent document a child document is joined to is deleted, delete the
+ // child document as well. This will only apply to children joined through
+ // QUALIFIED_ID, other (future) joinable value types won't use it.
+ optional bool propagate_delete = 2 [default = false];
+}
+
// Describes the schema of a single property of Documents that belong to a
// specific SchemaTypeConfigProto. These can be considered as a rich, structured
// type for each property of Documents accepted by IcingSearchEngine.
-// Next tag: 6
+// Next tag: 9
message PropertyConfigProto {
// REQUIRED: Name that uniquely identifies a property within an Document of
// a specific SchemaTypeConfigProto.
@@ -106,9 +222,10 @@ message PropertyConfigProto {
// REQUIRED: Physical data-types of the contents of the property.
message DataType {
enum Code {
- // This should never purposely be set. This is used for backwards
+ // This value should never purposely be used. This is used for backwards
// compatibility reasons.
UNKNOWN = 0;
+
STRING = 1;
INT64 = 2;
DOUBLE = 3;
@@ -162,9 +279,26 @@ message PropertyConfigProto {
}
optional Cardinality.Code cardinality = 4;
- // OPTIONAL: Properties that do not set the indexing config will not be
- // indexed.
- optional IndexingConfig indexing_config = 5;
+ // OPTIONAL: Describes how string properties should be indexed. String
+ // properties that do not set the indexing config will not be indexed.
+ optional StringIndexingConfig string_indexing_config = 5;
+
+ // OPTIONAL: Describes how document properties should be indexed.
+ optional DocumentIndexingConfig document_indexing_config = 6;
+
+ // OPTIONAL: Describes how int64 properties should be indexed. Int64
+ // properties that do not set the indexing config will not be indexed.
+ optional IntegerIndexingConfig integer_indexing_config = 7;
+
+ // OPTIONAL: Describes how string properties can be used as a document joining
+ // matcher.
+ //
+ // Note: currently we only support STRING single joining, so if a property is
+ // set as joinable (i.e. joinable_config.content_type is not NONE), then:
+ // - DataType should be STRING. Otherwise joinable_config will be ignored.
+ // - The property itself and any upper-level (nested doc) property should
+ // contain at most one element (i.e. Cardinality is OPTIONAL or REQUIRED).
+ optional JoinableConfig joinable_config = 8;
}
// List of all supported types constitutes the schema used by Icing.
@@ -174,7 +308,7 @@ message SchemaProto {
}
// Result of a call to IcingSearchEngine.SetSchema
-// Next tag: 4
+// Next tag: 9
message SetSchemaResultProto {
// Status code can be one of:
// OK
@@ -198,6 +332,29 @@ message SetSchemaResultProto {
// documents that fail validation against the new schema types would also be
// deleted.
repeated string incompatible_schema_types = 3;
+
+ // Schema types that did not exist in the previous schema and were added with
+ // the new schema type.
+ repeated string new_schema_types = 4;
+
+ // Schema types that were changed in a way that was backwards compatible and
+ // didn't invalidate the index.
+ repeated string fully_compatible_changed_schema_types = 5;
+
+ // Schema types that were changed in a way that was backwards compatible, but
+ // invalidated the index.
+ repeated string index_incompatible_changed_schema_types = 6;
+
+ // Overall time used for the function call.
+ optional int32 latency_ms = 7;
+
+ // Schema types that were changed in a way that was backwards compatible, but
+ // invalidated the joinable cache.
+ //
+ // For example, a property was set non joinable in the old schema definition,
+ // but changed to joinable in the new definition. In this case, this property
+ // will be considered join incompatible when setting new schema.
+ repeated string join_incompatible_changed_schema_types = 8;
}
// Result of a call to IcingSearchEngine.GetSchema
diff --git a/proto/icing/proto/scoring.proto b/proto/icing/proto/scoring.proto
index 667ff4f..a8040a1 100644
--- a/proto/icing/proto/scoring.proto
+++ b/proto/icing/proto/scoring.proto
@@ -16,14 +16,16 @@ syntax = "proto2";
package icing.lib;
+import "icing/proto/term.proto";
+
option java_package = "com.google.android.icing.proto";
option java_multiple_files = true;
-
option objc_class_prefix = "ICNG";
// Encapsulates the configurations on how Icing should score and rank the search
// results.
-// Next tag: 3
+// TODO(b/170347684): Change all timestamps to seconds.
+// Next tag: 12
message ScoringSpecProto {
// OPTIONAL: Indicates how the search results will be ranked.
message RankingStrategy {
@@ -37,6 +39,41 @@ message ScoringSpecProto {
// Ranked by document creation timestamps.
CREATION_TIMESTAMP = 2;
+
+ // The following ranking strategies are based on usage reporting. Please
+ // see usage.proto for more information. If one of the usage ranking
+ // strategy is used but none of result documents have reported usage, the
+ // documents will be returned in the default reverse insertion order.
+
+ // Ranked by count of reports with usage type 1.
+ USAGE_TYPE1_COUNT = 3;
+
+ // Ranked by count of reports with usage type 2.
+ USAGE_TYPE2_COUNT = 4;
+
+ // Ranked by count of reports with usage type 3.
+ USAGE_TYPE3_COUNT = 5;
+
+ // Ranked by last used timestamp with usage type 1. The timestamps are
+ // compared in seconds.
+ USAGE_TYPE1_LAST_USED_TIMESTAMP = 6;
+
+ // Ranked by last used timestamp with usage type 2. The timestamps are
+ // compared in seconds.
+ USAGE_TYPE2_LAST_USED_TIMESTAMP = 7;
+
+ // Ranked by last used timestamp with usage type 3. The timestamps are
+ // compared in seconds.
+ USAGE_TYPE3_LAST_USED_TIMESTAMP = 8;
+
+ // Ranked by relevance score, currently computed as BM25F score.
+ RELEVANCE_SCORE = 9;
+
+ // Ranked by the aggregated score of the joined documents.
+ JOIN_AGGREGATE_SCORE = 10;
+
+ // Ranked by the advanced scoring expression provided.
+ ADVANCED_SCORING_EXPRESSION = 11;
}
}
optional RankingStrategy.Code rank_by = 1;
@@ -54,4 +91,78 @@ message ScoringSpecProto {
}
}
optional Order.Code order_by = 2;
+
+ // OPTIONAL: Specifies property weights for RELEVANCE_SCORE scoring strategy.
+ // Property weights are used for promoting or demoting query term matches in a
+ // document property. When property weights are provided, the term frequency
+ // is multiplied by the normalized property weight when computing the
+ // normalized term frequency component of BM25F. To prefer query term matches
+ // in the "subject" property over the "body" property of "Email" documents,
+ // set a higher property weight value for "subject" than "body". By default,
+ // all properties that are not specified are given a raw, pre-normalized
+ // weight of 1.0 when scoring.
+ repeated TypePropertyWeights type_property_weights = 3;
+
+ // OPTIONAL: Specifies the scoring expression for ADVANCED_SCORING_EXPRESSION
+ // RankingStrategy.
+ optional string advanced_scoring_expression = 4;
+}
+
+// Next tag: 3
+message SuggestionScoringSpecProto {
+ message SuggestionRankingStrategy {
+ enum Code {
+ // No ranking strategy specified, terms may be returned in an arbitrary
+ // order.
+ NONE = 0;
+
+ // Ranked by the term's hit count.
+ DOCUMENT_COUNT = 1;
+
+ // Ranked by the term's frequency.
+ TERM_FREQUENCY = 2;
+ }
+ }
+
+ // TermMatchType.Code=UNKNOWN
+ // Should never purposely be set and may lead to undefined behavior. This is
+ // used for backwards compatibility reasons.
+ //
+ // TermMatchType.Code=EXACT_ONLY
+ // Only exact hits will be counted to score a suggestion term.
+ //
+ // TermMatchType.Code=PREFIX
+ // Both exact hits and prefix hits will be counted to score a suggestion
+ // term.
+ optional TermMatchType.Code scoring_match_type = 1;
+
+ // Rank the output suggested result by given SuggestionRankingStrategy.
+ optional SuggestionRankingStrategy.Code rank_by = 2;
+}
+
+// Next tag: 3
+message TypePropertyWeights {
+ // Schema type to apply property weights to.
+ optional string schema_type = 1;
+
+ // Property weights to apply to the schema type.
+ repeated PropertyWeight property_weights = 2;
+}
+
+// Next tag: 3
+message PropertyWeight {
+ // Property path to assign property weight to. Property paths must be composed
+ // only of property names and property separators (the '.' character).
+ // For example, if an "Email" schema type has string property "subject" and
+ // document property "sender", which has string property "name", the property
+ // path for the email's subject would just be "subject" and the property path
+ // for the sender's name would be "sender.name". If an invalid path is
+ // specified, the property weight is discarded.
+ optional string path = 1;
+
+ // Property weight, valid values are positive and zero. Setting a zero
+ // property weight will remove scoring contribution for a query term match in
+ // the property. Negative weights are invalid and will result in an error.
+ // By default, a property is given a raw, pre-normalized weight of 1.0.
+ optional double weight = 2;
}
diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto
index 8ea5036..7f4fb3e 100644
--- a/proto/icing/proto/search.proto
+++ b/proto/icing/proto/search.proto
@@ -17,6 +17,8 @@ syntax = "proto2";
package icing.lib;
import "icing/proto/document.proto";
+import "icing/proto/logging.proto";
+import "icing/proto/scoring.proto";
import "icing/proto/status.proto";
import "icing/proto/term.proto";
@@ -25,7 +27,7 @@ option java_multiple_files = true;
option objc_class_prefix = "ICNG";
// Client-supplied specifications on what documents to retrieve.
-// Next tag: 5
+// Next tag: 11
message SearchSpecProto {
// REQUIRED: The "raw" query string that users may type. For example, "cat"
// will search for documents with the term cat in it.
@@ -60,11 +62,61 @@ message SearchSpecProto {
// applies to the entire 'query'. To issue different queries for different
// schema types, separate Search()'s will need to be made.
repeated string schema_type_filters = 4;
+
+ // Timestamp taken just before sending proto across the JNI boundary from java
+ // to native side.
+ optional int64 java_to_native_start_timestamp_ms = 5;
+
+ message SearchType {
+ enum Code {
+ UNDEFINED = 0;
+ ICING_RAW_QUERY = 1;
+ EXPERIMENTAL_ICING_ADVANCED_QUERY = 2;
+ }
+ }
+ // This field determines which type of query parsing Icing will use to fulfill
+ // the query.
+ // ICING_RAW_QUERY is the current query language as released, which supports
+ // basic ands, ors and nots as well as grouping and property restricts.
+ // EXPERIMENTAL_ICING_ADVANCED_QUERY is a superset of ICING_RAW_QUERY that
+ // will also support the use of functions defined by Icing Lib.
+ // This field is only temporary. When fully complete, all queries will be
+ // parsed by EXPERIMENTAL_ICING_ADVANCED_QUERY. This field only exists to
+ // enable testing.
+ // TODO(b/208654892) Remove this field once EXPERIMENTAL_ICING_ADVANCED_QUERY
+ // is fully supported.
+ optional SearchType.Code search_type = 6
+ [default = EXPERIMENTAL_ICING_ADVANCED_QUERY];
+
+ // OPTIONAL: If this field is present, join documents based on a nested
+ // SearchSpec.
+ optional JoinSpecProto join_spec = 7;
+
+ // Features enabled in this search spec.
+ repeated string enabled_features = 8;
+
+ // OPTIONAL: Whether to use the read-only implementation of
+ // IcingSearchEngine::Search.
+ // The read-only version enables multiple queries to be performed concurrently
+ // as it only acquires the read lock at IcingSearchEngine's level.
+ // Finer-grained locks are implemented around code paths that write changes to
+ // Icing during Search.
+ optional bool use_read_only_search = 9 [default = true];
+
+ // TODO(b/294266822): Handle multiple property filter lists for same schema
+ // type.
+ // How to specify a subset of properties to be searched. If no type property
+ // filter has been specified for a schema type (no TypePropertyMask for the
+ // given schema type), then *all* properties of that schema type will be
+ // searched. If an empty property filter is specified for a given schema type
+ // (TypePropertyMask for the given schema type has empty paths field), no
+ // properties of that schema type will be searched.
+ repeated TypePropertyMask type_property_filters = 10;
}
// Client-supplied specifications on what to include/how to format the search
// results.
-// Next tag: 4
+// Next tag: 10
message ResultSpecProto {
// The results will be returned in pages, and num_per_page specifies the
// number of documents in one page.
@@ -84,46 +136,161 @@ message ResultSpecProto {
// have snippet information provided. If set to 0, snippeting is disabled.
optional int32 num_matches_per_property = 2;
- // How large of a window to provide. Windows start at max_window_bytes / 2
- // bytes before the middle of the matching token and end at max_window_bytes
- // / 2 bytes after the middle of the matching token. Windowing respects
- // token boundaries.
- // Therefore, the returned window may be smaller than requested. Setting
- // max_window_bytes to 0 will disable windowing information. If matches
- // enabled is also set to false, then snippeting is disabled.
- // Ex. max_window_bytes = 16. "foo bar baz bat rat" with a query of "baz"
+ // How large of a window to provide. Windows start at
+ // max_window_utf32_length / 2 bytes before the middle of the matching token
+ // and end at max_window_utf32_length / 2 bytes after the middle of the
+ // matching token. Windowing respects token boundaries. Therefore, the
+ // returned window may be smaller than requested. Setting
+ // max_window_utf32_length to 0 will disable windowing information. If
+ // matches enabled is also set to false, then snippeting is disabled. Ex.
+ // max_window_utf32_length = 16. "foo bar baz bat rat" with a query of "baz"
// will return a window of "bar baz bat" which is only 11 bytes long.
- optional int32 max_window_bytes = 3;
+ optional int32 max_window_utf32_length = 3;
}
optional SnippetSpecProto snippet_spec = 3;
+
+ // How to specify a subset of properties to retrieve. If no type property mask
+ // has been specified for a schema type, then *all* properties of that schema
+ // type will be retrieved.
+ repeated TypePropertyMask type_property_masks = 4;
+
+ // Groupings of namespaces and schema types whose total returned results
+ // should be limited together.
+ // Next tag: 3
+ message ResultGrouping {
+ // Grouping of namespace and schema type.
+ // Next tag: 3
+ message Entry {
+ // The namespace in this grouping that should be returned.
+ // This field should be empty if ResultGroupingType is SCHEMA_TYPE
+ optional string namespace = 1;
+
+ // The schema in this grouping that should be returned.
+ // This field should be empty if ResultGroupingType is NAMESPACE
+ optional string schema = 2;
+ }
+
+ // Identifier for namespace and schema type pairs.
+ repeated Entry entry_groupings = 1;
+
+ // The maximum number of results in this grouping that should be returned.
+ optional int32 max_results = 2;
+ }
+
+ // How to limit the number of results returned per set of namespaces or schema
+ // type. If results match for a namespace or schema type that is not present
+ // in any result groupings, then those results will be returned without limit.
+ //
+ // Non-existent namespaces and/or schema type will be ignored.
+ //
+ // Example : Suppose that there are four namespaces each with three results
+ // matching the query for "foo". Without any result groupings, Icing would
+ // return the following results:
+ // ["ns0doc0", "ns0doc1", "ns1doc0", "ns3doc0", "ns0doc2", "ns3doc1",
+ // "ns2doc1", "ns3doc2", "ns2doc0", "ns1doc1", "ns2doc2", "ns1doc1"].
+ //
+ // The following result groupings will be returned if that the
+ // ResultGroupingType is set to NAMESPACE:
+ // [ { [ {"namespace0"} ], 2 }, { [ {"namespace1"}, {"namespace2"} ], 2} ]
+ //
+ // The following results will be returned:
+ // ["ns0doc0", "ns0doc1", "ns1doc0", "ns3doc0", "ns3doc1", "ns2doc1",
+ // "ns3doc2"].
+ repeated ResultGrouping result_groupings = 5;
+
+ // The threshold of total bytes of all documents to cutoff, in order to limit
+ // # of bytes in a single page.
+ // Note that it doesn't guarantee the result # of bytes will be smaller, equal
+ // to, or larger than the threshold. Instead, it is just a threshold to
+ // cutoff, and only guarantees total bytes of search results will exceed the
+ // threshold by less than the size of the final search result.
+ optional int32 num_total_bytes_per_page_threshold = 6
+ [default = 2147483647]; // INT_MAX
+
+ // The value by which the search results will get grouped by.
+ // Can get grouped by schema type, namespace (default), or by namespace and
+ // schema type.
+ enum ResultGroupingType {
+ NONE = 0;
+ SCHEMA_TYPE = 1;
+ NAMESPACE = 2;
+ NAMESPACE_AND_SCHEMA_TYPE = 3;
+ }
+ optional ResultGroupingType result_group_type = 7;
+
+ // The max # of child documents will be attached and returned in the result
+ // for each parent. It is only used for join API.
+ optional int32 max_joined_children_per_parent_to_return = 8;
+
+ // The max # of results being scored and ranked.
+ // Running time of ScoringProcessor and Ranker is O(num_to_score) according to
+ // results of //icing/scoring:score-and-rank_benchmark. Note that
+ // the process includes scoring, building a heap, and popping results from the
+ // heap.
+ //
+ // 30000 results can be scored and ranked within 3 ms on a Pixel 3 XL
+ // according to results of
+ // //icing/scoring:score-and-rank_benchmark, so set it as the
+ // default value.
+ optional int32 num_to_score = 9 [default = 30000];
}
// The representation of a single match within a DocumentProto property.
-// Next tag: 6
+//
+// Example : A document whose content is "Necesito comprar comida mañana." and a
+// query for "mana" with window=15
+// Next tag: 12
message SnippetMatchProto {
- // Properties may have multiple values. values_index indicates which of these
- // multiple string values the match occurred in. For properties with only one
- // value, the values_index will always be 0.
- // Ex. "Recipients" [
- // { { "Name" : "Daffy Duck" }
- // { "EmailAddress" : "daffduck@gmail.com" } },
- // { { "Name" : "Donald Duck" }
- // { "EmailAddress" : "donduck@gmail.com" } }
- // "Daffy Duck" is the string value with a value_index of 0 for property
- // "Recipients.Name". "Donald Duck" is the string value with a value_index of
- // 1 for property "Recipients.Name".
- optional int32 values_index = 1;
-
- // The position and length within the matched string at which the exact
- // match begins.
- optional int32 exact_match_position = 2;
-
- optional int32 exact_match_bytes = 3;
-
- // The position and length of the suggested snippet window.
- optional int32 window_position = 4;
-
- optional int32 window_bytes = 5;
+ // The index of the byte in the string at which the match begins and the
+ // length in bytes of the match.
+ //
+ // For the example above, the values of these fields would be
+ // exact_match_byte_position=24, exact_match_byte_length=7 "mañana"
+ optional int32 exact_match_byte_position = 2;
+ optional int32 exact_match_byte_length = 3;
+
+ // The length in bytes of the subterm that matches the query. The beginning of
+ // the submatch is the same as exact_match_byte_position.
+ //
+ // For the example above, the value of this field would be 5. With
+ // exact_match_byte_position=24 above, it would produce the substring "maña"
+ optional int32 submatch_byte_length = 10;
+
+ // The index of the UTF-16 code unit in the string at which the match begins
+ // and the length in UTF-16 code units of the match. This is for use with
+ // UTF-16 encoded strings like Java.lang.String.
+ //
+ // For the example above, the values of these fields would be
+ // exact_match_utf16_position=24, exact_match_utf16_length=6 "mañana"
+ optional int32 exact_match_utf16_position = 6;
+ optional int32 exact_match_utf16_length = 7;
+
+ // The length in UTF-16 code units of the subterm that matches the query. The
+ // beginning of the submatch is the same as exact_match_utf16_position. This
+ // is for use with UTF-16 encoded strings like Java.lang.String.
+ //
+ // For the example above, the value of this field would be 4. With
+ // exact_match_utf16_position=24 above, it would produce the substring "maña"
+ optional int32 submatch_utf16_length = 11;
+
+ // The index of the byte in the string at which the suggested snippet window
+ // begins and the length in bytes of the window.
+ //
+ // For the example above, the values of these fields would be
+ // window_byte_position=17, window_byte_length=15 "comida mañana."
+ optional int32 window_byte_position = 4;
+ optional int32 window_byte_length = 5;
+
+ // The index of the UTF-16 code unit in the string at which the suggested
+ // snippet window begins and the length in UTF-16 code units of the window.
+ // This is for use with UTF-16 encoded strings like Java.lang.String.
+ //
+ // For the example above, the values of these fields would be
+ // window_utf16_position=17, window_utf16_length=14 "comida mañana."
+ optional int32 window_utf16_position = 8;
+ optional int32 window_utf16_length = 9;
+
+ reserved 1;
}
// A Proto representing all snippets for a single DocumentProto.
@@ -133,9 +300,29 @@ message SnippetProto {
// property values in the corresponding DocumentProto.
// Next tag: 3
message EntryProto {
- // A '.'-delimited sequence of property names indicating which property in
- // the DocumentProto these snippets correspond to.
- // Example properties: 'body', 'sender.name', 'sender.emailaddress', etc.
+ // A property path indicating which property in the DocumentProto these
+ // snippets correspond to. Property paths will contain 1) property names,
+ // 2) the property separator character '.' used to represent nested property
+ // and 3) indices surrounded by brackets to represent a specific value in
+ // that property.
+ //
+ // Example properties:
+ // - 'body' : the first and only string value of a top-level
+ // property called 'body'.
+ // - 'sender.name' : the first and only string value of a property
+ // called 'name' that is a subproperty of a
+ // property called 'sender'.
+ // - 'bcc[1].emailaddress': the first and only string value of a property
+ // called 'emailaddress' that is a subproperty of
+ // the second document value of a property called
+ // 'bcc'.
+ // - 'attachments[0]' : the first (of more than one) string value of a
+ // property called 'attachments'.
+ // NOTE: If there is only a single value for a property (like
+ // 'sender.name'), then no value index will be added to the property path.
+ // An index of [0] is implied. If there is more than one value for a
+ // property, then the value index will be added to the property path (like
+ // 'attachements[0]').
optional string property_name = 1;
repeated SnippetMatchProto snippet_matches = 2;
@@ -145,7 +332,7 @@ message SnippetProto {
}
// Icing lib-supplied results from a search results.
-// Next tag: 5
+// Next tag: 6
message SearchResultProto {
// Status code can be one of:
// OK
@@ -161,7 +348,7 @@ message SearchResultProto {
optional StatusProto status = 1;
// The Results that matched the query. Empty if there was an error.
- // Next tag: 3
+ // Next tag: 5
message ResultProto {
// Document that matches the SearchSpecProto.
optional DocumentProto document = 1;
@@ -169,30 +356,203 @@ message SearchResultProto {
// Snippeting information for the document if requested in the
// ResultSpecProto. A default instance, if not requested.
optional SnippetProto snippet = 2;
+
+ // The score that the document was ranked by. The meaning of this score is
+ // determined by ScoringSpecProto.rank_by.
+ optional double score = 3;
+
+ // The child documents that were joined to a parent document.
+ repeated ResultProto joined_results = 4;
}
repeated ResultProto results = 2;
// Various debug fields. Not populated if ResultSpecProto.debug_info = false.
+ // Next tag: 4
message DebugInfoProto {
- // The number of results that actually matched the SearchSpecProto. This is
- // different from the number of `documents` returned since the user can
- // set a ResultSpecProto.limit on how many results are returned to them.
- optional uint64 num_results = 1;
-
- // Latency to parse and execute the query, in milliseconds.
- optional uint64 latency_ms = 2;
-
// The internal representation of the actual query string that was executed.
// This may be different from the SearchSpecProto.query if the original
// query was malformed.
optional string executed_query = 3;
+
+ reserved 1, 2;
}
optional DebugInfoProto debug_info = 3;
// An opaque token used internally to keep track of information needed for
// pagination. A valid pagination token is required to fetch other pages of
- // results. The default value 0 means that there're no more pages.
+ // results. A value 0 means that there're no more pages.
// LINT.IfChange(next_page_token)
- optional uint64 next_page_token = 4 [default = 0];
+ optional uint64 next_page_token = 4;
// LINT.ThenChange(//depot/google3/icing/result/result-state-manager.h:kInvalidNextPageToken)
+
+ // Stats for query execution performance.
+ optional QueryStatsProto query_stats = 5;
+}
+
+// Next tag: 3
+message TypePropertyMask {
+ // The schema type to which these property masks should apply.
+ // If the schema type is the wildcard ("*"), then the type property masks
+ // will apply to all results of types that don't have their own, specific
+ // type property mask entry.
+ optional string schema_type = 1;
+
+ // The property masks specifying the property to be retrieved. Property
+ // masks must be composed only of property names, property separators (the
+ // '.' character). For example, "subject", "recipients.name". Specifying no
+ // property masks will result in *no* properties being retrieved.
+ repeated string paths = 2;
+}
+
+// Next tag: 2
+message GetResultSpecProto {
+ // How to specify a subset of properties to retrieve. If no type property mask
+ // has been specified for a schema type, then *all* properties of that schema
+ // type will be retrieved.
+ repeated TypePropertyMask type_property_masks = 1;
+}
+
+// Next tag: 8
+message SuggestionSpecProto {
+ // REQUIRED: The "raw" prefix string that users may type. For example, "f"
+ // will search for suggested query that start with "f" like "foo", "fool".
+ optional string prefix = 1;
+
+ // OPTIONAL: Only search for suggestions that under the specified namespaces.
+ // If unset, the suggestion will search over all namespaces. Note that this
+ // applies to the entire 'prefix'. To issue different suggestions for
+ // different namespaces, separate RunSuggestion()'s will need to be made.
+ repeated string namespace_filters = 2;
+
+ // REQUIRED: The number of suggestions to be returned.
+ optional int32 num_to_return = 3;
+
+ // Indicates how the suggestion terms should be scored and ranked.
+ optional SuggestionScoringSpecProto scoring_spec = 4;
+
+ // OPTIONAL: Only search for suggestions that under the specified
+ // DocumentUris. If unset, the suggestion will search over all Documents.
+ //
+ // All namespace in the given NamespaceDocumentUriGroup should match the
+ // namespace_filters. i.e. appears in the namespace_filter or namespace_filter
+ // is empty.
+ //
+ // All given NamespaceDocumentUriGroup cannot have empty. Please use the
+ // namespace_filter to exclude a namespace.
+ //
+ // Note that this applies to the entire 'prefix'. To issue different
+ // suggestions for different DocumentIds, separate RunSuggestion()'s will need
+ // to be made.
+ repeated NamespaceDocumentUriGroup document_uri_filters = 5;
+
+ // OPTIONAL: Only search for suggestions that under the specified schemas.
+ // If unset, the suggestion will search over all schema types. Note that this
+ // applies to the entire 'prefix'. To issue different suggestions for
+ // different schema typs, separate RunSuggestion()'s will need to be made.
+ repeated string schema_type_filters = 6;
+
+ // OPTIONAL: Only search for suggestions that under the specified types and
+ // properties.
+ //
+ // If unset, the suggestion will search over all types.
+ // If the TypePropertyMask.paths is unset, the suggestion will search over all
+ // properties under the TypePropertyMask.schema_type.
+ //
+ // Note that this applies to the entire 'prefix'. To issue different
+ // suggestions for different types, separate RunSuggestion()'s will need to be
+ // made.
+ repeated TypePropertyMask type_property_filters = 7;
+}
+
+// A group that holds namespace and document_uris under it.
+message NamespaceDocumentUriGroup {
+ optional string namespace_ = 1;
+ repeated string document_uris = 2;
+}
+
+// Next tag: 3
+message SuggestionResponse {
+ message Suggestion {
+ // The suggested query string for client to search for.
+ optional string query = 1;
+ }
+
+ // Status code can be one of:
+ // OK
+ // FAILED_PRECONDITION
+ // INTERNAL
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+
+ repeated Suggestion suggestions = 2;
+}
+
+// Specification for a left outer join.
+//
+// Next tag: 7
+message JoinSpecProto {
+ // Collection of several specs that will be used for searching and joining
+ // child documents.
+ //
+ // Next tag: 4
+ message NestedSpecProto {
+ // A nested SearchSpec that will be used to retrieve child documents. If you
+ // are only looking to join on a specific type documents, you could set a
+ // schema filter in this SearchSpec. This includes the nested search query.
+ // See SearchSpecProto.
+ optional SearchSpecProto search_spec = 1;
+
+ // A nested ScoringSpec that will be used to score child documents.
+ // See ScoringSpecProto.
+ optional ScoringSpecProto scoring_spec = 2;
+
+ // A nested ResultSpec that will be used to format child documents in the
+ // result joined documents, e.g. snippeting, projection.
+ // See ResultSpecProto.
+ optional ResultSpecProto result_spec = 3;
+ }
+ optional NestedSpecProto nested_spec = 1;
+
+ // The equivalent of a primary key in SQL. This is an expression that will be
+ // used to match child documents from the nested search to this document. One
+ // such expression is qualifiedId(). When used, it means the contents of
+ // child_property_expression property in the child documents must be equal to
+ // the qualified id.
+ // TODO(b/256022027) allow for parent_property_expression to be any property
+ // of the parent document.
+ optional string parent_property_expression = 2;
+
+ // The equivalent of a foreign key in SQL. This defines an equality constraint
+ // between a property in a child document and a property in the parent
+ // document. For example, if you want to join child documents which an
+ // entityId property containing a fully qualified document id,
+ // child_property_expression can be set to "entityId".
+ // TODO(b/256022027) figure out how to allow this to refer to documents
+ // outside of same pkg+db+ns.
+ optional string child_property_expression = 3;
+
+ // The max number of child documents to join to a parent document.
+ // DEPRECATED: use ResultSpecProto.max_joined_children_per_parent_to_return to
+ // control the number of children that are returned. There is no supported
+ // control for the number of children being scored at this time.
+ optional int32 max_joined_child_count = 4 [deprecated = true];
+
+ // The strategy by which to score the aggregation of child documents. For
+ // example, you might want to know which entity document has the most actions
+ // taken on it. If JOIN_AGGREGATE_SCORE is used in the base SearchSpecProto,
+ // the COUNT value will rank entity documents based on the number of child
+ // documents.
+ message AggregationScoringStrategy {
+ enum Code {
+ NONE = 0; // No aggregation strategy for child documents and use parent
+ // document score.
+ COUNT = 1;
+ MIN = 2;
+ AVG = 3;
+ MAX = 4;
+ SUM = 5;
+ }
+ }
+ optional AggregationScoringStrategy.Code aggregation_scoring_strategy = 5;
}
diff --git a/proto/icing/proto/status.proto b/proto/icing/proto/status.proto
index 2733a15..06ec6c4 100644
--- a/proto/icing/proto/status.proto
+++ b/proto/icing/proto/status.proto
@@ -24,7 +24,7 @@ option objc_class_prefix = "ICNG";
// Canonical status to indicate the results of API calls.
// Next tag: 3
message StatusProto {
- // Next tag: 9
+ // Next tag: 10
enum Code {
// A default for all other use-cases. Should never be used in practice. This
// may happen if there are backwards-compatibility issues.
@@ -36,6 +36,9 @@ message StatusProto {
// The IcingSearchEngine instance is still usable. But the schema and/or
// documents may need to be re-added to prevent future API calls from
// failing or returning correct information.
+ //
+ // TODO(b/171750324): split into WARNING_PARTIAL_LOSS and
+ // WARNING_COMPLETE_LOSS.
WARNING_DATA_LOSS = 2;
// Parameters to API call are invalid and cannot be processed.
@@ -62,6 +65,12 @@ message StatusProto {
// make some space on the underlying filesystem.
OUT_OF_SPACE = 8;
+ // An operation is invalid because the resource already exists and can't be
+ // replaced. For example, this status is used when a SchemaProto contains
+ // multiple definitions of the same type or multiple properties with the
+ // same name within a type.
+ ALREADY_EXISTS = 9;
+
// Any future status codes.
}
optional Code code = 1;
diff --git a/proto/icing/proto/storage.proto b/proto/icing/proto/storage.proto
new file mode 100644
index 0000000..39dab6b
--- /dev/null
+++ b/proto/icing/proto/storage.proto
@@ -0,0 +1,187 @@
+// Copyright 2021 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+import "icing/proto/status.proto";
+
+option java_package = "com.google.android.icing.proto";
+option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
+// Next tag: 10
+message NamespaceStorageInfoProto {
+ // Name of the namespace
+ optional string namespace = 1;
+
+ // Number of alive documents in this namespace.
+ optional int32 num_alive_documents = 2;
+
+ // NOTE: We don't have stats on number of deleted documents in a namespace
+ // since we completely erase all data on a document when it's deleted. And we
+ // can't figure out which namespace it belonged to.
+
+ // Number of expired documents in this namespace.
+ optional int32 num_expired_documents = 3;
+
+ // LINT.IfChange(namespace_storage_info_usage_types)
+ // Number of alive documents that have a UsageReport.usage_type reported
+ optional int32 num_alive_documents_usage_type1 = 4;
+ optional int32 num_alive_documents_usage_type2 = 5;
+ optional int32 num_alive_documents_usage_type3 = 6;
+
+ // Number of expired documents that have a UsageReport.usage_type reported
+ optional int32 num_expired_documents_usage_type1 = 7;
+ optional int32 num_expired_documents_usage_type2 = 8;
+ optional int32 num_expired_documents_usage_type3 = 9;
+ // LINT.ThenChange()
+}
+
+// Next tag: 15
+message DocumentStorageInfoProto {
+ // Total number of alive documents.
+ optional int32 num_alive_documents = 1;
+
+ // Total number of deleted documents.
+ optional int32 num_deleted_documents = 2;
+
+ // Total number of expired documents.
+ optional int32 num_expired_documents = 3;
+
+ // Total size of the document store in bytes. Will be set to -1 if an IO error
+ // is encountered while calculating this field.
+ optional int64 document_store_size = 4;
+
+ // Total size of the ground truth in bytes. The ground truth may
+ // include deleted or expired documents. Will be set to -1 if an IO error is
+ // encountered while calculating this field.
+ optional int64 document_log_size = 5;
+
+ // Size of the key mapper in bytes. Will be set to -1 if an IO error is
+ // encountered while calculating this field.
+ optional int64 key_mapper_size = 6;
+
+ // Size of the document id mapper in bytes. Will be set to -1 if an IO error
+ // is encountered while calculating this field.
+ optional int64 document_id_mapper_size = 7;
+
+ // Size of the score cache in bytes. Will be set to -1 if an IO error is
+ // encountered while calculating this field.
+ optional int64 score_cache_size = 8;
+
+ // Size of the filter cache in bytes. Will be set to -1 if an IO error is
+ // encountered while calculating this field.
+ optional int64 filter_cache_size = 9;
+
+ // Size of the corpus mapper in bytes. Will be set to -1 if an IO error is
+ // encountered while calculating this field.
+ optional int64 corpus_mapper_size = 10;
+
+ // Size of the corpus score cache in bytes. Will be set to -1 if an IO error
+ // is encountered while calculating this field.
+ optional int64 corpus_score_cache_size = 11;
+
+ // Size of the namespace id mapper in bytes. Will be set to -1 if an IO error
+ // is encountered while calculating this field.
+ optional int64 namespace_id_mapper_size = 12;
+
+ // Number of namespaces seen from the current documents.
+ //
+ // TODO(cassiewang): This isn't technically needed anymore since clients can
+ // get this number from namespace_storage_info. Consider removing this.
+ optional int32 num_namespaces = 13;
+
+ // Storage information of each namespace.
+ repeated NamespaceStorageInfoProto namespace_storage_info = 14;
+}
+
+// Next tag: 5
+message SchemaStoreStorageInfoProto {
+ // Size of the schema store in bytes. Will be set to -1 if an IO error is
+ // encountered while calculating this field.
+ optional int64 schema_store_size = 1;
+
+ // Total number of schema types.
+ optional int32 num_schema_types = 2;
+
+ // Total number of all sections across all types
+ optional int32 num_total_sections = 3;
+
+ // Total number of types at the current section limit.
+ optional int32 num_schema_types_sections_exhausted = 4;
+}
+
+// Next tag: 9
+message IndexStorageInfoProto {
+ // Total size of the index in bytes. Will be set to -1 if an IO error is
+ // encountered while calculating this field.
+ optional int64 index_size = 1;
+
+ // Size of the lite index lexicon in bytes. Will be set to -1 if an IO error
+ // is encountered while calculating this field.
+ optional int64 lite_index_lexicon_size = 2;
+
+ // Size of the lite index hit buffer in bytes. Will be set to -1 if an IO
+ // error is encountered while calculating this field.
+ optional int64 lite_index_hit_buffer_size = 3;
+
+ // Size of the main index lexicon in bytes. Will be set to -1 if an IO error
+ // is encountered while calculating this field.
+ optional int64 main_index_lexicon_size = 4;
+
+ // Size of the main index storage in bytes. Will be set to -1 if an IO error
+ // is encountered while calculating this field.
+ optional int64 main_index_storage_size = 5;
+
+ // Size of one main index block in bytes.
+ optional int64 main_index_block_size = 6;
+
+ // Number of main index blocks.
+ optional int32 num_blocks = 7;
+
+ // Percentage of the main index blocks that are free, assuming
+ // allocated blocks are fully used.
+ optional float min_free_fraction = 8;
+}
+
+// Next tag: 5
+message StorageInfoProto {
+ // Total size of Icing’s storage in bytes. Will be set to -1 if an IO error is
+ // encountered while calculating this field.
+ optional int64 total_storage_size = 1;
+
+ // Storage information of the document store.
+ optional DocumentStorageInfoProto document_storage_info = 2;
+
+ // Storage information of the schema store.
+ optional SchemaStoreStorageInfoProto schema_store_storage_info = 3;
+
+ // Storage information of the index.
+ optional IndexStorageInfoProto index_storage_info = 4;
+}
+
+// Next tag: 3
+message StorageInfoResultProto {
+ // Status code can be one of:
+ // OK
+ // FAILED_PRECONDITION
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+
+ // Storage information of Icing.
+ optional StorageInfoProto storage_info = 2;
+}
diff --git a/proto/icing/proto/usage.proto b/proto/icing/proto/usage.proto
new file mode 100644
index 0000000..eaa2671
--- /dev/null
+++ b/proto/icing/proto/usage.proto
@@ -0,0 +1,69 @@
+// Copyright 2019 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package icing.lib;
+
+import "icing/proto/status.proto";
+
+option java_package = "com.google.android.icing.proto";
+option java_multiple_files = true;
+option objc_class_prefix = "ICNG";
+
+// Representation of a usage report that is generated from the client and sent
+// to Icing.
+// Next tag: 5
+message UsageReport {
+ // Namespace of the document.
+ optional string document_namespace = 1;
+
+ // Uri of the document.
+ optional string document_uri = 2;
+
+ // Timestamp in milliseconds of when the usage happens.
+ optional int64 usage_timestamp_ms = 3;
+
+ // LINT.IfChange
+ // Next tag: 3
+ enum UsageType {
+ // A custom usage type that clients can assign a meaning to. UsageReports of
+ // the same type are combined to provide usage counts that clients may use
+ // in scoring.
+ USAGE_TYPE1 = 0;
+
+ // Same as above.
+ USAGE_TYPE2 = 1;
+
+ // Same as above.
+ USAGE_TYPE3 = 2;
+ }
+ // LINT.ThenChange(
+ // //depot/google3/icing/store/usage-store.h:UsageScores,
+ // //depot/google3/icing/proto/\
+ // storage.proto:namespace_storage_info_usage_types)
+ optional UsageType usage_type = 4;
+}
+
+// Result of a call to IcingSearchEngine.ReportUsage
+// Next tag: 2
+message ReportUsageResultProto {
+ // Status code can be one of:
+ // OK
+ // NOT_FOUND
+ // INTERNAL
+ //
+ // See status.proto for more details.
+ optional StatusProto status = 1;
+}
diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt
new file mode 100644
index 0000000..dd08fd1
--- /dev/null
+++ b/synced_AOSP_CL_number.txt
@@ -0,0 +1 @@
+set(synced_AOSP_CL_number=587883838)