aboutsummaryrefslogtreecommitdiff
path: root/research/deorummolae.cc
diff options
context:
space:
mode:
Diffstat (limited to 'research/deorummolae.cc')
-rwxr-xr-x[-rw-r--r--]research/deorummolae.cc200
1 files changed, 87 insertions, 113 deletions
diff --git a/research/deorummolae.cc b/research/deorummolae.cc
index d15b7ee..f4cc53e 100644..100755
--- a/research/deorummolae.cc
+++ b/research/deorummolae.cc
@@ -1,7 +1,7 @@
#include "./deorummolae.h"
#include <array>
-#include <cstdio>
+#include <vector>
#include "./esaxx/sais.hxx"
@@ -15,31 +15,22 @@
/* Non tunable definitions. */
#define CHUNK_MASK (CHUNK_SIZE - 1)
-#define COVERAGE_SIZE (1 << (DM_LOG_MAX_FILES - 6))
+#define COVERAGE_SIZE (1 << (LOG_MAX_FILES - 6))
/* File coverage: every bit set to 1 denotes a file covered by an isle. */
typedef std::array<uint64_t, COVERAGE_SIZE> Coverage;
-/* Symbol of text alphabet. */
-typedef int32_t TextChar;
-
-/* Pointer to position in text. */
-typedef uint32_t TextIdx;
-
-/* SAIS sarray_type; unfortunately, must be a signed type. */
-typedef int32_t TextSaIdx;
-
-static size_t popcount(uint64_t u) {
- return static_cast<size_t>(__builtin_popcountll(u));
+static int popcount(uint64_t u) {
+ return __builtin_popcountll(u);
}
/* Condense terminators and pad file entries. */
-static void rewriteText(std::vector<TextChar>* text) {
- TextChar terminator = text->back();
- TextChar prev = terminator;
- TextIdx to = 0;
- for (TextIdx from = 0; from < text->size(); ++from) {
- TextChar next = text->at(from);
+static void rewriteText(std::vector<int>* text) {
+ int terminator = text->back();
+ int prev = terminator;
+ size_t to = 0;
+ for (size_t from = 0; from < text->size(); ++from) {
+ int next = text->at(from);
if (next < 256 || prev < 256) {
text->at(to++) = next;
if (next >= 256) terminator = next;
@@ -52,12 +43,11 @@ static void rewriteText(std::vector<TextChar>* text) {
}
/* Reenumerate terminators for smaller alphabet. */
-static void remapTerminators(std::vector<TextChar>* text,
- TextChar* next_terminator) {
- TextChar prev = -1;
- TextChar x = 256;
- for (TextIdx i = 0; i < text->size(); ++i) {
- TextChar next = text->at(i);
+static void remapTerminators(std::vector<int>* text, int* next_terminator) {
+ int prev = -1;
+ int x = 256;
+ for (size_t i = 0; i < text->size(); ++i) {
+ int next = text->at(i);
if (next < 256) { // Char.
// Do nothing.
} else if (prev < 256) { // Terminator after char.
@@ -72,15 +62,15 @@ static void remapTerminators(std::vector<TextChar>* text,
}
/* Combine all file entries; create mapping position->file. */
-static void buildFullText(std::vector<std::vector<TextChar>>* data,
- std::vector<TextChar>* full_text, std::vector<TextIdx>* file_map,
- std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
+static void buildFullText(std::vector<std::vector<int>>* data,
+ std::vector<int>* full_text, std::vector<size_t>* file_map,
+ std::vector<size_t>* file_offset, int* next_terminator) {
file_map->resize(0);
file_offset->resize(0);
full_text->resize(0);
- for (TextIdx i = 0; i < data->size(); ++i) {
+ for (size_t i = 0; i < data->size(); ++i) {
file_offset->push_back(full_text->size());
- std::vector<TextChar>& file = data->at(i);
+ std::vector<int>& file = data->at(i);
rewriteText(&file);
full_text->insert(full_text->end(), file.begin(), file.end());
file_map->insert(file_map->end(), file.size() / CHUNK_SIZE, i);
@@ -90,19 +80,18 @@ static void buildFullText(std::vector<std::vector<TextChar>>* data,
/* Build longest-common-prefix based on suffix array and text.
TODO: borrowed -> unknown efficiency. */
-static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
- std::vector<TextIdx>* lcp, std::vector<TextIdx>* invese_sa) {
- TextIdx size = static_cast<TextIdx>(text->size());
+static void buildLcp(std::vector<int>* text, std::vector<int>* sa,
+ std::vector<int>* lcp, std::vector<int>* invese_sa) {
+ int size = static_cast<int>(text->size());
lcp->resize(size);
- TextIdx k = 0;
+ int k = 0;
lcp->at(size - 1) = 0;
- for (TextIdx i = 0; i < size; ++i) {
+ for (int i = 0; i < size; ++i) {
if (invese_sa->at(i) == size - 1) {
k = 0;
continue;
}
- // Suffix which follow i-th suffix.
- TextIdx j = sa->at(invese_sa->at(i) + 1);
+ int j = sa->at(invese_sa->at(i) + 1); // Suffix which follow i-th suffix.
while (i + k < size && j + k < size && text->at(i + k) == text->at(j + k)) {
++k;
}
@@ -115,21 +104,21 @@ static void buildLcp(std::vector<TextChar>* text, std::vector<TextIdx>* sa,
When we raise the LCP requirement, the isle sunks and smaller isles appear
instead. */
typedef struct {
- TextIdx lcp;
- TextIdx l;
- TextIdx r;
+ int lcp;
+ int l;
+ int r;
Coverage coverage;
} Isle;
/* Helper routine for `cutMatch`. */
-static void poisonData(TextIdx pos, TextIdx length,
- std::vector<std::vector<TextChar>>* data, std::vector<TextIdx>* file_map,
- std::vector<TextIdx>* file_offset, TextChar* next_terminator) {
- TextIdx f = file_map->at(pos / CHUNK_SIZE);
+static void poisonData(int pos, int length, std::vector<std::vector<int>>* data,
+ std::vector<size_t>* file_map, std::vector<size_t>* file_offset,
+ int* next_terminator) {
+ size_t f = file_map->at(pos / CHUNK_SIZE);
pos -= file_offset->at(f);
- std::vector<TextChar>& file = data->at(f);
- TextIdx l = (length == CUT_MATCH) ? CUT_MATCH : 1;
- for (TextIdx j = 0; j < l; j++, pos++) {
+ std::vector<int>& file = data->at(f);
+ int l = (length == CUT_MATCH) ? CUT_MATCH : 1;
+ for (int j = 0; j < l; j++, pos++) {
if (file[pos] >= 256) continue;
if (file[pos + 1] >= 256) {
file[pos] = file[pos + 1];
@@ -144,12 +133,12 @@ static void poisonData(TextIdx pos, TextIdx length,
/* Remove substrings of a given match from files.
Substrings are replaced with unique terminators, so next iteration SA would
not allow to cross removed areas. */
-static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
- TextIdx length, std::vector<TextIdx>* sa, std::vector<TextIdx>* lcp,
- std::vector<TextIdx>* invese_sa, TextChar* next_terminator,
- std::vector<TextIdx>* file_map, std::vector<TextIdx>* file_offset) {
+static void cutMatch(std::vector<std::vector<int>>* data, int index, int length,
+ std::vector<int>* sa, std::vector<int>* lcp, std::vector<int>* invese_sa,
+ int* next_terminator, std::vector<size_t>* file_map,
+ std::vector<size_t>* file_offset) {
while (length >= CUT_MATCH) {
- TextIdx i = index;
+ int i = index;
while (lcp->at(i) >= length) {
i++;
poisonData(
@@ -166,98 +155,81 @@ static void cutMatch(std::vector<std::vector<TextChar>>* data, TextIdx index,
}
}
-std::string DM_generate(size_t dictionary_size_limit,
- const std::vector<size_t>& sample_sizes, const uint8_t* sample_data) {
+size_t DM_generate(uint8_t* dictionary, size_t dictionary_size_limit,
+ size_t num_samples, const size_t* sample_sizes,
+ const uint8_t* sample_data) {
{
- TextIdx tmp = static_cast<TextIdx>(dictionary_size_limit);
- if ((tmp != dictionary_size_limit) || (tmp > 1u << 30)) {
- fprintf(stderr, "dictionary_size_limit is too large\n");
- return "";
+ uint64_t tmp = 0;
+ if (popcount(tmp - 1u) != 64) {
+ fprintf(stderr, "64-bit platform is required\n");
+ return 0;
}
}
/* Could use 256 + '0' for easier debugging. */
- TextChar next_terminator = 256;
+ int next_terminator = 256;
- std::string output;
- std::vector<std::vector<TextChar>> data;
+ std::vector<std::vector<int>> data;
- TextIdx offset = 0;
- size_t num_samples = sample_sizes.size();
- if (num_samples > DM_MAX_FILES) num_samples = DM_MAX_FILES;
+ size_t offset = 0;
+ if (num_samples > MAX_FILES) num_samples = MAX_FILES;
for (size_t n = 0; n < num_samples; ++n) {
- TextIdx delta = static_cast<TextIdx>(sample_sizes[n]);
- if (delta != sample_sizes[n]) {
- fprintf(stderr, "sample is too large\n");
- return "";
- }
- if (delta == 0) {
- fprintf(stderr, "0-length samples are prohibited\n");
- return "";
- }
- TextIdx next_offset = offset + delta;
- if (next_offset <= offset) {
- fprintf(stderr, "corpus is too large\n");
- return "";
- }
+ size_t next_offset = offset + sample_sizes[n];
data.push_back(
- std::vector<TextChar>(sample_data + offset, sample_data + next_offset));
+ std::vector<int>(sample_data + offset, sample_data + next_offset));
offset = next_offset;
data.back().push_back(next_terminator++);
}
/* Most arrays are allocated once, and then just resized to smaller and
smaller sizes. */
- std::vector<TextChar> full_text;
- std::vector<TextIdx> file_map;
- std::vector<TextIdx> file_offset;
- std::vector<TextIdx> sa;
- std::vector<TextIdx> invese_sa;
- std::vector<TextIdx> lcp;
+ std::vector<int> full_text;
+ std::vector<size_t> file_map;
+ std::vector<size_t> file_offset;
+ std::vector<int> sa;
+ std::vector<int> invese_sa;
+ std::vector<int> lcp;
std::vector<Isle> isles;
std::vector<char> output_data;
- TextIdx total = 0;
- TextIdx total_cost = 0;
- TextIdx best_cost;
+ size_t total = 0;
+ size_t total_cost = 0;
+ size_t best_cost;
Isle best_isle;
- size_t min_count = num_samples;
+ int min_count = num_samples;
while (true) {
- TextIdx max_match = static_cast<TextIdx>(dictionary_size_limit) - total;
+ size_t max_match = dictionary_size_limit - total;
buildFullText(&data, &full_text, &file_map, &file_offset, &next_terminator);
sa.resize(full_text.size());
- /* Hopefully, non-negative TextSaIdx is the same sa TextIdx counterpart. */
- saisxx(full_text.data(), reinterpret_cast<TextSaIdx*>(sa.data()),
- static_cast<TextChar>(full_text.size()), next_terminator);
+ saisxx(full_text.data(), sa.data(), static_cast<int>(full_text.size()),
+ next_terminator);
invese_sa.resize(full_text.size());
- for (TextIdx i = 0; i < full_text.size(); ++i) {
- invese_sa[sa[i]] = i;
- }
+ for (int i = 0; i < full_text.size(); ++i) invese_sa[sa[i]] = i;
buildLcp(&full_text, &sa, &lcp, &invese_sa);
/* Do not rebuild SA/LCP, just use different selection. */
- retry:
+retry:
best_cost = 0;
best_isle = {0, 0, 0, {{0}}};
isles.resize(0);
isles.push_back(best_isle);
- for (TextIdx i = 0; i < lcp.size(); ++i) {
- TextIdx l = i;
+ for (int i = 0; i < static_cast<int>(lcp.size()); ++i) {
+ int l = i;
Coverage cov = {{0}};
- size_t f = file_map[sa[i] / CHUNK_SIZE];
- cov[f >> 6] = (static_cast<uint64_t>(1)) << (f & 63);
+ int f = file_map[sa[i] / CHUNK_SIZE];
+ cov[f >> 6] = ((uint64_t)1) << (f & 63);
while (lcp[i] < isles.back().lcp) {
Isle& top = isles.back();
top.r = i;
l = top.l;
for (size_t x = 0; x < cov.size(); ++x) cov[x] |= top.coverage[x];
- size_t count = 0;
+ int count = 0;
for (size_t x = 0; x < cov.size(); ++x) count += popcount(cov[x]);
- TextIdx effective_lcp = top.lcp;
+ int effective_lcp = top.lcp;
/* Restrict (last) dictionary entry length. */
if (effective_lcp > max_match) effective_lcp = max_match;
- TextIdx cost = count * effective_lcp;
+ int cost = count * effective_lcp;
if (cost > best_cost && count >= min_count &&
effective_lcp >= MIN_MATCH) {
best_cost = cost;
@@ -280,23 +252,25 @@ std::string DM_generate(size_t dictionary_size_limit,
if (best_cost == 0 || best_isle.lcp < MIN_MATCH) {
if (min_count >= 8) {
min_count = (min_count * 7) / 8;
- fprintf(stderr, "Retry: min_count=%zu\n", min_count);
+ fprintf(stderr, "Retry: min_count=%d\n", min_count);
goto retry;
}
break;
}
/* Save the entry. */
- fprintf(stderr, "Savings: %d+%d, dictionary: %d+%d\n",
- total_cost, best_cost, total, best_isle.lcp);
- int* piece = &full_text[sa[best_isle.l]];
- output.insert(output.end(), piece, piece + best_isle.lcp);
+ fprintf(stderr,
+ "Savings: %zu+%zu, dictionary: %zu+%d\n",
+ total_cost, best_cost, total, best_isle.lcp);
+ for (size_t i = 0; i < best_isle.lcp; ++i) {
+ dictionary[total + i] =
+ static_cast<uint8_t>(full_text[sa[best_isle.l] + i]);
+ }
total += best_isle.lcp;
total_cost += best_cost;
- cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp, &invese_sa,
- &next_terminator, &file_map, &file_offset);
+ cutMatch(&data, best_isle.l, best_isle.lcp, &sa, &lcp,
+ &invese_sa, &next_terminator, &file_map, &file_offset);
if (total >= dictionary_size_limit) break;
}
-
- return output;
+ return total;
}