aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElliot Gorokhovsky <embg@fb.com>2021-12-10 16:19:40 -0500
committerElliot Gorokhovsky <embg@fb.com>2021-12-14 14:29:01 -0500
commit71c0c07c190fcb354ac16feef2a5afd7616422c2 (patch)
tree1ce9fac5bc661ddd50880c86b4cdc8cc9048338f
parent57383d23172119c1bb216a3c530bc425794bd786 (diff)
downloadzstd-71c0c07c190fcb354ac16feef2a5afd7616422c2.tar.gz
Allow user to specify memory limit for dictionary training
-rw-r--r--programs/dibio.c7
-rw-r--r--programs/dibio.h2
-rw-r--r--programs/zstd.1.md6
-rw-r--r--programs/zstdcli.c6
-rwxr-xr-xtests/playTests.sh7
5 files changed, 23 insertions, 5 deletions
diff --git a/programs/dibio.c b/programs/dibio.c
index e7fb905e..04860dbb 100644
--- a/programs/dibio.c
+++ b/programs/dibio.c
@@ -309,7 +309,7 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t
int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
const char** fileNamesTable, int nbFiles, size_t chunkSize,
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
- ZDICT_fastCover_params_t* fastCoverParams, int optimize)
+ ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
{
fileStats fs;
size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
@@ -341,6 +341,11 @@ int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
/* Limit the size of the training data to 2GB */
/* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
+ if (memLimit != 0) {
+ DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
+ (unsigned)(memLimit / (1 MB)));
+ loadedSize = (size_t)MIN(loadedSize, memLimit);
+ }
srcBuffer = malloc(loadedSize+NOISELENGTH);
sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
}
diff --git a/programs/dibio.h b/programs/dibio.h
index 03ec80e5..666c1e66 100644
--- a/programs/dibio.h
+++ b/programs/dibio.h
@@ -34,6 +34,6 @@
int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
const char** fileNamesTable, int nbFiles, size_t chunkSize,
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
- ZDICT_fastCover_params_t* fastCoverParams, int optimize);
+ ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit);
#endif
diff --git a/programs/zstd.1.md b/programs/zstd.1.md
index ef37fef3..e343ec04 100644
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@@ -190,6 +190,10 @@ the last one takes effect.
This is also used during compression when using with --patch-from=. In this case,
this parameter overrides that maximum size allowed for a dictionary. (128 MB).
+
+ Additionally, this can be used to limit memory for dictionary training. This parameter
+ overrides the default limit of 2 GB. zstd will load training samples up to the memory limit
+ and ignore the rest.
* `--stream-size=#` :
Sets the pledged source size of input coming from a stream. This value must be exact, as it
will be included in the produced frame header. Incorrect stream sizes will cause an error.
@@ -329,6 +333,8 @@ Compression of small files similar to the sample set will be greatly improved.
resulting in a _small_ compression ratio improvement for this level.
* `-B#`:
Split input files into blocks of size # (default: no split)
+* `-M#`, `--memory=#`:
+ Limit the amount of sample data loaded for training (default: 2 GB). See above for details.
* `--dictID=#`:
A dictionary ID is a locally unique ID
that a decoder can use to verify it is using the right dictionary.
diff --git a/programs/zstdcli.c b/programs/zstdcli.c
index 4d1978c8..bfe18c0c 100644
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@@ -1327,18 +1327,18 @@ int main(int argCount, const char* argv[])
int const optimize = !coverParams.k || !coverParams.d;
coverParams.nbThreads = (unsigned)nbWorkers;
coverParams.zParams = zParams;
- operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize);
+ operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize, memLimit);
} else if (dict == fastCover) {
int const optimize = !fastCoverParams.k || !fastCoverParams.d;
fastCoverParams.nbThreads = (unsigned)nbWorkers;
fastCoverParams.zParams = zParams;
- operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize);
+ operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize, memLimit);
} else {
ZDICT_legacy_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams));
dictParams.selectivityLevel = dictSelect;
dictParams.zParams = zParams;
- operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0);
+ operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0, memLimit);
}
#else
(void)dictCLevel; (void)dictSelect; (void)dictID; (void)maxDictSize; /* not used when ZSTD_NODICT set */
diff --git a/tests/playTests.sh b/tests/playTests.sh
index f6b6ac8f..ebe7bf28 100755
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@@ -1029,6 +1029,13 @@ then
fi
rm -f tmp* dictionary
+println "- Test --memory for dictionary compression"
+datagen -g12M -P90 > tmpCorpusHighCompress
+zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=10K && die "Dictionary training should fail : --memory too low (10K)"
+zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=5MB 2> zstTrainWithMemLimitStdErr
+cat zstTrainWithMemLimitStdErr | grep "setting manual memory limit for dictionary training data at 5 MB"
+cat zstTrainWithMemLimitStdErr | grep "Training samples set too large (12 MB); training on 5 MB only..."
+rm zstTrainWithMemLimitStdErr
println "\n===> fastCover dictionary builder : advanced options "
TESTFILE="$PRGDIR"/zstdcli.c