632 lines
16 KiB
C
632 lines
16 KiB
C
/*
|
|
* Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
* All rights reserved.
|
|
*
|
|
* This source code is licensed under both the BSD-style license (found in the
|
|
* LICENSE file in the root directory of this source tree) and the GPLv2 (found
|
|
* in the COPYING file in the root directory of this source tree).
|
|
* You may select, at your option, one of the above-listed licenses.
|
|
*/
|
|
|
|
#include "data.h"
|
|
|
|
#include <assert.h>
|
|
#include <errno.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <stdlib.h> /* free() */
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#include <curl/curl.h>
|
|
|
|
#include "mem.h"
|
|
#include "util.h"
|
|
#define XXH_STATIC_LINKING_ONLY
|
|
#include "xxhash.h"
|
|
|
|
/**
|
|
* Data objects
|
|
*/
|
|
|
|
#define REGRESSION_RELEASE(x) \
|
|
"https://github.com/facebook/zstd/releases/download/regression-data/" x
|
|
|
|
data_t silesia = {
|
|
.name = "silesia",
|
|
.type = data_type_dir,
|
|
.data =
|
|
{
|
|
.url = REGRESSION_RELEASE("silesia.tar.zst"),
|
|
.xxhash64 = 0x48a199f92f93e977LL,
|
|
},
|
|
};
|
|
|
|
data_t silesia_tar = {
|
|
.name = "silesia.tar",
|
|
.type = data_type_file,
|
|
.data =
|
|
{
|
|
.url = REGRESSION_RELEASE("silesia.tar.zst"),
|
|
.xxhash64 = 0x48a199f92f93e977LL,
|
|
},
|
|
};
|
|
|
|
data_t github = {
|
|
.name = "github",
|
|
.type = data_type_dir,
|
|
.data =
|
|
{
|
|
.url = REGRESSION_RELEASE("github.tar.zst"),
|
|
.xxhash64 = 0xa9b1b44b020df292LL,
|
|
},
|
|
.dict =
|
|
{
|
|
.url = REGRESSION_RELEASE("github.dict.zst"),
|
|
.xxhash64 = 0x1eddc6f737d3cb53LL,
|
|
|
|
},
|
|
};
|
|
|
|
data_t github_tar = {
|
|
.name = "github.tar",
|
|
.type = data_type_file,
|
|
.data =
|
|
{
|
|
.url = REGRESSION_RELEASE("github.tar.zst"),
|
|
.xxhash64 = 0xa9b1b44b020df292LL,
|
|
},
|
|
.dict =
|
|
{
|
|
.url = REGRESSION_RELEASE("github.dict.zst"),
|
|
.xxhash64 = 0x1eddc6f737d3cb53LL,
|
|
|
|
},
|
|
};
|
|
|
|
static data_t* g_data[] = {
|
|
&silesia,
|
|
&silesia_tar,
|
|
&github,
|
|
&github_tar,
|
|
NULL,
|
|
};
|
|
|
|
data_t const* const* data = (data_t const* const*)g_data;
|
|
|
|
/**
|
|
* data helpers.
|
|
*/
|
|
|
|
int data_has_dict(data_t const* data) {
|
|
return data->dict.url != NULL;
|
|
}
|
|
|
|
/**
|
|
* data buffer helper functions (documented in header).
|
|
*/
|
|
|
|
data_buffer_t data_buffer_create(size_t const capacity) {
|
|
data_buffer_t buffer = {};
|
|
|
|
buffer.data = (uint8_t*)malloc(capacity);
|
|
if (buffer.data == NULL)
|
|
return buffer;
|
|
buffer.capacity = capacity;
|
|
return buffer;
|
|
}
|
|
|
|
data_buffer_t data_buffer_read(char const* filename) {
|
|
data_buffer_t buffer = {};
|
|
|
|
uint64_t const size = UTIL_getFileSize(filename);
|
|
if (size == UTIL_FILESIZE_UNKNOWN) {
|
|
fprintf(stderr, "unknown size for %s\n", filename);
|
|
return buffer;
|
|
}
|
|
|
|
buffer.data = (uint8_t*)malloc(size);
|
|
if (buffer.data == NULL) {
|
|
fprintf(stderr, "malloc failed\n");
|
|
return buffer;
|
|
}
|
|
buffer.capacity = size;
|
|
|
|
FILE* file = fopen(filename, "rb");
|
|
if (file == NULL) {
|
|
fprintf(stderr, "file null\n");
|
|
goto err;
|
|
}
|
|
buffer.size = fread(buffer.data, 1, buffer.capacity, file);
|
|
fclose(file);
|
|
if (buffer.size != buffer.capacity) {
|
|
fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
|
|
goto err;
|
|
}
|
|
|
|
return buffer;
|
|
err:
|
|
free(buffer.data);
|
|
memset(&buffer, 0, sizeof(buffer));
|
|
return buffer;
|
|
}
|
|
|
|
data_buffer_t data_buffer_get_data(data_t const* data) {
|
|
data_buffer_t const kEmptyBuffer = {};
|
|
|
|
if (data->type != data_type_file)
|
|
return kEmptyBuffer;
|
|
|
|
return data_buffer_read(data->data.path);
|
|
}
|
|
|
|
data_buffer_t data_buffer_get_dict(data_t const* data) {
|
|
data_buffer_t const kEmptyBuffer = {};
|
|
|
|
if (!data_has_dict(data))
|
|
return kEmptyBuffer;
|
|
|
|
return data_buffer_read(data->dict.path);
|
|
}
|
|
|
|
int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
|
|
size_t const size =
|
|
buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
|
|
int const cmp = memcmp(buffer1.data, buffer2.data, size);
|
|
if (cmp != 0)
|
|
return cmp;
|
|
if (buffer1.size < buffer2.size)
|
|
return -1;
|
|
if (buffer1.size == buffer2.size)
|
|
return 0;
|
|
assert(buffer1.size > buffer2.size);
|
|
return 1;
|
|
}
|
|
|
|
void data_buffer_free(data_buffer_t buffer) {
|
|
free(buffer.data);
|
|
}
|
|
|
|
/**
|
|
* data filenames helpers.
|
|
*/
|
|
|
|
FileNamesTable* data_filenames_get(data_t const* data)
|
|
{
|
|
char const* const path = data->data.path;
|
|
return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
|
|
}
|
|
|
|
/**
|
|
* data buffers helpers.
|
|
*/
|
|
|
|
data_buffers_t data_buffers_get(data_t const* data) {
|
|
data_buffers_t buffers = {.size = 0};
|
|
FileNamesTable* const filenames = data_filenames_get(data);
|
|
if (filenames == NULL) return buffers;
|
|
if (filenames->tableSize == 0) {
|
|
UTIL_freeFileNamesTable(filenames);
|
|
return buffers;
|
|
}
|
|
|
|
data_buffer_t* buffersPtr =
|
|
(data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
|
|
if (buffersPtr == NULL) {
|
|
UTIL_freeFileNamesTable(filenames);
|
|
return buffers;
|
|
}
|
|
buffers.buffers = (data_buffer_t const*)buffersPtr;
|
|
buffers.size = filenames->tableSize;
|
|
|
|
for (size_t i = 0; i < filenames->tableSize; ++i) {
|
|
buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
|
|
if (buffersPtr[i].data == NULL) {
|
|
data_buffers_t const kEmptyBuffer = {};
|
|
data_buffers_free(buffers);
|
|
UTIL_freeFileNamesTable(filenames);
|
|
return kEmptyBuffer;
|
|
}
|
|
}
|
|
|
|
UTIL_freeFileNamesTable(filenames);
|
|
return buffers;
|
|
}
|
|
|
|
/**
|
|
* Frees the data buffers.
|
|
*/
|
|
void data_buffers_free(data_buffers_t buffers) {
|
|
free((data_buffer_t*)buffers.buffers);
|
|
}
|
|
|
|
/**
|
|
* Initialization and download functions.
|
|
*/
|
|
|
|
static char* g_data_dir = NULL;
|
|
|
|
/* mkdir -p */
|
|
static int ensure_directory_exists(char const* indir) {
|
|
char* const dir = strdup(indir);
|
|
char* end = dir;
|
|
int ret = 0;
|
|
if (dir == NULL) {
|
|
ret = EINVAL;
|
|
goto out;
|
|
}
|
|
do {
|
|
/* Find the next directory level. */
|
|
for (++end; *end != '\0' && *end != '/'; ++end)
|
|
;
|
|
/* End the string there, make the directory, and restore the string. */
|
|
char const save = *end;
|
|
*end = '\0';
|
|
int const isdir = UTIL_isDirectory(dir);
|
|
ret = mkdir(dir, S_IRWXU);
|
|
*end = save;
|
|
/* Its okay if the directory already exists. */
|
|
if (ret == 0 || (errno == EEXIST && isdir))
|
|
continue;
|
|
ret = errno;
|
|
fprintf(stderr, "mkdir() failed\n");
|
|
goto out;
|
|
} while (*end != '\0');
|
|
|
|
ret = 0;
|
|
out:
|
|
free(dir);
|
|
return ret;
|
|
}
|
|
|
|
/** Concatenate 3 strings into a new buffer. */
|
|
static char* cat3(char const* str1, char const* str2, char const* str3) {
|
|
size_t const size1 = strlen(str1);
|
|
size_t const size2 = strlen(str2);
|
|
size_t const size3 = str3 == NULL ? 0 : strlen(str3);
|
|
size_t const size = size1 + size2 + size3 + 1;
|
|
char* const dst = (char*)malloc(size);
|
|
if (dst == NULL)
|
|
return NULL;
|
|
strcpy(dst, str1);
|
|
strcpy(dst + size1, str2);
|
|
if (str3 != NULL)
|
|
strcpy(dst + size1 + size2, str3);
|
|
assert(strlen(dst) == size1 + size2 + size3);
|
|
return dst;
|
|
}
|
|
|
|
static char* cat2(char const* str1, char const* str2) {
|
|
return cat3(str1, str2, NULL);
|
|
}
|
|
|
|
/**
|
|
* State needed by the curl callback.
|
|
* It takes data from curl, hashes it, and writes it to the file.
|
|
*/
|
|
typedef struct {
|
|
FILE* file;
|
|
XXH64_state_t xxhash64;
|
|
int error;
|
|
} curl_data_t;
|
|
|
|
/** Create the curl state. */
|
|
static curl_data_t curl_data_create(
|
|
data_resource_t const* resource,
|
|
data_type_t type) {
|
|
curl_data_t cdata = {};
|
|
|
|
XXH64_reset(&cdata.xxhash64, 0);
|
|
|
|
assert(UTIL_isDirectory(g_data_dir));
|
|
|
|
if (type == data_type_file) {
|
|
/* Decompress the resource and store to the path. */
|
|
char* cmd = cat3("zstd -dqfo '", resource->path, "'");
|
|
if (cmd == NULL) {
|
|
cdata.error = ENOMEM;
|
|
return cdata;
|
|
}
|
|
cdata.file = popen(cmd, "w");
|
|
free(cmd);
|
|
} else {
|
|
/* Decompress and extract the resource to the cache directory. */
|
|
char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
|
|
if (cmd == NULL) {
|
|
cdata.error = ENOMEM;
|
|
return cdata;
|
|
}
|
|
cdata.file = popen(cmd, "w");
|
|
free(cmd);
|
|
}
|
|
if (cdata.file == NULL) {
|
|
cdata.error = errno;
|
|
}
|
|
|
|
return cdata;
|
|
}
|
|
|
|
/** Free the curl state. */
|
|
static int curl_data_free(curl_data_t cdata) {
|
|
return pclose(cdata.file);
|
|
}
|
|
|
|
/** curl callback. Updates the hash, and writes to the file. */
|
|
static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
|
|
curl_data_t* cdata = (curl_data_t*)ptr;
|
|
size_t const written = fwrite(data, size, count, cdata->file);
|
|
XXH64_update(&cdata->xxhash64, data, written * size);
|
|
return written;
|
|
}
|
|
|
|
static int curl_download_resource(
|
|
CURL* curl,
|
|
data_resource_t const* resource,
|
|
data_type_t type) {
|
|
curl_data_t cdata;
|
|
/* Download the data. */
|
|
if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
|
|
return EINVAL;
|
|
if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
|
|
return EINVAL;
|
|
cdata = curl_data_create(resource, type);
|
|
if (cdata.error != 0)
|
|
return cdata.error;
|
|
int const curl_err = curl_easy_perform(curl);
|
|
int const close_err = curl_data_free(cdata);
|
|
if (curl_err) {
|
|
fprintf(
|
|
stderr,
|
|
"downloading '%s' for '%s' failed\n",
|
|
resource->url,
|
|
resource->path);
|
|
return EIO;
|
|
}
|
|
if (close_err) {
|
|
fprintf(stderr, "writing data to '%s' failed\n", resource->path);
|
|
return EIO;
|
|
}
|
|
/* check that the file exists. */
|
|
if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
|
|
fprintf(stderr, "output file '%s' does not exist\n", resource->path);
|
|
return EIO;
|
|
}
|
|
if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
|
|
fprintf(
|
|
stderr, "output directory '%s' does not exist\n", resource->path);
|
|
return EIO;
|
|
}
|
|
/* Check that the hash matches. */
|
|
if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
|
|
fprintf(
|
|
stderr,
|
|
"checksum does not match: 0x%llxLL != 0x%llxLL\n",
|
|
(unsigned long long)XXH64_digest(&cdata.xxhash64),
|
|
(unsigned long long)resource->xxhash64);
|
|
return EINVAL;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/** Download a single data object. */
|
|
static int curl_download_datum(CURL* curl, data_t const* data) {
|
|
int ret;
|
|
ret = curl_download_resource(curl, &data->data, data->type);
|
|
if (ret != 0)
|
|
return ret;
|
|
if (data_has_dict(data)) {
|
|
ret = curl_download_resource(curl, &data->dict, data_type_file);
|
|
if (ret != 0)
|
|
return ret;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
/** Download all the data. */
|
|
static int curl_download_data(data_t const* const* data) {
|
|
if (curl_global_init(CURL_GLOBAL_ALL) != 0)
|
|
return EFAULT;
|
|
|
|
curl_data_t cdata = {};
|
|
CURL* curl = curl_easy_init();
|
|
int err = EFAULT;
|
|
|
|
if (curl == NULL)
|
|
return EFAULT;
|
|
|
|
if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
|
|
goto out;
|
|
if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
|
|
goto out;
|
|
if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
|
|
goto out;
|
|
|
|
assert(data != NULL);
|
|
for (; *data != NULL; ++data) {
|
|
if (curl_download_datum(curl, *data) != 0)
|
|
goto out;
|
|
}
|
|
|
|
err = 0;
|
|
out:
|
|
curl_easy_cleanup(curl);
|
|
curl_global_cleanup();
|
|
return err;
|
|
}
|
|
|
|
/** Fill the path member variable of the data objects. */
|
|
static int data_create_paths(data_t* const* data, char const* dir) {
|
|
size_t const dirlen = strlen(dir);
|
|
assert(data != NULL);
|
|
for (; *data != NULL; ++data) {
|
|
data_t* const datum = *data;
|
|
datum->data.path = cat3(dir, "/", datum->name);
|
|
if (datum->data.path == NULL)
|
|
return ENOMEM;
|
|
if (data_has_dict(datum)) {
|
|
datum->dict.path = cat2(datum->data.path, ".dict");
|
|
if (datum->dict.path == NULL)
|
|
return ENOMEM;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/** Free the path member variable of the data objects. */
|
|
static void data_free_paths(data_t* const* data) {
|
|
assert(data != NULL);
|
|
for (; *data != NULL; ++data) {
|
|
data_t* datum = *data;
|
|
free((void*)datum->data.path);
|
|
free((void*)datum->dict.path);
|
|
datum->data.path = NULL;
|
|
datum->dict.path = NULL;
|
|
}
|
|
}
|
|
|
|
static char const kStampName[] = "STAMP";
|
|
|
|
static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
|
|
if (!MEM_isLittleEndian())
|
|
data = MEM_swap64(data);
|
|
XXH64_update(state, &data, sizeof(data));
|
|
}
|
|
|
|
/** Hash the data to create the stamp. */
|
|
static uint64_t stamp_hash(data_t const* const* data) {
|
|
XXH64_state_t state;
|
|
|
|
XXH64_reset(&state, 0);
|
|
assert(data != NULL);
|
|
for (; *data != NULL; ++data) {
|
|
data_t const* datum = *data;
|
|
/* We don't care about the URL that we fetch from. */
|
|
/* The path is derived from the name. */
|
|
XXH64_update(&state, datum->name, strlen(datum->name));
|
|
xxh_update_le(&state, datum->data.xxhash64);
|
|
xxh_update_le(&state, datum->dict.xxhash64);
|
|
xxh_update_le(&state, datum->type);
|
|
}
|
|
return XXH64_digest(&state);
|
|
}
|
|
|
|
/** Check if the stamp matches the stamp in the cache directory. */
|
|
static int stamp_check(char const* dir, data_t const* const* data) {
|
|
char* stamp = cat3(dir, "/", kStampName);
|
|
uint64_t const expected = stamp_hash(data);
|
|
XXH64_canonical_t actual;
|
|
FILE* stampfile = NULL;
|
|
int matches = 0;
|
|
|
|
if (stamp == NULL)
|
|
goto out;
|
|
if (!UTIL_isRegularFile(stamp)) {
|
|
fprintf(stderr, "stamp does not exist: recreating the data cache\n");
|
|
goto out;
|
|
}
|
|
|
|
stampfile = fopen(stamp, "rb");
|
|
if (stampfile == NULL) {
|
|
fprintf(stderr, "could not open stamp: recreating the data cache\n");
|
|
goto out;
|
|
}
|
|
|
|
size_t b;
|
|
if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
|
|
fprintf(stderr, "invalid stamp: recreating the data cache\n");
|
|
goto out;
|
|
}
|
|
|
|
matches = (expected == XXH64_hashFromCanonical(&actual));
|
|
if (matches)
|
|
fprintf(stderr, "stamp matches: reusing the cached data\n");
|
|
else
|
|
fprintf(stderr, "stamp does not match: recreating the data cache\n");
|
|
|
|
out:
|
|
free(stamp);
|
|
if (stampfile != NULL)
|
|
fclose(stampfile);
|
|
return matches;
|
|
}
|
|
|
|
/** On success write a new stamp, on failure delete the old stamp. */
|
|
static int
|
|
stamp_write(char const* dir, data_t const* const* data, int const data_err) {
|
|
char* stamp = cat3(dir, "/", kStampName);
|
|
FILE* stampfile = NULL;
|
|
int err = EIO;
|
|
|
|
if (stamp == NULL)
|
|
return ENOMEM;
|
|
|
|
if (data_err != 0) {
|
|
err = data_err;
|
|
goto out;
|
|
}
|
|
XXH64_canonical_t hash;
|
|
|
|
XXH64_canonicalFromHash(&hash, stamp_hash(data));
|
|
|
|
stampfile = fopen(stamp, "wb");
|
|
if (stampfile == NULL)
|
|
goto out;
|
|
if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
|
|
goto out;
|
|
err = 0;
|
|
fprintf(stderr, "stamped new data cache\n");
|
|
out:
|
|
if (err != 0)
|
|
/* Ignore errors. */
|
|
unlink(stamp);
|
|
free(stamp);
|
|
if (stampfile != NULL)
|
|
fclose(stampfile);
|
|
return err;
|
|
}
|
|
|
|
int data_init(char const* dir) {
|
|
int err;
|
|
|
|
if (dir == NULL)
|
|
return EINVAL;
|
|
|
|
/* This must be first to simplify logic. */
|
|
err = ensure_directory_exists(dir);
|
|
if (err != 0)
|
|
return err;
|
|
|
|
/* Save the cache directory. */
|
|
g_data_dir = strdup(dir);
|
|
if (g_data_dir == NULL)
|
|
return ENOMEM;
|
|
|
|
err = data_create_paths(g_data, dir);
|
|
if (err != 0)
|
|
return err;
|
|
|
|
/* If the stamp matches then we are good to go.
|
|
* This must be called before any modifications to the data cache.
|
|
* After this point, we MUST call stamp_write() to update the STAMP,
|
|
* since we've updated the data cache.
|
|
*/
|
|
if (stamp_check(dir, data))
|
|
return 0;
|
|
|
|
err = curl_download_data(data);
|
|
if (err != 0)
|
|
goto out;
|
|
|
|
out:
|
|
/* This must be last, since it must know if data_init() succeeded. */
|
|
stamp_write(dir, data, err);
|
|
return err;
|
|
}
|
|
|
|
void data_finish(void) {
|
|
data_free_paths(g_data);
|
|
free(g_data_dir);
|
|
g_data_dir = NULL;
|
|
}
|