/* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. * * This source code is licensed under both the BSD-style license (found in the * LICENSE file in the root directory of this source tree) and the GPLv2 (found * in the COPYING file in the root directory of this source tree). * You may select, at your option, one of the above-listed licenses. */ #include "data.h" #include #include #include #include #include /* free() */ #include #include #include "mem.h" #include "util.h" #define XXH_STATIC_LINKING_ONLY #include "xxhash.h" /** * Data objects */ #define REGRESSION_RELEASE(x) \ "https://github.com/facebook/zstd/releases/download/regression-data/" x data_t silesia = { .name = "silesia", .type = data_type_dir, .data = { .url = REGRESSION_RELEASE("silesia.tar.zst"), .xxhash64 = 0x48a199f92f93e977LL, }, }; data_t silesia_tar = { .name = "silesia.tar", .type = data_type_file, .data = { .url = REGRESSION_RELEASE("silesia.tar.zst"), .xxhash64 = 0x48a199f92f93e977LL, }, }; data_t github = { .name = "github", .type = data_type_dir, .data = { .url = REGRESSION_RELEASE("github.tar.zst"), .xxhash64 = 0xa9b1b44b020df292LL, }, .dict = { .url = REGRESSION_RELEASE("github.dict.zst"), .xxhash64 = 0x1eddc6f737d3cb53LL, }, }; data_t github_tar = { .name = "github.tar", .type = data_type_file, .data = { .url = REGRESSION_RELEASE("github.tar.zst"), .xxhash64 = 0xa9b1b44b020df292LL, }, .dict = { .url = REGRESSION_RELEASE("github.dict.zst"), .xxhash64 = 0x1eddc6f737d3cb53LL, }, }; static data_t* g_data[] = { &silesia, &silesia_tar, &github, &github_tar, NULL, }; data_t const* const* data = (data_t const* const*)g_data; /** * data helpers. */ int data_has_dict(data_t const* data) { return data->dict.url != NULL; } /** * data buffer helper functions (documented in header). */ data_buffer_t data_buffer_create(size_t const capacity) { data_buffer_t buffer = {}; buffer.data = (uint8_t*)malloc(capacity); if (buffer.data == NULL) return buffer; buffer.capacity = capacity; return buffer; } data_buffer_t data_buffer_read(char const* filename) { data_buffer_t buffer = {}; uint64_t const size = UTIL_getFileSize(filename); if (size == UTIL_FILESIZE_UNKNOWN) { fprintf(stderr, "unknown size for %s\n", filename); return buffer; } buffer.data = (uint8_t*)malloc(size); if (buffer.data == NULL) { fprintf(stderr, "malloc failed\n"); return buffer; } buffer.capacity = size; FILE* file = fopen(filename, "rb"); if (file == NULL) { fprintf(stderr, "file null\n"); goto err; } buffer.size = fread(buffer.data, 1, buffer.capacity, file); fclose(file); if (buffer.size != buffer.capacity) { fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity); goto err; } return buffer; err: free(buffer.data); memset(&buffer, 0, sizeof(buffer)); return buffer; } data_buffer_t data_buffer_get_data(data_t const* data) { data_buffer_t const kEmptyBuffer = {}; if (data->type != data_type_file) return kEmptyBuffer; return data_buffer_read(data->data.path); } data_buffer_t data_buffer_get_dict(data_t const* data) { data_buffer_t const kEmptyBuffer = {}; if (!data_has_dict(data)) return kEmptyBuffer; return data_buffer_read(data->dict.path); } int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) { size_t const size = buffer1.size < buffer2.size ? buffer1.size : buffer2.size; int const cmp = memcmp(buffer1.data, buffer2.data, size); if (cmp != 0) return cmp; if (buffer1.size < buffer2.size) return -1; if (buffer1.size == buffer2.size) return 0; assert(buffer1.size > buffer2.size); return 1; } void data_buffer_free(data_buffer_t buffer) { free(buffer.data); } /** * data filenames helpers. */ FileNamesTable* data_filenames_get(data_t const* data) { char const* const path = data->data.path; return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ ); } /** * data buffers helpers. */ data_buffers_t data_buffers_get(data_t const* data) { data_buffers_t buffers = {.size = 0}; FileNamesTable* const filenames = data_filenames_get(data); if (filenames == NULL) return buffers; if (filenames->tableSize == 0) { UTIL_freeFileNamesTable(filenames); return buffers; } data_buffer_t* buffersPtr = (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr)); if (buffersPtr == NULL) { UTIL_freeFileNamesTable(filenames); return buffers; } buffers.buffers = (data_buffer_t const*)buffersPtr; buffers.size = filenames->tableSize; for (size_t i = 0; i < filenames->tableSize; ++i) { buffersPtr[i] = data_buffer_read(filenames->fileNames[i]); if (buffersPtr[i].data == NULL) { data_buffers_t const kEmptyBuffer = {}; data_buffers_free(buffers); UTIL_freeFileNamesTable(filenames); return kEmptyBuffer; } } UTIL_freeFileNamesTable(filenames); return buffers; } /** * Frees the data buffers. */ void data_buffers_free(data_buffers_t buffers) { free((data_buffer_t*)buffers.buffers); } /** * Initialization and download functions. */ static char* g_data_dir = NULL; /* mkdir -p */ static int ensure_directory_exists(char const* indir) { char* const dir = strdup(indir); char* end = dir; int ret = 0; if (dir == NULL) { ret = EINVAL; goto out; } do { /* Find the next directory level. */ for (++end; *end != '\0' && *end != '/'; ++end) ; /* End the string there, make the directory, and restore the string. */ char const save = *end; *end = '\0'; int const isdir = UTIL_isDirectory(dir); ret = mkdir(dir, S_IRWXU); *end = save; /* Its okay if the directory already exists. */ if (ret == 0 || (errno == EEXIST && isdir)) continue; ret = errno; fprintf(stderr, "mkdir() failed\n"); goto out; } while (*end != '\0'); ret = 0; out: free(dir); return ret; } /** Concatenate 3 strings into a new buffer. */ static char* cat3(char const* str1, char const* str2, char const* str3) { size_t const size1 = strlen(str1); size_t const size2 = strlen(str2); size_t const size3 = str3 == NULL ? 0 : strlen(str3); size_t const size = size1 + size2 + size3 + 1; char* const dst = (char*)malloc(size); if (dst == NULL) return NULL; strcpy(dst, str1); strcpy(dst + size1, str2); if (str3 != NULL) strcpy(dst + size1 + size2, str3); assert(strlen(dst) == size1 + size2 + size3); return dst; } static char* cat2(char const* str1, char const* str2) { return cat3(str1, str2, NULL); } /** * State needed by the curl callback. * It takes data from curl, hashes it, and writes it to the file. */ typedef struct { FILE* file; XXH64_state_t xxhash64; int error; } curl_data_t; /** Create the curl state. */ static curl_data_t curl_data_create( data_resource_t const* resource, data_type_t type) { curl_data_t cdata = {}; XXH64_reset(&cdata.xxhash64, 0); assert(UTIL_isDirectory(g_data_dir)); if (type == data_type_file) { /* Decompress the resource and store to the path. */ char* cmd = cat3("zstd -dqfo '", resource->path, "'"); if (cmd == NULL) { cdata.error = ENOMEM; return cdata; } cdata.file = popen(cmd, "w"); free(cmd); } else { /* Decompress and extract the resource to the cache directory. */ char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'"); if (cmd == NULL) { cdata.error = ENOMEM; return cdata; } cdata.file = popen(cmd, "w"); free(cmd); } if (cdata.file == NULL) { cdata.error = errno; } return cdata; } /** Free the curl state. */ static int curl_data_free(curl_data_t cdata) { return pclose(cdata.file); } /** curl callback. Updates the hash, and writes to the file. */ static size_t curl_write(void* data, size_t size, size_t count, void* ptr) { curl_data_t* cdata = (curl_data_t*)ptr; size_t const written = fwrite(data, size, count, cdata->file); XXH64_update(&cdata->xxhash64, data, written * size); return written; } static int curl_download_resource( CURL* curl, data_resource_t const* resource, data_type_t type) { curl_data_t cdata; /* Download the data. */ if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0) return EINVAL; if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0) return EINVAL; cdata = curl_data_create(resource, type); if (cdata.error != 0) return cdata.error; int const curl_err = curl_easy_perform(curl); int const close_err = curl_data_free(cdata); if (curl_err) { fprintf( stderr, "downloading '%s' for '%s' failed\n", resource->url, resource->path); return EIO; } if (close_err) { fprintf(stderr, "writing data to '%s' failed\n", resource->path); return EIO; } /* check that the file exists. */ if (type == data_type_file && !UTIL_isRegularFile(resource->path)) { fprintf(stderr, "output file '%s' does not exist\n", resource->path); return EIO; } if (type == data_type_dir && !UTIL_isDirectory(resource->path)) { fprintf( stderr, "output directory '%s' does not exist\n", resource->path); return EIO; } /* Check that the hash matches. */ if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) { fprintf( stderr, "checksum does not match: 0x%llxLL != 0x%llxLL\n", (unsigned long long)XXH64_digest(&cdata.xxhash64), (unsigned long long)resource->xxhash64); return EINVAL; } return 0; } /** Download a single data object. */ static int curl_download_datum(CURL* curl, data_t const* data) { int ret; ret = curl_download_resource(curl, &data->data, data->type); if (ret != 0) return ret; if (data_has_dict(data)) { ret = curl_download_resource(curl, &data->dict, data_type_file); if (ret != 0) return ret; } return ret; } /** Download all the data. */ static int curl_download_data(data_t const* const* data) { if (curl_global_init(CURL_GLOBAL_ALL) != 0) return EFAULT; curl_data_t cdata = {}; CURL* curl = curl_easy_init(); int err = EFAULT; if (curl == NULL) return EFAULT; if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0) goto out; if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0) goto out; if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0) goto out; assert(data != NULL); for (; *data != NULL; ++data) { if (curl_download_datum(curl, *data) != 0) goto out; } err = 0; out: curl_easy_cleanup(curl); curl_global_cleanup(); return err; } /** Fill the path member variable of the data objects. */ static int data_create_paths(data_t* const* data, char const* dir) { size_t const dirlen = strlen(dir); assert(data != NULL); for (; *data != NULL; ++data) { data_t* const datum = *data; datum->data.path = cat3(dir, "/", datum->name); if (datum->data.path == NULL) return ENOMEM; if (data_has_dict(datum)) { datum->dict.path = cat2(datum->data.path, ".dict"); if (datum->dict.path == NULL) return ENOMEM; } } return 0; } /** Free the path member variable of the data objects. */ static void data_free_paths(data_t* const* data) { assert(data != NULL); for (; *data != NULL; ++data) { data_t* datum = *data; free((void*)datum->data.path); free((void*)datum->dict.path); datum->data.path = NULL; datum->dict.path = NULL; } } static char const kStampName[] = "STAMP"; static void xxh_update_le(XXH64_state_t* state, uint64_t data) { if (!MEM_isLittleEndian()) data = MEM_swap64(data); XXH64_update(state, &data, sizeof(data)); } /** Hash the data to create the stamp. */ static uint64_t stamp_hash(data_t const* const* data) { XXH64_state_t state; XXH64_reset(&state, 0); assert(data != NULL); for (; *data != NULL; ++data) { data_t const* datum = *data; /* We don't care about the URL that we fetch from. */ /* The path is derived from the name. */ XXH64_update(&state, datum->name, strlen(datum->name)); xxh_update_le(&state, datum->data.xxhash64); xxh_update_le(&state, datum->dict.xxhash64); xxh_update_le(&state, datum->type); } return XXH64_digest(&state); } /** Check if the stamp matches the stamp in the cache directory. */ static int stamp_check(char const* dir, data_t const* const* data) { char* stamp = cat3(dir, "/", kStampName); uint64_t const expected = stamp_hash(data); XXH64_canonical_t actual; FILE* stampfile = NULL; int matches = 0; if (stamp == NULL) goto out; if (!UTIL_isRegularFile(stamp)) { fprintf(stderr, "stamp does not exist: recreating the data cache\n"); goto out; } stampfile = fopen(stamp, "rb"); if (stampfile == NULL) { fprintf(stderr, "could not open stamp: recreating the data cache\n"); goto out; } size_t b; if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) { fprintf(stderr, "invalid stamp: recreating the data cache\n"); goto out; } matches = (expected == XXH64_hashFromCanonical(&actual)); if (matches) fprintf(stderr, "stamp matches: reusing the cached data\n"); else fprintf(stderr, "stamp does not match: recreating the data cache\n"); out: free(stamp); if (stampfile != NULL) fclose(stampfile); return matches; } /** On success write a new stamp, on failure delete the old stamp. */ static int stamp_write(char const* dir, data_t const* const* data, int const data_err) { char* stamp = cat3(dir, "/", kStampName); FILE* stampfile = NULL; int err = EIO; if (stamp == NULL) return ENOMEM; if (data_err != 0) { err = data_err; goto out; } XXH64_canonical_t hash; XXH64_canonicalFromHash(&hash, stamp_hash(data)); stampfile = fopen(stamp, "wb"); if (stampfile == NULL) goto out; if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1) goto out; err = 0; fprintf(stderr, "stamped new data cache\n"); out: if (err != 0) /* Ignore errors. */ unlink(stamp); free(stamp); if (stampfile != NULL) fclose(stampfile); return err; } int data_init(char const* dir) { int err; if (dir == NULL) return EINVAL; /* This must be first to simplify logic. */ err = ensure_directory_exists(dir); if (err != 0) return err; /* Save the cache directory. */ g_data_dir = strdup(dir); if (g_data_dir == NULL) return ENOMEM; err = data_create_paths(g_data, dir); if (err != 0) return err; /* If the stamp matches then we are good to go. * This must be called before any modifications to the data cache. * After this point, we MUST call stamp_write() to update the STAMP, * since we've updated the data cache. */ if (stamp_check(dir, data)) return 0; err = curl_download_data(data); if (err != 0) goto out; out: /* This must be last, since it must know if data_init() succeeded. */ stamp_write(dir, data, err); return err; } void data_finish(void) { data_free_paths(g_data); free(g_data_dir); g_data_dir = NULL; }