From 548beaa3ecea2fbb149a6f31b5c48e3206975a60 Mon Sep 17 00:00:00 2001 From: "anthony.wen" Date: Tue, 21 Apr 2026 13:31:17 -0400 Subject: [PATCH] Add bulk dataset generation options to test data script Add bulk data generation controls for folder count, files per folder, file size range, and bulk dataset size limits. Also update the cdssync docs to describe the new options and how update mode applies to generated bulk files. --- cdssync/AGENTS.md | 6 + cdssync/generate_migration_test_dataset.sh | 160 ++++++++++++++++++++- cdssync/migration-test-manifest.md | 8 ++ 3 files changed, 169 insertions(+), 5 deletions(-) diff --git a/cdssync/AGENTS.md b/cdssync/AGENTS.md index 374fc63..fd35388 100644 --- a/cdssync/AGENTS.md +++ b/cdssync/AGENTS.md @@ -22,6 +22,12 @@ For migration test datasets in this workspace, follow this process by default: - The generator script also accepts `--update-only`: - use it to update an existing dataset in place without recreating files, links, or directories - combine it with `UPDATE_INTERVAL_SECONDS` to keep mutating an existing dataset on a fixed interval +- The generator script can also create additional bulk test data under `bulk/`: + - `--folder-count N` controls how many bulk folders are created + - `--files-per-folder N` controls how many bulk files are created in each folder + - `--min-file-size-mib N` and `--max-file-size-mib N` control the random size range for bulk files + - `--max-dataset-size-mib N` caps the total size of generated bulk files only + - once bulk files exist, update mode rewrites them too as part of the mutable-content set - If ACL/xattr coverage matters, ensure the generation host has: - `acl` installed for `setfacl` and `getfacl` - `attr` installed for `setfattr` and `getfattr` diff --git a/cdssync/generate_migration_test_dataset.sh b/cdssync/generate_migration_test_dataset.sh index eb992b3..a0ed361 100755 --- a/cdssync/generate_migration_test_dataset.sh +++ b/cdssync/generate_migration_test_dataset.sh @@ -5,11 +5,19 @@ set -euo pipefail usage() { cat <<'EOF' Usage: - generate_migration_test_dataset.sh [--update-only] TARGET_DIR [UPDATE_INTERVAL_SECONDS] + generate_migration_test_dataset.sh [OPTIONS] TARGET_DIR [UPDATE_INTERVAL_SECONDS] Creates a compact filesystem migration test dataset under TARGET_DIR. The dataset matches the manifest in migration-test-manifest.md. +Options: + --update-only Update an existing dataset in place. + --folder-count N Generate N bulk-data folders under bulk/. + --files-per-folder N Generate N bulk files in each bulk folder. + --min-file-size-mib N Minimum bulk file size in MiB. Default: 1. + --max-file-size-mib N Maximum bulk file size in MiB. Default: 5. + --max-dataset-size-mib N Maximum total size for generated bulk files. + Notes: - Existing TARGET_DIR contents are left in place unless they collide. - ACL and xattr cases are created only if the local tools are available. @@ -24,10 +32,52 @@ EOF } UPDATE_ONLY=0 -if [[ ${1:-} == "--update-only" ]]; then - UPDATE_ONLY=1 - shift -fi +FOLDER_COUNT=0 +FILES_PER_FOLDER=0 +MIN_FILE_SIZE_MIB=1 +MAX_FILE_SIZE_MIB=5 +MAX_DATASET_SIZE_MIB= + +while [[ $# -gt 0 ]]; do + case "$1" in + --update-only) + UPDATE_ONLY=1 + shift + ;; + --folder-count) + FOLDER_COUNT=${2:-} + shift 2 + ;; + --files-per-folder) + FILES_PER_FOLDER=${2:-} + shift 2 + ;; + --min-file-size-mib) + MIN_FILE_SIZE_MIB=${2:-} + shift 2 + ;; + --max-file-size-mib) + MAX_FILE_SIZE_MIB=${2:-} + shift 2 + ;; + --max-dataset-size-mib) + MAX_DATASET_SIZE_MIB=${2:-} + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + --*) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + *) + break + ;; + esac +done if [[ $# -lt 1 || $# -gt 2 ]]; then usage @@ -43,6 +93,23 @@ if [[ -n "$UPDATE_INTERVAL" && ! "$UPDATE_INTERVAL" =~ ^[0-9]+$ ]]; then exit 1 fi +for value_name in FOLDER_COUNT FILES_PER_FOLDER MIN_FILE_SIZE_MIB MAX_FILE_SIZE_MIB; do + if ! [[ ${!value_name} =~ ^[0-9]+$ ]]; then + echo "$value_name must be a non-negative integer" >&2 + exit 1 + fi +done + +if [[ -n "$MAX_DATASET_SIZE_MIB" && ! "$MAX_DATASET_SIZE_MIB" =~ ^[0-9]+$ ]]; then + echo "MAX_DATASET_SIZE_MIB must be a non-negative integer" >&2 + exit 1 +fi + +if (( MIN_FILE_SIZE_MIB > MAX_FILE_SIZE_MIB )); then + echo "MIN_FILE_SIZE_MIB cannot be greater than MAX_FILE_SIZE_MIB" >&2 + exit 1 +fi + mkdir -p "$ROOT" have_setfacl=0 @@ -60,6 +127,16 @@ create_dir() { mkdir -p "$ROOT/$1" } +random_int_between() { + local min=$1 + local max=$2 + if (( min == max )); then + echo "$min" + return + fi + shuf -i "${min}-${max}" -n 1 +} + log() { printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" } @@ -178,6 +255,14 @@ rewrite_file_with_random_data() { chmod "$mode" "$path" } +append_bulk_mutable_files() { + if [[ -d "$ROOT/bulk" ]]; then + while IFS= read -r rel; do + BULK_MUTABLE_FILES+=("$rel") + done < <(cd "$ROOT" && find bulk -type f | sort) + fi +} + update_mutable_files_pass() { local rel local mutable_files=( @@ -211,11 +296,18 @@ update_mutable_files_pass() { "metadata/xattr_random_3mb_600.bin" "metadata/acl_text_1mb_644.txt" ) + local BULK_MUTABLE_FILES=() + + append_bulk_mutable_files for rel in "${mutable_files[@]}"; do rewrite_file_with_random_data "$rel" done + for rel in "${BULK_MUTABLE_FILES[@]}"; do + rewrite_file_with_random_data "$rel" + done + set_acl_and_xattr_metadata } @@ -355,6 +447,57 @@ create_metadata_cases() { set_acl_and_xattr_metadata } +create_bulk_files() { + local folder_index + local file_index + local remaining_mib + local size_mib + local created_files=0 + local consumed_mib=0 + + if (( FOLDER_COUNT == 0 || FILES_PER_FOLDER == 0 )); then + return + fi + + if [[ -n "$MAX_DATASET_SIZE_MIB" && "$MAX_DATASET_SIZE_MIB" == "0" ]]; then + log "Skipping bulk-data generation because max dataset size is 0 MiB" + return + fi + + create_dir "bulk" + + for (( folder_index=1; folder_index<=FOLDER_COUNT; folder_index++ )); do + create_dir "bulk/folder-$(printf '%03d' "$folder_index")" + + for (( file_index=1; file_index<=FILES_PER_FOLDER; file_index++ )); do + if [[ -n "$MAX_DATASET_SIZE_MIB" ]]; then + remaining_mib=$((MAX_DATASET_SIZE_MIB - consumed_mib)) + if (( remaining_mib < MIN_FILE_SIZE_MIB )); then + log "Reached bulk dataset size cap after creating $created_files files (${consumed_mib} MiB)" + return + fi + if (( remaining_mib < MAX_FILE_SIZE_MIB )); then + size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$remaining_mib") + else + size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$MAX_FILE_SIZE_MIB") + fi + else + size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$MAX_FILE_SIZE_MIB") + fi + + make_file \ + "bulk/folder-$(printf '%03d' "$folder_index")/bulk_random_$(printf '%03d' "$file_index")_${size_mib}mib.bin" \ + random \ + "$size_mib" \ + 0644 + consumed_mib=$((consumed_mib + size_mib)) + created_files=$((created_files + 1)) + done + done + + log "Created $created_files bulk files across $FOLDER_COUNT folders (${consumed_mib} MiB total)" +} + write_summary() { cat >"$ROOT/GENERATION_SUMMARY.txt" <