diff --git a/cdssync/AGENTS.md b/cdssync/AGENTS.md index 374fc63..fd35388 100644 --- a/cdssync/AGENTS.md +++ b/cdssync/AGENTS.md @@ -22,6 +22,12 @@ For migration test datasets in this workspace, follow this process by default: - The generator script also accepts `--update-only`: - use it to update an existing dataset in place without recreating files, links, or directories - combine it with `UPDATE_INTERVAL_SECONDS` to keep mutating an existing dataset on a fixed interval +- The generator script can also create additional bulk test data under `bulk/`: + - `--folder-count N` controls how many bulk folders are created + - `--files-per-folder N` controls how many bulk files are created in each folder + - `--min-file-size-mib N` and `--max-file-size-mib N` control the random size range for bulk files + - `--max-dataset-size-mib N` caps the total size of generated bulk files only + - once bulk files exist, update mode rewrites them too as part of the mutable-content set - If ACL/xattr coverage matters, ensure the generation host has: - `acl` installed for `setfacl` and `getfacl` - `attr` installed for `setfattr` and `getfattr` diff --git a/cdssync/generate_migration_test_dataset.sh b/cdssync/generate_migration_test_dataset.sh index eb992b3..a0ed361 100755 --- a/cdssync/generate_migration_test_dataset.sh +++ b/cdssync/generate_migration_test_dataset.sh @@ -5,11 +5,19 @@ set -euo pipefail usage() { cat <<'EOF' Usage: - generate_migration_test_dataset.sh [--update-only] TARGET_DIR [UPDATE_INTERVAL_SECONDS] + generate_migration_test_dataset.sh [OPTIONS] TARGET_DIR [UPDATE_INTERVAL_SECONDS] Creates a compact filesystem migration test dataset under TARGET_DIR. The dataset matches the manifest in migration-test-manifest.md. +Options: + --update-only Update an existing dataset in place. + --folder-count N Generate N bulk-data folders under bulk/. + --files-per-folder N Generate N bulk files in each bulk folder. + --min-file-size-mib N Minimum bulk file size in MiB. Default: 1. + --max-file-size-mib N Maximum bulk file size in MiB. Default: 5. + --max-dataset-size-mib N Maximum total size for generated bulk files. + Notes: - Existing TARGET_DIR contents are left in place unless they collide. - ACL and xattr cases are created only if the local tools are available. @@ -24,10 +32,52 @@ EOF } UPDATE_ONLY=0 -if [[ ${1:-} == "--update-only" ]]; then - UPDATE_ONLY=1 - shift -fi +FOLDER_COUNT=0 +FILES_PER_FOLDER=0 +MIN_FILE_SIZE_MIB=1 +MAX_FILE_SIZE_MIB=5 +MAX_DATASET_SIZE_MIB= + +while [[ $# -gt 0 ]]; do + case "$1" in + --update-only) + UPDATE_ONLY=1 + shift + ;; + --folder-count) + FOLDER_COUNT=${2:-} + shift 2 + ;; + --files-per-folder) + FILES_PER_FOLDER=${2:-} + shift 2 + ;; + --min-file-size-mib) + MIN_FILE_SIZE_MIB=${2:-} + shift 2 + ;; + --max-file-size-mib) + MAX_FILE_SIZE_MIB=${2:-} + shift 2 + ;; + --max-dataset-size-mib) + MAX_DATASET_SIZE_MIB=${2:-} + shift 2 + ;; + --help|-h) + usage + exit 0 + ;; + --*) + echo "Unknown option: $1" >&2 + usage + exit 1 + ;; + *) + break + ;; + esac +done if [[ $# -lt 1 || $# -gt 2 ]]; then usage @@ -43,6 +93,23 @@ if [[ -n "$UPDATE_INTERVAL" && ! "$UPDATE_INTERVAL" =~ ^[0-9]+$ ]]; then exit 1 fi +for value_name in FOLDER_COUNT FILES_PER_FOLDER MIN_FILE_SIZE_MIB MAX_FILE_SIZE_MIB; do + if ! [[ ${!value_name} =~ ^[0-9]+$ ]]; then + echo "$value_name must be a non-negative integer" >&2 + exit 1 + fi +done + +if [[ -n "$MAX_DATASET_SIZE_MIB" && ! "$MAX_DATASET_SIZE_MIB" =~ ^[0-9]+$ ]]; then + echo "MAX_DATASET_SIZE_MIB must be a non-negative integer" >&2 + exit 1 +fi + +if (( MIN_FILE_SIZE_MIB > MAX_FILE_SIZE_MIB )); then + echo "MIN_FILE_SIZE_MIB cannot be greater than MAX_FILE_SIZE_MIB" >&2 + exit 1 +fi + mkdir -p "$ROOT" have_setfacl=0 @@ -60,6 +127,16 @@ create_dir() { mkdir -p "$ROOT/$1" } +random_int_between() { + local min=$1 + local max=$2 + if (( min == max )); then + echo "$min" + return + fi + shuf -i "${min}-${max}" -n 1 +} + log() { printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" } @@ -178,6 +255,14 @@ rewrite_file_with_random_data() { chmod "$mode" "$path" } +append_bulk_mutable_files() { + if [[ -d "$ROOT/bulk" ]]; then + while IFS= read -r rel; do + BULK_MUTABLE_FILES+=("$rel") + done < <(cd "$ROOT" && find bulk -type f | sort) + fi +} + update_mutable_files_pass() { local rel local mutable_files=( @@ -211,11 +296,18 @@ update_mutable_files_pass() { "metadata/xattr_random_3mb_600.bin" "metadata/acl_text_1mb_644.txt" ) + local BULK_MUTABLE_FILES=() + + append_bulk_mutable_files for rel in "${mutable_files[@]}"; do rewrite_file_with_random_data "$rel" done + for rel in "${BULK_MUTABLE_FILES[@]}"; do + rewrite_file_with_random_data "$rel" + done + set_acl_and_xattr_metadata } @@ -355,6 +447,57 @@ create_metadata_cases() { set_acl_and_xattr_metadata } +create_bulk_files() { + local folder_index + local file_index + local remaining_mib + local size_mib + local created_files=0 + local consumed_mib=0 + + if (( FOLDER_COUNT == 0 || FILES_PER_FOLDER == 0 )); then + return + fi + + if [[ -n "$MAX_DATASET_SIZE_MIB" && "$MAX_DATASET_SIZE_MIB" == "0" ]]; then + log "Skipping bulk-data generation because max dataset size is 0 MiB" + return + fi + + create_dir "bulk" + + for (( folder_index=1; folder_index<=FOLDER_COUNT; folder_index++ )); do + create_dir "bulk/folder-$(printf '%03d' "$folder_index")" + + for (( file_index=1; file_index<=FILES_PER_FOLDER; file_index++ )); do + if [[ -n "$MAX_DATASET_SIZE_MIB" ]]; then + remaining_mib=$((MAX_DATASET_SIZE_MIB - consumed_mib)) + if (( remaining_mib < MIN_FILE_SIZE_MIB )); then + log "Reached bulk dataset size cap after creating $created_files files (${consumed_mib} MiB)" + return + fi + if (( remaining_mib < MAX_FILE_SIZE_MIB )); then + size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$remaining_mib") + else + size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$MAX_FILE_SIZE_MIB") + fi + else + size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$MAX_FILE_SIZE_MIB") + fi + + make_file \ + "bulk/folder-$(printf '%03d' "$folder_index")/bulk_random_$(printf '%03d' "$file_index")_${size_mib}mib.bin" \ + random \ + "$size_mib" \ + 0644 + consumed_mib=$((consumed_mib + size_mib)) + created_files=$((created_files + 1)) + done + done + + log "Created $created_files bulk files across $FOLDER_COUNT folders (${consumed_mib} MiB total)" +} + write_summary() { cat >"$ROOT/GENERATION_SUMMARY.txt" <