Add bulk dataset generation options to test data script

Add bulk data generation controls for folder count, files per folder,
file size range, and bulk dataset size limits.

Also update the cdssync docs to describe the new options and how
update mode applies to generated bulk files.
This commit is contained in:
2026-04-21 13:31:17 -04:00
parent 7c27535e2a
commit 548beaa3ec
3 changed files with 169 additions and 5 deletions

View File

@@ -5,11 +5,19 @@ set -euo pipefail
usage() {
cat <<'EOF'
Usage:
generate_migration_test_dataset.sh [--update-only] TARGET_DIR [UPDATE_INTERVAL_SECONDS]
generate_migration_test_dataset.sh [OPTIONS] TARGET_DIR [UPDATE_INTERVAL_SECONDS]
Creates a compact filesystem migration test dataset under TARGET_DIR.
The dataset matches the manifest in migration-test-manifest.md.
Options:
--update-only Update an existing dataset in place.
--folder-count N Generate N bulk-data folders under bulk/.
--files-per-folder N Generate N bulk files in each bulk folder.
--min-file-size-mib N Minimum bulk file size in MiB. Default: 1.
--max-file-size-mib N Maximum bulk file size in MiB. Default: 5.
--max-dataset-size-mib N Maximum total size for generated bulk files.
Notes:
- Existing TARGET_DIR contents are left in place unless they collide.
- ACL and xattr cases are created only if the local tools are available.
@@ -24,10 +32,52 @@ EOF
}
UPDATE_ONLY=0
if [[ ${1:-} == "--update-only" ]]; then
UPDATE_ONLY=1
shift
fi
FOLDER_COUNT=0
FILES_PER_FOLDER=0
MIN_FILE_SIZE_MIB=1
MAX_FILE_SIZE_MIB=5
MAX_DATASET_SIZE_MIB=
while [[ $# -gt 0 ]]; do
case "$1" in
--update-only)
UPDATE_ONLY=1
shift
;;
--folder-count)
FOLDER_COUNT=${2:-}
shift 2
;;
--files-per-folder)
FILES_PER_FOLDER=${2:-}
shift 2
;;
--min-file-size-mib)
MIN_FILE_SIZE_MIB=${2:-}
shift 2
;;
--max-file-size-mib)
MAX_FILE_SIZE_MIB=${2:-}
shift 2
;;
--max-dataset-size-mib)
MAX_DATASET_SIZE_MIB=${2:-}
shift 2
;;
--help|-h)
usage
exit 0
;;
--*)
echo "Unknown option: $1" >&2
usage
exit 1
;;
*)
break
;;
esac
done
if [[ $# -lt 1 || $# -gt 2 ]]; then
usage
@@ -43,6 +93,23 @@ if [[ -n "$UPDATE_INTERVAL" && ! "$UPDATE_INTERVAL" =~ ^[0-9]+$ ]]; then
exit 1
fi
for value_name in FOLDER_COUNT FILES_PER_FOLDER MIN_FILE_SIZE_MIB MAX_FILE_SIZE_MIB; do
if ! [[ ${!value_name} =~ ^[0-9]+$ ]]; then
echo "$value_name must be a non-negative integer" >&2
exit 1
fi
done
if [[ -n "$MAX_DATASET_SIZE_MIB" && ! "$MAX_DATASET_SIZE_MIB" =~ ^[0-9]+$ ]]; then
echo "MAX_DATASET_SIZE_MIB must be a non-negative integer" >&2
exit 1
fi
if (( MIN_FILE_SIZE_MIB > MAX_FILE_SIZE_MIB )); then
echo "MIN_FILE_SIZE_MIB cannot be greater than MAX_FILE_SIZE_MIB" >&2
exit 1
fi
mkdir -p "$ROOT"
have_setfacl=0
@@ -60,6 +127,16 @@ create_dir() {
mkdir -p "$ROOT/$1"
}
random_int_between() {
local min=$1
local max=$2
if (( min == max )); then
echo "$min"
return
fi
shuf -i "${min}-${max}" -n 1
}
log() {
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
}
@@ -178,6 +255,14 @@ rewrite_file_with_random_data() {
chmod "$mode" "$path"
}
append_bulk_mutable_files() {
if [[ -d "$ROOT/bulk" ]]; then
while IFS= read -r rel; do
BULK_MUTABLE_FILES+=("$rel")
done < <(cd "$ROOT" && find bulk -type f | sort)
fi
}
update_mutable_files_pass() {
local rel
local mutable_files=(
@@ -211,11 +296,18 @@ update_mutable_files_pass() {
"metadata/xattr_random_3mb_600.bin"
"metadata/acl_text_1mb_644.txt"
)
local BULK_MUTABLE_FILES=()
append_bulk_mutable_files
for rel in "${mutable_files[@]}"; do
rewrite_file_with_random_data "$rel"
done
for rel in "${BULK_MUTABLE_FILES[@]}"; do
rewrite_file_with_random_data "$rel"
done
set_acl_and_xattr_metadata
}
@@ -355,6 +447,57 @@ create_metadata_cases() {
set_acl_and_xattr_metadata
}
create_bulk_files() {
local folder_index
local file_index
local remaining_mib
local size_mib
local created_files=0
local consumed_mib=0
if (( FOLDER_COUNT == 0 || FILES_PER_FOLDER == 0 )); then
return
fi
if [[ -n "$MAX_DATASET_SIZE_MIB" && "$MAX_DATASET_SIZE_MIB" == "0" ]]; then
log "Skipping bulk-data generation because max dataset size is 0 MiB"
return
fi
create_dir "bulk"
for (( folder_index=1; folder_index<=FOLDER_COUNT; folder_index++ )); do
create_dir "bulk/folder-$(printf '%03d' "$folder_index")"
for (( file_index=1; file_index<=FILES_PER_FOLDER; file_index++ )); do
if [[ -n "$MAX_DATASET_SIZE_MIB" ]]; then
remaining_mib=$((MAX_DATASET_SIZE_MIB - consumed_mib))
if (( remaining_mib < MIN_FILE_SIZE_MIB )); then
log "Reached bulk dataset size cap after creating $created_files files (${consumed_mib} MiB)"
return
fi
if (( remaining_mib < MAX_FILE_SIZE_MIB )); then
size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$remaining_mib")
else
size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$MAX_FILE_SIZE_MIB")
fi
else
size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$MAX_FILE_SIZE_MIB")
fi
make_file \
"bulk/folder-$(printf '%03d' "$folder_index")/bulk_random_$(printf '%03d' "$file_index")_${size_mib}mib.bin" \
random \
"$size_mib" \
0644
consumed_mib=$((consumed_mib + size_mib))
created_files=$((created_files + 1))
done
done
log "Created $created_files bulk files across $FOLDER_COUNT folders (${consumed_mib} MiB total)"
}
write_summary() {
cat >"$ROOT/GENERATION_SUMMARY.txt" <<EOF
Dataset root: $ROOT
@@ -368,6 +511,12 @@ Notes:
- Sparse files have logical size with low physical allocation.
- Hard links share inode data with their source file.
- Read-only files and directories may require elevated privileges to modify later.
- Bulk generation settings:
- folder count: $FOLDER_COUNT
- files per folder: $FILES_PER_FOLDER
- min file size mib: $MIN_FILE_SIZE_MIB
- max file size mib: $MAX_FILE_SIZE_MIB
- max bulk dataset size mib: ${MAX_DATASET_SIZE_MIB:-unbounded}
EOF
}
@@ -383,6 +532,7 @@ else
create_time_and_readonly_cases
create_links
create_metadata_cases
create_bulk_files
write_summary
echo "Created migration test dataset at: $ROOT"
fi