Add bulk dataset generation options to test data script
Add bulk data generation controls for folder count, files per folder, file size range, and bulk dataset size limits. Also update the cdssync docs to describe the new options and how update mode applies to generated bulk files.
This commit is contained in:
@@ -22,6 +22,12 @@ For migration test datasets in this workspace, follow this process by default:
|
|||||||
- The generator script also accepts `--update-only`:
|
- The generator script also accepts `--update-only`:
|
||||||
- use it to update an existing dataset in place without recreating files, links, or directories
|
- use it to update an existing dataset in place without recreating files, links, or directories
|
||||||
- combine it with `UPDATE_INTERVAL_SECONDS` to keep mutating an existing dataset on a fixed interval
|
- combine it with `UPDATE_INTERVAL_SECONDS` to keep mutating an existing dataset on a fixed interval
|
||||||
|
- The generator script can also create additional bulk test data under `bulk/`:
|
||||||
|
- `--folder-count N` controls how many bulk folders are created
|
||||||
|
- `--files-per-folder N` controls how many bulk files are created in each folder
|
||||||
|
- `--min-file-size-mib N` and `--max-file-size-mib N` control the random size range for bulk files
|
||||||
|
- `--max-dataset-size-mib N` caps the total size of generated bulk files only
|
||||||
|
- once bulk files exist, update mode rewrites them too as part of the mutable-content set
|
||||||
- If ACL/xattr coverage matters, ensure the generation host has:
|
- If ACL/xattr coverage matters, ensure the generation host has:
|
||||||
- `acl` installed for `setfacl` and `getfacl`
|
- `acl` installed for `setfacl` and `getfacl`
|
||||||
- `attr` installed for `setfattr` and `getfattr`
|
- `attr` installed for `setfattr` and `getfattr`
|
||||||
|
|||||||
@@ -5,11 +5,19 @@ set -euo pipefail
|
|||||||
usage() {
|
usage() {
|
||||||
cat <<'EOF'
|
cat <<'EOF'
|
||||||
Usage:
|
Usage:
|
||||||
generate_migration_test_dataset.sh [--update-only] TARGET_DIR [UPDATE_INTERVAL_SECONDS]
|
generate_migration_test_dataset.sh [OPTIONS] TARGET_DIR [UPDATE_INTERVAL_SECONDS]
|
||||||
|
|
||||||
Creates a compact filesystem migration test dataset under TARGET_DIR.
|
Creates a compact filesystem migration test dataset under TARGET_DIR.
|
||||||
The dataset matches the manifest in migration-test-manifest.md.
|
The dataset matches the manifest in migration-test-manifest.md.
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--update-only Update an existing dataset in place.
|
||||||
|
--folder-count N Generate N bulk-data folders under bulk/.
|
||||||
|
--files-per-folder N Generate N bulk files in each bulk folder.
|
||||||
|
--min-file-size-mib N Minimum bulk file size in MiB. Default: 1.
|
||||||
|
--max-file-size-mib N Maximum bulk file size in MiB. Default: 5.
|
||||||
|
--max-dataset-size-mib N Maximum total size for generated bulk files.
|
||||||
|
|
||||||
Notes:
|
Notes:
|
||||||
- Existing TARGET_DIR contents are left in place unless they collide.
|
- Existing TARGET_DIR contents are left in place unless they collide.
|
||||||
- ACL and xattr cases are created only if the local tools are available.
|
- ACL and xattr cases are created only if the local tools are available.
|
||||||
@@ -24,10 +32,52 @@ EOF
|
|||||||
}
|
}
|
||||||
|
|
||||||
UPDATE_ONLY=0
|
UPDATE_ONLY=0
|
||||||
if [[ ${1:-} == "--update-only" ]]; then
|
FOLDER_COUNT=0
|
||||||
|
FILES_PER_FOLDER=0
|
||||||
|
MIN_FILE_SIZE_MIB=1
|
||||||
|
MAX_FILE_SIZE_MIB=5
|
||||||
|
MAX_DATASET_SIZE_MIB=
|
||||||
|
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
--update-only)
|
||||||
UPDATE_ONLY=1
|
UPDATE_ONLY=1
|
||||||
shift
|
shift
|
||||||
fi
|
;;
|
||||||
|
--folder-count)
|
||||||
|
FOLDER_COUNT=${2:-}
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--files-per-folder)
|
||||||
|
FILES_PER_FOLDER=${2:-}
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--min-file-size-mib)
|
||||||
|
MIN_FILE_SIZE_MIB=${2:-}
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--max-file-size-mib)
|
||||||
|
MAX_FILE_SIZE_MIB=${2:-}
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--max-dataset-size-mib)
|
||||||
|
MAX_DATASET_SIZE_MIB=${2:-}
|
||||||
|
shift 2
|
||||||
|
;;
|
||||||
|
--help|-h)
|
||||||
|
usage
|
||||||
|
exit 0
|
||||||
|
;;
|
||||||
|
--*)
|
||||||
|
echo "Unknown option: $1" >&2
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
break
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
if [[ $# -lt 1 || $# -gt 2 ]]; then
|
if [[ $# -lt 1 || $# -gt 2 ]]; then
|
||||||
usage
|
usage
|
||||||
@@ -43,6 +93,23 @@ if [[ -n "$UPDATE_INTERVAL" && ! "$UPDATE_INTERVAL" =~ ^[0-9]+$ ]]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
for value_name in FOLDER_COUNT FILES_PER_FOLDER MIN_FILE_SIZE_MIB MAX_FILE_SIZE_MIB; do
|
||||||
|
if ! [[ ${!value_name} =~ ^[0-9]+$ ]]; then
|
||||||
|
echo "$value_name must be a non-negative integer" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ -n "$MAX_DATASET_SIZE_MIB" && ! "$MAX_DATASET_SIZE_MIB" =~ ^[0-9]+$ ]]; then
|
||||||
|
echo "MAX_DATASET_SIZE_MIB must be a non-negative integer" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( MIN_FILE_SIZE_MIB > MAX_FILE_SIZE_MIB )); then
|
||||||
|
echo "MIN_FILE_SIZE_MIB cannot be greater than MAX_FILE_SIZE_MIB" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
mkdir -p "$ROOT"
|
mkdir -p "$ROOT"
|
||||||
|
|
||||||
have_setfacl=0
|
have_setfacl=0
|
||||||
@@ -60,6 +127,16 @@ create_dir() {
|
|||||||
mkdir -p "$ROOT/$1"
|
mkdir -p "$ROOT/$1"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
random_int_between() {
|
||||||
|
local min=$1
|
||||||
|
local max=$2
|
||||||
|
if (( min == max )); then
|
||||||
|
echo "$min"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
shuf -i "${min}-${max}" -n 1
|
||||||
|
}
|
||||||
|
|
||||||
log() {
|
log() {
|
||||||
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
|
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
|
||||||
}
|
}
|
||||||
@@ -178,6 +255,14 @@ rewrite_file_with_random_data() {
|
|||||||
chmod "$mode" "$path"
|
chmod "$mode" "$path"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
append_bulk_mutable_files() {
|
||||||
|
if [[ -d "$ROOT/bulk" ]]; then
|
||||||
|
while IFS= read -r rel; do
|
||||||
|
BULK_MUTABLE_FILES+=("$rel")
|
||||||
|
done < <(cd "$ROOT" && find bulk -type f | sort)
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
update_mutable_files_pass() {
|
update_mutable_files_pass() {
|
||||||
local rel
|
local rel
|
||||||
local mutable_files=(
|
local mutable_files=(
|
||||||
@@ -211,11 +296,18 @@ update_mutable_files_pass() {
|
|||||||
"metadata/xattr_random_3mb_600.bin"
|
"metadata/xattr_random_3mb_600.bin"
|
||||||
"metadata/acl_text_1mb_644.txt"
|
"metadata/acl_text_1mb_644.txt"
|
||||||
)
|
)
|
||||||
|
local BULK_MUTABLE_FILES=()
|
||||||
|
|
||||||
|
append_bulk_mutable_files
|
||||||
|
|
||||||
for rel in "${mutable_files[@]}"; do
|
for rel in "${mutable_files[@]}"; do
|
||||||
rewrite_file_with_random_data "$rel"
|
rewrite_file_with_random_data "$rel"
|
||||||
done
|
done
|
||||||
|
|
||||||
|
for rel in "${BULK_MUTABLE_FILES[@]}"; do
|
||||||
|
rewrite_file_with_random_data "$rel"
|
||||||
|
done
|
||||||
|
|
||||||
set_acl_and_xattr_metadata
|
set_acl_and_xattr_metadata
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -355,6 +447,57 @@ create_metadata_cases() {
|
|||||||
set_acl_and_xattr_metadata
|
set_acl_and_xattr_metadata
|
||||||
}
|
}
|
||||||
|
|
||||||
|
create_bulk_files() {
|
||||||
|
local folder_index
|
||||||
|
local file_index
|
||||||
|
local remaining_mib
|
||||||
|
local size_mib
|
||||||
|
local created_files=0
|
||||||
|
local consumed_mib=0
|
||||||
|
|
||||||
|
if (( FOLDER_COUNT == 0 || FILES_PER_FOLDER == 0 )); then
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$MAX_DATASET_SIZE_MIB" && "$MAX_DATASET_SIZE_MIB" == "0" ]]; then
|
||||||
|
log "Skipping bulk-data generation because max dataset size is 0 MiB"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
|
||||||
|
create_dir "bulk"
|
||||||
|
|
||||||
|
for (( folder_index=1; folder_index<=FOLDER_COUNT; folder_index++ )); do
|
||||||
|
create_dir "bulk/folder-$(printf '%03d' "$folder_index")"
|
||||||
|
|
||||||
|
for (( file_index=1; file_index<=FILES_PER_FOLDER; file_index++ )); do
|
||||||
|
if [[ -n "$MAX_DATASET_SIZE_MIB" ]]; then
|
||||||
|
remaining_mib=$((MAX_DATASET_SIZE_MIB - consumed_mib))
|
||||||
|
if (( remaining_mib < MIN_FILE_SIZE_MIB )); then
|
||||||
|
log "Reached bulk dataset size cap after creating $created_files files (${consumed_mib} MiB)"
|
||||||
|
return
|
||||||
|
fi
|
||||||
|
if (( remaining_mib < MAX_FILE_SIZE_MIB )); then
|
||||||
|
size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$remaining_mib")
|
||||||
|
else
|
||||||
|
size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$MAX_FILE_SIZE_MIB")
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
size_mib=$(random_int_between "$MIN_FILE_SIZE_MIB" "$MAX_FILE_SIZE_MIB")
|
||||||
|
fi
|
||||||
|
|
||||||
|
make_file \
|
||||||
|
"bulk/folder-$(printf '%03d' "$folder_index")/bulk_random_$(printf '%03d' "$file_index")_${size_mib}mib.bin" \
|
||||||
|
random \
|
||||||
|
"$size_mib" \
|
||||||
|
0644
|
||||||
|
consumed_mib=$((consumed_mib + size_mib))
|
||||||
|
created_files=$((created_files + 1))
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
log "Created $created_files bulk files across $FOLDER_COUNT folders (${consumed_mib} MiB total)"
|
||||||
|
}
|
||||||
|
|
||||||
write_summary() {
|
write_summary() {
|
||||||
cat >"$ROOT/GENERATION_SUMMARY.txt" <<EOF
|
cat >"$ROOT/GENERATION_SUMMARY.txt" <<EOF
|
||||||
Dataset root: $ROOT
|
Dataset root: $ROOT
|
||||||
@@ -368,6 +511,12 @@ Notes:
|
|||||||
- Sparse files have logical size with low physical allocation.
|
- Sparse files have logical size with low physical allocation.
|
||||||
- Hard links share inode data with their source file.
|
- Hard links share inode data with their source file.
|
||||||
- Read-only files and directories may require elevated privileges to modify later.
|
- Read-only files and directories may require elevated privileges to modify later.
|
||||||
|
- Bulk generation settings:
|
||||||
|
- folder count: $FOLDER_COUNT
|
||||||
|
- files per folder: $FILES_PER_FOLDER
|
||||||
|
- min file size mib: $MIN_FILE_SIZE_MIB
|
||||||
|
- max file size mib: $MAX_FILE_SIZE_MIB
|
||||||
|
- max bulk dataset size mib: ${MAX_DATASET_SIZE_MIB:-unbounded}
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -383,6 +532,7 @@ else
|
|||||||
create_time_and_readonly_cases
|
create_time_and_readonly_cases
|
||||||
create_links
|
create_links
|
||||||
create_metadata_cases
|
create_metadata_cases
|
||||||
|
create_bulk_files
|
||||||
write_summary
|
write_summary
|
||||||
echo "Created migration test dataset at: $ROOT"
|
echo "Created migration test dataset at: $ROOT"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -9,9 +9,17 @@ The generator script can also run in continuous update mode after initial creati
|
|||||||
- use any integer greater than `0` to rewrite mutable files every `N` seconds
|
- use any integer greater than `0` to rewrite mutable files every `N` seconds
|
||||||
- use `--update-only` to run updates against an already-existing dataset without recreating the special-case filesystem objects first
|
- use `--update-only` to run updates against an already-existing dataset without recreating the special-case filesystem objects first
|
||||||
|
|
||||||
|
The generator script can also create additional bulk test data under `bulk/`:
|
||||||
|
|
||||||
|
- `--folder-count N` creates `N` numbered bulk folders
|
||||||
|
- `--files-per-folder N` creates `N` bulk files in each bulk folder
|
||||||
|
- `--min-file-size-mib N` and `--max-file-size-mib N` control the random bulk file size range
|
||||||
|
- `--max-dataset-size-mib N` caps the total size of generated bulk files only and stops creation when the cap is reached
|
||||||
|
|
||||||
Important implementation detail for update mode:
|
Important implementation detail for update mode:
|
||||||
|
|
||||||
- the update loop rewrites content-bearing regular files that are intended to simulate active data churn
|
- the update loop rewrites content-bearing regular files that are intended to simulate active data churn
|
||||||
|
- if bulk files exist under `bulk/`, the update loop rewrites those bulk files too
|
||||||
- it does not rewrite script files, sparse files, symlinks, hard links, or empty files
|
- it does not rewrite script files, sparse files, symlinks, hard links, or empty files
|
||||||
- this preserves the special-case filesystem structure while still generating ongoing content changes
|
- this preserves the special-case filesystem structure while still generating ongoing content changes
|
||||||
- if ACL/xattr assignment is unsupported on the target filesystem, the script logs that condition and continues
|
- if ACL/xattr assignment is unsupported on the target filesystem, the script logs that condition and continues
|
||||||
|
|||||||
Reference in New Issue
Block a user