Files
cds-ai/cdssync/generate_migration_test_dataset.sh
anthony.wen 7c27535e2a Add update-only mode for test dataset generator
Add support for running content updates against an existing migration
test dataset without recreating the filesystem structure.

Also make ACL/xattr updates non-fatal on filesystems that do not
support those operations.
2026-04-21 13:21:22 -04:00

393 lines
12 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage:
generate_migration_test_dataset.sh [--update-only] TARGET_DIR [UPDATE_INTERVAL_SECONDS]
Creates a compact filesystem migration test dataset under TARGET_DIR.
The dataset matches the manifest in migration-test-manifest.md.
Notes:
- Existing TARGET_DIR contents are left in place unless they collide.
- ACL and xattr cases are created only if the local tools are available.
- Sparse files are created with logical size but low physical allocation.
- If UPDATE_INTERVAL_SECONDS is provided, the script keeps rewriting
mutable files with random content after the initial dataset creation.
- An interval of 0 means continuous updates with no sleep between passes.
- Update mode rewrites content-bearing regular files only.
- Update mode does not rewrite script files, sparse files, symlinks,
hard links, or empty files.
EOF
}
UPDATE_ONLY=0
if [[ ${1:-} == "--update-only" ]]; then
UPDATE_ONLY=1
shift
fi
if [[ $# -lt 1 || $# -gt 2 ]]; then
usage
exit 1
fi
TARGET_DIR=$1
ROOT=$(realpath -m "$TARGET_DIR")
UPDATE_INTERVAL=${2:-}
if [[ -n "$UPDATE_INTERVAL" && ! "$UPDATE_INTERVAL" =~ ^[0-9]+$ ]]; then
echo "UPDATE_INTERVAL_SECONDS must be a non-negative integer" >&2
exit 1
fi
mkdir -p "$ROOT"
have_setfacl=0
have_setfattr=0
if command -v setfacl >/dev/null 2>&1; then
have_setfacl=1
fi
if command -v setfattr >/dev/null 2>&1; then
have_setfattr=1
fi
create_dir() {
mkdir -p "$ROOT/$1"
}
log() {
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
}
set_times() {
local rel=$1
local stamp=$2
touch -a -m -t "$stamp" "$ROOT/$rel"
}
write_text() {
local path=$1
local mib=$2
local bytes=$((mib * 1024 * 1024))
perl -e '
my ($target, $label) = @ARGV;
my $chunk = "Migration text payload for $label\n";
while (length($chunk) < 8192) { $chunk .= $chunk; }
while ($target > 0) {
my $part = substr($chunk, 0, $target > length($chunk) ? length($chunk) : $target);
print $part;
$target -= length($part);
}
' "$bytes" "$path" >"$ROOT/$path"
}
write_compressible() {
local path=$1
local mib=$2
local bytes=$((mib * 1024 * 1024))
perl -e '
my ($target) = @ARGV;
my $chunk = "A" x 8192;
while ($target > 0) {
my $part = substr($chunk, 0, $target > length($chunk) ? length($chunk) : $target);
print $part;
$target -= length($part);
}
' "$bytes" >"$ROOT/$path"
}
write_random() {
local path=$1
local mib=$2
dd if=/dev/urandom of="$ROOT/$path" bs=1M count="$mib" status=none
}
write_script() {
local path=$1
local mib=$2
cat >"$ROOT/$path" <<'EOF'
#!/usr/bin/env bash
echo "migration test script"
EOF
local current_size
current_size=$(wc -c <"$ROOT/$path")
local target_size=$((mib * 1024 * 1024))
if (( current_size < target_size )); then
dd if=/dev/zero bs=1 count=$((target_size - current_size)) status=none | tr '\0' '#' >>"$ROOT/$path"
fi
}
write_empty() {
: >"$ROOT/$1"
}
write_sparse() {
local path=$1
local mib=$2
truncate -s "${mib}M" "$ROOT/$path"
}
apply_mode() {
chmod "$2" "$ROOT/$1"
}
set_acl_and_xattr_metadata() {
if (( have_setfattr )); then
if ! setfattr -n user.migration_case -v "xattr-text" "$ROOT/metadata/xattr_text_1mb_644.txt"; then
log "Skipping xattr assignment on $ROOT/metadata/xattr_text_1mb_644.txt: operation not supported"
fi
if ! setfattr -n user.migration_case -v "xattr-random" "$ROOT/metadata/xattr_random_3mb_600.bin"; then
log "Skipping xattr assignment on $ROOT/metadata/xattr_random_3mb_600.bin: operation not supported"
fi
else
log "Skipping xattr assignment: setfattr not available"
fi
if (( have_setfacl )); then
if ! setfacl -m u:nobody:r-- "$ROOT/metadata/acl_text_1mb_644.txt"; then
log "Skipping ACL assignment on $ROOT/metadata/acl_text_1mb_644.txt: operation not supported"
fi
if ! setfacl -m u:nobody:r-x "$ROOT/metadata/acl_script_1mb_755.sh"; then
log "Skipping ACL assignment on $ROOT/metadata/acl_script_1mb_755.sh: operation not supported"
fi
else
log "Skipping ACL assignment: setfacl not available"
fi
}
rewrite_file_with_random_data() {
local rel=$1
local path="$ROOT/$rel"
local size
local mode
size=$(stat -c '%s' "$path")
mode=$(stat -c '%a' "$path")
chmod u+w "$path"
if (( size > 0 )); then
head -c "$size" /dev/urandom >"$path"
else
: >"$path"
fi
chmod "$mode" "$path"
}
update_mutable_files_pass() {
local rel
local mutable_files=(
"regular/text_1mb_644.txt"
"regular/text_3mb_600.txt"
"regular/text_5mb_755.txt"
"regular/random_1mb_600.bin"
"regular/random_3mb_644.bin"
"regular/random_5mb_755.bin"
"regular/compressible_1mb_644.log"
"regular/compressible_3mb_600.log"
"regular/compressible_5mb_755.log"
"hidden/.hidden_text_1mb_644.txt"
"hidden/.hidden_random_3mb_600.bin"
"spaces in name/file with spaces text 1mb 644.txt"
"spaces in name/file with spaces random 3mb 600.bin"
"regular/longname_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_text_1mb_644.txt"
"regular/longname_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb_random_3mb_600.bin"
"regular/longname_cccccccccccccccccccccccccccccccc_compressible_5mb_755.log"
"deep/tree/level1/level2/level3/deep_text_1mb_644.txt"
"deep/tree/level1/level2/level3/deep_random_3mb_600.bin"
"regular/dup_source_text_3mb_644.txt"
"regular/dup_copy_a_text_3mb_600.txt"
"deep/tree/level1/level2/dup_copy_b_text_3mb_755.txt"
"regular/old_text_1mb_644.txt"
"regular/recent_text_1mb_644.txt"
"regular/futureish_text_1mb_644.txt"
"readonly-dir/locked_text_1mb_444.txt"
"readonly-dir/locked_random_3mb_400.bin"
"metadata/xattr_text_1mb_644.txt"
"metadata/xattr_random_3mb_600.bin"
"metadata/acl_text_1mb_644.txt"
)
for rel in "${mutable_files[@]}"; do
rewrite_file_with_random_data "$rel"
done
set_acl_and_xattr_metadata
}
run_update_loop() {
local iteration=1
log "Starting update loop for $ROOT with interval ${UPDATE_INTERVAL}s"
while true; do
update_mutable_files_pass
log "Completed random update pass $iteration"
iteration=$((iteration + 1))
if (( UPDATE_INTERVAL > 0 )); then
sleep "$UPDATE_INTERVAL"
fi
done
}
make_file() {
local path=$1
local type=$2
local mib=$3
local mode=$4
create_dir "$(dirname "$path")"
case "$type" in
text) write_text "$path" "$mib" ;;
random) write_random "$path" "$mib" ;;
compressible) write_compressible "$path" "$mib" ;;
script) write_script "$path" "$mib" ;;
empty) write_empty "$path" ;;
sparse) write_sparse "$path" "$mib" ;;
*)
echo "Unknown type: $type" >&2
exit 1
;;
esac
apply_mode "$path" "$mode"
}
create_base_dirs() {
create_dir "regular"
create_dir "hidden"
create_dir "spaces in name"
create_dir "deep/tree/level1/level2/level3"
create_dir "readonly-dir"
create_dir "links"
create_dir "metadata"
create_dir "empty-dirs/empty_a"
create_dir "empty-dirs/empty_b"
create_dir "empty-dirs/.hidden_empty_dir"
create_dir "readonly-dir/no_write_subdir"
}
create_regular_files() {
make_file "regular/text_1mb_644.txt" text 1 0644
make_file "regular/text_3mb_600.txt" text 3 0600
make_file "regular/text_5mb_755.txt" text 5 0755
make_file "regular/random_1mb_600.bin" random 1 0600
make_file "regular/random_3mb_644.bin" random 3 0644
make_file "regular/random_5mb_755.bin" random 5 0755
make_file "regular/compressible_1mb_644.log" compressible 1 0644
make_file "regular/compressible_3mb_600.log" compressible 3 0600
make_file "regular/compressible_5mb_755.log" compressible 5 0755
make_file "regular/script_1mb_755.sh" script 1 0755
make_file "regular/script_3mb_700.sh" script 3 0700
make_file "regular/script_5mb_755.sh" script 5 0755
make_file "regular/sparse_1mb_600.img" sparse 1 0600
make_file "regular/sparse_3mb_600.img" sparse 3 0600
make_file "regular/sparse_5mb_600.img" sparse 5 0600
make_file "regular/empty_000_644.txt" empty 0 0644
make_file "regular/empty_001_600.txt" empty 0 0600
make_file "regular/empty_002_755.txt" empty 0 0755
}
create_named_variants() {
make_file "hidden/.hidden_text_1mb_644.txt" text 1 0644
make_file "hidden/.hidden_random_3mb_600.bin" random 3 0600
make_file "hidden/.hidden_script_1mb_755.sh" script 1 0755
make_file "hidden/.hidden_empty_644" empty 0 0644
make_file "hidden/.hidden_sparse_5mb_600.img" sparse 5 0600
make_file "spaces in name/file with spaces text 1mb 644.txt" text 1 0644
make_file "spaces in name/file with spaces random 3mb 600.bin" random 3 0600
make_file "spaces in name/file with spaces script 1mb 755.sh" script 1 0755
make_file "spaces in name/file with spaces empty 644" empty 0 0644
make_file "spaces in name/file with spaces sparse 5mb 600.img" sparse 5 0600
make_file "regular/longname_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_text_1mb_644.txt" text 1 0644
make_file "regular/longname_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb_random_3mb_600.bin" random 3 0600
make_file "regular/longname_cccccccccccccccccccccccccccccccc_compressible_5mb_755.log" compressible 5 0755
}
create_deep_and_duplicate_cases() {
make_file "deep/tree/level1/level2/level3/deep_text_1mb_644.txt" text 1 0644
make_file "deep/tree/level1/level2/level3/deep_random_3mb_600.bin" random 3 0600
make_file "deep/tree/level1/level2/level3/deep_script_1mb_755.sh" script 1 0755
make_file "deep/tree/level1/level2/level3/deep_sparse_5mb_600.img" sparse 5 0600
make_file "regular/dup_source_text_3mb_644.txt" text 3 0644
cp "$ROOT/regular/dup_source_text_3mb_644.txt" "$ROOT/regular/dup_copy_a_text_3mb_600.txt"
cp "$ROOT/regular/dup_source_text_3mb_644.txt" "$ROOT/deep/tree/level1/level2/dup_copy_b_text_3mb_755.txt"
chmod 0600 "$ROOT/regular/dup_copy_a_text_3mb_600.txt"
chmod 0755 "$ROOT/deep/tree/level1/level2/dup_copy_b_text_3mb_755.txt"
}
create_time_and_readonly_cases() {
make_file "regular/old_text_1mb_644.txt" text 1 0644
make_file "regular/recent_text_1mb_644.txt" text 1 0644
make_file "regular/futureish_text_1mb_644.txt" text 1 0644
set_times "regular/old_text_1mb_644.txt" 201801020304
set_times "regular/recent_text_1mb_644.txt" 202604191530
set_times "regular/futureish_text_1mb_644.txt" 203001020304
make_file "readonly-dir/locked_text_1mb_444.txt" text 1 0444
make_file "readonly-dir/locked_random_3mb_400.bin" random 3 0400
make_file "readonly-dir/locked_script_1mb_500.sh" script 1 0500
chmod 0555 "$ROOT/readonly-dir/no_write_subdir"
}
create_links() {
ln -s ../regular/text_1mb_644.txt "$ROOT/links/symlink_to_text_1mb_644.txt"
ln -s ../deep/tree/level1/level2/level3/deep_random_3mb_600.bin "$ROOT/links/symlink_to_deep_random_3mb_600.bin"
ln -s ../hidden/.hidden_text_1mb_644.txt "$ROOT/links/symlink_to_hidden_file"
ln "$ROOT/regular/random_3mb_644.bin" "$ROOT/links/hardlink_to_random_3mb_644.bin"
ln "$ROOT/regular/compressible_5mb_755.log" "$ROOT/links/hardlink_to_compressible_5mb_755.log"
}
create_metadata_cases() {
make_file "metadata/xattr_text_1mb_644.txt" text 1 0644
make_file "metadata/xattr_random_3mb_600.bin" random 3 0600
make_file "metadata/acl_text_1mb_644.txt" text 1 0644
make_file "metadata/acl_script_1mb_755.sh" script 1 0755
set_acl_and_xattr_metadata
}
write_summary() {
cat >"$ROOT/GENERATION_SUMMARY.txt" <<EOF
Dataset root: $ROOT
Manifest: migration-test-manifest.md
Optional metadata support:
- setfacl available: $have_setfacl
- setfattr available: $have_setfattr
Notes:
- Sparse files have logical size with low physical allocation.
- Hard links share inode data with their source file.
- Read-only files and directories may require elevated privileges to modify later.
EOF
}
if (( UPDATE_ONLY )); then
log "Running in update-only mode for $ROOT"
update_mutable_files_pass
log "Completed initial update-only pass"
else
create_base_dirs
create_regular_files
create_named_variants
create_deep_and_duplicate_cases
create_time_and_readonly_cases
create_links
create_metadata_cases
write_summary
echo "Created migration test dataset at: $ROOT"
fi
if [[ -n "$UPDATE_INTERVAL" ]]; then
run_update_loop
fi