Files
cds-ai/cdssync/generate_migration_test_dataset.sh
anthony.wen 4275956259 Add interval-based update mode for test dataset generation
Add optional interval-based random content updates to the cdssync
migration test dataset generator and document the new behavior.

This allows the dataset to be created once and then updated either
continuously or every N seconds while preserving the intended
special-case file structure.
2026-04-21 11:12:37 -04:00

374 lines
11 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
usage() {
cat <<'EOF'
Usage:
generate_migration_test_dataset.sh TARGET_DIR [UPDATE_INTERVAL_SECONDS]
Creates a compact filesystem migration test dataset under TARGET_DIR.
The dataset matches the manifest in migration-test-manifest.md.
Notes:
- Existing TARGET_DIR contents are left in place unless they collide.
- ACL and xattr cases are created only if the local tools are available.
- Sparse files are created with logical size but low physical allocation.
- If UPDATE_INTERVAL_SECONDS is provided, the script keeps rewriting
mutable files with random content after the initial dataset creation.
- An interval of 0 means continuous updates with no sleep between passes.
- Update mode rewrites content-bearing regular files only.
- Update mode does not rewrite script files, sparse files, symlinks,
hard links, or empty files.
EOF
}
if [[ $# -lt 1 || $# -gt 2 ]]; then
usage
exit 1
fi
TARGET_DIR=$1
ROOT=$(realpath -m "$TARGET_DIR")
UPDATE_INTERVAL=${2:-}
if [[ -n "$UPDATE_INTERVAL" && ! "$UPDATE_INTERVAL" =~ ^[0-9]+$ ]]; then
echo "UPDATE_INTERVAL_SECONDS must be a non-negative integer" >&2
exit 1
fi
mkdir -p "$ROOT"
have_setfacl=0
have_setfattr=0
if command -v setfacl >/dev/null 2>&1; then
have_setfacl=1
fi
if command -v setfattr >/dev/null 2>&1; then
have_setfattr=1
fi
create_dir() {
mkdir -p "$ROOT/$1"
}
log() {
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*"
}
set_times() {
local rel=$1
local stamp=$2
touch -a -m -t "$stamp" "$ROOT/$rel"
}
write_text() {
local path=$1
local mib=$2
local bytes=$((mib * 1024 * 1024))
perl -e '
my ($target, $label) = @ARGV;
my $chunk = "Migration text payload for $label\n";
while (length($chunk) < 8192) { $chunk .= $chunk; }
while ($target > 0) {
my $part = substr($chunk, 0, $target > length($chunk) ? length($chunk) : $target);
print $part;
$target -= length($part);
}
' "$bytes" "$path" >"$ROOT/$path"
}
write_compressible() {
local path=$1
local mib=$2
local bytes=$((mib * 1024 * 1024))
perl -e '
my ($target) = @ARGV;
my $chunk = "A" x 8192;
while ($target > 0) {
my $part = substr($chunk, 0, $target > length($chunk) ? length($chunk) : $target);
print $part;
$target -= length($part);
}
' "$bytes" >"$ROOT/$path"
}
write_random() {
local path=$1
local mib=$2
dd if=/dev/urandom of="$ROOT/$path" bs=1M count="$mib" status=none
}
write_script() {
local path=$1
local mib=$2
cat >"$ROOT/$path" <<'EOF'
#!/usr/bin/env bash
echo "migration test script"
EOF
local current_size
current_size=$(wc -c <"$ROOT/$path")
local target_size=$((mib * 1024 * 1024))
if (( current_size < target_size )); then
dd if=/dev/zero bs=1 count=$((target_size - current_size)) status=none | tr '\0' '#' >>"$ROOT/$path"
fi
}
write_empty() {
: >"$ROOT/$1"
}
write_sparse() {
local path=$1
local mib=$2
truncate -s "${mib}M" "$ROOT/$path"
}
apply_mode() {
chmod "$2" "$ROOT/$1"
}
set_acl_and_xattr_metadata() {
if (( have_setfattr )); then
setfattr -n user.migration_case -v "xattr-text" "$ROOT/metadata/xattr_text_1mb_644.txt"
setfattr -n user.migration_case -v "xattr-random" "$ROOT/metadata/xattr_random_3mb_600.bin"
else
echo "Skipping xattr assignment: setfattr not available"
fi
if (( have_setfacl )); then
setfacl -m u:nobody:r-- "$ROOT/metadata/acl_text_1mb_644.txt"
setfacl -m u:nobody:r-x "$ROOT/metadata/acl_script_1mb_755.sh"
else
echo "Skipping ACL assignment: setfacl not available"
fi
}
rewrite_file_with_random_data() {
local rel=$1
local path="$ROOT/$rel"
local size
local mode
size=$(stat -c '%s' "$path")
mode=$(stat -c '%a' "$path")
chmod u+w "$path"
if (( size > 0 )); then
head -c "$size" /dev/urandom >"$path"
else
: >"$path"
fi
chmod "$mode" "$path"
}
update_mutable_files_pass() {
local rel
local mutable_files=(
"regular/text_1mb_644.txt"
"regular/text_3mb_600.txt"
"regular/text_5mb_755.txt"
"regular/random_1mb_600.bin"
"regular/random_3mb_644.bin"
"regular/random_5mb_755.bin"
"regular/compressible_1mb_644.log"
"regular/compressible_3mb_600.log"
"regular/compressible_5mb_755.log"
"hidden/.hidden_text_1mb_644.txt"
"hidden/.hidden_random_3mb_600.bin"
"spaces in name/file with spaces text 1mb 644.txt"
"spaces in name/file with spaces random 3mb 600.bin"
"regular/longname_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_text_1mb_644.txt"
"regular/longname_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb_random_3mb_600.bin"
"regular/longname_cccccccccccccccccccccccccccccccc_compressible_5mb_755.log"
"deep/tree/level1/level2/level3/deep_text_1mb_644.txt"
"deep/tree/level1/level2/level3/deep_random_3mb_600.bin"
"regular/dup_source_text_3mb_644.txt"
"regular/dup_copy_a_text_3mb_600.txt"
"deep/tree/level1/level2/dup_copy_b_text_3mb_755.txt"
"regular/old_text_1mb_644.txt"
"regular/recent_text_1mb_644.txt"
"regular/futureish_text_1mb_644.txt"
"readonly-dir/locked_text_1mb_444.txt"
"readonly-dir/locked_random_3mb_400.bin"
"metadata/xattr_text_1mb_644.txt"
"metadata/xattr_random_3mb_600.bin"
"metadata/acl_text_1mb_644.txt"
)
for rel in "${mutable_files[@]}"; do
rewrite_file_with_random_data "$rel"
done
set_acl_and_xattr_metadata
}
run_update_loop() {
local iteration=1
log "Starting update loop for $ROOT with interval ${UPDATE_INTERVAL}s"
while true; do
update_mutable_files_pass
log "Completed random update pass $iteration"
iteration=$((iteration + 1))
if (( UPDATE_INTERVAL > 0 )); then
sleep "$UPDATE_INTERVAL"
fi
done
}
make_file() {
local path=$1
local type=$2
local mib=$3
local mode=$4
create_dir "$(dirname "$path")"
case "$type" in
text) write_text "$path" "$mib" ;;
random) write_random "$path" "$mib" ;;
compressible) write_compressible "$path" "$mib" ;;
script) write_script "$path" "$mib" ;;
empty) write_empty "$path" ;;
sparse) write_sparse "$path" "$mib" ;;
*)
echo "Unknown type: $type" >&2
exit 1
;;
esac
apply_mode "$path" "$mode"
}
create_base_dirs() {
create_dir "regular"
create_dir "hidden"
create_dir "spaces in name"
create_dir "deep/tree/level1/level2/level3"
create_dir "readonly-dir"
create_dir "links"
create_dir "metadata"
create_dir "empty-dirs/empty_a"
create_dir "empty-dirs/empty_b"
create_dir "empty-dirs/.hidden_empty_dir"
create_dir "readonly-dir/no_write_subdir"
}
create_regular_files() {
make_file "regular/text_1mb_644.txt" text 1 0644
make_file "regular/text_3mb_600.txt" text 3 0600
make_file "regular/text_5mb_755.txt" text 5 0755
make_file "regular/random_1mb_600.bin" random 1 0600
make_file "regular/random_3mb_644.bin" random 3 0644
make_file "regular/random_5mb_755.bin" random 5 0755
make_file "regular/compressible_1mb_644.log" compressible 1 0644
make_file "regular/compressible_3mb_600.log" compressible 3 0600
make_file "regular/compressible_5mb_755.log" compressible 5 0755
make_file "regular/script_1mb_755.sh" script 1 0755
make_file "regular/script_3mb_700.sh" script 3 0700
make_file "regular/script_5mb_755.sh" script 5 0755
make_file "regular/sparse_1mb_600.img" sparse 1 0600
make_file "regular/sparse_3mb_600.img" sparse 3 0600
make_file "regular/sparse_5mb_600.img" sparse 5 0600
make_file "regular/empty_000_644.txt" empty 0 0644
make_file "regular/empty_001_600.txt" empty 0 0600
make_file "regular/empty_002_755.txt" empty 0 0755
}
create_named_variants() {
make_file "hidden/.hidden_text_1mb_644.txt" text 1 0644
make_file "hidden/.hidden_random_3mb_600.bin" random 3 0600
make_file "hidden/.hidden_script_1mb_755.sh" script 1 0755
make_file "hidden/.hidden_empty_644" empty 0 0644
make_file "hidden/.hidden_sparse_5mb_600.img" sparse 5 0600
make_file "spaces in name/file with spaces text 1mb 644.txt" text 1 0644
make_file "spaces in name/file with spaces random 3mb 600.bin" random 3 0600
make_file "spaces in name/file with spaces script 1mb 755.sh" script 1 0755
make_file "spaces in name/file with spaces empty 644" empty 0 0644
make_file "spaces in name/file with spaces sparse 5mb 600.img" sparse 5 0600
make_file "regular/longname_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa_text_1mb_644.txt" text 1 0644
make_file "regular/longname_bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb_random_3mb_600.bin" random 3 0600
make_file "regular/longname_cccccccccccccccccccccccccccccccc_compressible_5mb_755.log" compressible 5 0755
}
create_deep_and_duplicate_cases() {
make_file "deep/tree/level1/level2/level3/deep_text_1mb_644.txt" text 1 0644
make_file "deep/tree/level1/level2/level3/deep_random_3mb_600.bin" random 3 0600
make_file "deep/tree/level1/level2/level3/deep_script_1mb_755.sh" script 1 0755
make_file "deep/tree/level1/level2/level3/deep_sparse_5mb_600.img" sparse 5 0600
make_file "regular/dup_source_text_3mb_644.txt" text 3 0644
cp "$ROOT/regular/dup_source_text_3mb_644.txt" "$ROOT/regular/dup_copy_a_text_3mb_600.txt"
cp "$ROOT/regular/dup_source_text_3mb_644.txt" "$ROOT/deep/tree/level1/level2/dup_copy_b_text_3mb_755.txt"
chmod 0600 "$ROOT/regular/dup_copy_a_text_3mb_600.txt"
chmod 0755 "$ROOT/deep/tree/level1/level2/dup_copy_b_text_3mb_755.txt"
}
create_time_and_readonly_cases() {
make_file "regular/old_text_1mb_644.txt" text 1 0644
make_file "regular/recent_text_1mb_644.txt" text 1 0644
make_file "regular/futureish_text_1mb_644.txt" text 1 0644
set_times "regular/old_text_1mb_644.txt" 201801020304
set_times "regular/recent_text_1mb_644.txt" 202604191530
set_times "regular/futureish_text_1mb_644.txt" 203001020304
make_file "readonly-dir/locked_text_1mb_444.txt" text 1 0444
make_file "readonly-dir/locked_random_3mb_400.bin" random 3 0400
make_file "readonly-dir/locked_script_1mb_500.sh" script 1 0500
chmod 0555 "$ROOT/readonly-dir/no_write_subdir"
}
create_links() {
ln -s ../regular/text_1mb_644.txt "$ROOT/links/symlink_to_text_1mb_644.txt"
ln -s ../deep/tree/level1/level2/level3/deep_random_3mb_600.bin "$ROOT/links/symlink_to_deep_random_3mb_600.bin"
ln -s ../hidden/.hidden_text_1mb_644.txt "$ROOT/links/symlink_to_hidden_file"
ln "$ROOT/regular/random_3mb_644.bin" "$ROOT/links/hardlink_to_random_3mb_644.bin"
ln "$ROOT/regular/compressible_5mb_755.log" "$ROOT/links/hardlink_to_compressible_5mb_755.log"
}
create_metadata_cases() {
make_file "metadata/xattr_text_1mb_644.txt" text 1 0644
make_file "metadata/xattr_random_3mb_600.bin" random 3 0600
make_file "metadata/acl_text_1mb_644.txt" text 1 0644
make_file "metadata/acl_script_1mb_755.sh" script 1 0755
set_acl_and_xattr_metadata
}
write_summary() {
cat >"$ROOT/GENERATION_SUMMARY.txt" <<EOF
Dataset root: $ROOT
Manifest: migration-test-manifest.md
Optional metadata support:
- setfacl available: $have_setfacl
- setfattr available: $have_setfattr
Notes:
- Sparse files have logical size with low physical allocation.
- Hard links share inode data with their source file.
- Read-only files and directories may require elevated privileges to modify later.
EOF
}
create_base_dirs
create_regular_files
create_named_variants
create_deep_and_duplicate_cases
create_time_and_readonly_cases
create_links
create_metadata_cases
write_summary
echo "Created migration test dataset at: $ROOT"
if [[ -n "$UPDATE_INTERVAL" ]]; then
run_update_loop
fi