diff --git a/README.md b/README.md index 533ce67..6fe5856 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Simple bash script to rebalance pool data between all mirrors when adding vdevs ## How it works -This script recursively traverses all the files in a given directory. Each file is copied with a `.rebalance` suffix, retaining all file attributes. The original is then deleted and the *copy* is renamed back to the name of the original file. When copying a file ZFS will spread the data blocks across all vdevs, effectively distributing/rebalancing the data of the original file (more or less) evenly. This allows the pool data to be rebalanced without the need for a separate backup pool/drive. +This script recursively traverses all the files in a given directory. Each file is copied with a `.balance` suffix, retaining all file attributes. The original is then deleted and the *copy* is renamed back to the name of the original file. When copying a file ZFS will spread the data blocks across all vdevs, effectively distributing/rebalancing the data of the original file (more or less) evenly. This allows the pool data to be rebalanced without the need for a separate backup pool/drive. The way ZFS distributes writes is not trivial, which makes it hard to predict how effective the redistribution will be. See: - https://jrs-s.net/2018/04/11/zfs-allocates-writes-according-to-free-space-per-vdev-not-latency-per-vdev/ @@ -100,6 +100,7 @@ You can print a help message by running the script without any parameters: |-----------|-------------|---------| | `-c`
`--checksum` | Whether to compare attributes and content of the copied file using an **MD5** checksum. Technically this is a redundent check and consumes a lot of resources, so think twice. | `true` | | `-p`
`--passes` | The maximum number of rebalance passes per file. Setting this to infinity by using a value `<= 0` might improve performance when rebalancing a lot of small files. | `1` | +| `--skip-hardlinks` | Skip rebalancing hardlinked files, since it will only create duplicate data. | `false` | ### Example @@ -133,7 +134,7 @@ tail -F ./stdout.log Although this script **does** have a progress output (files as well as percentage) it might be a good idea to try a small subfolder first, or process your pool folder layout in manually selected badges. This can also limit the damage done, if anything bad happens. -When aborting the script midway through, be sure to check the last lines of its output. When cancelling before or during the renaming process a ".rebalance" file might be left and you have to rename (or delete) it manually. +When aborting the script midway through, be sure to check the last lines of its output. When cancelling before or during the renaming process a ".balance" file might be left and you have to rename (or delete) it manually. Although the `--passes` parameter can be used to limit the maximum amount of rebalance passes per file, it is only meant to speedup aborted runs. Individual files will **not be process multiple times automatically**. To reach multiple passes you have to run the script on the same target directory multiple times. diff --git a/testing.sh b/testing.sh index 3c92f2b..b46f74f 100755 --- a/testing.sh +++ b/testing.sh @@ -30,6 +30,20 @@ function assertions() { fi } +function assert_matching_file_copied() { + if ! grep "Copying" $log_std_file | grep -q "$1"; then + echo "File matching '$1' was not copied when it should have been!" + exit 1 + fi +} + +function assert_matching_file_not_copied() { + if grep "Copying" $log_std_file | grep -q "$1"; then + echo "File matching '$1' was copied when it should have been skipped!" + exit 1 + fi +} + prepare ./zfs-inplace-rebalancing.sh $test_pool_data_path >> $log_std_file 2>> $log_error_file cat $log_std_file @@ -44,3 +58,21 @@ prepare ./zfs-inplace-rebalancing.sh --checksum false $test_pool_data_path >> $log_std_file 2>> $log_error_file cat $log_std_file assertions + +prepare +ln "$test_pool_data_path/projects/[2020] some project/mp4.txt" "$test_pool_data_path/projects/[2020] some project/mp4.txt.link" +./zfs-inplace-rebalancing.sh --skip-hardlinks false $test_pool_data_path >> $log_std_file 2>> $log_error_file +cat $log_std_file +# Both link files should be copied +assert_matching_file_copied "mp4.txt" +assert_matching_file_copied "mp4.txt.link" +assertions + +prepare +ln "$test_pool_data_path/projects/[2020] some project/mp4.txt" "$test_pool_data_path/projects/[2020] some project/mp4.txt.link" +./zfs-inplace-rebalancing.sh --skip-hardlinks true $test_pool_data_path >> $log_std_file 2>> $log_error_file +cat $log_std_file +# Neither file should be copied now, since they are each a hardlink +assert_matching_file_not_copied "mp4.txt.link" +assert_matching_file_not_copied "mp4.txt" +assertions diff --git a/zfs-inplace-rebalancing.sh b/zfs-inplace-rebalancing.sh index 05679db..66b8224 100755 --- a/zfs-inplace-rebalancing.sh +++ b/zfs-inplace-rebalancing.sh @@ -26,7 +26,7 @@ Cyan='\033[0;36m' # Cyan # print a help message function print_usage() { - echo "Usage: zfs-inplace-rebalancing --checksum true --passes 1 /my/pool" + echo "Usage: zfs-inplace-rebalancing --checksum true --skip-hardlinks false --passes 1 /my/pool" } # print a given text entirely in a given color @@ -56,6 +56,18 @@ function get_rebalance_count () { function rebalance () { file_path=$1 + # check if file has >=2 links in the case of --skip-hardlinks + # this shouldn't be needed in the typical case of `find` only finding files with links == 1 + # but this can run for a long time, so it's good to double check if something changed + if [[ "${skip_hardlinks_flag,,}" == "true"* ]]; then + hardlink_count=$(stat -c "%h" "${file_path}") + + if [ "${hardlink_count}" -ge 2 ]; then + echo "Skipping hard-linked file: ${file_path}" + return + fi + fi + current_index="$((current_index + 1))" progress_percent=$(echo "scale=2; ${current_index}*100/${file_count}" | bc) color_echo "${Cyan}" "Progress -- Files: ${current_index}/${file_count} (${progress_percent}%)" @@ -175,15 +187,20 @@ function rebalance () { } checksum_flag='true' +skip_hardlinks_flag='false' passes_flag='1' -if [ "$#" -eq 0 ]; then +if [[ "$#" -eq 0 ]]; then print_usage exit 0 fi while true ; do case "$1" in + -h | --help ) + print_usage + exit 0 + ;; -c | --checksum ) if [[ "$2" == 1 || "$2" =~ (on|true|yes) ]]; then checksum_flag="true" @@ -192,6 +209,14 @@ while true ; do fi shift 2 ;; + --skip-hardlinks ) + if [[ "$2" == 1 || "$2" =~ (on|true|yes) ]]; then + skip_hardlinks_flag="true" + else + skip_hardlinks_flag="false" + fi + shift 2 + ;; -p | --passes ) passes_flag=$2 shift 2 @@ -208,9 +233,15 @@ color_echo "$Cyan" "Start rebalancing:" color_echo "$Cyan" " Path: ${root_path}" color_echo "$Cyan" " Rebalancing Passes: ${passes_flag}" color_echo "$Cyan" " Use Checksum: ${checksum_flag}" +color_echo "$Cyan" " Skip Hardlinks: ${skip_hardlinks_flag}" # count files -file_count=$(find "${root_path}" -type f | wc -l) +if [[ "${skip_hardlinks_flag,,}" == "true"* ]]; then + file_count=$(find "${root_path}" -type f -links 1 | wc -l) +else + file_count=$(find "${root_path}" -type f | wc -l) +fi + color_echo "$Cyan" " File count: ${file_count}" # create db file @@ -219,7 +250,13 @@ if [ "${passes_flag}" -ge 1 ]; then fi # recursively scan through files and execute "rebalance" procedure -find "$root_path" -type f -print0 | while IFS= read -r -d '' file; do rebalance "$file"; done +# in the case of --skip-hardlinks, only find files with links == 1 +if [[ "${skip_hardlinks_flag,,}" == "true"* ]]; then + find "$root_path" -type f -links 1 -print0 | while IFS= read -r -d '' file; do rebalance "$file"; done +else + find "$root_path" -type f -print0 | while IFS= read -r -d '' file; do rebalance "$file"; done +fi + echo "" echo "" color_echo "$Green" "Done!"