Compare commits

..

1 Commits

Author SHA1 Message Date
Markus Ressel
4ad46ae83a -F, --fixed-strings PATTERNS are strings
-x, --line-regexp         match only whole lines
2022-03-29 03:40:06 +02:00
14 changed files with 17 additions and 306 deletions

View File

@ -1,9 +0,0 @@
version: 2
updates:
- package-ecosystem: github-actions
directory: "/"
schedule:
# Check for updates to GitHub Actions every week
interval: "weekly"

59
.github/stale.yml vendored
View File

@ -1,59 +0,0 @@
# Configuration for probot-stale - https://github.com/probot/stale
# Number of days of inactivity before an Issue or Pull Request becomes stale
daysUntilStale: 60
# Number of days of inactivity before an Issue or Pull Request with the stale label is closed.
# Set to false to disable. If disabled, issues still need to be closed manually, but will remain marked as stale.
daysUntilClose: 14
# Issues or Pull Requests with these labels will never be considered stale. Set to `[]` to disable
exemptLabels:
- pinned
- security
- bug
- enhancement
# Set to true to ignore issues in a project (defaults to false)
exemptProjects: false
# Set to true to ignore issues in a milestone (defaults to false)
exemptMilestones: false
# Set to true to ignore issues with an assignee (defaults to false)
exemptAssignees: true
# Label to use when marking as stale
staleLabel: wontfix
# Comment to post when marking as stale. Set to `false` to disable
markComment: >
This issue has been automatically marked as stale because it has not had
recent activity. It will be closed if no further activity occurs. Thank you
for your contributions.
# Comment to post when removing the stale label.
# unmarkComment: >
# Your comment here.
# Comment to post when closing a stale Issue or Pull Request.
closeComment: >
There has been no incentive by contributors or maintainers to revive this stale issue and it will now be closed.
# Limit the number of actions per hour, from 1-30. Default is 30
limitPerRun: 30
# Limit to only `issues` or `pulls`
only: issues
# Optionally, specify configuration settings that are specific to just 'issues' or 'pulls':
# pulls:
# daysUntilStale: 30
# markComment: >
# This pull request has been automatically marked as stale because it has not had
# recent activity. It will be closed if no further activity occurs. Thank you
# for your contributions.
# issues:
# exemptLabels:
# - confirmed

View File

@ -1,44 +0,0 @@
name: Docker latest
on:
push:
branches: [ master ]
env:
REGISTRY: ghcr.io
jobs:
build:
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log into registry ${{ env.REGISTRY }}
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Extract Docker metadata
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ github.repository }}
tags: |
type=raw,value=latest
- name: Build and push Docker image
id: build-and-push
uses: docker/build-push-action@v5
with:
context: .
file: Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}

View File

@ -15,6 +15,6 @@ jobs:
name: Shellcheck
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v2
- name: Run ShellCheck
uses: ludeeus/action-shellcheck@master

View File

@ -1,19 +0,0 @@
# Test
name: Test
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
shellcheck:
name: Test
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Run testing script
run: ./testing.sh

5
.gitignore vendored
View File

@ -1,4 +1 @@
test.log
error.log
rebalance_db.txt
testing_data
rebalance_db.txt

View File

@ -1,10 +0,0 @@
FROM phusion/baseimage:jammy-1.0.1
MAINTAINER markusressel
RUN apt-get update \
&& apt-get -y install bc \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
COPY zfs-inplace-rebalancing.sh ./
ENTRYPOINT ["./zfs-inplace-rebalancing.sh"]

View File

@ -5,11 +5,7 @@ Simple bash script to rebalance pool data between all mirrors when adding vdevs
## How it works
This script recursively traverses all the files in a given directory. Each file is copied with a `.balance` suffix, retaining all file attributes. The original is then deleted and the *copy* is renamed back to the name of the original file. When copying a file ZFS will spread the data blocks across all vdevs, effectively distributing/rebalancing the data of the original file (more or less) evenly. This allows the pool data to be rebalanced without the need for a separate backup pool/drive.
The way ZFS distributes writes is not trivial, which makes it hard to predict how effective the redistribution will be. See:
- https://jrs-s.net/2018/04/11/zfs-allocates-writes-according-to-free-space-per-vdev-not-latency-per-vdev/
- https://jrs-s.net/2018/08/24/zfs-write-allocation-in-0-7-x/
This script recursively traverses all the files in a given directory. Each file is copied with a `.rebalance` suffix, retaining all file attributes. The original is then deleted and the *copy* is renamed back to the name of the original file. When copying a file ZFS will spread the data blocks across all vdevs, effectively distributing/rebalancing the data of the original file (more or less) evenly. This allows the pool data to be rebalanced without the need for a separate backup pool/drive.
Note that this process is not entirely "in-place", since a file has to be fully copied before the original is deleted. The term is used to make it clear that no additional pool (and therefore hardware) is necessary to use this script. However, this also means that you have to have enough space to create a copy of the biggest file in your target directory for it to work.
@ -70,7 +66,7 @@ Due to the working principle of this script, it is crucial that you **only run i
### Snapshots
If you do a snapshot of the data you want to balance before starting the rebalancing script, keep in mind that ZFS now has to keep track of all of the data in the target directory twice. Once in the snapshot you made, and once for the new copy. This means that you will effectively use double the file size of all files within the target directory. Therefore it is a good idea to process the pool data in batches and remove old snapshots along the way, since you probably will be hitting the capacity limits of your pool at some point during the rebalancing process.
If you do a snapshot of the data you want to balance before starting the rebalancing script, keep in mind that ZFS now has to keep track of all of the data in the target directory twice. Once in the snapshot you made, and once for the new copy. This means that you will effectively use double the file size of all files within the target directory. Therefore it is a good idea to process the pool data in badges and remove old snapshots along the way, since you probably will be hitting the capacity limits of your pool at some point during the rebalancing process.
## Installation
@ -82,7 +78,7 @@ chmod +x ./zfs-inplace-rebalancing.sh
```
Dependencies:
* `perl` - it should be available on most systems by default
* `pacman -S bc` - used for percentage calculation
## Usage
@ -90,7 +86,7 @@ Dependencies:
You can print a help message by running the script without any parameters:
```shell
```
./zfs-inplace-rebalancing.sh
```
@ -100,52 +96,31 @@ You can print a help message by running the script without any parameters:
|-----------|-------------|---------|
| `-c`<br>`--checksum` | Whether to compare attributes and content of the copied file using an **MD5** checksum. Technically this is a redundent check and consumes a lot of resources, so think twice. | `true` |
| `-p`<br>`--passes` | The maximum number of rebalance passes per file. Setting this to infinity by using a value `<= 0` might improve performance when rebalancing a lot of small files. | `1` |
| `--skip-hardlinks` | Skip rebalancing hardlinked files, since it will only create duplicate data. | `false` |
### Example
Make sure to run this script with a user that has rw permission to all of the files in the target directory.
The easiest way to achieve this is by **running the script as root**.
```shell
```
sudo su
./zfs-inplace-rebalancing.sh --checksum true --passes 1 /pool/path/to/rebalance
```
To keep track of the balancing progress, you can open another terminal and run:
```shell
watch zpool list -v
```
### Log to File
To write the output to a file, simply redirect stdout and stderr to a file (or separate files).
Since this redirects all output, you will have to follow the contents of the log files to get realtime info:
```shell
# one shell window:
tail -F ./stdout.log
# another shell window:
./zfs-inplace-rebalancing.sh /pool/path/to/rebalance >> ./stdout.log 2>> ./stderr.log
watch zpool list -v
```
### Things to consider
Although this script **does** have a progress output (files as well as percentage) it might be a good idea to try a small subfolder first, or process your pool folder layout in manually selected badges. This can also limit the damage done, if anything bad happens.
When aborting the script midway through, be sure to check the last lines of its output. When cancelling before or during the renaming process a ".balance" file might be left and you have to rename (or delete) it manually.
When aborting the script midway through, be sure to check the last lines of its output. When cancelling before or during the renaming process a ".rebalance" file might be left and you have to rename (or delete) it manually.
Although the `--passes` parameter can be used to limit the maximum amount of rebalance passes per file, it is only meant to speedup aborted runs. Individual files will **not be process multiple times automatically**. To reach multiple passes you have to run the script on the same target directory multiple times.
### Dockerfile
To increase portability, this script can also be run using docker:
```shell
sudo docker run --rm -it -v /your/data:/data ghcr.io/markusressel/zfs-inplace-rebalancing:latest ./data
```
# Contributing
GitHub is for social coding: if you want to write code, I encourage contributions through pull requests from forks

View File

@ -1 +0,0 @@
test

View File

@ -1 +0,0 @@
test

View File

@ -1 +0,0 @@
test

View File

@ -1,78 +0,0 @@
#!/usr/bin/env bash
# exit script on error
set -e
# exit on undeclared variable
set -u
log_std_file=./test.log
log_error_file=./error.log
test_data_src=./test/pool
test_pool_data_path=./testing_data
function prepare() {
# cleanup
rm -f $log_std_file
rm -f $log_error_file
rm -f rebalance_db.txt
rm -rf $test_pool_data_path
# setup
cp -rf $test_data_src $test_pool_data_path
}
function assertions() {
# check error log is empty
if grep -q '[^[:space:]]' $log_error_file; then
echo "error log is not empty!"
cat $log_error_file
exit 1
fi
}
function assert_matching_file_copied() {
if ! grep "Copying" $log_std_file | grep -q "$1"; then
echo "File matching '$1' was not copied when it should have been!"
exit 1
fi
}
function assert_matching_file_not_copied() {
if grep "Copying" $log_std_file | grep -q "$1"; then
echo "File matching '$1' was copied when it should have been skipped!"
exit 1
fi
}
prepare
./zfs-inplace-rebalancing.sh $test_pool_data_path >> $log_std_file 2>> $log_error_file
cat $log_std_file
assertions
prepare
./zfs-inplace-rebalancing.sh --checksum true --passes 1 $test_pool_data_path >> $log_std_file 2>> $log_error_file
cat $log_std_file
assertions
prepare
./zfs-inplace-rebalancing.sh --checksum false $test_pool_data_path >> $log_std_file 2>> $log_error_file
cat $log_std_file
assertions
prepare
ln "$test_pool_data_path/projects/[2020] some project/mp4.txt" "$test_pool_data_path/projects/[2020] some project/mp4.txt.link"
./zfs-inplace-rebalancing.sh --skip-hardlinks false $test_pool_data_path >> $log_std_file 2>> $log_error_file
cat $log_std_file
# Both link files should be copied
assert_matching_file_copied "mp4.txt"
assert_matching_file_copied "mp4.txt.link"
assertions
prepare
ln "$test_pool_data_path/projects/[2020] some project/mp4.txt" "$test_pool_data_path/projects/[2020] some project/mp4.txt.link"
./zfs-inplace-rebalancing.sh --skip-hardlinks true $test_pool_data_path >> $log_std_file 2>> $log_error_file
cat $log_std_file
# Neither file should be copied now, since they are each a hardlink
assert_matching_file_not_copied "mp4.txt.link"
assert_matching_file_not_copied "mp4.txt"
assertions

View File

@ -26,7 +26,7 @@ Cyan='\033[0;36m' # Cyan
# print a help message
function print_usage() {
echo "Usage: zfs-inplace-rebalancing --checksum true --skip-hardlinks false --passes 1 /my/pool"
echo "Usage: zfs-inplace-rebalancing -checksum true -passes 1 /my/pool"
}
# print a given text entirely in a given color
@ -56,20 +56,8 @@ function get_rebalance_count () {
function rebalance () {
file_path=$1
# check if file has >=2 links in the case of --skip-hardlinks
# this shouldn't be needed in the typical case of `find` only finding files with links == 1
# but this can run for a long time, so it's good to double check if something changed
if [[ "${skip_hardlinks_flag,,}" == "true"* ]]; then
hardlink_count=$(stat -c "%h" "${file_path}")
if [ "${hardlink_count}" -ge 2 ]; then
echo "Skipping hard-linked file: ${file_path}"
return
fi
fi
current_index="$((current_index + 1))"
progress_percent=$(perl -e "printf('%0.2f', ${current_index}*100/${file_count})")
progress_percent=$(echo "scale=2; ${current_index}*100/${file_count}" | bc)
color_echo "${Cyan}" "Progress -- Files: ${current_index}/${file_count} (${progress_percent}%)"
if [[ ! -f "${file_path}" ]]; then
@ -92,12 +80,11 @@ function rebalance () {
if [[ "${OSTYPE,,}" == "linux-gnu"* ]]; then
# Linux
# --reflink=never -- force standard copy (see ZFS Block Cloning)
# -a -- keep attributes
# -d -- keep symlinks (dont copy target)
# -x -- stay on one system
# -p -- preserve ACLs too
cp --reflink=never -adxp "${file_path}" "${tmp_file_path}"
cp -adxp "${file_path}" "${tmp_file_path}"
elif [[ "${OSTYPE,,}" == "darwin"* ]] || [[ "${OSTYPE,,}" == "freebsd"* ]]; then
# Mac OS
# FreeBSD
@ -188,36 +175,23 @@ function rebalance () {
}
checksum_flag='true'
skip_hardlinks_flag='false'
passes_flag='1'
if [[ "$#" -eq 0 ]]; then
if [ "$#" -eq 0 ]; then
print_usage
exit 0
fi
while true ; do
case "$1" in
-h | --help )
print_usage
exit 0
;;
-c | --checksum )
if [[ "$2" == 1 || "$2" =~ (on|true|yes) ]]; then
if [ "$2" -eq 1 ] || [[ "$2" =~ (on|true|yes) ]]; then
checksum_flag="true"
else
checksum_flag="false"
fi
shift 2
;;
--skip-hardlinks )
if [[ "$2" == 1 || "$2" =~ (on|true|yes) ]]; then
skip_hardlinks_flag="true"
else
skip_hardlinks_flag="false"
fi
shift 2
;;
-p | --passes )
passes_flag=$2
shift 2
@ -230,19 +204,13 @@ done;
root_path=$1
color_echo "$Cyan" "Start rebalancing $(date):"
color_echo "$Cyan" "Start rebalancing:"
color_echo "$Cyan" " Path: ${root_path}"
color_echo "$Cyan" " Rebalancing Passes: ${passes_flag}"
color_echo "$Cyan" " Use Checksum: ${checksum_flag}"
color_echo "$Cyan" " Skip Hardlinks: ${skip_hardlinks_flag}"
# count files
if [[ "${skip_hardlinks_flag,,}" == "true"* ]]; then
file_count=$(find "${root_path}" -type f -links 1 | wc -l)
else
file_count=$(find "${root_path}" -type f | wc -l)
fi
file_count=$(find "${root_path}" -type f | wc -l)
color_echo "$Cyan" " File count: ${file_count}"
# create db file
@ -251,13 +219,7 @@ if [ "${passes_flag}" -ge 1 ]; then
fi
# recursively scan through files and execute "rebalance" procedure
# in the case of --skip-hardlinks, only find files with links == 1
if [[ "${skip_hardlinks_flag,,}" == "true"* ]]; then
find "$root_path" -type f -links 1 -print0 | while IFS= read -r -d '' file; do rebalance "$file"; done
else
find "$root_path" -type f -print0 | while IFS= read -r -d '' file; do rebalance "$file"; done
fi
find "$root_path" -type f -print0 | while IFS= read -r -d '' file; do rebalance "$file"; done
echo ""
echo ""
color_echo "$Green" "Done!"