Differences
This shows you the differences between two versions of the page.
Both sides previous revision Previous revision Next revision | Previous revision Next revisionBoth sides next revision | ||
parallel_rsync [20.02.2017 17:40] – Pascal Suter | parallel_rsync [20.05.2020 19:32] – [the code] Pascal Suter | ||
---|---|---|---|
Line 25: | Line 25: | ||
==== the code ==== | ==== the code ==== | ||
- | < | + | < |
# | # | ||
- | # Parallel Rsync function | + | # Parallel Rsync function |
+ | # documentation and explanation at http:// | ||
+ | # | ||
+ | # version 1: initial release in 2017 | ||
+ | # version 2: removed the need to escape filenames by using null delimiter + xargs to run commands such as mkdir and rsync, | ||
+ | # added ability to resume without rescanning (argument $5) and to skip already synced directories (argument $6) | ||
# | # | ||
Line 34: | Line 39: | ||
# $2 = destination | # $2 = destination | ||
# $3 = dirdepth | # $3 = dirdepth | ||
- | # $4 = numjobs | + | # $4 = numjobs |
+ | # $5 = dirlist file (optional) --> will allow to resume without re-scanning the entire directory structure | ||
+ | # $6 = progress log file (optional) --> will allow to skip previously synced directory when resuming with a dirlist file | ||
source=$1 | source=$1 | ||
destination=$2 | destination=$2 | ||
depth=$3 | depth=$3 | ||
threads=$4 | threads=$4 | ||
+ | dirlistfile=$5 | ||
+ | progressfile=$6 | ||
- | # gets directory listing form remote or local using ssh and file | + | # gets directory listing form remote or local using ssh and find |
dirlist(){ | dirlist(){ | ||
#$1 = path, $2 = maxdepth | #$1 = path, $2 = maxdepth | ||
Line 68: | Line 76: | ||
fi | fi | ||
} | } | ||
- | # escape wrapper function. will do a double escape if the source is remote, will do a single escape if source is local | ||
- | source_escape() { | ||
- | echo " | ||
- | if [ $? -eq 0 ];then | ||
- | escape | escape | ||
- | else | ||
- | escape | ||
- | fi | ||
- | } | ||
- | |||
- | #magic escape function. it is probably not yet complete but it can be expanded based on the last "final sync to double check" | ||
- | #file names that where not or wrongly escaped end up there. | ||
- | escape() { | ||
- | sed -e ' | ||
- | } | ||
- | |||
# generate a list of directories to sync | # generate a list of directories to sync | ||
- | rawfilelist=`dirlist $source $depth` | + | if [ -z " |
+ | rawfilelist=$(dirlist $source $depth) | ||
+ | else | ||
+ | # dirlist filename was passed check if it exists and load dirlist from there, otherwise create it and save the dirlist to the file | ||
+ | if [ -f $dirlistfile ]; then | ||
+ | rawfilelist=$(< | ||
+ | else | ||
+ | rawfilelist=$(dirlist $source $depth | tee $dirlistfile) | ||
+ | fi | ||
+ | fi | ||
# separate paths less than DIRDEPTH deep from the others, so that only the " | # separate paths less than DIRDEPTH deep from the others, so that only the " | ||
Line 99: | Line 100: | ||
remote=`echo " | remote=`echo " | ||
remotepath=${path: | remotepath=${path: | ||
- | remotepath=`echo " | + | echo |
- | ssh $remote "mkdir -p $remotepath" | + | |
else | else | ||
- | path=`echo " | + | echo |
- | mkdir -p $path | + | |
fi | fi | ||
Line 110: | Line 109: | ||
echo "Sync parents" | echo "Sync parents" | ||
echo " | echo " | ||
- | echo "$parentlist" | + | function PRS_syncParents(){ |
+ | source=$2 | ||
+ | destination=$3 | ||
+ | progressfile=$4 | ||
+ | if [ -n " | ||
+ | echo "skipping | ||
+ | else | ||
+ | echo -n -e "$1\0" | xargs -0 -I PPP rsync -aHvx --numeric-ids --relative -f '- PPP/ | ||
+ | status=$? | ||
+ | if [ -n " | ||
+ | echo " | ||
+ | fi | ||
+ | return $status | ||
+ | fi | ||
+ | } | ||
+ | export -f PRS_syncParents | ||
+ | echo " | ||
status=$? | status=$? | ||
if [ $status -gt 0 ]; then | if [ $status -gt 0 ]; then | ||
Line 118: | Line 133: | ||
return 1 | return 1 | ||
fi | fi | ||
- | |||
#sync leafs recursively | #sync leafs recursively | ||
echo " | echo " | ||
echo "Sync leafs recursively" | echo "Sync leafs recursively" | ||
echo " | echo " | ||
- | echo "$filelist" | + | function PRS_syncLeafs(){ |
+ | source=$2 | ||
+ | destination=$3 | ||
+ | progressfile=$4 | ||
+ | if [ -n " | ||
+ | echo "skipping | ||
+ | else | ||
+ | echo -n -e "$1\0" | xargs -0 -I PPP rsync -aHvx --relative --numeric-ids $source/ | ||
+ | status=$? | ||
+ | if [ -n " | ||
+ | echo " | ||
+ | fi | ||
+ | return $status | ||
+ | fi | ||
+ | } | ||
+ | export -f PRS_syncLeafs | ||
+ | echo " | ||
+ | status=$? | ||
if [ $? -gt 0 ]; then | if [ $? -gt 0 ]; then | ||
cat /tmp/debug | cat /tmp/debug | ||
rm /tmp/debug | rm /tmp/debug | ||
- | echo " | + | echo " |
return 1 | return 1 | ||
fi | fi | ||
+ | #exit # uncomment for debugging what happenes before the final rsync | ||
#run a single thread rsync across the entire project directory | #run a single thread rsync across the entire project directory | ||
Line 141: | Line 173: | ||
return 1 | return 1 | ||
fi | fi | ||
+ | | ||
+ | exit # comment out if you want to really do the md5 sums, this may take very long! | ||
#create an md5 sum of the md5sums of all files of the entire project directory to comapre it to the archive copy | #create an md5 sum of the md5sums of all files of the entire project directory to comapre it to the archive copy | ||
Line 158: | Line 192: | ||
**Usage** | **Usage** | ||
you can run this function like so: | you can run this function like so: | ||
- | psync sourceHost:/ | + | |
- | this will copy the / | + | |
+ | this will copy the / | ||
**catuion** this is a work in progress.. I am writing down my notes as I go! | **catuion** this is a work in progress.. I am writing down my notes as I go! | ||
Line 166: | Line 201: | ||
here is, how i did it when i needed to copy 40 TB of data from one raidset to another while the server was still online serving files to everybody in the company: | here is, how i did it when i needed to copy 40 TB of data from one raidset to another while the server was still online serving files to everybody in the company: | ||
+ | |||
+ | ==== testing ==== | ||
+ | to test this script when modifying, I use a simple test-dataset which I extract to ''/ | ||
+ | prsync.sh /tmp/source /tmp/target 3 1 / | ||
+ | to compare the resulting structure i use diff: | ||
+ | diff <(find source/|sed -e ' | ||
+ | and to delete the temporary files and target folder in order to re-run a fresh sync i run | ||
+ | rm -rf / | ||
===== Before we get startet ===== | ===== Before we get startet ===== |