parallel_rsync

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revisionBoth sides next revision
parallel_rsync [24.04.2018 18:33] – [the code] Pascal Suterparallel_rsync [20.05.2020 19:32] – [the code] Pascal Suter
Line 27: Line 27:
 <code bash prsync.sh> <code bash prsync.sh>
 # #
-# Parallel Rsync function 2017 by Pascal Suter @ DALCO AG, Switzerland+# Parallel Rsync function 2020 by Pascal Suter @ DALCO AG, Switzerland 
 +# documentation and explanation at http://wiki.psuter.ch/doku.php?id=parallel_rsync 
 +
 +# version 1: initial release in 2017 
 +# version 2: removed the need to escape filenames by using null delimiter + xargs to run commands such as mkdir and rsync,  
 +#            added ability to resume without rescanning (argument $5) and to skip already synced directories (argument $6)
 # #
  
Line 34: Line 39:
  # $2 = destination  # $2 = destination
  # $3 = dirdepth  # $3 = dirdepth
- # $4 = numjobs  + # $4 = numjobs 
 + # $5 = dirlist file (optional) --> will allow to resume without re-scanning the entire directory structure 
 +    # $6 = progress log file (optional) --> will allow to skip previously synced directory when resuming with a dirlist file
  source=$1  source=$1
  destination=$2  destination=$2
  depth=$3  depth=$3
  threads=$4  threads=$4
 + dirlistfile=$5
 + progressfile=$6
   
- # gets directory listing form remote or local using ssh and file+ # gets directory listing form remote or local using ssh and find
  dirlist(){  dirlist(){
  #$1 = path, $2 = maxdepth  #$1 = path, $2 = maxdepth
Line 68: Line 76:
  fi  fi
  }  }
- # escape wrapper function. will do a double escape if the source is remote, will do a single escape if source is local 
- source_escape() { 
- echo "$source" | grep -P "^[^@]*@[^:]*:" > /dev/null 
- if [ $? -eq 0 ];then 
- escape | escape  
- else  
- escape 
- fi 
- } 
-  
- #magic escape function. it is probably not yet complete but it can be expanded based on the last "final sync to double check" 
- #file names that where not or wrongly escaped end up there.  
- escape() { 
- sed -e 's/\\/\\\\/g' -e 's/ /\\ /g' -e 's/\$/\\\$/g' -e 's/:/\\:/g' -e 's/(/\\(/g' -e 's/)/\\)/g' -e 's/"/\\"/g' -e "s/'/\\\\'/g" -e 's/|/\\|/g' 
- } 
- 
  
  # generate a list of directories to sync   # generate a list of directories to sync 
- rawfilelist=`dirlist $source $depth`+ if [ -z "$dirlistfile" ]; then 
 + rawfilelist=$(dirlist $source $depth
 + else  
 + # dirlist filename was passed check if it exists and load dirlist from there, otherwise create it and save the dirlist to the file 
 + if [ -f $dirlistfile ]; then  
 + rawfilelist=$(<$dirlistfile) 
 + else  
 + rawfilelist=$(dirlist $source $depth | tee $dirlistfile) 
 + fi  
 + fi
  
  # separate paths less than DIRDEPTH deep from the others, so that only the "leafs" get rsynced recursively, the rest is synced without recursion  # separate paths less than DIRDEPTH deep from the others, so that only the "leafs" get rsynced recursively, the rest is synced without recursion
Line 99: Line 100:
  remote=`echo "$path" | awk -F : '{print $1}'`  remote=`echo "$path" | awk -F : '{print $1}'`
  remotepath=${path:$((${#remote}+1))}  remotepath=${path:$((${#remote}+1))}
- remotepath=`echo "$remotepath"escape | escape` + echo -n -e "$remotepath\0" | ssh $remote "xargs -0 mkdir -p"
- ssh $remote "mkdir -p $remotepath"+
  else   else 
- path=`echo "$path"escape` + echo -n -e "$path\0" | xargs -0 mkdir -p
- mkdir -p $path+
  fi  fi
   
Line 110: Line 109:
  echo "Sync parents"  echo "Sync parents"
  echo "==========================================================================="  echo "==========================================================================="
- echo "$parentlist| source_escape | xargs -P $threads -I PPP rsync -aHvx --numeric-ids --relative -f '- PPP/*/' $source/./'PPP'/ $destination/ 2>/tmp/debug+ function PRS_syncParents(){ 
 + source=$2 
 + destination=$3 
 + progressfile=$4 
 + if [ -n "$progressfile" ] && grep -q -x -F "$1" $progressfile ; then 
 + echo "skipping $1 because it was synced before according to $progressfile" 
 + else 
 + echo -n -e "$1\0" | xargs --I PPP rsync -aHvx --numeric-ids --relative -f '- PPP/*/' $source/./'PPP'/ $destination/ 2>/tmp/debug 
 + status=$? 
 + if [ -n "$progressfile" ]; then  
 + echo "$1" >> "$progressfile" 
 + fi 
 + return $status 
 + fi 
 +
 + export -f PRS_syncParents 
 + echo "$parentlist" | tr \\n \\0 | xargs -0 -P $threads -I PPP /bin/bash -c 'PRS_syncParents "$@"' _ PPP "$source" "$destination" "$progressfile"
  status=$?  status=$?
  if [ $status -gt 0 ]; then   if [ $status -gt 0 ]; then 
Line 118: Line 133:
  return 1  return 1
  fi  fi
- 
  #sync leafs recursively  #sync leafs recursively
  echo "==========================================================================="  echo "==========================================================================="
  echo "Sync leafs recursively"  echo "Sync leafs recursively"
  echo "==========================================================================="  echo "==========================================================================="
- echo "$filelist| source_escape | xargs -P $threads -I PPP rsync -aHvx --relative --numeric-ids $source/./'PPP' $destination/ 2>/tmp/debug+ function PRS_syncLeafs(){ 
 + source=$2 
 + destination=$3 
 + progressfile=$4 
 + if [ -n "$progressfile" ] && grep -q -x -F "$1" $progressfile ; then 
 + echo "skipping $1 because it was synced before according to $progressfile" 
 + else 
 + echo -n -e "$1\0" | xargs --I PPP rsync -aHvx --relative --numeric-ids $source/./'PPP' $destination/ 2>/tmp/debug 
 + status=$? 
 + if [ -n "$progressfile" ]; then  
 + echo "$1" >> "$progressfile" 
 + fi 
 + return $status 
 + fi 
 +
 + export -f PRS_syncLeafs 
 + echo "$filelist" | tr \\n \\0 | xargs -0 -P $threads -I PPP /bin/bash -c 'PRS_syncLeafs "$@"' _ PPP "$source" "$destination" "$progressfile" 
 + status=$?
  if [ $? -gt 0 ]; then   if [ $? -gt 0 ]; then 
  cat /tmp/debug  cat /tmp/debug
  rm /tmp/debug  rm /tmp/debug
- echo "ERROR: there was an error when syncing the leaf directories recursively, check messages and try again"+ echo "ERROR: there was an error while syncing the leaf directories recursively, check messages and try again"
  return 1  return 1
  fi  fi
 +    #exit # uncomment for debugging what happenes before the final rsync
  
  #run a single thread rsync across the entire project directory  #run a single thread rsync across the entire project directory
Line 141: Line 173:
  return 1  return 1
  fi  fi
 +    
 + exit # comment out if you want to really do the md5 sums, this may take very long! 
  
  #create an md5 sum of the md5sums of all files of the entire project directory to comapre it to the archive copy  #create an md5 sum of the md5sums of all files of the entire project directory to comapre it to the archive copy
Line 158: Line 192:
 **Usage**  **Usage** 
 you can run this function like so:  you can run this function like so: 
-  psync sourceHost:/source/directory target/destination 5 8 +  source prsync.sh 
-this will copy the /source/directory to /target/destination and it will dive 5 directory levels deep to parallelize rsyncs. it will run 8 rsync processes in parallel. +  psync sourceHost:/source/directory target/destination 5 8 /tmp/dirlist /tmp/progressfile 
 +this will copy the /source/directory to /target/destination and it will dive 5 directory levels deep to parallelize rsyncs. it will run 8 rsync processes in parallel. with the optional ''dirlist'' and ''progressfile'' files, it will track its progress and skip all directories it has already rsynced when re-running it in case of an interrupted previous run.
  
 **catuion** this is a work in progress.. I am writing down my notes as I go!  **catuion** this is a work in progress.. I am writing down my notes as I go! 
Line 166: Line 201:
  
 here is, how i did it when i needed to copy 40 TB of data from one raidset to another while the server was still online serving files to everybody in the company:  here is, how i did it when i needed to copy 40 TB of data from one raidset to another while the server was still online serving files to everybody in the company: 
 +
 +==== testing ====
 +to test this script when modifying, I use a simple test-dataset which I extract to ''/tmp/''. I then uncomment the "exit" statement before the "final sync to doublecheck" and run the script like so: 
 +  prsync.sh /tmp/source /tmp/target 3 1 /tmp/testdirlist /tmp/progressfile
 +to compare the resulting structure i use diff: 
 +  diff <(find source/|sed -e 's/source//' | sort ) <(find target/ | sed -e 's/target//' | sort)
 +and to delete the temporary files and target folder in order to re-run a fresh sync i run 
 +  rm -rf /tmp/target/* /tmp/testdirlist /tmp/progressfile
  
 ===== Before we get startet ===== ===== Before we get startet =====
  • parallel_rsync.txt
  • Last modified: 20.05.2020 19:44
  • by Pascal Suter