parallel_rsync

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
Next revision
Previous revision
Next revisionBoth sides next revision
parallel_rsync [20.02.2017 17:40] Pascal Suterparallel_rsync [20.05.2020 19:32] – [the code] Pascal Suter
Line 25: Line 25:
 ==== the code ==== ==== the code ====
  
-<code>+<code bash prsync.sh>
 # #
-# Parallel Rsync function 2017 by Pascal Suter @ DALCO AG, Switzerland+# Parallel Rsync function 2020 by Pascal Suter @ DALCO AG, Switzerland 
 +# documentation and explanation at http://wiki.psuter.ch/doku.php?id=parallel_rsync 
 +
 +# version 1: initial release in 2017 
 +# version 2: removed the need to escape filenames by using null delimiter + xargs to run commands such as mkdir and rsync,  
 +#            added ability to resume without rescanning (argument $5) and to skip already synced directories (argument $6)
 # #
  
Line 34: Line 39:
  # $2 = destination  # $2 = destination
  # $3 = dirdepth  # $3 = dirdepth
- # $4 = numjobs  + # $4 = numjobs 
 + # $5 = dirlist file (optional) --> will allow to resume without re-scanning the entire directory structure 
 +    # $6 = progress log file (optional) --> will allow to skip previously synced directory when resuming with a dirlist file
  source=$1  source=$1
  destination=$2  destination=$2
  depth=$3  depth=$3
  threads=$4  threads=$4
 + dirlistfile=$5
 + progressfile=$6
   
- # gets directory listing form remote or local using ssh and file+ # gets directory listing form remote or local using ssh and find
  dirlist(){  dirlist(){
  #$1 = path, $2 = maxdepth  #$1 = path, $2 = maxdepth
Line 68: Line 76:
  fi  fi
  }  }
- # escape wrapper function. will do a double escape if the source is remote, will do a single escape if source is local 
- source_escape() { 
- echo "$source" | grep -P "^[^@]*@[^:]*:" > /dev/null 
- if [ $? -eq 0 ];then 
- escape | escape  
- else  
- escape 
- fi 
- } 
-  
- #magic escape function. it is probably not yet complete but it can be expanded based on the last "final sync to double check" 
- #file names that where not or wrongly escaped end up there.  
- escape() { 
- sed -e 's/\\/\\\\/g' -e 's/ /\\ /g' -e 's/\$/\\\$/g' -e 's/:/\\:/g' -e 's/(/\\(/g' -e 's/)/\\)/g' -e 's/"/\\"/g' -e "s/'/\\\\'/g" -e 's/|/\\|/g' 
- } 
- 
  
  # generate a list of directories to sync   # generate a list of directories to sync 
- rawfilelist=`dirlist $source $depth`+ if [ -z "$dirlistfile" ]; then 
 + rawfilelist=$(dirlist $source $depth
 + else  
 + # dirlist filename was passed check if it exists and load dirlist from there, otherwise create it and save the dirlist to the file 
 + if [ -f $dirlistfile ]; then  
 + rawfilelist=$(<$dirlistfile) 
 + else  
 + rawfilelist=$(dirlist $source $depth | tee $dirlistfile) 
 + fi  
 + fi
  
  # separate paths less than DIRDEPTH deep from the others, so that only the "leafs" get rsynced recursively, the rest is synced without recursion  # separate paths less than DIRDEPTH deep from the others, so that only the "leafs" get rsynced recursively, the rest is synced without recursion
Line 99: Line 100:
  remote=`echo "$path" | awk -F : '{print $1}'`  remote=`echo "$path" | awk -F : '{print $1}'`
  remotepath=${path:$((${#remote}+1))}  remotepath=${path:$((${#remote}+1))}
- remotepath=`echo "$remotepath"escape | escape` + echo -n -e "$remotepath\0" | ssh $remote "xargs -0 mkdir -p"
- ssh $remote "mkdir -p $remotepath"+
  else   else 
- path=`echo "$path"escape` + echo -n -e "$path\0" | xargs -0 mkdir -p
- mkdir -p $path+
  fi  fi
   
Line 110: Line 109:
  echo "Sync parents"  echo "Sync parents"
  echo "==========================================================================="  echo "==========================================================================="
- echo "$parentlist| source_escape | xargs -P $threads -I PPP rsync -aHvx --numeric-ids --relative -f '- PPP/*/' $source/./'PPP'/ $destination/ 2>/tmp/debug+ function PRS_syncParents(){ 
 + source=$2 
 + destination=$3 
 + progressfile=$4 
 + if [ -n "$progressfile" ] && grep -q -x -F "$1" $progressfile ; then 
 + echo "skipping $1 because it was synced before according to $progressfile" 
 + else 
 + echo -n -e "$1\0" | xargs --I PPP rsync -aHvx --numeric-ids --relative -f '- PPP/*/' $source/./'PPP'/ $destination/ 2>/tmp/debug 
 + status=$? 
 + if [ -n "$progressfile" ]; then  
 + echo "$1" >> "$progressfile" 
 + fi 
 + return $status 
 + fi 
 +
 + export -f PRS_syncParents 
 + echo "$parentlist" | tr \\n \\0 | xargs -0 -P $threads -I PPP /bin/bash -c 'PRS_syncParents "$@"' _ PPP "$source" "$destination" "$progressfile"
  status=$?  status=$?
  if [ $status -gt 0 ]; then   if [ $status -gt 0 ]; then 
Line 118: Line 133:
  return 1  return 1
  fi  fi
- 
  #sync leafs recursively  #sync leafs recursively
  echo "==========================================================================="  echo "==========================================================================="
  echo "Sync leafs recursively"  echo "Sync leafs recursively"
  echo "==========================================================================="  echo "==========================================================================="
- echo "$filelist| source_escape | xargs -P $threads -I PPP rsync -aHvx --relative --numeric-ids $source/./'PPP' $destination/ 2>/tmp/debug+ function PRS_syncLeafs(){ 
 + source=$2 
 + destination=$3 
 + progressfile=$4 
 + if [ -n "$progressfile" ] && grep -q -x -F "$1" $progressfile ; then 
 + echo "skipping $1 because it was synced before according to $progressfile" 
 + else 
 + echo -n -e "$1\0" | xargs --I PPP rsync -aHvx --relative --numeric-ids $source/./'PPP' $destination/ 2>/tmp/debug 
 + status=$? 
 + if [ -n "$progressfile" ]; then  
 + echo "$1" >> "$progressfile" 
 + fi 
 + return $status 
 + fi 
 +
 + export -f PRS_syncLeafs 
 + echo "$filelist" | tr \\n \\0 | xargs -0 -P $threads -I PPP /bin/bash -c 'PRS_syncLeafs "$@"' _ PPP "$source" "$destination" "$progressfile" 
 + status=$?
  if [ $? -gt 0 ]; then   if [ $? -gt 0 ]; then 
  cat /tmp/debug  cat /tmp/debug
  rm /tmp/debug  rm /tmp/debug
- echo "ERROR: there was an error when syncing the leaf directories recursively, check messages and try again"+ echo "ERROR: there was an error while syncing the leaf directories recursively, check messages and try again"
  return 1  return 1
  fi  fi
 +    #exit # uncomment for debugging what happenes before the final rsync
  
  #run a single thread rsync across the entire project directory  #run a single thread rsync across the entire project directory
Line 141: Line 173:
  return 1  return 1
  fi  fi
 +    
 + exit # comment out if you want to really do the md5 sums, this may take very long! 
  
  #create an md5 sum of the md5sums of all files of the entire project directory to comapre it to the archive copy  #create an md5 sum of the md5sums of all files of the entire project directory to comapre it to the archive copy
Line 158: Line 192:
 **Usage**  **Usage** 
 you can run this function like so:  you can run this function like so: 
-  psync sourceHost:/source/directory target/destination 5 8 +  source prsync.sh 
-this will copy the /source/directory to /target/destination and it will dive 5 directory levels deep to parallelize rsyncs. it will run 8 rsync processes in parallel. +  psync sourceHost:/source/directory target/destination 5 8 /tmp/dirlist /tmp/progressfile 
 +this will copy the /source/directory to /target/destination and it will dive 5 directory levels deep to parallelize rsyncs. it will run 8 rsync processes in parallel. with the optional ''dirlist'' and ''progressfile'' files, it will track its progress and skip all directories it has already rsynced when re-running it in case of an interrupted previous run.
  
 **catuion** this is a work in progress.. I am writing down my notes as I go!  **catuion** this is a work in progress.. I am writing down my notes as I go! 
Line 166: Line 201:
  
 here is, how i did it when i needed to copy 40 TB of data from one raidset to another while the server was still online serving files to everybody in the company:  here is, how i did it when i needed to copy 40 TB of data from one raidset to another while the server was still online serving files to everybody in the company: 
 +
 +==== testing ====
 +to test this script when modifying, I use a simple test-dataset which I extract to ''/tmp/''. I then uncomment the "exit" statement before the "final sync to doublecheck" and run the script like so: 
 +  prsync.sh /tmp/source /tmp/target 3 1 /tmp/testdirlist /tmp/progressfile
 +to compare the resulting structure i use diff: 
 +  diff <(find source/|sed -e 's/source//' | sort ) <(find target/ | sed -e 's/target//' | sort)
 +and to delete the temporary files and target folder in order to re-run a fresh sync i run 
 +  rm -rf /tmp/target/* /tmp/testdirlist /tmp/progressfile
  
 ===== Before we get startet ===== ===== Before we get startet =====
  • parallel_rsync.txt
  • Last modified: 20.05.2020 19:44
  • by Pascal Suter