Recently I needed quickly a tool which compares two directories file by file and delete duplicate copies on one side. So it checks the source directory with the copy directory file by file. Each file is compared by inode, size and md5sum. If the source file and the copy file have the same md5 hash value (but not the same inode) the file copy could be deleted via the -d parameter or moved with the -m parameter.
At the beginning I found fslint which was to fancy for this task. And at the end of this blog post I found fdupes which seems to do exactly what I needed. I love YAGN programming
#!/bin/sh # # dupremover - Recursive Duplicate Remover # # (c) 2011 by Sebastian Felis # # You are free to use, modify, restribute without warranty while keeping it # free of charge. It would be nice to get an feedback note if you are using it. # # History # 2011-01-14 Initial Version # SCRIPT="$0" SRC= COPY= LOGLEVEL=0 # 2 DEBUG, 1 VERBOSE, 0 INFO, -1 WARN, -2 ERR DELETE=0 DELETE_HARDLINK=0 MOVE=0 MOVEDST=".dupremover" TRY=0 help() { echo "$(basename "$SCRIPT") [OPTIONS] SRC COPY Compares files of SRC directory with COPY directory recursivly via md5sum and deletes or moves duplicated files of the COPY directory. \t-d|--delete \t\tDelete the copy file \t-m|--move \t\tMoves the copy file to SRC/$MOVEDST \t-t|--try \t\tTry run. Do not perform any action \t-H|--delete-hardlink \t\tDelete multiple hardlink file \t-L|--log-level LEVEL \t\tSet log level: 2: DEBUG, 1: VERBOSE, 0: INFO, -1: WARN, -2: ERR " } while [ -n "$1" ]; do case "$1" in -d|--delete) DELETE=1 ;; -m|--move) MOVE=1 ;; -t|--try) TRY=1 ;; -H|--delete-hardlink) DELETE_HARDLINK=1 ;; -L|--log-level) shift LOGLEVEL=$1 ;; -h|--help) help exit 0 ;; *) if [ -z "$SRC" ]; then SRC=$1 elif [ -z "$COPY" ]; then COPY=$1 fi ;; esac shift done if [ -z "$COPY" ]; then help exit 1 fi log() { if [ $1 -eq 2 ]; then LEVEL=DEBUG elif [ $1 -eq 1 ]; then LEVEL=VERBOSE elif [ $1 -eq 0 ]; then LEVEL=INFO elif [ $1 -eq -1 ]; then LEVEL=WARN elif [ $1 -eq -2 ]; then LEVEL=ERR else LEVEL=UNKNOWN fi if [ $1 -le $LOGLEVEL ]; then echo $LEVEL: $2 fi } action() { local FILE="$1" local DIR="$2" if [ $MOVE -ne 0 ]; then if [ ! -f "$FILE" ]; then log 1 "Skip moving of directory $FILE" return 0 fi local DST="$SRC/$MOVEDST$DIR" log 0 "Move duplicate $FILE to $DST" if [ $TRY -eq 0 ]; then mkdir -p "$DST" mv "$FILE" "$DST" if [ $? -ne 0 ]; then log -2 "Could not move $FILE to $DST" fi fi elif [ $DELETE -ne 0 ]; then log 0 "Delete duplicate $FILE" if [ $TRY -eq 0 ]; then rm -rf "$FILE" > /dev/null if [ $? -ne 0 ]; then log -2 "Could not delete $FILE" fi fi else log 0 "Do nothing for duplicate $FILE" fi } checkemptydir() { local DIR="$1" if [ ! -d "$DIR" ]; then log -1 "$DIR is not a directory" exit fi COUNT=$(ls -a "$DIR" | wc -l) if [ $COUNT -eq 2 ]; then action "$DIR" fi } compare() { local DIR="$1" if [ ! -d "$SRC$DIR" ]; then log -2 "Skip non directory $_DIR" fi ls -a "$SRC$DIR" | while read NAME; do if [ "$NAME" = "." -o "$NAME" = ".." ]; then continue fi SRCFILE="$SRC$DIR/$NAME" COPYFILE="$COPY$DIR/$NAME" if [ ! -e "$COPYFILE" ]; then log 1 "File $NAME only in source $SRC$DIR" continue fi if [ -d "$SRCFILE" ]; then log 1 "Read directory $SRCFILE" compare "$DIR/$NAME" checkemptydir "$COPYFILE" continue fi if [ $(stat -c%i "$SRCFILE") -eq $(stat -c%i "$COPYFILE") ]; then if [ $(ls -ld "$SRCFILE" | awk '{print $2}') -gt 1 -a $DELETE_HARDLINK -ne 0 ]; then log -1 "File $COPYFILE is duplicated hardlink" else log 0 "Skip file $DIR/$NAME with same inode. Use -H to delete hardlinks" continue fi fi SRCSIZE=$(stat -c%s "$SRCFILE") COPYSIZE=$(stat -c%s "$COPYFILE") if [ $SRCSIZE -ne $COPYSIZE ]; then log 2 "File size mismatch of {$SRC|$COPY}$DIR/$NAME $SRCSIZE != $COPYSIZE" continue fi SRCMD5=$(md5sum "$SRCFILE" | awk '{print $1}') COPYMD5=$(md5sum "$COPYFILE" | awk '{print $1}') if [ "$SRCMD5" != "$COPYMD5" ]; then log 2 "MD5 mismatch of {$SRC|$COPY}$DIR/$NAME" continue fi log 1 "{$SRC|$COPY}$DIR/$NAME:$SRCSIZE are equal" action "$COPYFILE" "$DIR" done # Get files only in the copy directory ls -a "$COPY$DIR" | while read NAME; do if [ "$NAME" = "." -o "$NAME" = ".." ]; then continue fi SRCFILE="$SRC$DIR/$NAME" COPYFILE="$COPY$DIR/$NAME" if [ ! -e "$SRCFILE" ]; then log 1 "File $NAME only in copy $COPY$DIR" continue fi done } compare
