Skip to content

Categories:

Recursive Duplicate Finder

Recently I needed quickly a tool which compares two directories file by file and delete duplicate copies on one side. So it checks the source directory with the copy directory file by file. Each file is compared by inode, size and md5sum. If the source file and the copy file have the same md5 hash value (but not the same inode) the file copy could be deleted via the -d parameter or moved with the -m parameter.
At the beginning I found fslint which was to fancy for this task. And at the end of this blog post I found fdupes which seems to do exactly what I needed. I love YAGN programming :-P

#!/bin/sh
#
# dupremover - Recursive Duplicate Remover
#
# (c) 2011 by Sebastian Felis
#
# You are free to use, modify, restribute without warranty while keeping it
# free of charge. It would be nice to get an feedback note if you are using it.
#
# History
# 2011-01-14 Initial Version
#
 
SCRIPT="$0"
SRC=
COPY=
LOGLEVEL=0 # 2 DEBUG, 1 VERBOSE, 0 INFO, -1 WARN, -2 ERR
DELETE=0
DELETE_HARDLINK=0
MOVE=0
MOVEDST=".dupremover"
TRY=0
 
help() {
  echo "$(basename "$SCRIPT") [OPTIONS] SRC COPY
 
Compares files of SRC directory with COPY directory recursivly via md5sum and
deletes or moves duplicated files of the COPY directory.
 
\t-d|--delete
\t\tDelete the copy file
\t-m|--move
\t\tMoves the copy file to SRC/$MOVEDST
\t-t|--try
\t\tTry run. Do not perform any action
\t-H|--delete-hardlink
\t\tDelete multiple hardlink file
\t-L|--log-level LEVEL
\t\tSet log level: 2: DEBUG, 1: VERBOSE, 0: INFO, -1: WARN, -2: ERR
"
}
 
while [ -n "$1" ]; do
  case "$1" in
    -d|--delete)
      DELETE=1
      ;;
    -m|--move)
      MOVE=1
      ;;
    -t|--try)
      TRY=1
      ;;
    -H|--delete-hardlink)
      DELETE_HARDLINK=1
      ;;
    -L|--log-level)
      shift
      LOGLEVEL=$1
      ;;
    -h|--help)
      help
      exit 0
      ;;
    *)
      if [ -z "$SRC" ]; then
        SRC=$1
      elif [ -z "$COPY" ]; then
        COPY=$1
      fi
      ;;
  esac
  shift
done
 
if [ -z "$COPY" ]; then
  help
  exit 1
fi
 
log() {
  if [ $1 -eq 2 ]; then
    LEVEL=DEBUG
  elif [ $1 -eq 1 ]; then
    LEVEL=VERBOSE
  elif [ $1 -eq 0 ]; then
    LEVEL=INFO
  elif [ $1 -eq -1 ]; then
    LEVEL=WARN
  elif [ $1 -eq -2 ]; then
    LEVEL=ERR
  else
    LEVEL=UNKNOWN
  fi
  if [ $1 -le $LOGLEVEL ]; then
    echo $LEVEL: $2
  fi
}
 
action() {
  local FILE="$1"
  local DIR="$2"
  if [ $MOVE -ne 0 ]; then
    if [ ! -f "$FILE" ]; then
      log 1 "Skip moving of directory $FILE"
      return 0
    fi
    local DST="$SRC/$MOVEDST$DIR"
    log 0 "Move duplicate $FILE to $DST"
    if [ $TRY -eq 0 ]; then
      mkdir -p "$DST"
      mv "$FILE" "$DST"
      if [ $? -ne 0 ]; then
        log -2 "Could not move $FILE to $DST"
      fi
    fi
  elif [ $DELETE -ne 0 ]; then
    log 0 "Delete duplicate $FILE"
    if [ $TRY -eq 0 ]; then
      rm -rf "$FILE" > /dev/null
      if [ $? -ne 0 ]; then
        log -2 "Could not delete $FILE"
      fi
    fi
  else
    log 0 "Do nothing for duplicate $FILE"
  fi
}
 
checkemptydir() {
  local DIR="$1"
  if [ ! -d "$DIR" ]; then
    log -1 "$DIR is not a directory"
    exit
  fi
 
  COUNT=$(ls -a "$DIR" | wc -l)
  if [ $COUNT -eq 2 ]; then
    action "$DIR"
  fi
}
 
compare() {
  local DIR="$1"
  if [ ! -d "$SRC$DIR" ]; then
    log -2 "Skip non directory $_DIR"
  fi
  ls -a "$SRC$DIR" | while read NAME; do
    if [ "$NAME" = "." -o "$NAME" = ".." ]; then
      continue
    fi
    SRCFILE="$SRC$DIR/$NAME"
    COPYFILE="$COPY$DIR/$NAME"
    if [ ! -e "$COPYFILE" ]; then
      log 1 "File $NAME only in source $SRC$DIR"
      continue
    fi
    if [ -d "$SRCFILE" ]; then
      log 1 "Read directory $SRCFILE"
      compare "$DIR/$NAME"
      checkemptydir "$COPYFILE"
      continue
    fi
    if [ $(stat -c%i "$SRCFILE") -eq $(stat -c%i "$COPYFILE") ]; then
      if [ $(ls -ld "$SRCFILE" | awk '{print $2}') -gt 1 -a $DELETE_HARDLINK -ne 0 ]; then
        log -1 "File $COPYFILE is duplicated hardlink"
      else
        log 0 "Skip file $DIR/$NAME with same inode. Use -H to delete hardlinks"
        continue
      fi
    fi
    SRCSIZE=$(stat -c%s "$SRCFILE")
    COPYSIZE=$(stat -c%s "$COPYFILE")
    if [ $SRCSIZE -ne $COPYSIZE ]; then
      log 2 "File size mismatch of {$SRC|$COPY}$DIR/$NAME $SRCSIZE != $COPYSIZE"
      continue
    fi
    SRCMD5=$(md5sum "$SRCFILE" | awk '{print $1}')
    COPYMD5=$(md5sum "$COPYFILE" | awk '{print $1}')
    if [ "$SRCMD5" != "$COPYMD5" ]; then
      log 2 "MD5 mismatch of {$SRC|$COPY}$DIR/$NAME"
      continue
    fi
    log 1 "{$SRC|$COPY}$DIR/$NAME:$SRCSIZE are equal"
    action "$COPYFILE" "$DIR"
  done
  # Get files only in the copy directory
  ls -a "$COPY$DIR" | while read NAME; do
    if [ "$NAME" = "." -o "$NAME" = ".." ]; then
      continue
    fi
    SRCFILE="$SRC$DIR/$NAME"
    COPYFILE="$COPY$DIR/$NAME"
    if [ ! -e "$SRCFILE" ]; then
      log 1 "File $NAME only in copy $COPY$DIR"
      continue
    fi
  done
}
 
compare

Posted in ubuntu.

Tagged with , , , , .


0 Responses

Stay in touch with the conversation, subscribe to the RSS feed for comments on this post.



Some HTML is OK

or, reply to this post via trackback.

*
To prove you're a person (not a spam script), type the security word shown in the picture. Click on the picture to hear an audio file of the word.
Click to hear an audio file of the anti-spam word