Finding duplicate files and removing them:
#!/usr/bin/env python # if running in py3, change the shebang, drop the next import for readability (it does no harm in py3) from __future__ import print_function # py2 compatibility from collections import defaultdict import hashlib import os import sys def chunk_reader(fobj, chunk_size=1024): """Generator that reads a file in chunks of bytes""" while True: chunk = fobj.read(chunk_size) if not chunk: return yield chunk def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1): hashobj = hash() file_object = open(filename, 'rb') if first_chunk_only: hashobj.update(file_object.read(1024)) else: for chunk in chunk_reader(file_object): hashobj.update(chunk) hashed = hashobj.digest() file_object.close() return hashed def check_for_duplicates(paths, hash=hashlib.sha1): hashes_by_size = defaultdict(list) # dict of size_in_bytes: [full_path_to_file1, full_path_to_file2, ] hashes_on_1k = defaultdict(list) # dict of (hash1k, size_in_bytes): [full_path_to_file1, full_path_to_file2, ] hashes_full = {} # dict of full_file_hash: full_path_to_file_string for path in paths: for dirpath, dirnames, filenames in os.walk(path): # get all files that have the same size - they are the collision candidates for filename in filenames: full_path = os.path.join(dirpath, filename) try: # if the target is a symlink (soft one), this will # dereference it - change the value to the actual target file full_path = os.path.realpath(full_path) file_size = os.path.getsize(full_path) hashes_by_size[file_size].append(full_path) except (OSError,): # not accessible (permissions, etc) - pass on continue # For all files with the same file size, get their hash on the 1st 1024 bytes only for size_in_bytes, files in hashes_by_size.items(): if len(files) < 2: continue # this file size is unique, no need to spend CPU cycles on it for filename in files: try: small_hash = get_hash(filename, first_chunk_only=True) # the key is the hash on the first 1024 bytes plus the size - to # avoid collisions on equal hashes in the first part of the file # credits to @Futal for the optimization hashes_on_1k[(small_hash, size_in_bytes)].append(filename) except (OSError,): # the file access might've changed till the exec point got here continue # For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates for __, files_list in hashes_on_1k.items(): if len(files_list) < 2: continue # this hash of fist 1k file bytes is unique, no need to spend cpy cycles on it for filename in files_list: try: full_hash = get_hash(filename, first_chunk_only=False) duplicate = hashes_full.get(full_hash) if duplicate: print("Duplicate found: {} and {}".format(filename, duplicate)) else: hashes_full[full_hash] = filename except (OSError,): # the file access might've changed till the exec point got here continue if __name__ == "__main__": if sys.argv[1:]: check_for_duplicates(sys.argv[1:]) else: print("Please pass the paths to check as parameters to the script")
Recursive folders version:
This version uses the file size and a hash of the contents to find duplicates. You can pass it multiple paths, it will scan all paths recursively and report all duplicates found.
Example #1:
import sys import os import hashlib check_path = (lambda filepath, hashes, p = sys.stdout.write: (lambda hash = hashlib.sha1 (file (filepath).read ()).hexdigest (): ((hash in hashes) and (p ('DUPLICATE FILE\n' ' %s\n' 'of %s\n' % (filepath, hashes[hash]))) or hashes.setdefault (hash, filepath)))()) scan = (lambda dirpath, hashes = {}: map (lambda (root, dirs, files): map (lambda filename: check_path (os.path.join (root, filename), hashes), files), os.walk (dirpath))) ((len (sys.argv) > 1) and scan (sys.argv[1]))
Example #2:
# Originally taken from: # http://www.pythoncentral.io/finding-duplicate-files-with-python/ # Original Auther: Andres Torres # Adapted to only compute the md5sum of files with the same size import argparse import os import sys import hashlib def find_duplicates(folders): """ Takes in an iterable of folders and prints & returns the duplicate files """ dup_size = {} for i in folders: # Iterate the folders given if os.path.exists(i): # Find the duplicated files and append them to dup_size join_dicts(dup_size, find_duplicate_size(i)) else: print('%s is not a valid path, please verify' % i) return {} print('Comparing files with the same size...') dups = {} for dup_list in dup_size.values(): if len(dup_list) > 1: join_dicts(dups, find_duplicate_hash(dup_list)) print_results(dups) return dups def find_duplicate_size(parent_dir): # Dups in format {hash:[names]} dups = {} for dirName, subdirs, fileList in os.walk(parent_dir): print('Scanning %s...' % dirName) for filename in fileList: # Get the path to the file path = os.path.join(dirName, filename) # Check to make sure the path is valid. if not os.path.exists(path): continue # Calculate sizes file_size = os.path.getsize(path) # Add or append the file path if file_size in dups: dups[file_size].append(path) else: dups[file_size] = [path] return dups def find_duplicate_hash(file_list): print('Comparing: ') for filename in file_list: print(' {}'.format(filename)) dups = {} for path in file_list: file_hash = hashfile(path) if file_hash in dups: dups[file_hash].append(path) else: dups[file_hash] = [path] return dups # Joins two dictionaries def join_dicts(dict1, dict2): for key in dict2.keys(): if key in dict1: dict1[key] = dict1[key] + dict2[key] else: dict1[key] = dict2[key] def hashfile(path, blocksize=65536): afile = open(path, 'rb') hasher = hashlib.md5() buf = afile.read(blocksize) while len(buf) > 0: hasher.update(buf) buf = afile.read(blocksize) afile.close() return hasher.hexdigest() def print_results(dict1): results = list(filter(lambda x: len(x) > 1, dict1.values())) if len(results) > 0: print('Duplicates Found:') print( 'The following files are identical. The name could differ, but the' ' content is identical' ) print('___________________') for result in results: for subresult in result: print('\t\t%s' % subresult) print('___________________') else: print('No duplicate files found.') def main(): parser = argparse.ArgumentParser(description='Find duplicate files') parser.add_argument( 'folders', metavar='dir', type=str, nargs='+', help='A directory to parse for duplicates', ) args = parser.parse_args() find_duplicates(args.folders) if __name__ == '__main__': sys.exit(main())
How to find duplicate files with same content, but with different names using python?
Example #1:
import hashlib import os import sys from sets import Set def read_chunk(fobj, chunk_size = 2048): """ Files can be huge so read them in chunks of bytes. """ while True: chunk = fobj.read(chunk_size) if not chunk: return yield chunk def remove_duplicates(dir, hashfun = hashlib.sha512): unique = Set() for filename in os.listdir(dir): filepath = os.path.join(dir, filename) if os.path.isfile(filepath): hashobj = hashfun() for chunk in read_chunk(open(filepath,'rb')): hashobj.update(chunk) # the size of the hashobj is constant # print "hashfun: ", hashfun.__sizeof__() hashfile = hashobj.hexdigest() if hashfile not in unique: unique.add(hashfile) else: os.remove(filepath) try: hashfun = hashlib.sha256 remove_duplicates(sys.argv[1], hashfun) except IndexError: print """Please pass a path to a directory with duplicate files as a parameter to the script."""
Example #2:
import hashlib, os unique = dict() for filename in os.listdir('.'): if os.path.isfile(filename): filehash = hashlib.md5(open(filename, 'rb').read()).hexdigest() if filehash not in unique: unique[filehash] = filename else: print filename + ' is a duplicate of ' + unique[filehash]
Deleting duplicate files if file exists in certain directories – Python:
import time import os import shutil from hashlib import sha256 class Duplython: def __init__(self): self.home_dir = os.getcwd() self.File_hashes = [] self.Cleaned_dirs = [] self.Total_bytes_saved = 0 self.block_size = 65536 self.count_cleaned = 0 def welcome(self) -> None: print('******************************************************************') print('**************** DUPLYTHON ****************************') print('********************************************************************\n\n') print('---------------- WELCOME ----------------------------') time.sleep(3) print('\nCleaning .................') return None def generate_hash(self, Filename: str) -> str: Filehash = sha256() try: with open(Filename, 'rb') as File: fileblock = File.read(self.block_size) while len(fileblock) > 0: Filehash.update(fileblock) fileblock = File.read(self.block_size) Filehash = Filehash.hexdigest() return Filehash except: return False def clean(self) -> None: all_dirs = [path[0] for path in os.walk('E:\\songs')] for path in all_dirs: os.chdir(path) All_Files = [file for file in os.listdir() if os.path.isfile(file)] for file in All_Files: filehash = self.generate_hash(file) if not filehash in self.File_hashes: if filehash: self.File_hashes.append(filehash) # print(file) else: byte_saved = os.path.getsize(file) self.count_cleaned += 1 self.Total_bytes_saved += byte_saved os.remove(file) filename = file.split('/')[-1] print(filename, '.. cleaned ') os.chdir(self.home_dir) def cleaning_summary(self) -> None: mb_saved = self.Total_bytes_saved / 1048576 mb_saved = round(mb_saved, 2) print('\n\n--------------FINISHED CLEANING ------------') print('File cleaned : ', self.count_cleaned) print('Total Space saved : ', mb_saved, 'MB') print('-----------------------------------------------') def main(self) -> None: self.welcome() self.clean() self.cleaning_summary() # # if __name__ == '__main__': # App = Duplython() # App.main() def dedupe_bing_images(): App = Duplython() App.main() return True dedupe_bing_images()
Find and remove duplicate files using Python:
import os file_path = r"C:\Data\temp" file_list = os.listdir(file_path) print (file_list) for file_name in file_list: if "(1)" not in file_name: continue original_file_name = file_name.replace('(1)', '') if not os.path.exists(os.path.join(file_path, original_file_name): continue # do not remove files which have no original os.remove(os.path.join(file_path, file_name))
Using python code to remove duplicate files from a Directory and Subdirectory:
Example #1:
#!/usr/bin/python import os import hashlib import sys dirname = sys.argv[1] os.chdir(dirname) def dup_fileremove(dir): duplicate = set() os.chdir(dir) path=os.getcwd() print ("The dir is: ", path) for filename in os.listdir(dir): filehash = None filepath=os.path.join(dir, filename) print("Current file path is: ", filepath) if os.path.isdir(filepath): dup_fileremove(filepath) elif os.path.isfile(filepath): filehash =hashlib.md5(file(filepath).read()).hexdigest() if filehash is not None and filehash not in duplicate: duplicate.add(filehash) elif filehash is not None: os.remove(filepath) print("removed : ", filepath) dup_fileremove(dirname)
Example #2:
#!/usr/bin/python import os import hashlib import sys def dup_fileremove(dirname): duplicate = set() os.chdir(dirname) path=os.getcwd() print ("The dirname is: ", path) for filename in os.listdir(dirname): filehash = None filepath=os.path.join(dirname, filename) print("Current file path is: ", filepath) if os.path.isdir(filepath): dup_fileremove(filepath) elif os.path.isfile(filepath): filehash =hashlib.md5(file(filepath).read()).hexdigest() if filehash not in duplicate: duplicate.add(filehash) else: os.remove(filepath) print("removed : ", filepath) dirname = sys.argv[1] os.chdir(dirname) dup_fileremove(dirname)
How to Remove all Duplicate Files on your Drive via Python:
Example #1:
import time import os from hashlib import sha256 class Duplython: def __init__(self): self.home_dir = os.getcwd(); self.File_hashes = [] self.Cleaned_dirs = []; self.Total_bytes_saved = 0 self.block_size = 65536; self.count_cleaned = 0 def welcome(self)->None: print('******************************************************************') print('**************** DUPLYTHON ****************************') print('********************************************************************\n\n') print('---------------- WELCOME ----------------------------') time.sleep(3) print('\nCleaning .................') def main(self)->None: self.welcome() if __name__ == '__main__': App = Duplython() App.main()
Example #2:
import time import os import shutil from hashlib import sha256 class Duplython: def __init__(self): self.home_dir = os.getcwd(); self.File_hashes = [] self.Cleaned_dirs = []; self.Total_bytes_saved = 0 self.block_size = 65536; self.count_cleaned = 0 def welcome(self)->None: print('******************************************************************') print('**************** DUPLYTHON ****************************') print('********************************************************************\n\n') print('---------------- WELCOME ----------------------------') time.sleep(3) print('\nCleaning .................') def generate_hash(self, Filename:str)->str: Filehash = sha256() try: with open(Filename, 'rb') as File: fileblock = File.read(self.block_size) while len(fileblock)>0: Filehash.update(fileblock) fileblock = File.read(self.block_size) Filehash = Filehash.hexdigest() return Filehash except: return False def clean(self)->None: all_dirs = [path[0] for path in os.walk('.')] for path in all_dirs: os.chdir(path) All_Files =[file for file in os.listdir() if os.path.isfile(file)] for file in All_Files: filehash = self.generate_hash(file) if not filehash in self.File_hashes: if filehash: self.File_hashes.append(filehash) #print(file) else: byte_saved = os.path.getsize(file); self.count_cleaned+=1 self.Total_bytes_saved+=byte_saved os.remove(file); filename = file.split('/')[-1] print(filename, '.. cleaned ') os.chdir(self.home_dir) def cleaning_summary(self)->None: mb_saved = self.Total_bytes_saved/1048576 mb_saved = round(mb_saved, 2) print('\n\n--------------FINISHED CLEANING ------------') print('File cleaned : ', self.count_cleaned) print('Total Space saved : ', mb_saved, 'MB') print('-----------------------------------------------') def main(self)->None: self.welcome();self.clean();self.cleaning_summary() if __name__ == '__main__': App = Duplython() App.main()