I have a hard disk with several thousand files that I have backed up from an old computer. These include csv, docx, xlsx, zip, 7z, pst and eml files. I am trying to find some email addresses of old contacts and I don't want to go through all of these files one by one.
I tried to write a Python script to extract email addresses from inside these files and then save them in a separate txt file called outputs.txt. However, even though the script runs without any errors, my outputs.txt file is empty.
Here is what I tried (with ChatGPT's help in places):
import osimport reimport zipfileimport openpyxlimport py7zrimport csvfrom docx import Documentfrom email.parser import BytesParserfrom email.policy import defaultdirectory = 'media/user/Backups'output_file = 'outputs.txt'email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')def extract_emails_from_text(text): return email_pattern.findall(text)def process_txt_file(file_path): print(f"Processing TXT file: {file_path}") with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: text = file.read() emails = extract_emails_from_text(text) return emailsdef process_xlsx_file(file_path): print(f"Processing XLSX file: {file_path}") emails = [] workbook = openpyxl.load_workbook(file_path) for sheet in workbook.worksheets: for row in sheet.iter_rows(values_only=True): for cell in row: if isinstance(cell, str): emails.extend(extract_emails_from_text(cell)) return emailsdef process_csv_file(file_path): print(f"Processing CSV file: {file_path}") emails = [] with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: reader = csv.reader(file) for row in reader: for cell in row: emails.extend(extract_emails_from_text(cell)) return emailsdef process_docx_file(file_path): print(f"Processing DOCX file: {file_path}") emails = [] doc = Document(file_path) for para in doc.paragraphs: emails.extend(extract_emails_from_text(para.text)) for table in doc.tables: for row in table.rows: for cell in row.cells: emails.extend(extract_emails_from_text(cell.text)) return emailsdef process_eml_file(file_content): emails = [] msg = BytesParser(policy=default).parsebytes(file_content) for part in msg.walk(): if part.get_content_type() == 'text/plain': emails.extend(extract_emails_from_text(part.get_payload(decode=True).decode('utf-8', errors='ignore'))) return emailsdef process_zip_file(file_path): print(f"Processing ZIP file: {file_path}") emails = [] with zipfile.ZipFile(file_path, 'r') as zip_ref: for file in zip_ref.namelist(): with zip_ref.open(file) as f: try: if file.endswith('.eml'): emails.extend(process_eml_file(f.read())) else: content = f.read().decode('utf-8', errors='ignore') emails.extend(extract_emails_from_text(content)) except: pass # Skip files that cannot be read return emailsdef process_7z_file(file_path): print(f"Processing 7z file: {file_path}") emails = [] with py7zr.SevenZipFile(file_path, mode='r') as z: for file in z.getnames(): with z.extractfile(file) as f: try: if file.endswith('.eml'): emails.extend(process_eml_file(f.read())) else: content = f.read().decode('utf-8', errors='ignore') emails.extend(extract_emails_from_text(content)) except: pass # Skip files that cannot be read return emailsdef process_pst_file(file_path): print(f"Processing PST file: {file_path}") # Processing PST files requires the use of a specific library like pypff (which is not included here) # Here is a placeholder for the logic to process PST files emails = [] # Implement logic to process PST files and extract emails return emailsdef process_files(directory): results = [] for root, _, files in os.walk(directory): print(f"Checking directory: {root}") for file in files: file_path = os.path.join(root, file) print(f"Found file: {file_path}") emails = [] if file.endswith('.txt'): emails = process_txt_file(file_path) elif file.endswith('.xlsx'): emails = process_xlsx_file(file_path) elif file.endswith('.csv'): emails = process_csv_file(file_path) elif file.endswith('.docx'): emails = process_docx_file(file_path) elif file.endswith('.zip'): emails = process_zip_file(file_path) elif file.endswith('.7z'): emails = process_7z_file(file_path) elif file.endswith('.pst'): emails = process_pst_file(file_path) if emails: results.append((file_path, emails)) print(f"Emails found in {file_path}: {emails}") return resultsdef save_results(results, output_file): with open(output_file, 'w') as file: for file_path, emails in results: for email in emails: file.write(f"{email} found in {file_path}\n")if __name__ == '__main__': results = process_files(directory) save_results(results, output_file) print(f"Email addresses have been extracted and saved to {output_file}")