Quantcast
Channel: Recent Questions - Stack Overflow
Viewing all articles
Browse latest Browse all 12201

Extracting email addresses from different files in a directory in Python

$
0
0

I have a hard disk with several thousand files that I have backed up from an old computer. These include csv, docx, xlsx, zip, 7z, pst and eml files. I am trying to find some email addresses of old contacts and I don't want to go through all of these files one by one.

I tried to write a Python script to extract email addresses from inside these files and then save them in a separate txt file called outputs.txt. However, even though the script runs without any errors, my outputs.txt file is empty.

Here is what I tried (with ChatGPT's help in places):

import osimport reimport zipfileimport openpyxlimport py7zrimport csvfrom docx import Documentfrom email.parser import BytesParserfrom email.policy import defaultdirectory = 'media/user/Backups'output_file = 'outputs.txt'email_pattern = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}')def extract_emails_from_text(text):    return email_pattern.findall(text)def process_txt_file(file_path):    print(f"Processing TXT file: {file_path}")    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:        text = file.read()        emails = extract_emails_from_text(text)    return emailsdef process_xlsx_file(file_path):    print(f"Processing XLSX file: {file_path}")    emails = []    workbook = openpyxl.load_workbook(file_path)    for sheet in workbook.worksheets:        for row in sheet.iter_rows(values_only=True):            for cell in row:                if isinstance(cell, str):                    emails.extend(extract_emails_from_text(cell))    return emailsdef process_csv_file(file_path):    print(f"Processing CSV file: {file_path}")    emails = []    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:        reader = csv.reader(file)        for row in reader:            for cell in row:                emails.extend(extract_emails_from_text(cell))    return emailsdef process_docx_file(file_path):    print(f"Processing DOCX file: {file_path}")    emails = []    doc = Document(file_path)    for para in doc.paragraphs:        emails.extend(extract_emails_from_text(para.text))    for table in doc.tables:        for row in table.rows:            for cell in row.cells:                emails.extend(extract_emails_from_text(cell.text))    return emailsdef process_eml_file(file_content):    emails = []    msg = BytesParser(policy=default).parsebytes(file_content)    for part in msg.walk():        if part.get_content_type() == 'text/plain':            emails.extend(extract_emails_from_text(part.get_payload(decode=True).decode('utf-8', errors='ignore')))    return emailsdef process_zip_file(file_path):    print(f"Processing ZIP file: {file_path}")    emails = []    with zipfile.ZipFile(file_path, 'r') as zip_ref:        for file in zip_ref.namelist():            with zip_ref.open(file) as f:                try:                    if file.endswith('.eml'):                        emails.extend(process_eml_file(f.read()))                    else:                        content = f.read().decode('utf-8', errors='ignore')                        emails.extend(extract_emails_from_text(content))                except:                    pass  # Skip files that cannot be read    return emailsdef process_7z_file(file_path):    print(f"Processing 7z file: {file_path}")    emails = []    with py7zr.SevenZipFile(file_path, mode='r') as z:        for file in z.getnames():            with z.extractfile(file) as f:                try:                    if file.endswith('.eml'):                        emails.extend(process_eml_file(f.read()))                    else:                        content = f.read().decode('utf-8', errors='ignore')                        emails.extend(extract_emails_from_text(content))                except:                    pass  # Skip files that cannot be read    return emailsdef process_pst_file(file_path):    print(f"Processing PST file: {file_path}")    # Processing PST files requires the use of a specific library like pypff (which is not included here)    # Here is a placeholder for the logic to process PST files    emails = []    # Implement logic to process PST files and extract emails    return emailsdef process_files(directory):    results = []    for root, _, files in os.walk(directory):        print(f"Checking directory: {root}")        for file in files:            file_path = os.path.join(root, file)            print(f"Found file: {file_path}")            emails = []            if file.endswith('.txt'):                emails = process_txt_file(file_path)            elif file.endswith('.xlsx'):                emails = process_xlsx_file(file_path)            elif file.endswith('.csv'):                emails = process_csv_file(file_path)            elif file.endswith('.docx'):                emails = process_docx_file(file_path)            elif file.endswith('.zip'):                emails = process_zip_file(file_path)            elif file.endswith('.7z'):                emails = process_7z_file(file_path)            elif file.endswith('.pst'):                emails = process_pst_file(file_path)            if emails:                results.append((file_path, emails))                print(f"Emails found in {file_path}: {emails}")    return resultsdef save_results(results, output_file):    with open(output_file, 'w') as file:        for file_path, emails in results:            for email in emails:                file.write(f"{email} found in {file_path}\n")if __name__ == '__main__':    results = process_files(directory)    save_results(results, output_file)    print(f"Email addresses have been extracted and saved to {output_file}")

Viewing all articles
Browse latest Browse all 12201

Trending Articles



<script src="https://jsc.adskeeper.com/r/s/rssing.com.1596347.js" async> </script>