I am trying to concatenate n Dataframes where: each has a column that contains names of companies (may not be exactly the same) and other columns (some columns are consistent across tables e.g. Value, Currency, etc. and others are not).
Tables need to be joined on the company name only, any with duplicate columns have their values stored in the cell as a tuple.
e.g.
Company Name | Value |
---|---|
A corp | Cell 2 |
B corp | Cell 4 |
Company Name | Date | Leadership | Net Profit |
---|---|---|---|
A corp | Cell 2 | Cell 1 | Cell 2 |
C corp | Cell 4 | Cell 1 | Cell 2 |
Company Name | Value |
---|---|
A corp | Cell 2 |
B corp | Cell 4 |
C corp | Cell 4 |
Company Name | Percentage | Comment | HQ | Website |
---|---|---|---|---|
D corp | Cell 2 | Cell 2 | Cell 2 | Cell 2 |
D corp | Cell 4 | Cell 4 | Cell 4 | Cell 4 |
Here is my code:
def concatenate_tables(self, tables): def normalize_name(name): if pd.isna(name) or not isinstance(name, str) or name.strip() == '': return None return name.lower().strip() def fuzzymatch(name, existing_entities): # Uses fuzzywuzzy if not name or not existing_entities: return None matches = process.extract(name, existing_entities, scorer=fuzz.token_set_ratio, limit=1) return matches[0][0] if matches and matches[0][1] > 80 else None # Ensure all inputs are DataFrames if not all(isinstance(df, pd.DataFrame) for df in tables): raise ValueError("All elements in 'tables' must be pandas DataFrames.") # Collect all unique columns and company names all_columns = set(['Company Name']) # Initialize with 'Company Name' to ensure inclusion for df in tables: all_columns.update(df.columns) all_company_names = set() for df in tables: if 'Company Name' in df.columns: all_company_names.update(df['Company Name'].astype(str).apply(normalize_name).dropna()) consolidated_data = defaultdict(lambda: {col: [] for col in all_columns}) # Pre-fill with all columns for df in tables: for _, row in df.iterrows(): raw_name = row['Company Name'] norm_name = normalize_name(raw_name) matched_name = fuzzymatch(norm_name, all_company_names) if norm_name else None effective_name = matched_name if matched_name else norm_name # Use normalized name if no match for col in all_columns: # Append data if column exists in current df, else append None if col in df.columns: consolidated_data[effective_name][col].append(row[col]) else: consolidated_data[effective_name][col].append(None) # Prepare consolidated rows consolidated_rows = [] for company_name, cols in consolidated_data.items(): row_data = {"Company Name": company_name} for col, values in cols.items(): # Filter out None values non_null_values = list(filter(None.__ne__, values)) if len(set(non_null_values)) > 1: row_data[col] = tuple(set(non_null_values)) elif non_null_values: row_data[col] = non_null_values[0] else: row_data[col] = None consolidated_rows.append(row_data) master_df = pd.DataFrame(consolidated_rows) return master_df
But I can't seem to figure out why this code doesn't return a full table that contains all the columns from each individual table.
Please let me know if anything is unclear.
Thanks in advance.