diff --git a/BankCleaner.py b/BankCleaner.py new file mode 100644 index 0000000..60918e5 --- /dev/null +++ b/BankCleaner.py @@ -0,0 +1,46 @@ +import tkinter as tk +from tkinter import filedialog +import pandas as pd + +# 1. Initialize Tkinter (GUI) and hide the root window +root = tk.Tk() +root.withdraw() + +# 2. Prompt user for the input Excel file +input_file_path = filedialog.askopenfilename( + title="Select the Excel file", + filetypes=[("Excel files", "*.xlsx *.xls"), ("All files", "*.*")] +) +if not input_file_path: + print("No file selected. Exiting.") + exit() + +# 3. Read the Excel file as strings to preserve columns +df = pd.read_excel(input_file_path, dtype=str) + +# 4. Keep only the columns you want +columns_to_keep = ["text", "time", "url", "topReactionsCount", "viewsCount", "likes"] +df = df[columns_to_keep] + +# 5. Convert the 'time' column to YYYY-MM-DD +df["time"] = pd.to_datetime(df["time"]).dt.strftime("%Y-%m-%d") + +# 6. Remove empty lines (line breaks) in the 'text' column +df["text"] = df["text"].replace(r"[\r\n]+", " ", regex=True) + +# 7. Remove "#" characters from the 'text' column +df["text"] = df["text"].replace(r"#", "", regex=True) + +# 8. Prompt user for the output Excel file location +output_file_path = filedialog.asksaveasfilename( + title="Save Cleaned Excel File", + defaultextension=".xlsx", + filetypes=[("Excel files", "*.xlsx"), ("All files", "*.*")] +) +if not output_file_path: + print("No output file specified. Exiting.") + exit() + +# 9. Write the cleaned data to a new Excel file +df.to_excel(output_file_path, index=False) +print(f"Cleaned Excel file saved to: {output_file_path}")