import pandas as pd import glob import os from datetime import datetime # Prompt the user for a valid date in DD/MM/YYYY format while True: date_str = input("Please input the date in YYYY/MM/DD format: ") try: # Validate format date_obj = datetime.strptime(date_str, "%Y-%m-%d") break except ValueError: print("Invalid date format. Please try again (YYYY/MM/DD).") # Collect all .xls and .xlsx files in the current directory excel_files = glob.glob("*.xls") + glob.glob("*.xlsx") for file_path in excel_files: print(f"Processing: {file_path}") # Read the first sheet with no header; treat all rows as data df = pd.read_excel(file_path, sheet_name=0, header=None) # 1. Remove the first 9 rows df = df.iloc[9:, :] # 2. Remove rows where column A has "TỔNG" # (Here, column A = df.iloc[:,0]) df = df[df.iloc[:, 0] != "TỔNG"] # 3. Add a new column (the DataFrame column name here won't matter # because we'll save with header=False) df["temp_col"] = "" # If there is at least one row, set the *top* row of that new column to "getDate" if not df.empty: df.iloc[0, df.columns.get_loc("temp_col")] = "getDate" # If there's more than one row, fill the rest of that column with the user-entered date if df.shape[0] > 1: df.iloc[1:, df.columns.get_loc("temp_col")] = date_str # Construct output file name base_name = os.path.splitext(file_path)[0] output_file = f"{base_name}_cleaned.xlsx" # 4. Save the result WITHOUT a pandas header row # So the first row of df remains the first row in Excel df.to_excel(output_file, index=False, header=False) print(f" --> Cleaned file saved as: {output_file}")