From 97b54fa1d8ec537b7db6bae038d272ecc53af0c0 Mon Sep 17 00:00:00 2001 From: thanhtl Date: Mon, 24 Feb 2025 10:23:51 +0700 Subject: [PATCH] Update clean_mshr19_NV.py --- clean_mshr19_NV.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/clean_mshr19_NV.py b/clean_mshr19_NV.py index ba44058..d8d6ac7 100644 --- a/clean_mshr19_NV.py +++ b/clean_mshr19_NV.py @@ -1,25 +1,51 @@ import pandas as pd import glob import os +from datetime import datetime + +# Prompt the user for a valid date in DD/MM/YYYY format +while True: + date_str = input("Please input the date in DD/MM/YYYY format: ") + try: + # Validate format + date_obj = datetime.strptime(date_str, "%d/%m/%Y") + break + except ValueError: + print("Invalid date format. Please try again (DD/MM/YYYY).") # Collect all .xls and .xlsx files in the current directory excel_files = glob.glob("*.xls") + glob.glob("*.xlsx") for file_path in excel_files: print(f"Processing: {file_path}") - # Read the first sheet + # Read the first sheet with no header; treat all rows as data df = pd.read_excel(file_path, sheet_name=0, header=None) - # Remove the first 9 rows + # 1. Remove the first 9 rows df = df.iloc[9:, :] - # Remove rows where column A has "TỔNG" + # 2. Remove rows where column A has "TỔNG" + # (Here, column A = df.iloc[:,0]) df = df[df.iloc[:, 0] != "TỔNG"] + # 3. Add a new column (the DataFrame column name here won't matter + # because we'll save with header=False) + df["temp_col"] = "" + + # If there is at least one row, set the *top* row of that new column to "getDate" + if not df.empty: + df.iloc[0, df.columns.get_loc("temp_col")] = "getDate" + + # If there's more than one row, fill the rest of that column with the user-entered date + if df.shape[0] > 1: + df.iloc[1:, df.columns.get_loc("temp_col")] = date_str + # Construct output file name - base_name = os.path.splitext(file_path)[0] # e.g. "data" + base_name = os.path.splitext(file_path)[0] output_file = f"{base_name}_cleaned.xlsx" - # Save the result + # 4. Save the result WITHOUT a pandas header row + # So the first row of df remains the first row in Excel df.to_excel(output_file, index=False, header=False) + print(f" --> Cleaned file saved as: {output_file}")