From 97b54fa1d8ec537b7db6bae038d272ecc53af0c0 Mon Sep 17 00:00:00 2001
From: thanhtl <le.thanh1305@gmail.com>
Date: Mon, 24 Feb 2025 10:23:51 +0700
Subject: [PATCH] Update clean_mshr19_NV.py

---
 clean_mshr19_NV.py | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/clean_mshr19_NV.py b/clean_mshr19_NV.py
index ba44058..d8d6ac7 100644
--- a/clean_mshr19_NV.py
+++ b/clean_mshr19_NV.py
@@ -1,25 +1,51 @@
 import pandas as pd
 import glob
 import os
+from datetime import datetime
+
+# Prompt the user for a valid date in DD/MM/YYYY format
+while True:
+    date_str = input("Please input the date in DD/MM/YYYY format: ")
+    try:
+        # Validate format
+        date_obj = datetime.strptime(date_str, "%d/%m/%Y")
+        break
+    except ValueError:
+        print("Invalid date format. Please try again (DD/MM/YYYY).")
 
 # Collect all .xls and .xlsx files in the current directory
 excel_files = glob.glob("*.xls") + glob.glob("*.xlsx")
 
 for file_path in excel_files:
     print(f"Processing: {file_path}")
-    # Read the first sheet
+    # Read the first sheet with no header; treat all rows as data
     df = pd.read_excel(file_path, sheet_name=0, header=None)
     
-    # Remove the first 9 rows
+    # 1. Remove the first 9 rows
     df = df.iloc[9:, :]
     
-    # Remove rows where column A has "TỔNG"
+    # 2. Remove rows where column A has "TỔNG"
+    #    (Here, column A = df.iloc[:,0])
     df = df[df.iloc[:, 0] != "TỔNG"]
     
+    # 3. Add a new column (the DataFrame column name here won't matter 
+    #    because we'll save with header=False)
+    df["temp_col"] = ""
+    
+    # If there is at least one row, set the *top* row of that new column to "getDate"
+    if not df.empty:
+        df.iloc[0, df.columns.get_loc("temp_col")] = "getDate"
+    
+    # If there's more than one row, fill the rest of that column with the user-entered date
+    if df.shape[0] > 1:
+        df.iloc[1:, df.columns.get_loc("temp_col")] = date_str
+    
     # Construct output file name
-    base_name = os.path.splitext(file_path)[0]  # e.g. "data"
+    base_name = os.path.splitext(file_path)[0]
     output_file = f"{base_name}_cleaned.xlsx"
     
-    # Save the result
+    # 4. Save the result WITHOUT a pandas header row
+    #    So the first row of df remains the first row in Excel
     df.to_excel(output_file, index=False, header=False)
+    
     print(f" --> Cleaned file saved as: {output_file}")