Update clean_mshr19_NV.py

2025-02-24 10:23:51 +07:00 · 2025-02-24 10:23:51 +07:00 · 97b54fa1d8
commit 97b54fa1d8
parent 7bed8d7515
1 changed files with 31 additions and 5 deletions
--- a/clean_mshr19_NV.py
+++ b/clean_mshr19_NV.py
@ -1,25 +1,51 @@
 import pandas as pd
 import glob
 import os
+from datetime import datetime
+
+# Prompt the user for a valid date in DD/MM/YYYY format
+while True:
+    date_str = input("Please input the date in DD/MM/YYYY format: ")
+    try:
+        # Validate format
+        date_obj = datetime.strptime(date_str, "%d/%m/%Y")
+        break
+    except ValueError:
+        print("Invalid date format. Please try again (DD/MM/YYYY).")

 # Collect all .xls and .xlsx files in the current directory
 excel_files = glob.glob("*.xls") + glob.glob("*.xlsx")

 for file_path in excel_files:
    print(f"Processing: {file_path}")
-    # Read the first sheet
+    # Read the first sheet with no header; treat all rows as data
    df = pd.read_excel(file_path, sheet_name=0, header=None)
    
-    # Remove the first 9 rows
+    # 1. Remove the first 9 rows
    df = df.iloc[9:, :]
    
-    # Remove rows where column A has "TỔNG"
+    # 2. Remove rows where column A has "TỔNG"
+    #    (Here, column A = df.iloc[:,0])
    df = df[df.iloc[:, 0] != "TỔNG"]
    
+    # 3. Add a new column (the DataFrame column name here won't matter 
+    #    because we'll save with header=False)
+    df["temp_col"] = ""
+    
+    # If there is at least one row, set the *top* row of that new column to "getDate"
+    if not df.empty:
+        df.iloc[0, df.columns.get_loc("temp_col")] = "getDate"
+    
+    # If there's more than one row, fill the rest of that column with the user-entered date
+    if df.shape[0] > 1:
+        df.iloc[1:, df.columns.get_loc("temp_col")] = date_str
+    
    # Construct output file name
-    base_name = os.path.splitext(file_path)[0]  # e.g. "data"
+    base_name = os.path.splitext(file_path)[0]
    output_file = f"{base_name}_cleaned.xlsx"
    
-    # Save the result
+    # 4. Save the result WITHOUT a pandas header row
+    #    So the first row of df remains the first row in Excel
    df.to_excel(output_file, index=False, header=False)
+    
    print(f" --> Cleaned file saved as: {output_file}")