From fac04235343e81a7b2f1e5d3665ec7749d84ba51 Mon Sep 17 00:00:00 2001
From: thanhtl <le.thanh1305@gmail.com>
Date: Wed, 24 Dec 2025 06:29:20 +0700
Subject: [PATCH] Add batchcompressed.py

---
 batchcompressed.py | 524 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 524 insertions(+)
 create mode 100644 batchcompressed.py
diff --git a/batchcompressed.py b/batchcompressed.py
new file mode 100644
index 0000000..173b30a
--- /dev/null
+++ b/batchcompressed.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+r"""
+SMART PDF Compressor - Only Saves When Actually Smaller
+Automatically keeps the smaller version (original vs compressed)
+
+Features:
+- Dynamic timeout based on file size
+- Only keeps compressed version if actually smaller
+- Shows compression success rate
+- Moves processed files to 'processed' folder
+
+Usage: python batch_compress_smart_v2.py <source> <destination> [workers] [--timeout-per-mb SECONDS]
+
+Examples:
+    python batch_compress_smart_v2.py D:\C D:\D 60
+    python batch_compress_smart_v2.py D:\C D:\D 60 --timeout-per-mb 10
+
+Huong dan su dung
+    Day la se chuyen tu folder A (original) to foder B (Output/Result)
+    Required to install ghostscripts de chay
+    Toi uu cho PC manh 3990x 5090
+       
+"""
+
+import os
+import sys
+import subprocess
+import platform
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from multiprocessing import cpu_count
+import time
+from datetime import timedelta
+import threading
+import shutil
+
+# Windows limit
+MAX_WINDOWS_WORKERS = 61
+
+def get_file_size_mb(filepath):
+    """Get file size in MB"""
+    try:
+        return os.path.getsize(filepath) / (1024 * 1024)
+    except:
+        return 0
+
+def ensure_folder_exists(folder_path):
+    """Create folder if it doesn't exist"""
+    Path(folder_path).mkdir(parents=True, exist_ok=True)
+
+def find_pdf_files_with_sizes(folder_path, processed_folder):
+    """Find all unprocessed PDF files"""
+    pdf_files = []
+    if not os.path.exists(folder_path):
+        return pdf_files
+    
+    processed_files = set()
+    if os.path.exists(processed_folder):
+        processed_files = {f.lower() for f in os.listdir(processed_folder) if f.lower().endswith('.pdf')}
+    
+    for file in os.listdir(folder_path):
+        if file.lower().endswith('.pdf'):
+            full_path = os.path.join(folder_path, file)
+            if os.path.isfile(full_path) and file.lower() not in processed_files:
+                size = get_file_size_mb(full_path)
+                pdf_files.append((file, size))
+    
+    # Sort by size (SMALLEST first)
+    return sorted(pdf_files, key=lambda x: x[1])
+
+def find_ghostscript_command():
+    """Find Ghostscript"""
+    gs_commands = ['gswin64c', 'gswin32c', 'gs']
+    for cmd in gs_commands:
+        try:
+            subprocess.run([cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, timeout=5)
+            return cmd
+        except:
+            continue
+    return None
+
+def calculate_timeout(file_size_mb, seconds_per_mb=6):
+    """
+    Calculate timeout based on file size
+    Default: 6 seconds per MB
+    Minimum: 60 seconds
+    Maximum: 30 minutes
+    """
+    timeout = int(file_size_mb * seconds_per_mb)
+    timeout = max(60, timeout)  # At least 1 minute
+    timeout = min(1800, timeout)  # At most 30 minutes
+    return timeout
+
+def compress_pdf_smart(args):
+    """
+    Compress PDF and only keep if smaller
+    Returns: (filename, success, original_size, final_size, compressed_smaller, error_msg, duration, moved, timeout_used)
+    """
+    input_path, output_path, processed_path, gs_command, thread_count, seconds_per_mb = args
+    filename = os.path.basename(input_path)
+    start_time = time.time()
+    moved = False
+    compressed_smaller = False
+    temp_output = output_path + ".tmp"
+    
+    try:
+        original_size = get_file_size_mb(input_path)
+        
+        # Calculate dynamic timeout
+        timeout_seconds = calculate_timeout(original_size, seconds_per_mb)
+        
+        # Ghostscript compression to temp file
+        gs_cmd = [
+            gs_command,
+            '-sDEVICE=pdfwrite',
+            '-dCompatibilityLevel=1.4',
+            '-dPDFSETTINGS=/ebook',
+            '-dNOPAUSE',
+            '-dQUIET',
+            '-dBATCH',
+            '-dDetectDuplicateImages=true',
+            '-dCompressFonts=true',
+            '-dCompressPages=true',
+            '-dColorImageResolution=150',
+            '-dGrayImageResolution=150',
+            '-dMonoImageResolution=300',
+            f'-dNumRenderingThreads={thread_count}',
+            '-dOptimize=true',
+            '-dDownsampleColorImages=true',
+            '-dDownsampleGrayImages=true',
+            '-dColorImageDownsampleType=/Bicubic',
+            '-dGrayImageDownsampleType=/Bicubic',
+            f'-sOutputFile={temp_output}',
+            input_path
+        ]
+        
+        creation_flags = subprocess.HIGH_PRIORITY_CLASS if platform.system() == "Windows" else 0
+        
+        result = subprocess.run(
+            gs_cmd,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            check=True,
+            text=True,
+            timeout=timeout_seconds,
+            creationflags=creation_flags
+        )
+        
+        if os.path.exists(temp_output):
+            compressed_size = get_file_size_mb(temp_output)
+            
+            # Compare sizes - only use compressed if smaller
+            if compressed_size < original_size:
+                # Compressed is smaller - use it!
+                shutil.move(temp_output, output_path)
+                final_size = compressed_size
+                compressed_smaller = True
+            else:
+                # Original is smaller - copy it instead
+                shutil.copy2(input_path, output_path)
+                final_size = original_size
+                compressed_smaller = False
+                # Clean up temp file
+                if os.path.exists(temp_output):
+                    os.remove(temp_output)
+            
+            # Move source file to processed folder
+            try:
+                shutil.move(input_path, processed_path)
+                moved = True
+            except Exception as move_error:
+                pass
+            
+            duration = time.time() - start_time
+            return (filename, True, original_size, final_size, compressed_smaller, None, duration, moved, timeout_seconds)
+        else:
+            duration = time.time() - start_time
+            return (filename, False, original_size, 0, False, "Output not created", duration, False, timeout_seconds)
+            
+    except subprocess.TimeoutExpired:
+        # Timeout - just copy original
+        if os.path.exists(temp_output):
+            os.remove(temp_output)
+        
+        try:
+            shutil.copy2(input_path, output_path)
+            shutil.move(input_path, processed_path)
+            moved = True
+            duration = time.time() - start_time
+            return (filename, True, original_size, original_size, False, f"Timeout - used original", duration, moved, timeout_seconds)
+        except:
+            duration = time.time() - start_time
+            return (filename, False, original_size, 0, False, f"Timeout after {timeout_seconds}s", duration, False, timeout_seconds)
+            
+    except subprocess.CalledProcessError as e:
+        # Compression failed - copy original
+        if os.path.exists(temp_output):
+            os.remove(temp_output)
+        
+        try:
+            shutil.copy2(input_path, output_path)
+            shutil.move(input_path, processed_path)
+            moved = True
+            duration = time.time() - start_time
+            error_msg = e.stderr[:50] if e.stderr else "Compression failed"
+            return (filename, True, original_size, original_size, False, f"Failed - used original: {error_msg}", duration, moved, timeout_seconds)
+        except:
+            duration = time.time() - start_time
+            error_msg = e.stderr[:100] if e.stderr else "Ghostscript error"
+            return (filename, False, original_size, 0, False, error_msg, duration, False, timeout_seconds)
+            
+    except Exception as e:
+        if os.path.exists(temp_output):
+            os.remove(temp_output)
+        duration = time.time() - start_time
+        timeout_seconds = calculate_timeout(original_size, seconds_per_mb)
+        return (filename, False, original_size, 0, False, str(e)[:100], duration, False, timeout_seconds)
+
+class ProgressMonitor:
+    """Real-time progress monitoring"""
+    def __init__(self, total_files):
+        self.total_files = total_files
+        self.completed = 0
+        self.successful = 0
+        self.failed = 0
+        self.moved = 0
+        self.compressed_better = 0
+        self.original_better = 0
+        self.total_original = 0
+        self.total_final = 0
+        self.start_time = time.time()
+        self.lock = threading.Lock()
+        
+    def update(self, success, original_size, final_size, compressed_smaller, was_moved):
+        with self.lock:
+            self.completed += 1
+            self.total_original += original_size
+            if success:
+                self.successful += 1
+                self.total_final += final_size
+                if was_moved:
+                    self.moved += 1
+                if compressed_smaller:
+                    self.compressed_better += 1
+                elif original_size == final_size:
+                    self.original_better += 1
+            else:
+                self.failed += 1
+    
+    def get_stats(self):
+        with self.lock:
+            elapsed = time.time() - self.start_time
+            rate = self.completed / elapsed if elapsed > 0 else 0
+            eta = (self.total_files - self.completed) / rate if rate > 0 else 0
+            
+            return {
+                'completed': self.completed,
+                'successful': self.successful,
+                'failed': self.failed,
+                'moved': self.moved,
+                'compressed_better': self.compressed_better,
+                'original_better': self.original_better,
+                'elapsed': elapsed,
+                'rate': rate,
+                'eta': eta,
+                'total_original': self.total_original,
+                'total_final': self.total_final
+            }
+
+def batch_compress_smart_v2(source_folder, dest_folder, max_workers=None, seconds_per_mb=6):
+    """Smart batch compression - only keeps smaller files"""
+    
+    source_folder = os.path.abspath(os.path.expanduser(source_folder))
+    dest_folder = os.path.abspath(os.path.expanduser(dest_folder))
+    processed_folder = os.path.join(source_folder, "processed")
+    failed_log = os.path.join(dest_folder, "_failed_files.txt")
+    
+    if not os.path.exists(source_folder):
+        print(f"❌ Error: Source folder does not exist: {source_folder}")
+        return 1
+    
+    ensure_folder_exists(dest_folder)
+    ensure_folder_exists(processed_folder)
+    
+    print("🔍 Scanning for PDF files...")
+    pdf_files = find_pdf_files_with_sizes(source_folder, processed_folder)
+    
+    if not pdf_files:
+        print("✓ All files already processed!")
+        return 0
+    
+    gs_command = find_ghostscript_command()
+    if not gs_command:
+        print("❌ Error: Ghostscript not found!")
+        print("\nInstall Ghostscript from: https://ghostscript.com/releases/gsdnld.html")
+        return 1
+    
+    # Count processed
+    processed_count = len([f for f in os.listdir(processed_folder) if f.lower().endswith('.pdf')]) if os.path.exists(processed_folder) else 0
+    
+    # Determine workers
+    cpu_cores = cpu_count()
+    if max_workers is None:
+        max_workers = min(MAX_WINDOWS_WORKERS, int(cpu_cores * 0.75)) if platform.system() == "Windows" else int(cpu_cores * 0.85)
+    else:
+        if platform.system() == "Windows" and max_workers > MAX_WINDOWS_WORKERS:
+            print(f"⚠️  Adjusting workers from {max_workers} to {MAX_WINDOWS_WORKERS}")
+            max_workers = MAX_WINDOWS_WORKERS
+    
+    threads_per_pdf = max(2, min(8, cpu_cores // (max_workers // 2)))
+    
+    # Analyze file sizes
+    total_size = sum(s for _, s in pdf_files)
+    max_file_size = max(s for _, s in pdf_files)
+    max_timeout = calculate_timeout(max_file_size, seconds_per_mb)
+    
+    # Print header
+    print("\n" + "=" * 80)
+    print("  🎯 SMART PDF COMPRESSOR V2 - BEST VERSION WINS")
+    print("  Only keeps compressed file if it's actually smaller!")
+    print("=" * 80)
+    print(f"\n🖥️  CPU:          {cpu_cores} logical cores")
+    print(f"🔧 Workers:      {max_workers} parallel processes")
+    print(f"🧵 Threads/PDF:  {threads_per_pdf} per file")
+    print(f"⏱️  Timeout:      {seconds_per_mb}s per MB (min: 60s, max: 1800s)")
+    print(f"📦 Ghostscript:  {gs_command}")
+    
+    print(f"\n📁 Folders:")
+    print(f"   Source:       {source_folder}")
+    print(f"   Destination:  {dest_folder}")
+    print(f"   Processed:    {processed_folder}")
+    
+    if processed_count > 0:
+        print(f"\n✓ Already processed: {processed_count} files")
+    
+    print(f"\n📄 Files to process: {len(pdf_files)} PDFs")
+    print(f"   Total size:    {total_size:.1f} MB ({total_size/1024:.1f} GB)")
+    print(f"   Largest file:  {max_file_size:.1f} MB (timeout: {max_timeout}s = {max_timeout/60:.1f} min)")
+    
+    print(f"\n📋 Smart Strategy:")
+    print(f"   1. Try to compress each PDF")
+    print(f"   2. Compare: compressed vs original")
+    print(f"   3. Keep whichever is SMALLER")
+    print(f"   4. Move source to 'processed' folder")
+    print(f"   5. Result: Always get the best version!")
+    
+    print("\n" + "=" * 80)
+    response = input("👉 Continue? (yes/no): ").lower().strip()
+    if response not in ['yes', 'y']:
+        print("❌ Cancelled")
+        return 1
+    
+    print("\n🚀 Starting smart compression...\n")
+    
+    # Prepare tasks
+    tasks = []
+    for pdf_file, _ in pdf_files:
+        input_path = os.path.join(source_folder, pdf_file)
+        output_path = os.path.join(dest_folder, pdf_file)
+        processed_path = os.path.join(processed_folder, pdf_file)
+        tasks.append((input_path, output_path, processed_path, gs_command, threads_per_pdf, seconds_per_mb))
+    
+    monitor = ProgressMonitor(len(pdf_files))
+    failed_files = []
+    
+    start_time = time.time()
+    
+    try:
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            future_to_file = {executor.submit(compress_pdf_smart, task): task for task in tasks}
+            
+            for future in as_completed(future_to_file):
+                filename, success, original_size, final_size, compressed_smaller, error_msg, duration, was_moved, timeout_used = future.result()
+                
+                monitor.update(success, original_size, final_size, compressed_smaller, was_moved)
+                stats = monitor.get_stats()
+                
+                if success:
+                    if compressed_smaller:
+                        reduction = ((original_size - final_size) / original_size) * 100
+                        indicator = "✓ COMPRESSED"
+                        print(f"✓ [{stats['completed']}/{len(pdf_files)}] {filename[:45]}")
+                        print(f"  {original_size:.1f}MB → {final_size:.1f}MB ({reduction:.1f}%↓) [{duration:.1f}s] {indicator}")
+                    else:
+                        if original_size == final_size:
+                            indicator = "= ORIGINAL (better)"
+                        else:
+                            indicator = "= ORIGINAL (compression failed)"
+                        print(f"✓ [{stats['completed']}/{len(pdf_files)}] {filename[:45]}")
+                        print(f"  {original_size:.1f}MB (kept original) [{duration:.1f}s] {indicator}")
+                        if error_msg:
+                            print(f"  Note: {error_msg}")
+                else:
+                    print(f"✗ [{stats['completed']}/{len(pdf_files)}] {filename[:45]}")
+                    print(f"  FAILED ({original_size:.1f}MB): {error_msg} [{duration:.1f}s]")
+                    failed_files.append(f"{filename} ({original_size:.1f}MB) - {error_msg}")
+                
+                # Stats every 50 files
+                if stats['completed'] % 50 == 0:
+                    compression_rate = (stats['compressed_better'] / stats['successful'] * 100) if stats['successful'] > 0 else 0
+                    print(f"\n  📊 Progress:")
+                    print(f"     Rate: {stats['rate']:.2f} files/sec | ETA: {timedelta(seconds=int(stats['eta']))}")
+                    print(f"     Compressed better: {stats['compressed_better']} ({compression_rate:.1f}%)")
+                    print(f"     Original better: {stats['original_better']}")
+                    print(f"     Failed: {stats['failed']}")
+                    if stats['total_original'] > 0:
+                        saved_gb = (stats['total_original'] - stats['total_final']) / 1024
+                        reduction_pct = ((stats['total_original'] - stats['total_final']) / stats['total_original']) * 100
+                        print(f"     Total saved: {saved_gb:.2f} GB ({reduction_pct:.1f}% reduction)")
+                    print()
+    
+    except KeyboardInterrupt:
+        print("\n\n⚠️  Interrupted by user")
+        stats = monitor.get_stats()
+    
+    # Save failed files log
+    if failed_files:
+        with open(failed_log, 'w', encoding='utf-8') as f:
+            f.write(f"Failed Files Report - {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
+            f.write(f"Total Failed: {len(failed_files)}\n")
+            f.write("=" * 80 + "\n\n")
+            for line in failed_files:
+                f.write(line + "\n")
+        print(f"\n📝 Failed files logged to: {failed_log}")
+    
+    # Final Summary
+    elapsed_time = time.time() - start_time
+    stats = monitor.get_stats()
+    
+    print("\n" + "=" * 80)
+    print("🏁 FINAL SUMMARY")
+    print("=" * 80)
+    print(f"⏱️  Total Time:    {timedelta(seconds=int(elapsed_time))}")
+    print(f"⚡ Average Rate:  {stats['completed'] / elapsed_time:.2f} files/sec")
+    print(f"\n📊 Results:")
+    print(f"   Processed:          {stats['completed']} files")
+    print(f"   ✓ Successful:       {stats['successful']}")
+    print(f"   ✗ Failed:           {stats['failed']}")
+    
+    if stats['successful'] > 0:
+        compression_success_rate = (stats['compressed_better'] / stats['successful'] * 100)
+        print(f"\n🎯 Compression Results:")
+        print(f"   Compressed better:  {stats['compressed_better']} files ({compression_success_rate:.1f}%)")
+        print(f"   Original better:    {stats['original_better']} files ({100-compression_success_rate:.1f}%)")
+    
+    if stats['total_original'] > 0:
+        print(f"\n💾 Storage:")
+        print(f"   Original size:  {stats['total_original']/1024:.2f} GB")
+        print(f"   Final size:     {stats['total_final']/1024:.2f} GB")
+        overall = ((stats['total_original'] - stats['total_final']) / stats['total_original']) * 100
+        saved = (stats['total_original'] - stats['total_final']) / 1024
+        print(f"   Reduction:      {overall:.1f}%")
+        print(f"   💰 Saved:        {saved:.2f} GB")
+    
+    print(f"\n📁 Locations:")
+    print(f"   Final files:   {dest_folder}")
+    print(f"   Processed:     {processed_folder}")
+    print(f"   Remaining:     {source_folder}")
+    
+    if stats['failed'] > 0:
+        print(f"\n⚠️  {stats['failed']} files failed - see {failed_log}")
+    
+    print("=" * 80)
+    
+    return 0 if stats['failed'] == 0 else 1
+
+def main():
+    """Main entry point"""
+    if len(sys.argv) < 3:
+        cpu_cores = cpu_count()
+        recommended = min(MAX_WINDOWS_WORKERS, max(1, int(cpu_cores * 0.75))) if platform.system() == "Windows" else int(cpu_cores * 0.85)
+        
+        print("🎯 SMART PDF Compressor V2 - Best Version Wins!")
+        print(f"   Detected: {cpu_cores} cores | Recommended: {recommended} workers")
+        print("\nUsage:")
+        print(f"  python {sys.argv[0]} <source> <destination> [workers] [--timeout-per-mb SECONDS]")
+        print("\nExamples:")
+        print(r"  python batch_compress_smart_v2.py D:\C D:\D 60")
+        print(r"  python batch_compress_smart_v2.py D:\C D:\D 60 --timeout-per-mb 10")
+        print("\nKey Feature:")
+        print("  • Tries to compress each PDF")
+        print("  • Compares compressed vs original")
+        print("  • Always keeps the SMALLER version")
+        print("  • No more files getting bigger!")
+        return 1
+    
+    source = sys.argv[1]
+    destination = sys.argv[2]
+    
+    # Parse arguments
+    max_workers = None
+    seconds_per_mb = 6
+    
+    i = 3
+    while i < len(sys.argv):
+        arg = sys.argv[i]
+        if arg == '--timeout-per-mb' and i + 1 < len(sys.argv):
+            try:
+                seconds_per_mb = int(sys.argv[i + 1])
+                i += 2
+            except ValueError:
+                print(f"Warning: Invalid timeout-per-mb, using default 6")
+                i += 2
+        else:
+            try:
+                max_workers = int(arg)
+                i += 1
+            except ValueError:
+                print(f"Warning: Unknown argument '{arg}', ignoring")
+                i += 1
+    
+    return batch_compress_smart_v2(source, destination, max_workers, seconds_per_mb)
+
+if __name__ == '__main__':
+    try:
+        sys.exit(main())
+    except KeyboardInterrupt:
+        print("\n\n❌ Cancelled by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
\ No newline at end of file