#!/usr/bin/env python3 r""" SMART PDF Compressor - Only Saves When Actually Smaller Automatically keeps the smaller version (original vs compressed) Features: - Dynamic timeout based on file size - Only keeps compressed version if actually smaller - Shows compression success rate - Moves processed files to 'processed' folder Usage: python batch_compress_smart_v2.py [workers] [--timeout-per-mb SECONDS] Examples: python batch_compress_smart_v2.py D:\C D:\D 60 python batch_compress_smart_v2.py D:\C D:\D 60 --timeout-per-mb 10 Huong dan su dung Day la se chuyen tu folder A (original) to foder B (Output/Result) Required to install ghostscripts de chay Toi uu cho PC manh 3990x 5090 """ import os import sys import subprocess import platform from pathlib import Path from concurrent.futures import ProcessPoolExecutor, as_completed from multiprocessing import cpu_count import time from datetime import timedelta import threading import shutil # Windows limit MAX_WINDOWS_WORKERS = 61 def get_file_size_mb(filepath): """Get file size in MB""" try: return os.path.getsize(filepath) / (1024 * 1024) except: return 0 def ensure_folder_exists(folder_path): """Create folder if it doesn't exist""" Path(folder_path).mkdir(parents=True, exist_ok=True) def find_pdf_files_with_sizes(folder_path, processed_folder): """Find all unprocessed PDF files""" pdf_files = [] if not os.path.exists(folder_path): return pdf_files processed_files = set() if os.path.exists(processed_folder): processed_files = {f.lower() for f in os.listdir(processed_folder) if f.lower().endswith('.pdf')} for file in os.listdir(folder_path): if file.lower().endswith('.pdf'): full_path = os.path.join(folder_path, file) if os.path.isfile(full_path) and file.lower() not in processed_files: size = get_file_size_mb(full_path) pdf_files.append((file, size)) # Sort by size (SMALLEST first) return sorted(pdf_files, key=lambda x: x[1]) def find_ghostscript_command(): """Find Ghostscript""" gs_commands = ['gswin64c', 'gswin32c', 'gs'] for cmd in gs_commands: try: subprocess.run([cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, timeout=5) return cmd except: continue return None def calculate_timeout(file_size_mb, seconds_per_mb=6): """ Calculate timeout based on file size Default: 6 seconds per MB Minimum: 60 seconds Maximum: 30 minutes """ timeout = int(file_size_mb * seconds_per_mb) timeout = max(60, timeout) # At least 1 minute timeout = min(1800, timeout) # At most 30 minutes return timeout def compress_pdf_smart(args): """ Compress PDF and only keep if smaller Returns: (filename, success, original_size, final_size, compressed_smaller, error_msg, duration, moved, timeout_used) """ input_path, output_path, processed_path, gs_command, thread_count, seconds_per_mb = args filename = os.path.basename(input_path) start_time = time.time() moved = False compressed_smaller = False temp_output = output_path + ".tmp" try: original_size = get_file_size_mb(input_path) # Calculate dynamic timeout timeout_seconds = calculate_timeout(original_size, seconds_per_mb) # Ghostscript compression to temp file gs_cmd = [ gs_command, '-sDEVICE=pdfwrite', '-dCompatibilityLevel=1.4', '-dPDFSETTINGS=/ebook', '-dNOPAUSE', '-dQUIET', '-dBATCH', '-dDetectDuplicateImages=true', '-dCompressFonts=true', '-dCompressPages=true', '-dColorImageResolution=150', '-dGrayImageResolution=150', '-dMonoImageResolution=300', f'-dNumRenderingThreads={thread_count}', '-dOptimize=true', '-dDownsampleColorImages=true', '-dDownsampleGrayImages=true', '-dColorImageDownsampleType=/Bicubic', '-dGrayImageDownsampleType=/Bicubic', f'-sOutputFile={temp_output}', input_path ] creation_flags = subprocess.HIGH_PRIORITY_CLASS if platform.system() == "Windows" else 0 result = subprocess.run( gs_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True, text=True, timeout=timeout_seconds, creationflags=creation_flags ) if os.path.exists(temp_output): compressed_size = get_file_size_mb(temp_output) # Compare sizes - only use compressed if smaller if compressed_size < original_size: # Compressed is smaller - use it! shutil.move(temp_output, output_path) final_size = compressed_size compressed_smaller = True else: # Original is smaller - copy it instead shutil.copy2(input_path, output_path) final_size = original_size compressed_smaller = False # Clean up temp file if os.path.exists(temp_output): os.remove(temp_output) # Move source file to processed folder try: shutil.move(input_path, processed_path) moved = True except Exception as move_error: pass duration = time.time() - start_time return (filename, True, original_size, final_size, compressed_smaller, None, duration, moved, timeout_seconds) else: duration = time.time() - start_time return (filename, False, original_size, 0, False, "Output not created", duration, False, timeout_seconds) except subprocess.TimeoutExpired: # Timeout - just copy original if os.path.exists(temp_output): os.remove(temp_output) try: shutil.copy2(input_path, output_path) shutil.move(input_path, processed_path) moved = True duration = time.time() - start_time return (filename, True, original_size, original_size, False, f"Timeout - used original", duration, moved, timeout_seconds) except: duration = time.time() - start_time return (filename, False, original_size, 0, False, f"Timeout after {timeout_seconds}s", duration, False, timeout_seconds) except subprocess.CalledProcessError as e: # Compression failed - copy original if os.path.exists(temp_output): os.remove(temp_output) try: shutil.copy2(input_path, output_path) shutil.move(input_path, processed_path) moved = True duration = time.time() - start_time error_msg = e.stderr[:50] if e.stderr else "Compression failed" return (filename, True, original_size, original_size, False, f"Failed - used original: {error_msg}", duration, moved, timeout_seconds) except: duration = time.time() - start_time error_msg = e.stderr[:100] if e.stderr else "Ghostscript error" return (filename, False, original_size, 0, False, error_msg, duration, False, timeout_seconds) except Exception as e: if os.path.exists(temp_output): os.remove(temp_output) duration = time.time() - start_time timeout_seconds = calculate_timeout(original_size, seconds_per_mb) return (filename, False, original_size, 0, False, str(e)[:100], duration, False, timeout_seconds) class ProgressMonitor: """Real-time progress monitoring""" def __init__(self, total_files): self.total_files = total_files self.completed = 0 self.successful = 0 self.failed = 0 self.moved = 0 self.compressed_better = 0 self.original_better = 0 self.total_original = 0 self.total_final = 0 self.start_time = time.time() self.lock = threading.Lock() def update(self, success, original_size, final_size, compressed_smaller, was_moved): with self.lock: self.completed += 1 self.total_original += original_size if success: self.successful += 1 self.total_final += final_size if was_moved: self.moved += 1 if compressed_smaller: self.compressed_better += 1 elif original_size == final_size: self.original_better += 1 else: self.failed += 1 def get_stats(self): with self.lock: elapsed = time.time() - self.start_time rate = self.completed / elapsed if elapsed > 0 else 0 eta = (self.total_files - self.completed) / rate if rate > 0 else 0 return { 'completed': self.completed, 'successful': self.successful, 'failed': self.failed, 'moved': self.moved, 'compressed_better': self.compressed_better, 'original_better': self.original_better, 'elapsed': elapsed, 'rate': rate, 'eta': eta, 'total_original': self.total_original, 'total_final': self.total_final } def batch_compress_smart_v2(source_folder, dest_folder, max_workers=None, seconds_per_mb=6): """Smart batch compression - only keeps smaller files""" source_folder = os.path.abspath(os.path.expanduser(source_folder)) dest_folder = os.path.abspath(os.path.expanduser(dest_folder)) processed_folder = os.path.join(source_folder, "processed") failed_log = os.path.join(dest_folder, "_failed_files.txt") if not os.path.exists(source_folder): print(f"❌ Error: Source folder does not exist: {source_folder}") return 1 ensure_folder_exists(dest_folder) ensure_folder_exists(processed_folder) print("🔍 Scanning for PDF files...") pdf_files = find_pdf_files_with_sizes(source_folder, processed_folder) if not pdf_files: print("✓ All files already processed!") return 0 gs_command = find_ghostscript_command() if not gs_command: print("❌ Error: Ghostscript not found!") print("\nInstall Ghostscript from: https://ghostscript.com/releases/gsdnld.html") return 1 # Count processed processed_count = len([f for f in os.listdir(processed_folder) if f.lower().endswith('.pdf')]) if os.path.exists(processed_folder) else 0 # Determine workers cpu_cores = cpu_count() if max_workers is None: max_workers = min(MAX_WINDOWS_WORKERS, int(cpu_cores * 0.75)) if platform.system() == "Windows" else int(cpu_cores * 0.85) else: if platform.system() == "Windows" and max_workers > MAX_WINDOWS_WORKERS: print(f"⚠️ Adjusting workers from {max_workers} to {MAX_WINDOWS_WORKERS}") max_workers = MAX_WINDOWS_WORKERS threads_per_pdf = max(2, min(8, cpu_cores // (max_workers // 2))) # Analyze file sizes total_size = sum(s for _, s in pdf_files) max_file_size = max(s for _, s in pdf_files) max_timeout = calculate_timeout(max_file_size, seconds_per_mb) # Print header print("\n" + "=" * 80) print(" 🎯 SMART PDF COMPRESSOR V2 - BEST VERSION WINS") print(" Only keeps compressed file if it's actually smaller!") print("=" * 80) print(f"\n🖥️ CPU: {cpu_cores} logical cores") print(f"🔧 Workers: {max_workers} parallel processes") print(f"🧵 Threads/PDF: {threads_per_pdf} per file") print(f"⏱️ Timeout: {seconds_per_mb}s per MB (min: 60s, max: 1800s)") print(f"📦 Ghostscript: {gs_command}") print(f"\n📁 Folders:") print(f" Source: {source_folder}") print(f" Destination: {dest_folder}") print(f" Processed: {processed_folder}") if processed_count > 0: print(f"\n✓ Already processed: {processed_count} files") print(f"\n📄 Files to process: {len(pdf_files)} PDFs") print(f" Total size: {total_size:.1f} MB ({total_size/1024:.1f} GB)") print(f" Largest file: {max_file_size:.1f} MB (timeout: {max_timeout}s = {max_timeout/60:.1f} min)") print(f"\n📋 Smart Strategy:") print(f" 1. Try to compress each PDF") print(f" 2. Compare: compressed vs original") print(f" 3. Keep whichever is SMALLER") print(f" 4. Move source to 'processed' folder") print(f" 5. Result: Always get the best version!") print("\n" + "=" * 80) response = input("👉 Continue? (yes/no): ").lower().strip() if response not in ['yes', 'y']: print("❌ Cancelled") return 1 print("\n🚀 Starting smart compression...\n") # Prepare tasks tasks = [] for pdf_file, _ in pdf_files: input_path = os.path.join(source_folder, pdf_file) output_path = os.path.join(dest_folder, pdf_file) processed_path = os.path.join(processed_folder, pdf_file) tasks.append((input_path, output_path, processed_path, gs_command, threads_per_pdf, seconds_per_mb)) monitor = ProgressMonitor(len(pdf_files)) failed_files = [] start_time = time.time() try: with ProcessPoolExecutor(max_workers=max_workers) as executor: future_to_file = {executor.submit(compress_pdf_smart, task): task for task in tasks} for future in as_completed(future_to_file): filename, success, original_size, final_size, compressed_smaller, error_msg, duration, was_moved, timeout_used = future.result() monitor.update(success, original_size, final_size, compressed_smaller, was_moved) stats = monitor.get_stats() if success: if compressed_smaller: reduction = ((original_size - final_size) / original_size) * 100 indicator = "✓ COMPRESSED" print(f"✓ [{stats['completed']}/{len(pdf_files)}] {filename[:45]}") print(f" {original_size:.1f}MB → {final_size:.1f}MB ({reduction:.1f}%↓) [{duration:.1f}s] {indicator}") else: if original_size == final_size: indicator = "= ORIGINAL (better)" else: indicator = "= ORIGINAL (compression failed)" print(f"✓ [{stats['completed']}/{len(pdf_files)}] {filename[:45]}") print(f" {original_size:.1f}MB (kept original) [{duration:.1f}s] {indicator}") if error_msg: print(f" Note: {error_msg}") else: print(f"✗ [{stats['completed']}/{len(pdf_files)}] {filename[:45]}") print(f" FAILED ({original_size:.1f}MB): {error_msg} [{duration:.1f}s]") failed_files.append(f"{filename} ({original_size:.1f}MB) - {error_msg}") # Stats every 50 files if stats['completed'] % 50 == 0: compression_rate = (stats['compressed_better'] / stats['successful'] * 100) if stats['successful'] > 0 else 0 print(f"\n 📊 Progress:") print(f" Rate: {stats['rate']:.2f} files/sec | ETA: {timedelta(seconds=int(stats['eta']))}") print(f" Compressed better: {stats['compressed_better']} ({compression_rate:.1f}%)") print(f" Original better: {stats['original_better']}") print(f" Failed: {stats['failed']}") if stats['total_original'] > 0: saved_gb = (stats['total_original'] - stats['total_final']) / 1024 reduction_pct = ((stats['total_original'] - stats['total_final']) / stats['total_original']) * 100 print(f" Total saved: {saved_gb:.2f} GB ({reduction_pct:.1f}% reduction)") print() except KeyboardInterrupt: print("\n\n⚠️ Interrupted by user") stats = monitor.get_stats() # Save failed files log if failed_files: with open(failed_log, 'w', encoding='utf-8') as f: f.write(f"Failed Files Report - {time.strftime('%Y-%m-%d %H:%M:%S')}\n") f.write(f"Total Failed: {len(failed_files)}\n") f.write("=" * 80 + "\n\n") for line in failed_files: f.write(line + "\n") print(f"\n📝 Failed files logged to: {failed_log}") # Final Summary elapsed_time = time.time() - start_time stats = monitor.get_stats() print("\n" + "=" * 80) print("🏁 FINAL SUMMARY") print("=" * 80) print(f"⏱️ Total Time: {timedelta(seconds=int(elapsed_time))}") print(f"⚡ Average Rate: {stats['completed'] / elapsed_time:.2f} files/sec") print(f"\n📊 Results:") print(f" Processed: {stats['completed']} files") print(f" ✓ Successful: {stats['successful']}") print(f" ✗ Failed: {stats['failed']}") if stats['successful'] > 0: compression_success_rate = (stats['compressed_better'] / stats['successful'] * 100) print(f"\n🎯 Compression Results:") print(f" Compressed better: {stats['compressed_better']} files ({compression_success_rate:.1f}%)") print(f" Original better: {stats['original_better']} files ({100-compression_success_rate:.1f}%)") if stats['total_original'] > 0: print(f"\n💾 Storage:") print(f" Original size: {stats['total_original']/1024:.2f} GB") print(f" Final size: {stats['total_final']/1024:.2f} GB") overall = ((stats['total_original'] - stats['total_final']) / stats['total_original']) * 100 saved = (stats['total_original'] - stats['total_final']) / 1024 print(f" Reduction: {overall:.1f}%") print(f" 💰 Saved: {saved:.2f} GB") print(f"\n📁 Locations:") print(f" Final files: {dest_folder}") print(f" Processed: {processed_folder}") print(f" Remaining: {source_folder}") if stats['failed'] > 0: print(f"\n⚠️ {stats['failed']} files failed - see {failed_log}") print("=" * 80) return 0 if stats['failed'] == 0 else 1 def main(): """Main entry point""" if len(sys.argv) < 3: cpu_cores = cpu_count() recommended = min(MAX_WINDOWS_WORKERS, max(1, int(cpu_cores * 0.75))) if platform.system() == "Windows" else int(cpu_cores * 0.85) print("🎯 SMART PDF Compressor V2 - Best Version Wins!") print(f" Detected: {cpu_cores} cores | Recommended: {recommended} workers") print("\nUsage:") print(f" python {sys.argv[0]} [workers] [--timeout-per-mb SECONDS]") print("\nExamples:") print(r" python batch_compress_smart_v2.py D:\C D:\D 60") print(r" python batch_compress_smart_v2.py D:\C D:\D 60 --timeout-per-mb 10") print("\nKey Feature:") print(" • Tries to compress each PDF") print(" • Compares compressed vs original") print(" • Always keeps the SMALLER version") print(" • No more files getting bigger!") return 1 source = sys.argv[1] destination = sys.argv[2] # Parse arguments max_workers = None seconds_per_mb = 6 i = 3 while i < len(sys.argv): arg = sys.argv[i] if arg == '--timeout-per-mb' and i + 1 < len(sys.argv): try: seconds_per_mb = int(sys.argv[i + 1]) i += 2 except ValueError: print(f"Warning: Invalid timeout-per-mb, using default 6") i += 2 else: try: max_workers = int(arg) i += 1 except ValueError: print(f"Warning: Unknown argument '{arg}', ignoring") i += 1 return batch_compress_smart_v2(source, destination, max_workers, seconds_per_mb) if __name__ == '__main__': try: sys.exit(main()) except KeyboardInterrupt: print("\n\n❌ Cancelled by user") sys.exit(1) except Exception as e: print(f"\n❌ Error: {e}") import traceback traceback.print_exc() sys.exit(1)