#!/usr/bin/env python3 import requests import json import time from datetime import datetime from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.options import Options from ebooklib import epub # <-- Add this import class HybridAPIScraper: def __init__(self): self.frontend_url = "https://metruyencv.biz" self.backend_url = "https://backend.metruyencv.com" self.email = "le.thanh1305@gmail.com" self.password = "Lethanh1710" self.session = requests.Session() # Set up session headers self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'application/json, text/plain, */*', 'Accept-Language': 'en-US,en;q=0.9', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Origin': self.frontend_url, 'Referer': self.frontend_url + '/' }) def login_via_api(self): """Login via API and establish authenticated session""" print("πŸ” Logging in via API...") try: # Get CSRF token response = self.session.get(self.frontend_url) csrf_token = self.session.cookies.get('XSRF-TOKEN') if csrf_token: import urllib.parse csrf_token = urllib.parse.unquote(csrf_token) print(f"βœ… CSRF token obtained: {csrf_token[:50]}...") # Prepare login data login_data = { 'email': self.email, 'password': self.password, 'device_name': 'Web Browser' } # Set headers headers = { 'Content-Type': 'application/json', 'Accept': 'application/json', 'X-CSRF-TOKEN': csrf_token, 'X-Requested-With': 'XMLHttpRequest', 'Origin': self.frontend_url, 'Referer': self.frontend_url + '/' } # Make login request login_url = f"{self.backend_url}/api/auth/login" print(f"Making login request to: {login_url}") response = self.session.post(login_url, json=login_data, headers=headers, timeout=10) print(f"Response status: {response.status_code}") if response.status_code == 200: print("βœ… API login successful!") # Try to extract authentication token try: json_data = response.json() if 'data' in json_data and 'token' in json_data['data']: token = json_data['data']['token'] print(f"βœ… Authentication token found: {token[:50]}...") self.session.headers.update({'Authorization': f'Bearer {token}'}) except: print("βœ… Login successful (no token in response)") return True else: print(f"❌ API login failed: {response.status_code}") return False except Exception as e: print(f"❌ Error during API login: {e}") return False def get_authenticated_cookies(self): """Get authenticated cookies from API session""" print("πŸͺ Getting authenticated cookies...") try: # Get cookies from the authenticated session cookies = {} for cookie in self.session.cookies: cookies[cookie.name] = cookie.value print(f"βœ… Retrieved {len(cookies)} authenticated cookies") return cookies except Exception as e: print(f"❌ Error getting cookies: {e}") return {} def scrape_with_authenticated_session(self): """Scrape content using authenticated session with minimal Selenium""" print("🌐 Starting authenticated scraping...") # First, login via API if not self.login_via_api(): print("❌ Cannot proceed without successful login") return False # Get authenticated cookies cookies = self.get_authenticated_cookies() # Set up Selenium with authenticated cookies options = Options() options.headless = False driver = webdriver.Firefox(options=options) try: # Navigate to homepage first print("Step 1: Navigating to homepage...") driver.get(self.frontend_url) time.sleep(2) # Add authenticated cookies to Selenium print("Step 2: Adding authenticated cookies...") for name, value in cookies.items(): driver.add_cookie({ 'name': name, 'value': value, 'domain': '.metruyencv.biz' # Use domain for both frontend and backend }) # Navigate to chapter page print("Step 3: Navigating to chapter page...") chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-1" driver.get(chapter_url) time.sleep(3) # Check if we need to login via UI (minimal interaction) print("Step 4: Checking login status...") login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhαΊ­p') or contains(text(), 'Login')]") if login_elements: print("⚠️ Need minimal UI login...") # Perform minimal UI login (only if absolutely necessary) self.perform_minimal_ui_login(driver) time.sleep(3) # Now try to access chapter content print("Step 5: Accessing chapter content...") content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words') if content_div: content_text = content_div[0].text print(f"βœ… Chapter content found!") print(f"Content length: {len(content_text)} characters") print(f"Content preview: {content_text[:200]}...") # Save content to file timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = f"chapter_1_content_{timestamp}.txt" with open(filename, 'w', encoding='utf-8') as f: f.write(content_text) print(f"βœ… Content saved to: {filename}") return True else: print("❌ Chapter content not found") return False except Exception as e: print(f"❌ Error during scraping: {e}") return False finally: driver.quit() def perform_minimal_ui_login(self, driver): """Perform minimal UI login only if absolutely necessary""" print("πŸ” Performing minimal UI login...") try: # Click hamburger menu hamburger = driver.find_element(By.CSS_SELECTOR, 'svg.w-7.h-7') hamburger.click() time.sleep(1) # Click login button login_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/button') login_btn.click() time.sleep(1) # Fill email email_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[1]/div[2]/input') email_field.clear() email_field.send_keys(self.email) time.sleep(0.5) # Fill password password_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[2]/div[2]/input') password_field.clear() password_field.send_keys(self.password) time.sleep(0.5) # Click submit submit_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[3]/div[1]/button') submit_btn.click() time.sleep(3) print("βœ… Minimal UI login completed!") except Exception as e: print(f"❌ Error during minimal UI login: {e}") def scrape_multiple_chapters(self, start_chapter=1, end_chapter=5): """Scrape multiple chapters with minimal UI interaction and create an EPUB file""" print(f"\U0001F4DA Scraping chapters {start_chapter} to {end_chapter}...") # First, login via API if not self.login_via_api(): print("❌ Cannot proceed without successful login") return False # Get authenticated cookies cookies = self.get_authenticated_cookies() # Set up Selenium options = Options() options.headless = False driver = webdriver.Firefox(options=options) chapters = [] # Collect chapters for EPUB try: # Navigate to homepage and add cookies driver.get(self.frontend_url) time.sleep(2) # Add authenticated cookies for name, value in cookies.items(): driver.add_cookie({ 'name': name, 'value': value, 'domain': '.metruyencv.biz' }) # --- Scrape novel metadata from main page --- # Example novel URL (should be parameterized in real use) novel_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than" driver.get(novel_url) time.sleep(2) # Title try: title = driver.find_element(By.CSS_SELECTOR, 'h1.mb-2').text except Exception: title = "Unknown Title" # Author try: author = driver.find_element(By.CSS_SELECTOR, 'a.text-gray-500').text except Exception: author = "Unknown Author" # Status try: status = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-primary.rounded.px-2.py-1.text-primary span').text except Exception: status = "Unknown Status" # Attribute try: attribute = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-rose-700').text except Exception: attribute = "Unknown Attribute" # Cover image try: image_url = driver.find_element(By.CSS_SELECTOR, 'img.w-44.h-60.shadow-lg.rounded.mx-auto').get_attribute('src') except Exception: image_url = None # Download cover image image = None if image_url: try: # Use session cookies for requests s = requests.Session() for c in driver.get_cookies(): s.cookies.set(c['name'], c['value']) resp = s.get(image_url, timeout=10) if resp.status_code == 200: image = resp.content except Exception: image = None # --- End metadata scraping --- # Check if UI login is needed login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhαΊ­p') or contains(text(), 'Login')]") if login_elements: print("⚠️ Performing minimal UI login...") self.perform_minimal_ui_login(driver) time.sleep(3) # Scrape chapters successful_chapters = 0 failed_chapters = [] for chapter_num in range(start_chapter, end_chapter + 1): print(f"\n--- Scraping chapter {chapter_num} ---") try: chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-{chapter_num}" driver.get(chapter_url) time.sleep(2) # Get chapter title title_element = driver.find_elements(By.CSS_SELECTOR, 'h2.text-center') chap_title = title_element[0].text if title_element else f"Chapter {chapter_num}" # Get chapter content content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words') if content_div: content_text = content_div[0].text print(f"βœ… Chapter {chapter_num} content found!") print(f"Title: {chap_title}") print(f"Content length: {len(content_text)} characters") # Save chapter as txt (optional, keep old behavior) filename = f"chapter_{chapter_num:04d}_{chap_title.replace(' ', '_').replace(':', '_')}.txt" with open(filename, 'w', encoding='utf-8') as f: f.write(f"Title: {chap_title}\n\n") f.write(content_text) print(f"βœ… Saved to: {filename}") # Collect for EPUB chapters.append({ 'title': chap_title, 'content': content_text, 'number': chapter_num }) successful_chapters += 1 else: print(f"❌ Chapter {chapter_num} content not found") failed_chapters.append(chapter_num) except Exception as e: print(f"❌ Error scraping chapter {chapter_num}: {e}") failed_chapters.append(chapter_num) print(f"\n\U0001F4CA SCRAPING SUMMARY:") print(f" Successful chapters: {successful_chapters}") print(f" Failed chapters: {len(failed_chapters)}") if failed_chapters: print(f" Failed chapters: {failed_chapters}") # Create EPUB if any chapters were scraped if chapters: self.create_epub(title, author, status, attribute, image, chapters, start_chapter, end_chapter) return successful_chapters > 0 except Exception as e: print(f"❌ Error during chapter scraping: {e}") return False finally: driver.quit() def create_epub(self, title, author, status, attribute, image, chapters, start_chapter, end_chapter): """Create an EPUB file from scraped chapters""" print("\U0001F4D6 Creating EPUB file...") book = epub.EpubBook() # Set metadata book.set_identifier(f"ta-chi-muon-huy-diet-tong-mon-{start_chapter}-{end_chapter}") book.set_title(title) book.set_language("vi") book.add_author(author) book.add_metadata(None, 'meta', '', {'name': 'status', 'content': status}) book.add_metadata(None, 'meta', '', {'name': 'chapter', 'content': str(len(chapters))}) book.add_metadata(None, 'meta', '', {'name': 'attribute', 'content': attribute}) if image: book.set_cover(content=image, file_name='cover.jpg') # Add custom CSS style = ''' body { font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif; } h1 { text-align: left; text-transform: uppercase; font-weight: 400; } h2 { text-align: left; text-transform: uppercase; font-weight: 300; } ''' nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) book.add_item(nav_css) epub_chapters = [] p = 1 for chap in chapters: chap_title = chap['title'] chap_content = f'

{chap_title}

' + chap['content'].replace("\n", "
") if p == 1: chap_content = f"

{title}

" + chap_content p += 1 file_name = f'chapter{chap["number"]}-{chap_title}.html' c = epub.EpubHtml(lang='vi', title=chap_title, file_name=file_name, uid=f'chapter{chap["number"]}') c.content = chap_content book.add_item(c) epub_chapters.append(c) book.spine = [f'chapter{chap["number"]}' for chap in chapters] book.toc = tuple(epub_chapters) book.add_item(epub.EpubNcx()) book.add_item(epub.EpubNav()) epub_filename = f"novel_{start_chapter:04d}_{end_chapter:04d}.epub" epub.write_epub(epub_filename, book, {}) print(f"βœ… EPUB created: {epub_filename}") def main(): """Main function to run hybrid API scraper""" scraper = HybridAPIScraper() print("πŸš€ Starting hybrid API scraper...") print("=" * 60) # Test single chapter first print("Testing single chapter access...") success = scraper.scrape_with_authenticated_session() if success: print("\nβœ… Single chapter test successful! Testing multiple chapters...") # Test multiple chapters scraper.scrape_multiple_chapters(1, 3) print("=" * 60) print("βœ… Hybrid API scraper completed!") if __name__ == "__main__": main()