commit 02d9145707afff72e281c57eaa7a85127b2d7c09 Author: thanhtl Date: Fri Jul 25 15:02:32 2025 +0700 Add crawl.py diff --git a/crawl.py b/crawl.py new file mode 100644 index 0000000..49e6600 --- /dev/null +++ b/crawl.py @@ -0,0 +1,443 @@ +#!/usr/bin/env python3 +import requests +import json +import time +from datetime import datetime +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.firefox.options import Options +from ebooklib import epub # <-- Add this import + +class HybridAPIScraper: + def __init__(self): + self.frontend_url = "https://metruyencv.biz" + self.backend_url = "https://backend.metruyencv.com" + self.email = "le.thanh1305@gmail.com" + self.password = "Lethanh1710" + self.session = requests.Session() + + # Set up session headers + self.session.headers.update({ + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Origin': self.frontend_url, + 'Referer': self.frontend_url + '/' + }) + + def login_via_api(self): + """Login via API and establish authenticated session""" + print("πŸ” Logging in via API...") + + try: + # Get CSRF token + response = self.session.get(self.frontend_url) + csrf_token = self.session.cookies.get('XSRF-TOKEN') + if csrf_token: + import urllib.parse + csrf_token = urllib.parse.unquote(csrf_token) + print(f"βœ… CSRF token obtained: {csrf_token[:50]}...") + + # Prepare login data + login_data = { + 'email': self.email, + 'password': self.password, + 'device_name': 'Web Browser' + } + + # Set headers + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + 'X-CSRF-TOKEN': csrf_token, + 'X-Requested-With': 'XMLHttpRequest', + 'Origin': self.frontend_url, + 'Referer': self.frontend_url + '/' + } + + # Make login request + login_url = f"{self.backend_url}/api/auth/login" + print(f"Making login request to: {login_url}") + + response = self.session.post(login_url, json=login_data, headers=headers, timeout=10) + + print(f"Response status: {response.status_code}") + + if response.status_code == 200: + print("βœ… API login successful!") + + # Try to extract authentication token + try: + json_data = response.json() + if 'data' in json_data and 'token' in json_data['data']: + token = json_data['data']['token'] + print(f"βœ… Authentication token found: {token[:50]}...") + self.session.headers.update({'Authorization': f'Bearer {token}'}) + except: + print("βœ… Login successful (no token in response)") + + return True + else: + print(f"❌ API login failed: {response.status_code}") + return False + + except Exception as e: + print(f"❌ Error during API login: {e}") + return False + + def get_authenticated_cookies(self): + """Get authenticated cookies from API session""" + print("πŸͺ Getting authenticated cookies...") + + try: + # Get cookies from the authenticated session + cookies = {} + for cookie in self.session.cookies: + cookies[cookie.name] = cookie.value + + print(f"βœ… Retrieved {len(cookies)} authenticated cookies") + return cookies + + except Exception as e: + print(f"❌ Error getting cookies: {e}") + return {} + + def scrape_with_authenticated_session(self): + """Scrape content using authenticated session with minimal Selenium""" + print("🌐 Starting authenticated scraping...") + + # First, login via API + if not self.login_via_api(): + print("❌ Cannot proceed without successful login") + return False + + # Get authenticated cookies + cookies = self.get_authenticated_cookies() + + # Set up Selenium with authenticated cookies + options = Options() + options.headless = False + driver = webdriver.Firefox(options=options) + + try: + # Navigate to homepage first + print("Step 1: Navigating to homepage...") + driver.get(self.frontend_url) + time.sleep(2) + + # Add authenticated cookies to Selenium + print("Step 2: Adding authenticated cookies...") + for name, value in cookies.items(): + driver.add_cookie({ + 'name': name, + 'value': value, + 'domain': '.metruyencv.biz' # Use domain for both frontend and backend + }) + + # Navigate to chapter page + print("Step 3: Navigating to chapter page...") + chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-1" + driver.get(chapter_url) + time.sleep(3) + + # Check if we need to login via UI (minimal interaction) + print("Step 4: Checking login status...") + login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhαΊ­p') or contains(text(), 'Login')]") + + if login_elements: + print("⚠️ Need minimal UI login...") + # Perform minimal UI login (only if absolutely necessary) + self.perform_minimal_ui_login(driver) + time.sleep(3) + + # Now try to access chapter content + print("Step 5: Accessing chapter content...") + content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words') + + if content_div: + content_text = content_div[0].text + print(f"βœ… Chapter content found!") + print(f"Content length: {len(content_text)} characters") + print(f"Content preview: {content_text[:200]}...") + + # Save content to file + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"chapter_1_content_{timestamp}.txt" + + with open(filename, 'w', encoding='utf-8') as f: + f.write(content_text) + + print(f"βœ… Content saved to: {filename}") + return True + else: + print("❌ Chapter content not found") + return False + + except Exception as e: + print(f"❌ Error during scraping: {e}") + return False + finally: + driver.quit() + + def perform_minimal_ui_login(self, driver): + """Perform minimal UI login only if absolutely necessary""" + print("πŸ” Performing minimal UI login...") + + try: + # Click hamburger menu + hamburger = driver.find_element(By.CSS_SELECTOR, 'svg.w-7.h-7') + hamburger.click() + time.sleep(1) + + # Click login button + login_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/button') + login_btn.click() + time.sleep(1) + + # Fill email + email_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[1]/div[2]/input') + email_field.clear() + email_field.send_keys(self.email) + time.sleep(0.5) + + # Fill password + password_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[2]/div[2]/input') + password_field.clear() + password_field.send_keys(self.password) + time.sleep(0.5) + + # Click submit + submit_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[3]/div[1]/button') + submit_btn.click() + time.sleep(3) + + print("βœ… Minimal UI login completed!") + + except Exception as e: + print(f"❌ Error during minimal UI login: {e}") + + def scrape_multiple_chapters(self, start_chapter=1, end_chapter=5): + """Scrape multiple chapters with minimal UI interaction and create an EPUB file""" + print(f"\U0001F4DA Scraping chapters {start_chapter} to {end_chapter}...") + + # First, login via API + if not self.login_via_api(): + print("❌ Cannot proceed without successful login") + return False + + # Get authenticated cookies + cookies = self.get_authenticated_cookies() + + # Set up Selenium + options = Options() + options.headless = False + driver = webdriver.Firefox(options=options) + + chapters = [] # Collect chapters for EPUB + try: + # Navigate to homepage and add cookies + driver.get(self.frontend_url) + time.sleep(2) + + # Add authenticated cookies + for name, value in cookies.items(): + driver.add_cookie({ + 'name': name, + 'value': value, + 'domain': '.metruyencv.biz' + }) + + # --- Scrape novel metadata from main page --- + # Example novel URL (should be parameterized in real use) + novel_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than" + driver.get(novel_url) + time.sleep(2) + # Title + try: + title = driver.find_element(By.CSS_SELECTOR, 'h1.mb-2').text + except Exception: + title = "Unknown Title" + # Author + try: + author = driver.find_element(By.CSS_SELECTOR, 'a.text-gray-500').text + except Exception: + author = "Unknown Author" + # Status + try: + status = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-primary.rounded.px-2.py-1.text-primary span').text + except Exception: + status = "Unknown Status" + # Attribute + try: + attribute = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-rose-700').text + except Exception: + attribute = "Unknown Attribute" + # Cover image + try: + image_url = driver.find_element(By.CSS_SELECTOR, 'img.w-44.h-60.shadow-lg.rounded.mx-auto').get_attribute('src') + except Exception: + image_url = None + # Download cover image + image = None + if image_url: + try: + # Use session cookies for requests + s = requests.Session() + for c in driver.get_cookies(): + s.cookies.set(c['name'], c['value']) + resp = s.get(image_url, timeout=10) + if resp.status_code == 200: + image = resp.content + except Exception: + image = None + # --- End metadata scraping --- + + # Check if UI login is needed + login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhαΊ­p') or contains(text(), 'Login')]") + if login_elements: + print("⚠️ Performing minimal UI login...") + self.perform_minimal_ui_login(driver) + time.sleep(3) + + # Scrape chapters + successful_chapters = 0 + failed_chapters = [] + + for chapter_num in range(start_chapter, end_chapter + 1): + print(f"\n--- Scraping chapter {chapter_num} ---") + + try: + chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-{chapter_num}" + driver.get(chapter_url) + time.sleep(2) + + # Get chapter title + title_element = driver.find_elements(By.CSS_SELECTOR, 'h2.text-center') + chap_title = title_element[0].text if title_element else f"Chapter {chapter_num}" + + # Get chapter content + content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words') + + if content_div: + content_text = content_div[0].text + print(f"βœ… Chapter {chapter_num} content found!") + print(f"Title: {chap_title}") + print(f"Content length: {len(content_text)} characters") + + # Save chapter as txt (optional, keep old behavior) + filename = f"chapter_{chapter_num:04d}_{chap_title.replace(' ', '_').replace(':', '_')}.txt" + with open(filename, 'w', encoding='utf-8') as f: + f.write(f"Title: {chap_title}\n\n") + f.write(content_text) + print(f"βœ… Saved to: {filename}") + + # Collect for EPUB + chapters.append({ + 'title': chap_title, + 'content': content_text, + 'number': chapter_num + }) + successful_chapters += 1 + else: + print(f"❌ Chapter {chapter_num} content not found") + failed_chapters.append(chapter_num) + + except Exception as e: + print(f"❌ Error scraping chapter {chapter_num}: {e}") + failed_chapters.append(chapter_num) + + print(f"\n\U0001F4CA SCRAPING SUMMARY:") + print(f" Successful chapters: {successful_chapters}") + print(f" Failed chapters: {len(failed_chapters)}") + if failed_chapters: + print(f" Failed chapters: {failed_chapters}") + + # Create EPUB if any chapters were scraped + if chapters: + self.create_epub(title, author, status, attribute, image, chapters, start_chapter, end_chapter) + + return successful_chapters > 0 + + except Exception as e: + print(f"❌ Error during chapter scraping: {e}") + return False + finally: + driver.quit() + + def create_epub(self, title, author, status, attribute, image, chapters, start_chapter, end_chapter): + """Create an EPUB file from scraped chapters""" + print("\U0001F4D6 Creating EPUB file...") + book = epub.EpubBook() + # Set metadata + book.set_identifier(f"ta-chi-muon-huy-diet-tong-mon-{start_chapter}-{end_chapter}") + book.set_title(title) + book.set_language("vi") + book.add_author(author) + book.add_metadata(None, 'meta', '', {'name': 'status', 'content': status}) + book.add_metadata(None, 'meta', '', {'name': 'chapter', 'content': str(len(chapters))}) + book.add_metadata(None, 'meta', '', {'name': 'attribute', 'content': attribute}) + if image: + book.set_cover(content=image, file_name='cover.jpg') + # Add custom CSS + style = ''' + body { + font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif; + } + h1 { + text-align: left; + text-transform: uppercase; + font-weight: 400; + } + h2 { + text-align: left; + text-transform: uppercase; + font-weight: 300; + } + ''' + nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style) + book.add_item(nav_css) + epub_chapters = [] + p = 1 + for chap in chapters: + chap_title = chap['title'] + chap_content = f'

{chap_title}

' + chap['content'].replace("\n", "
") + if p == 1: + chap_content = f"

{title}

" + chap_content + p += 1 + file_name = f'chapter{chap["number"]}-{chap_title}.html' + c = epub.EpubHtml(lang='vi', title=chap_title, file_name=file_name, uid=f'chapter{chap["number"]}') + c.content = chap_content + book.add_item(c) + epub_chapters.append(c) + book.spine = [f'chapter{chap["number"]}' for chap in chapters] + book.toc = tuple(epub_chapters) + book.add_item(epub.EpubNcx()) + book.add_item(epub.EpubNav()) + epub_filename = f"novel_{start_chapter:04d}_{end_chapter:04d}.epub" + epub.write_epub(epub_filename, book, {}) + print(f"βœ… EPUB created: {epub_filename}") + +def main(): + """Main function to run hybrid API scraper""" + scraper = HybridAPIScraper() + + print("πŸš€ Starting hybrid API scraper...") + print("=" * 60) + + # Test single chapter first + print("Testing single chapter access...") + success = scraper.scrape_with_authenticated_session() + + if success: + print("\nβœ… Single chapter test successful! Testing multiple chapters...") + + # Test multiple chapters + scraper.scrape_multiple_chapters(1, 3) + + print("=" * 60) + print("βœ… Hybrid API scraper completed!") + +if __name__ == "__main__": + main() \ No newline at end of file