Add crawl.py

2025-07-25 15:02:32 +07:00 · 2025-07-25 15:02:32 +07:00 · 02d9145707
commit 02d9145707
1 changed files with 443 additions and 0 deletions
--- a/crawl.py
+++ b/crawl.py
@ -0,0 +1,443 @@
 #!/usr/bin/env python3
 import requests
 import json
 import time
 from datetime import datetime
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.firefox.options import Options
 from ebooklib import epub  # <-- Add this import
 class HybridAPIScraper:
    def __init__(self):
        self.frontend_url = "https://metruyencv.biz"
        self.backend_url = "https://backend.metruyencv.com"
        self.email = "le.thanh1305@gmail.com"
        self.password = "Lethanh1710"
        self.session = requests.Session()
        # Set up session headers
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'application/json, text/plain, */*',
            'Accept-Language': 'en-US,en;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Origin': self.frontend_url,
            'Referer': self.frontend_url + '/'
        })
    def login_via_api(self):
        """Login via API and establish authenticated session"""
        print("🔐 Logging in via API...")
        try:
            # Get CSRF token
            response = self.session.get(self.frontend_url)
            csrf_token = self.session.cookies.get('XSRF-TOKEN')
            if csrf_token:
                import urllib.parse
                csrf_token = urllib.parse.unquote(csrf_token)
                print(f"✅ CSRF token obtained: {csrf_token[:50]}...")
            # Prepare login data
            login_data = {
                'email': self.email,
                'password': self.password,
                'device_name': 'Web Browser'
            }
            # Set headers
            headers = {
                'Content-Type': 'application/json',
                'Accept': 'application/json',
                'X-CSRF-TOKEN': csrf_token,
                'X-Requested-With': 'XMLHttpRequest',
                'Origin': self.frontend_url,
                'Referer': self.frontend_url + '/'
            }
            # Make login request
            login_url = f"{self.backend_url}/api/auth/login"
            print(f"Making login request to: {login_url}")
            response = self.session.post(login_url, json=login_data, headers=headers, timeout=10)
            print(f"Response status: {response.status_code}")
            if response.status_code == 200:
                print("✅ API login successful!")
                # Try to extract authentication token
                try:
                    json_data = response.json()
                    if 'data' in json_data and 'token' in json_data['data']:
                        token = json_data['data']['token']
                        print(f"✅ Authentication token found: {token[:50]}...")
                        self.session.headers.update({'Authorization': f'Bearer {token}'})
                except:
                    print("✅ Login successful (no token in response)")
                return True
            else:
                print(f"❌ API login failed: {response.status_code}")
                return False
        except Exception as e:
            print(f"❌ Error during API login: {e}")
            return False
    def get_authenticated_cookies(self):
        """Get authenticated cookies from API session"""
        print("🍪 Getting authenticated cookies...")
        try:
            # Get cookies from the authenticated session
            cookies = {}
            for cookie in self.session.cookies:
                cookies[cookie.name] = cookie.value
            print(f"✅ Retrieved {len(cookies)} authenticated cookies")
            return cookies
        except Exception as e:
            print(f"❌ Error getting cookies: {e}")
            return {}
    def scrape_with_authenticated_session(self):
        """Scrape content using authenticated session with minimal Selenium"""
        print("🌐 Starting authenticated scraping...")
        # First, login via API
        if not self.login_via_api():
            print("❌ Cannot proceed without successful login")
            return False
        # Get authenticated cookies
        cookies = self.get_authenticated_cookies()
        # Set up Selenium with authenticated cookies
        options = Options()
        options.headless = False
        driver = webdriver.Firefox(options=options)
        try:
            # Navigate to homepage first
            print("Step 1: Navigating to homepage...")
            driver.get(self.frontend_url)
            time.sleep(2)
            # Add authenticated cookies to Selenium
            print("Step 2: Adding authenticated cookies...")
            for name, value in cookies.items():
                driver.add_cookie({
                    'name': name,
                    'value': value,
                    'domain': '.metruyencv.biz'  # Use domain for both frontend and backend
                })
            # Navigate to chapter page
            print("Step 3: Navigating to chapter page...")
            chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-1"
            driver.get(chapter_url)
            time.sleep(3)
            # Check if we need to login via UI (minimal interaction)
            print("Step 4: Checking login status...")
            login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhập') or contains(text(), 'Login')]")
            if login_elements:
                print("⚠️ Need minimal UI login...")
                # Perform minimal UI login (only if absolutely necessary)
                self.perform_minimal_ui_login(driver)
                time.sleep(3)
            # Now try to access chapter content
            print("Step 5: Accessing chapter content...")
            content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words')
            if content_div:
                content_text = content_div[0].text
                print(f"✅ Chapter content found!")
                print(f"Content length: {len(content_text)} characters")
                print(f"Content preview: {content_text[:200]}...")
                # Save content to file
                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
                filename = f"chapter_1_content_{timestamp}.txt"
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(content_text)
                print(f"✅ Content saved to: {filename}")
                return True
            else:
                print("❌ Chapter content not found")
                return False
        except Exception as e:
            print(f"❌ Error during scraping: {e}")
            return False
        finally:
            driver.quit()
    def perform_minimal_ui_login(self, driver):
        """Perform minimal UI login only if absolutely necessary"""
        print("🔐 Performing minimal UI login...")
        try:
            # Click hamburger menu
            hamburger = driver.find_element(By.CSS_SELECTOR, 'svg.w-7.h-7')
            hamburger.click()
            time.sleep(1)
            # Click login button
            login_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/button')
            login_btn.click()
            time.sleep(1)
            # Fill email
            email_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[1]/div[2]/input')
            email_field.clear()
            email_field.send_keys(self.email)
            time.sleep(0.5)
            # Fill password
            password_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[2]/div[2]/input')
            password_field.clear()
            password_field.send_keys(self.password)
            time.sleep(0.5)
            # Click submit
            submit_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[3]/div[1]/button')
            submit_btn.click()
            time.sleep(3)
            print("✅ Minimal UI login completed!")
        except Exception as e:
            print(f"❌ Error during minimal UI login: {e}")
    def scrape_multiple_chapters(self, start_chapter=1, end_chapter=5):
        """Scrape multiple chapters with minimal UI interaction and create an EPUB file"""
        print(f"\U0001F4DA Scraping chapters {start_chapter} to {end_chapter}...")
        # First, login via API
        if not self.login_via_api():
            print("❌ Cannot proceed without successful login")
            return False
        # Get authenticated cookies
        cookies = self.get_authenticated_cookies()
        # Set up Selenium
        options = Options()
        options.headless = False
        driver = webdriver.Firefox(options=options)
        chapters = []  # Collect chapters for EPUB
        try:
            # Navigate to homepage and add cookies
            driver.get(self.frontend_url)
            time.sleep(2)
            # Add authenticated cookies
            for name, value in cookies.items():
                driver.add_cookie({
                    'name': name,
                    'value': value,
                    'domain': '.metruyencv.biz'
                })
            # --- Scrape novel metadata from main page ---
            # Example novel URL (should be parameterized in real use)
            novel_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than"
            driver.get(novel_url)
            time.sleep(2)
            # Title
            try:
                title = driver.find_element(By.CSS_SELECTOR, 'h1.mb-2').text
            except Exception:
                title = "Unknown Title"
            # Author
            try:
                author = driver.find_element(By.CSS_SELECTOR, 'a.text-gray-500').text
            except Exception:
                author = "Unknown Author"
            # Status
            try:
                status = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-primary.rounded.px-2.py-1.text-primary span').text
            except Exception:
                status = "Unknown Status"
            # Attribute
            try:
                attribute = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-rose-700').text
            except Exception:
                attribute = "Unknown Attribute"
            # Cover image
            try:
                image_url = driver.find_element(By.CSS_SELECTOR, 'img.w-44.h-60.shadow-lg.rounded.mx-auto').get_attribute('src')
            except Exception:
                image_url = None
            # Download cover image
            image = None
            if image_url:
                try:
                    # Use session cookies for requests
                    s = requests.Session()
                    for c in driver.get_cookies():
                        s.cookies.set(c['name'], c['value'])
                    resp = s.get(image_url, timeout=10)
                    if resp.status_code == 200:
                        image = resp.content
                except Exception:
                    image = None
            # --- End metadata scraping ---
            # Check if UI login is needed
            login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhập') or contains(text(), 'Login')]")
            if login_elements:
                print("⚠️ Performing minimal UI login...")
                self.perform_minimal_ui_login(driver)
                time.sleep(3)
            # Scrape chapters
            successful_chapters = 0
            failed_chapters = []
            for chapter_num in range(start_chapter, end_chapter + 1):
                print(f"\n--- Scraping chapter {chapter_num} ---")
                try:
                    chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-{chapter_num}"
                    driver.get(chapter_url)
                    time.sleep(2)
                    # Get chapter title
                    title_element = driver.find_elements(By.CSS_SELECTOR, 'h2.text-center')
                    chap_title = title_element[0].text if title_element else f"Chapter {chapter_num}"
                    # Get chapter content
                    content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words')
                    if content_div:
                        content_text = content_div[0].text
                        print(f"✅ Chapter {chapter_num} content found!")
                        print(f"Title: {chap_title}")
                        print(f"Content length: {len(content_text)} characters")
                        # Save chapter as txt (optional, keep old behavior)
                        filename = f"chapter_{chapter_num:04d}_{chap_title.replace(' ', '_').replace(':', '_')}.txt"
                        with open(filename, 'w', encoding='utf-8') as f:
                            f.write(f"Title: {chap_title}\n\n")
                            f.write(content_text)
                        print(f"✅ Saved to: {filename}")
                        # Collect for EPUB
                        chapters.append({
                            'title': chap_title,
                            'content': content_text,
                            'number': chapter_num
                        })
                        successful_chapters += 1
                    else:
                        print(f"❌ Chapter {chapter_num} content not found")
                        failed_chapters.append(chapter_num)
                except Exception as e:
                    print(f"❌ Error scraping chapter {chapter_num}: {e}")
                    failed_chapters.append(chapter_num)
            print(f"\n\U0001F4CA SCRAPING SUMMARY:")
            print(f"  Successful chapters: {successful_chapters}")
            print(f"  Failed chapters: {len(failed_chapters)}")
            if failed_chapters:
                print(f"  Failed chapters: {failed_chapters}")
            # Create EPUB if any chapters were scraped
            if chapters:
                self.create_epub(title, author, status, attribute, image, chapters, start_chapter, end_chapter)
            return successful_chapters > 0
        except Exception as e:
            print(f"❌ Error during chapter scraping: {e}")
            return False
        finally:
            driver.quit()
    def create_epub(self, title, author, status, attribute, image, chapters, start_chapter, end_chapter):
        """Create an EPUB file from scraped chapters"""
        print("\U0001F4D6 Creating EPUB file...")
        book = epub.EpubBook()
        # Set metadata
        book.set_identifier(f"ta-chi-muon-huy-diet-tong-mon-{start_chapter}-{end_chapter}")
        book.set_title(title)
        book.set_language("vi")
        book.add_author(author)
        book.add_metadata(None, 'meta', '', {'name': 'status', 'content': status})
        book.add_metadata(None, 'meta', '', {'name': 'chapter', 'content': str(len(chapters))})
        book.add_metadata(None, 'meta', '', {'name': 'attribute', 'content': attribute})
        if image:
            book.set_cover(content=image, file_name='cover.jpg')
        # Add custom CSS
        style = '''
            body {
                font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
            }
            h1 {
                 text-align: left;
                 text-transform: uppercase;
                 font-weight: 400;     
            }
            h2 {
                 text-align: left;
                 text-transform: uppercase;
                 font-weight: 300;     
            }
        '''
        nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
        book.add_item(nav_css)
        epub_chapters = []
        p = 1
        for chap in chapters:
            chap_title = chap['title']
            chap_content = f'<h2>{chap_title}</h2>' + chap['content'].replace("\n", "<br/>")
            if p == 1:
                chap_content = f"<h1>{title}</h1>" + chap_content
            p += 1
            file_name = f'chapter{chap["number"]}-{chap_title}.html'
            c = epub.EpubHtml(lang='vi', title=chap_title, file_name=file_name, uid=f'chapter{chap["number"]}')
            c.content = chap_content
            book.add_item(c)
            epub_chapters.append(c)
        book.spine = [f'chapter{chap["number"]}' for chap in chapters]
        book.toc = tuple(epub_chapters)
        book.add_item(epub.EpubNcx())
        book.add_item(epub.EpubNav())
        epub_filename = f"novel_{start_chapter:04d}_{end_chapter:04d}.epub"
        epub.write_epub(epub_filename, book, {})
        print(f"✅ EPUB created: {epub_filename}")
 def main():
    """Main function to run hybrid API scraper"""
    scraper = HybridAPIScraper()
    print("🚀 Starting hybrid API scraper...")
    print("=" * 60)
    # Test single chapter first
    print("Testing single chapter access...")
    success = scraper.scrape_with_authenticated_session()
    if success:
        print("\n✅ Single chapter test successful! Testing multiple chapters...")
        # Test multiple chapters
        scraper.scrape_multiple_chapters(1, 3)
    print("=" * 60)
    print("✅ Hybrid API scraper completed!")
 if __name__ == "__main__":
    main()