Add crawl.py

2025-07-25 15:02:32 +07:00 · 2025-07-25 15:02:32 +07:00 · 02d9145707
commit 02d9145707
1 changed files with 443 additions and 0 deletions
--- a/crawl.py
+++ b/crawl.py
@ -0,0 +1,443 @@
+#!/usr/bin/env python3
+import requests
+import json
+import time
+from datetime import datetime
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.firefox.options import Options
+from ebooklib import epub  # <-- Add this import
+
+class HybridAPIScraper:
+    def __init__(self):
+        self.frontend_url = "https://metruyencv.biz"
+        self.backend_url = "https://backend.metruyencv.com"
+        self.email = "le.thanh1305@gmail.com"
+        self.password = "Lethanh1710"
+        self.session = requests.Session()
+        
+        # Set up session headers
+        self.session.headers.update({
+            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept': 'application/json, text/plain, */*',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
+            'Origin': self.frontend_url,
+            'Referer': self.frontend_url + '/'
+        })
+    
+    def login_via_api(self):
+        """Login via API and establish authenticated session"""
+        print("🔐 Logging in via API...")
+        
+        try:
+            # Get CSRF token
+            response = self.session.get(self.frontend_url)
+            csrf_token = self.session.cookies.get('XSRF-TOKEN')
+            if csrf_token:
+                import urllib.parse
+                csrf_token = urllib.parse.unquote(csrf_token)
+                print(f"✅ CSRF token obtained: {csrf_token[:50]}...")
+            
+            # Prepare login data
+            login_data = {
+                'email': self.email,
+                'password': self.password,
+                'device_name': 'Web Browser'
+            }
+            
+            # Set headers
+            headers = {
+                'Content-Type': 'application/json',
+                'Accept': 'application/json',
+                'X-CSRF-TOKEN': csrf_token,
+                'X-Requested-With': 'XMLHttpRequest',
+                'Origin': self.frontend_url,
+                'Referer': self.frontend_url + '/'
+            }
+            
+            # Make login request
+            login_url = f"{self.backend_url}/api/auth/login"
+            print(f"Making login request to: {login_url}")
+            
+            response = self.session.post(login_url, json=login_data, headers=headers, timeout=10)
+            
+            print(f"Response status: {response.status_code}")
+            
+            if response.status_code == 200:
+                print("✅ API login successful!")
+                
+                # Try to extract authentication token
+                try:
+                    json_data = response.json()
+                    if 'data' in json_data and 'token' in json_data['data']:
+                        token = json_data['data']['token']
+                        print(f"✅ Authentication token found: {token[:50]}...")
+                        self.session.headers.update({'Authorization': f'Bearer {token}'})
+                except:
+                    print("✅ Login successful (no token in response)")
+                
+                return True
+            else:
+                print(f"❌ API login failed: {response.status_code}")
+                return False
+                
+        except Exception as e:
+            print(f"❌ Error during API login: {e}")
+            return False
+    
+    def get_authenticated_cookies(self):
+        """Get authenticated cookies from API session"""
+        print("🍪 Getting authenticated cookies...")
+        
+        try:
+            # Get cookies from the authenticated session
+            cookies = {}
+            for cookie in self.session.cookies:
+                cookies[cookie.name] = cookie.value
+            
+            print(f"✅ Retrieved {len(cookies)} authenticated cookies")
+            return cookies
+            
+        except Exception as e:
+            print(f"❌ Error getting cookies: {e}")
+            return {}
+    
+    def scrape_with_authenticated_session(self):
+        """Scrape content using authenticated session with minimal Selenium"""
+        print("🌐 Starting authenticated scraping...")
+        
+        # First, login via API
+        if not self.login_via_api():
+            print("❌ Cannot proceed without successful login")
+            return False
+        
+        # Get authenticated cookies
+        cookies = self.get_authenticated_cookies()
+        
+        # Set up Selenium with authenticated cookies
+        options = Options()
+        options.headless = False
+        driver = webdriver.Firefox(options=options)
+        
+        try:
+            # Navigate to homepage first
+            print("Step 1: Navigating to homepage...")
+            driver.get(self.frontend_url)
+            time.sleep(2)
+            
+            # Add authenticated cookies to Selenium
+            print("Step 2: Adding authenticated cookies...")
+            for name, value in cookies.items():
+                driver.add_cookie({
+                    'name': name,
+                    'value': value,
+                    'domain': '.metruyencv.biz'  # Use domain for both frontend and backend
+                })
+            
+            # Navigate to chapter page
+            print("Step 3: Navigating to chapter page...")
+            chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-1"
+            driver.get(chapter_url)
+            time.sleep(3)
+            
+            # Check if we need to login via UI (minimal interaction)
+            print("Step 4: Checking login status...")
+            login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhập') or contains(text(), 'Login')]")
+            
+            if login_elements:
+                print("⚠️ Need minimal UI login...")
+                # Perform minimal UI login (only if absolutely necessary)
+                self.perform_minimal_ui_login(driver)
+                time.sleep(3)
+            
+            # Now try to access chapter content
+            print("Step 5: Accessing chapter content...")
+            content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words')
+            
+            if content_div:
+                content_text = content_div[0].text
+                print(f"✅ Chapter content found!")
+                print(f"Content length: {len(content_text)} characters")
+                print(f"Content preview: {content_text[:200]}...")
+                
+                # Save content to file
+                timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+                filename = f"chapter_1_content_{timestamp}.txt"
+                
+                with open(filename, 'w', encoding='utf-8') as f:
+                    f.write(content_text)
+                
+                print(f"✅ Content saved to: {filename}")
+                return True
+            else:
+                print("❌ Chapter content not found")
+                return False
+                
+        except Exception as e:
+            print(f"❌ Error during scraping: {e}")
+            return False
+        finally:
+            driver.quit()
+    
+    def perform_minimal_ui_login(self, driver):
+        """Perform minimal UI login only if absolutely necessary"""
+        print("🔐 Performing minimal UI login...")
+        
+        try:
+            # Click hamburger menu
+            hamburger = driver.find_element(By.CSS_SELECTOR, 'svg.w-7.h-7')
+            hamburger.click()
+            time.sleep(1)
+            
+            # Click login button
+            login_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/button')
+            login_btn.click()
+            time.sleep(1)
+            
+            # Fill email
+            email_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[1]/div[2]/input')
+            email_field.clear()
+            email_field.send_keys(self.email)
+            time.sleep(0.5)
+            
+            # Fill password
+            password_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[2]/div[2]/input')
+            password_field.clear()
+            password_field.send_keys(self.password)
+            time.sleep(0.5)
+            
+            # Click submit
+            submit_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[3]/div[1]/button')
+            submit_btn.click()
+            time.sleep(3)
+            
+            print("✅ Minimal UI login completed!")
+            
+        except Exception as e:
+            print(f"❌ Error during minimal UI login: {e}")
+    
+    def scrape_multiple_chapters(self, start_chapter=1, end_chapter=5):
+        """Scrape multiple chapters with minimal UI interaction and create an EPUB file"""
+        print(f"\U0001F4DA Scraping chapters {start_chapter} to {end_chapter}...")
+        
+        # First, login via API
+        if not self.login_via_api():
+            print("❌ Cannot proceed without successful login")
+            return False
+        
+        # Get authenticated cookies
+        cookies = self.get_authenticated_cookies()
+        
+        # Set up Selenium
+        options = Options()
+        options.headless = False
+        driver = webdriver.Firefox(options=options)
+        
+        chapters = []  # Collect chapters for EPUB
+        try:
+            # Navigate to homepage and add cookies
+            driver.get(self.frontend_url)
+            time.sleep(2)
+            
+            # Add authenticated cookies
+            for name, value in cookies.items():
+                driver.add_cookie({
+                    'name': name,
+                    'value': value,
+                    'domain': '.metruyencv.biz'
+                })
+            
+            # --- Scrape novel metadata from main page ---
+            # Example novel URL (should be parameterized in real use)
+            novel_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than"
+            driver.get(novel_url)
+            time.sleep(2)
+            # Title
+            try:
+                title = driver.find_element(By.CSS_SELECTOR, 'h1.mb-2').text
+            except Exception:
+                title = "Unknown Title"
+            # Author
+            try:
+                author = driver.find_element(By.CSS_SELECTOR, 'a.text-gray-500').text
+            except Exception:
+                author = "Unknown Author"
+            # Status
+            try:
+                status = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-primary.rounded.px-2.py-1.text-primary span').text
+            except Exception:
+                status = "Unknown Status"
+            # Attribute
+            try:
+                attribute = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-rose-700').text
+            except Exception:
+                attribute = "Unknown Attribute"
+            # Cover image
+            try:
+                image_url = driver.find_element(By.CSS_SELECTOR, 'img.w-44.h-60.shadow-lg.rounded.mx-auto').get_attribute('src')
+            except Exception:
+                image_url = None
+            # Download cover image
+            image = None
+            if image_url:
+                try:
+                    # Use session cookies for requests
+                    s = requests.Session()
+                    for c in driver.get_cookies():
+                        s.cookies.set(c['name'], c['value'])
+                    resp = s.get(image_url, timeout=10)
+                    if resp.status_code == 200:
+                        image = resp.content
+                except Exception:
+                    image = None
+            # --- End metadata scraping ---
+            
+            # Check if UI login is needed
+            login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhập') or contains(text(), 'Login')]")
+            if login_elements:
+                print("⚠️ Performing minimal UI login...")
+                self.perform_minimal_ui_login(driver)
+                time.sleep(3)
+            
+            # Scrape chapters
+            successful_chapters = 0
+            failed_chapters = []
+            
+            for chapter_num in range(start_chapter, end_chapter + 1):
+                print(f"\n--- Scraping chapter {chapter_num} ---")
+                
+                try:
+                    chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-{chapter_num}"
+                    driver.get(chapter_url)
+                    time.sleep(2)
+                    
+                    # Get chapter title
+                    title_element = driver.find_elements(By.CSS_SELECTOR, 'h2.text-center')
+                    chap_title = title_element[0].text if title_element else f"Chapter {chapter_num}"
+                    
+                    # Get chapter content
+                    content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words')
+                    
+                    if content_div:
+                        content_text = content_div[0].text
+                        print(f"✅ Chapter {chapter_num} content found!")
+                        print(f"Title: {chap_title}")
+                        print(f"Content length: {len(content_text)} characters")
+                        
+                        # Save chapter as txt (optional, keep old behavior)
+                        filename = f"chapter_{chapter_num:04d}_{chap_title.replace(' ', '_').replace(':', '_')}.txt"
+                        with open(filename, 'w', encoding='utf-8') as f:
+                            f.write(f"Title: {chap_title}\n\n")
+                            f.write(content_text)
+                        print(f"✅ Saved to: {filename}")
+                        
+                        # Collect for EPUB
+                        chapters.append({
+                            'title': chap_title,
+                            'content': content_text,
+                            'number': chapter_num
+                        })
+                        successful_chapters += 1
+                    else:
+                        print(f"❌ Chapter {chapter_num} content not found")
+                        failed_chapters.append(chapter_num)
+                        
+                except Exception as e:
+                    print(f"❌ Error scraping chapter {chapter_num}: {e}")
+                    failed_chapters.append(chapter_num)
+            
+            print(f"\n\U0001F4CA SCRAPING SUMMARY:")
+            print(f"  Successful chapters: {successful_chapters}")
+            print(f"  Failed chapters: {len(failed_chapters)}")
+            if failed_chapters:
+                print(f"  Failed chapters: {failed_chapters}")
+            
+            # Create EPUB if any chapters were scraped
+            if chapters:
+                self.create_epub(title, author, status, attribute, image, chapters, start_chapter, end_chapter)
+            
+            return successful_chapters > 0
+            
+        except Exception as e:
+            print(f"❌ Error during chapter scraping: {e}")
+            return False
+        finally:
+            driver.quit()
+
+    def create_epub(self, title, author, status, attribute, image, chapters, start_chapter, end_chapter):
+        """Create an EPUB file from scraped chapters"""
+        print("\U0001F4D6 Creating EPUB file...")
+        book = epub.EpubBook()
+        # Set metadata
+        book.set_identifier(f"ta-chi-muon-huy-diet-tong-mon-{start_chapter}-{end_chapter}")
+        book.set_title(title)
+        book.set_language("vi")
+        book.add_author(author)
+        book.add_metadata(None, 'meta', '', {'name': 'status', 'content': status})
+        book.add_metadata(None, 'meta', '', {'name': 'chapter', 'content': str(len(chapters))})
+        book.add_metadata(None, 'meta', '', {'name': 'attribute', 'content': attribute})
+        if image:
+            book.set_cover(content=image, file_name='cover.jpg')
+        # Add custom CSS
+        style = '''
+            body {
+                font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
+            }
+            h1 {
+                 text-align: left;
+                 text-transform: uppercase;
+                 font-weight: 400;     
+            }
+            h2 {
+                 text-align: left;
+                 text-transform: uppercase;
+                 font-weight: 300;     
+            }
+        '''
+        nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
+        book.add_item(nav_css)
+        epub_chapters = []
+        p = 1
+        for chap in chapters:
+            chap_title = chap['title']
+            chap_content = f'<h2>{chap_title}</h2>' + chap['content'].replace("\n", "<br/>")
+            if p == 1:
+                chap_content = f"<h1>{title}</h1>" + chap_content
+            p += 1
+            file_name = f'chapter{chap["number"]}-{chap_title}.html'
+            c = epub.EpubHtml(lang='vi', title=chap_title, file_name=file_name, uid=f'chapter{chap["number"]}')
+            c.content = chap_content
+            book.add_item(c)
+            epub_chapters.append(c)
+        book.spine = [f'chapter{chap["number"]}' for chap in chapters]
+        book.toc = tuple(epub_chapters)
+        book.add_item(epub.EpubNcx())
+        book.add_item(epub.EpubNav())
+        epub_filename = f"novel_{start_chapter:04d}_{end_chapter:04d}.epub"
+        epub.write_epub(epub_filename, book, {})
+        print(f"✅ EPUB created: {epub_filename}")
+
+def main():
+    """Main function to run hybrid API scraper"""
+    scraper = HybridAPIScraper()
+    
+    print("🚀 Starting hybrid API scraper...")
+    print("=" * 60)
+    
+    # Test single chapter first
+    print("Testing single chapter access...")
+    success = scraper.scrape_with_authenticated_session()
+    
+    if success:
+        print("\n✅ Single chapter test successful! Testing multiple chapters...")
+        
+        # Test multiple chapters
+        scraper.scrape_multiple_chapters(1, 3)
+    
+    print("=" * 60)
+    print("✅ Hybrid API scraper completed!")
+
+if __name__ == "__main__":
+    main()