Add crawl.py

This commit is contained in:
thanhtl 2025-07-25 15:02:32 +07:00
commit 02d9145707

443
crawl.py Normal file
View File

@ -0,0 +1,443 @@
#!/usr/bin/env python3
import requests
import json
import time
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from ebooklib import epub # <-- Add this import
class HybridAPIScraper:
def __init__(self):
self.frontend_url = "https://metruyencv.biz"
self.backend_url = "https://backend.metruyencv.com"
self.email = "le.thanh1305@gmail.com"
self.password = "Lethanh1710"
self.session = requests.Session()
# Set up session headers
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Origin': self.frontend_url,
'Referer': self.frontend_url + '/'
})
def login_via_api(self):
"""Login via API and establish authenticated session"""
print("🔐 Logging in via API...")
try:
# Get CSRF token
response = self.session.get(self.frontend_url)
csrf_token = self.session.cookies.get('XSRF-TOKEN')
if csrf_token:
import urllib.parse
csrf_token = urllib.parse.unquote(csrf_token)
print(f"✅ CSRF token obtained: {csrf_token[:50]}...")
# Prepare login data
login_data = {
'email': self.email,
'password': self.password,
'device_name': 'Web Browser'
}
# Set headers
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json',
'X-CSRF-TOKEN': csrf_token,
'X-Requested-With': 'XMLHttpRequest',
'Origin': self.frontend_url,
'Referer': self.frontend_url + '/'
}
# Make login request
login_url = f"{self.backend_url}/api/auth/login"
print(f"Making login request to: {login_url}")
response = self.session.post(login_url, json=login_data, headers=headers, timeout=10)
print(f"Response status: {response.status_code}")
if response.status_code == 200:
print("✅ API login successful!")
# Try to extract authentication token
try:
json_data = response.json()
if 'data' in json_data and 'token' in json_data['data']:
token = json_data['data']['token']
print(f"✅ Authentication token found: {token[:50]}...")
self.session.headers.update({'Authorization': f'Bearer {token}'})
except:
print("✅ Login successful (no token in response)")
return True
else:
print(f"❌ API login failed: {response.status_code}")
return False
except Exception as e:
print(f"❌ Error during API login: {e}")
return False
def get_authenticated_cookies(self):
"""Get authenticated cookies from API session"""
print("🍪 Getting authenticated cookies...")
try:
# Get cookies from the authenticated session
cookies = {}
for cookie in self.session.cookies:
cookies[cookie.name] = cookie.value
print(f"✅ Retrieved {len(cookies)} authenticated cookies")
return cookies
except Exception as e:
print(f"❌ Error getting cookies: {e}")
return {}
def scrape_with_authenticated_session(self):
"""Scrape content using authenticated session with minimal Selenium"""
print("🌐 Starting authenticated scraping...")
# First, login via API
if not self.login_via_api():
print("❌ Cannot proceed without successful login")
return False
# Get authenticated cookies
cookies = self.get_authenticated_cookies()
# Set up Selenium with authenticated cookies
options = Options()
options.headless = False
driver = webdriver.Firefox(options=options)
try:
# Navigate to homepage first
print("Step 1: Navigating to homepage...")
driver.get(self.frontend_url)
time.sleep(2)
# Add authenticated cookies to Selenium
print("Step 2: Adding authenticated cookies...")
for name, value in cookies.items():
driver.add_cookie({
'name': name,
'value': value,
'domain': '.metruyencv.biz' # Use domain for both frontend and backend
})
# Navigate to chapter page
print("Step 3: Navigating to chapter page...")
chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-1"
driver.get(chapter_url)
time.sleep(3)
# Check if we need to login via UI (minimal interaction)
print("Step 4: Checking login status...")
login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhập') or contains(text(), 'Login')]")
if login_elements:
print("⚠️ Need minimal UI login...")
# Perform minimal UI login (only if absolutely necessary)
self.perform_minimal_ui_login(driver)
time.sleep(3)
# Now try to access chapter content
print("Step 5: Accessing chapter content...")
content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words')
if content_div:
content_text = content_div[0].text
print(f"✅ Chapter content found!")
print(f"Content length: {len(content_text)} characters")
print(f"Content preview: {content_text[:200]}...")
# Save content to file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"chapter_1_content_{timestamp}.txt"
with open(filename, 'w', encoding='utf-8') as f:
f.write(content_text)
print(f"✅ Content saved to: {filename}")
return True
else:
print("❌ Chapter content not found")
return False
except Exception as e:
print(f"❌ Error during scraping: {e}")
return False
finally:
driver.quit()
def perform_minimal_ui_login(self, driver):
"""Perform minimal UI login only if absolutely necessary"""
print("🔐 Performing minimal UI login...")
try:
# Click hamburger menu
hamburger = driver.find_element(By.CSS_SELECTOR, 'svg.w-7.h-7')
hamburger.click()
time.sleep(1)
# Click login button
login_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/button')
login_btn.click()
time.sleep(1)
# Fill email
email_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[1]/div[2]/input')
email_field.clear()
email_field.send_keys(self.email)
time.sleep(0.5)
# Fill password
password_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[2]/div[2]/input')
password_field.clear()
password_field.send_keys(self.password)
time.sleep(0.5)
# Click submit
submit_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[3]/div[1]/button')
submit_btn.click()
time.sleep(3)
print("✅ Minimal UI login completed!")
except Exception as e:
print(f"❌ Error during minimal UI login: {e}")
def scrape_multiple_chapters(self, start_chapter=1, end_chapter=5):
"""Scrape multiple chapters with minimal UI interaction and create an EPUB file"""
print(f"\U0001F4DA Scraping chapters {start_chapter} to {end_chapter}...")
# First, login via API
if not self.login_via_api():
print("❌ Cannot proceed without successful login")
return False
# Get authenticated cookies
cookies = self.get_authenticated_cookies()
# Set up Selenium
options = Options()
options.headless = False
driver = webdriver.Firefox(options=options)
chapters = [] # Collect chapters for EPUB
try:
# Navigate to homepage and add cookies
driver.get(self.frontend_url)
time.sleep(2)
# Add authenticated cookies
for name, value in cookies.items():
driver.add_cookie({
'name': name,
'value': value,
'domain': '.metruyencv.biz'
})
# --- Scrape novel metadata from main page ---
# Example novel URL (should be parameterized in real use)
novel_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than"
driver.get(novel_url)
time.sleep(2)
# Title
try:
title = driver.find_element(By.CSS_SELECTOR, 'h1.mb-2').text
except Exception:
title = "Unknown Title"
# Author
try:
author = driver.find_element(By.CSS_SELECTOR, 'a.text-gray-500').text
except Exception:
author = "Unknown Author"
# Status
try:
status = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-primary.rounded.px-2.py-1.text-primary span').text
except Exception:
status = "Unknown Status"
# Attribute
try:
attribute = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-rose-700').text
except Exception:
attribute = "Unknown Attribute"
# Cover image
try:
image_url = driver.find_element(By.CSS_SELECTOR, 'img.w-44.h-60.shadow-lg.rounded.mx-auto').get_attribute('src')
except Exception:
image_url = None
# Download cover image
image = None
if image_url:
try:
# Use session cookies for requests
s = requests.Session()
for c in driver.get_cookies():
s.cookies.set(c['name'], c['value'])
resp = s.get(image_url, timeout=10)
if resp.status_code == 200:
image = resp.content
except Exception:
image = None
# --- End metadata scraping ---
# Check if UI login is needed
login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhập') or contains(text(), 'Login')]")
if login_elements:
print("⚠️ Performing minimal UI login...")
self.perform_minimal_ui_login(driver)
time.sleep(3)
# Scrape chapters
successful_chapters = 0
failed_chapters = []
for chapter_num in range(start_chapter, end_chapter + 1):
print(f"\n--- Scraping chapter {chapter_num} ---")
try:
chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-{chapter_num}"
driver.get(chapter_url)
time.sleep(2)
# Get chapter title
title_element = driver.find_elements(By.CSS_SELECTOR, 'h2.text-center')
chap_title = title_element[0].text if title_element else f"Chapter {chapter_num}"
# Get chapter content
content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words')
if content_div:
content_text = content_div[0].text
print(f"✅ Chapter {chapter_num} content found!")
print(f"Title: {chap_title}")
print(f"Content length: {len(content_text)} characters")
# Save chapter as txt (optional, keep old behavior)
filename = f"chapter_{chapter_num:04d}_{chap_title.replace(' ', '_').replace(':', '_')}.txt"
with open(filename, 'w', encoding='utf-8') as f:
f.write(f"Title: {chap_title}\n\n")
f.write(content_text)
print(f"✅ Saved to: {filename}")
# Collect for EPUB
chapters.append({
'title': chap_title,
'content': content_text,
'number': chapter_num
})
successful_chapters += 1
else:
print(f"❌ Chapter {chapter_num} content not found")
failed_chapters.append(chapter_num)
except Exception as e:
print(f"❌ Error scraping chapter {chapter_num}: {e}")
failed_chapters.append(chapter_num)
print(f"\n\U0001F4CA SCRAPING SUMMARY:")
print(f" Successful chapters: {successful_chapters}")
print(f" Failed chapters: {len(failed_chapters)}")
if failed_chapters:
print(f" Failed chapters: {failed_chapters}")
# Create EPUB if any chapters were scraped
if chapters:
self.create_epub(title, author, status, attribute, image, chapters, start_chapter, end_chapter)
return successful_chapters > 0
except Exception as e:
print(f"❌ Error during chapter scraping: {e}")
return False
finally:
driver.quit()
def create_epub(self, title, author, status, attribute, image, chapters, start_chapter, end_chapter):
"""Create an EPUB file from scraped chapters"""
print("\U0001F4D6 Creating EPUB file...")
book = epub.EpubBook()
# Set metadata
book.set_identifier(f"ta-chi-muon-huy-diet-tong-mon-{start_chapter}-{end_chapter}")
book.set_title(title)
book.set_language("vi")
book.add_author(author)
book.add_metadata(None, 'meta', '', {'name': 'status', 'content': status})
book.add_metadata(None, 'meta', '', {'name': 'chapter', 'content': str(len(chapters))})
book.add_metadata(None, 'meta', '', {'name': 'attribute', 'content': attribute})
if image:
book.set_cover(content=image, file_name='cover.jpg')
# Add custom CSS
style = '''
body {
font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
}
h1 {
text-align: left;
text-transform: uppercase;
font-weight: 400;
}
h2 {
text-align: left;
text-transform: uppercase;
font-weight: 300;
}
'''
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
book.add_item(nav_css)
epub_chapters = []
p = 1
for chap in chapters:
chap_title = chap['title']
chap_content = f'<h2>{chap_title}</h2>' + chap['content'].replace("\n", "<br/>")
if p == 1:
chap_content = f"<h1>{title}</h1>" + chap_content
p += 1
file_name = f'chapter{chap["number"]}-{chap_title}.html'
c = epub.EpubHtml(lang='vi', title=chap_title, file_name=file_name, uid=f'chapter{chap["number"]}')
c.content = chap_content
book.add_item(c)
epub_chapters.append(c)
book.spine = [f'chapter{chap["number"]}' for chap in chapters]
book.toc = tuple(epub_chapters)
book.add_item(epub.EpubNcx())
book.add_item(epub.EpubNav())
epub_filename = f"novel_{start_chapter:04d}_{end_chapter:04d}.epub"
epub.write_epub(epub_filename, book, {})
print(f"✅ EPUB created: {epub_filename}")
def main():
"""Main function to run hybrid API scraper"""
scraper = HybridAPIScraper()
print("🚀 Starting hybrid API scraper...")
print("=" * 60)
# Test single chapter first
print("Testing single chapter access...")
success = scraper.scrape_with_authenticated_session()
if success:
print("\n✅ Single chapter test successful! Testing multiple chapters...")
# Test multiple chapters
scraper.scrape_multiple_chapters(1, 3)
print("=" * 60)
print("✅ Hybrid API scraper completed!")
if __name__ == "__main__":
main()