Add crawl.py
This commit is contained in:
commit
02d9145707
443
crawl.py
Normal file
443
crawl.py
Normal file
@ -0,0 +1,443 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import requests
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
from ebooklib import epub # <-- Add this import
|
||||||
|
|
||||||
|
class HybridAPIScraper:
|
||||||
|
def __init__(self):
|
||||||
|
self.frontend_url = "https://metruyencv.biz"
|
||||||
|
self.backend_url = "https://backend.metruyencv.com"
|
||||||
|
self.email = "le.thanh1305@gmail.com"
|
||||||
|
self.password = "Lethanh1710"
|
||||||
|
self.session = requests.Session()
|
||||||
|
|
||||||
|
# Set up session headers
|
||||||
|
self.session.headers.update({
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
||||||
|
'Accept': 'application/json, text/plain, */*',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Origin': self.frontend_url,
|
||||||
|
'Referer': self.frontend_url + '/'
|
||||||
|
})
|
||||||
|
|
||||||
|
def login_via_api(self):
|
||||||
|
"""Login via API and establish authenticated session"""
|
||||||
|
print("🔐 Logging in via API...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get CSRF token
|
||||||
|
response = self.session.get(self.frontend_url)
|
||||||
|
csrf_token = self.session.cookies.get('XSRF-TOKEN')
|
||||||
|
if csrf_token:
|
||||||
|
import urllib.parse
|
||||||
|
csrf_token = urllib.parse.unquote(csrf_token)
|
||||||
|
print(f"✅ CSRF token obtained: {csrf_token[:50]}...")
|
||||||
|
|
||||||
|
# Prepare login data
|
||||||
|
login_data = {
|
||||||
|
'email': self.email,
|
||||||
|
'password': self.password,
|
||||||
|
'device_name': 'Web Browser'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Set headers
|
||||||
|
headers = {
|
||||||
|
'Content-Type': 'application/json',
|
||||||
|
'Accept': 'application/json',
|
||||||
|
'X-CSRF-TOKEN': csrf_token,
|
||||||
|
'X-Requested-With': 'XMLHttpRequest',
|
||||||
|
'Origin': self.frontend_url,
|
||||||
|
'Referer': self.frontend_url + '/'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make login request
|
||||||
|
login_url = f"{self.backend_url}/api/auth/login"
|
||||||
|
print(f"Making login request to: {login_url}")
|
||||||
|
|
||||||
|
response = self.session.post(login_url, json=login_data, headers=headers, timeout=10)
|
||||||
|
|
||||||
|
print(f"Response status: {response.status_code}")
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
print("✅ API login successful!")
|
||||||
|
|
||||||
|
# Try to extract authentication token
|
||||||
|
try:
|
||||||
|
json_data = response.json()
|
||||||
|
if 'data' in json_data and 'token' in json_data['data']:
|
||||||
|
token = json_data['data']['token']
|
||||||
|
print(f"✅ Authentication token found: {token[:50]}...")
|
||||||
|
self.session.headers.update({'Authorization': f'Bearer {token}'})
|
||||||
|
except:
|
||||||
|
print("✅ Login successful (no token in response)")
|
||||||
|
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print(f"❌ API login failed: {response.status_code}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error during API login: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def get_authenticated_cookies(self):
|
||||||
|
"""Get authenticated cookies from API session"""
|
||||||
|
print("🍪 Getting authenticated cookies...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get cookies from the authenticated session
|
||||||
|
cookies = {}
|
||||||
|
for cookie in self.session.cookies:
|
||||||
|
cookies[cookie.name] = cookie.value
|
||||||
|
|
||||||
|
print(f"✅ Retrieved {len(cookies)} authenticated cookies")
|
||||||
|
return cookies
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error getting cookies: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def scrape_with_authenticated_session(self):
|
||||||
|
"""Scrape content using authenticated session with minimal Selenium"""
|
||||||
|
print("🌐 Starting authenticated scraping...")
|
||||||
|
|
||||||
|
# First, login via API
|
||||||
|
if not self.login_via_api():
|
||||||
|
print("❌ Cannot proceed without successful login")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Get authenticated cookies
|
||||||
|
cookies = self.get_authenticated_cookies()
|
||||||
|
|
||||||
|
# Set up Selenium with authenticated cookies
|
||||||
|
options = Options()
|
||||||
|
options.headless = False
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Navigate to homepage first
|
||||||
|
print("Step 1: Navigating to homepage...")
|
||||||
|
driver.get(self.frontend_url)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Add authenticated cookies to Selenium
|
||||||
|
print("Step 2: Adding authenticated cookies...")
|
||||||
|
for name, value in cookies.items():
|
||||||
|
driver.add_cookie({
|
||||||
|
'name': name,
|
||||||
|
'value': value,
|
||||||
|
'domain': '.metruyencv.biz' # Use domain for both frontend and backend
|
||||||
|
})
|
||||||
|
|
||||||
|
# Navigate to chapter page
|
||||||
|
print("Step 3: Navigating to chapter page...")
|
||||||
|
chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-1"
|
||||||
|
driver.get(chapter_url)
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Check if we need to login via UI (minimal interaction)
|
||||||
|
print("Step 4: Checking login status...")
|
||||||
|
login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhập') or contains(text(), 'Login')]")
|
||||||
|
|
||||||
|
if login_elements:
|
||||||
|
print("⚠️ Need minimal UI login...")
|
||||||
|
# Perform minimal UI login (only if absolutely necessary)
|
||||||
|
self.perform_minimal_ui_login(driver)
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Now try to access chapter content
|
||||||
|
print("Step 5: Accessing chapter content...")
|
||||||
|
content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words')
|
||||||
|
|
||||||
|
if content_div:
|
||||||
|
content_text = content_div[0].text
|
||||||
|
print(f"✅ Chapter content found!")
|
||||||
|
print(f"Content length: {len(content_text)} characters")
|
||||||
|
print(f"Content preview: {content_text[:200]}...")
|
||||||
|
|
||||||
|
# Save content to file
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"chapter_1_content_{timestamp}.txt"
|
||||||
|
|
||||||
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(content_text)
|
||||||
|
|
||||||
|
print(f"✅ Content saved to: {filename}")
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
print("❌ Chapter content not found")
|
||||||
|
return False
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error during scraping: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
def perform_minimal_ui_login(self, driver):
|
||||||
|
"""Perform minimal UI login only if absolutely necessary"""
|
||||||
|
print("🔐 Performing minimal UI login...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Click hamburger menu
|
||||||
|
hamburger = driver.find_element(By.CSS_SELECTOR, 'svg.w-7.h-7')
|
||||||
|
hamburger.click()
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Click login button
|
||||||
|
login_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/div[2]/div/div/div/div/div[2]/div[1]/div/div[1]/button')
|
||||||
|
login_btn.click()
|
||||||
|
time.sleep(1)
|
||||||
|
|
||||||
|
# Fill email
|
||||||
|
email_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[1]/div[2]/input')
|
||||||
|
email_field.clear()
|
||||||
|
email_field.send_keys(self.email)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# Fill password
|
||||||
|
password_field = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[2]/div[2]/input')
|
||||||
|
password_field.clear()
|
||||||
|
password_field.send_keys(self.password)
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
|
# Click submit
|
||||||
|
submit_btn = driver.find_element(By.XPATH, '/html/body/div[1]/div[3]/div[2]/div/div/div[2]/div[3]/div[1]/button')
|
||||||
|
submit_btn.click()
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
print("✅ Minimal UI login completed!")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error during minimal UI login: {e}")
|
||||||
|
|
||||||
|
def scrape_multiple_chapters(self, start_chapter=1, end_chapter=5):
|
||||||
|
"""Scrape multiple chapters with minimal UI interaction and create an EPUB file"""
|
||||||
|
print(f"\U0001F4DA Scraping chapters {start_chapter} to {end_chapter}...")
|
||||||
|
|
||||||
|
# First, login via API
|
||||||
|
if not self.login_via_api():
|
||||||
|
print("❌ Cannot proceed without successful login")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Get authenticated cookies
|
||||||
|
cookies = self.get_authenticated_cookies()
|
||||||
|
|
||||||
|
# Set up Selenium
|
||||||
|
options = Options()
|
||||||
|
options.headless = False
|
||||||
|
driver = webdriver.Firefox(options=options)
|
||||||
|
|
||||||
|
chapters = [] # Collect chapters for EPUB
|
||||||
|
try:
|
||||||
|
# Navigate to homepage and add cookies
|
||||||
|
driver.get(self.frontend_url)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Add authenticated cookies
|
||||||
|
for name, value in cookies.items():
|
||||||
|
driver.add_cookie({
|
||||||
|
'name': name,
|
||||||
|
'value': value,
|
||||||
|
'domain': '.metruyencv.biz'
|
||||||
|
})
|
||||||
|
|
||||||
|
# --- Scrape novel metadata from main page ---
|
||||||
|
# Example novel URL (should be parameterized in real use)
|
||||||
|
novel_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than"
|
||||||
|
driver.get(novel_url)
|
||||||
|
time.sleep(2)
|
||||||
|
# Title
|
||||||
|
try:
|
||||||
|
title = driver.find_element(By.CSS_SELECTOR, 'h1.mb-2').text
|
||||||
|
except Exception:
|
||||||
|
title = "Unknown Title"
|
||||||
|
# Author
|
||||||
|
try:
|
||||||
|
author = driver.find_element(By.CSS_SELECTOR, 'a.text-gray-500').text
|
||||||
|
except Exception:
|
||||||
|
author = "Unknown Author"
|
||||||
|
# Status
|
||||||
|
try:
|
||||||
|
status = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-primary.rounded.px-2.py-1.text-primary span').text
|
||||||
|
except Exception:
|
||||||
|
status = "Unknown Status"
|
||||||
|
# Attribute
|
||||||
|
try:
|
||||||
|
attribute = driver.find_element(By.CSS_SELECTOR, 'a.inline-flex.border.border-rose-700').text
|
||||||
|
except Exception:
|
||||||
|
attribute = "Unknown Attribute"
|
||||||
|
# Cover image
|
||||||
|
try:
|
||||||
|
image_url = driver.find_element(By.CSS_SELECTOR, 'img.w-44.h-60.shadow-lg.rounded.mx-auto').get_attribute('src')
|
||||||
|
except Exception:
|
||||||
|
image_url = None
|
||||||
|
# Download cover image
|
||||||
|
image = None
|
||||||
|
if image_url:
|
||||||
|
try:
|
||||||
|
# Use session cookies for requests
|
||||||
|
s = requests.Session()
|
||||||
|
for c in driver.get_cookies():
|
||||||
|
s.cookies.set(c['name'], c['value'])
|
||||||
|
resp = s.get(image_url, timeout=10)
|
||||||
|
if resp.status_code == 200:
|
||||||
|
image = resp.content
|
||||||
|
except Exception:
|
||||||
|
image = None
|
||||||
|
# --- End metadata scraping ---
|
||||||
|
|
||||||
|
# Check if UI login is needed
|
||||||
|
login_elements = driver.find_elements(By.XPATH, "//*[contains(text(), 'Đăng nhập') or contains(text(), 'Login')]")
|
||||||
|
if login_elements:
|
||||||
|
print("⚠️ Performing minimal UI login...")
|
||||||
|
self.perform_minimal_ui_login(driver)
|
||||||
|
time.sleep(3)
|
||||||
|
|
||||||
|
# Scrape chapters
|
||||||
|
successful_chapters = 0
|
||||||
|
failed_chapters = []
|
||||||
|
|
||||||
|
for chapter_num in range(start_chapter, end_chapter + 1):
|
||||||
|
print(f"\n--- Scraping chapter {chapter_num} ---")
|
||||||
|
|
||||||
|
try:
|
||||||
|
chapter_url = f"{self.frontend_url}/truyen/ta-chi-muon-huy-diet-tong-mon-the-nao-nghich-thien-thanh-than/chuong-{chapter_num}"
|
||||||
|
driver.get(chapter_url)
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Get chapter title
|
||||||
|
title_element = driver.find_elements(By.CSS_SELECTOR, 'h2.text-center')
|
||||||
|
chap_title = title_element[0].text if title_element else f"Chapter {chapter_num}"
|
||||||
|
|
||||||
|
# Get chapter content
|
||||||
|
content_div = driver.find_elements(By.CSS_SELECTOR, 'div.break-words')
|
||||||
|
|
||||||
|
if content_div:
|
||||||
|
content_text = content_div[0].text
|
||||||
|
print(f"✅ Chapter {chapter_num} content found!")
|
||||||
|
print(f"Title: {chap_title}")
|
||||||
|
print(f"Content length: {len(content_text)} characters")
|
||||||
|
|
||||||
|
# Save chapter as txt (optional, keep old behavior)
|
||||||
|
filename = f"chapter_{chapter_num:04d}_{chap_title.replace(' ', '_').replace(':', '_')}.txt"
|
||||||
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
|
f.write(f"Title: {chap_title}\n\n")
|
||||||
|
f.write(content_text)
|
||||||
|
print(f"✅ Saved to: {filename}")
|
||||||
|
|
||||||
|
# Collect for EPUB
|
||||||
|
chapters.append({
|
||||||
|
'title': chap_title,
|
||||||
|
'content': content_text,
|
||||||
|
'number': chapter_num
|
||||||
|
})
|
||||||
|
successful_chapters += 1
|
||||||
|
else:
|
||||||
|
print(f"❌ Chapter {chapter_num} content not found")
|
||||||
|
failed_chapters.append(chapter_num)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error scraping chapter {chapter_num}: {e}")
|
||||||
|
failed_chapters.append(chapter_num)
|
||||||
|
|
||||||
|
print(f"\n\U0001F4CA SCRAPING SUMMARY:")
|
||||||
|
print(f" Successful chapters: {successful_chapters}")
|
||||||
|
print(f" Failed chapters: {len(failed_chapters)}")
|
||||||
|
if failed_chapters:
|
||||||
|
print(f" Failed chapters: {failed_chapters}")
|
||||||
|
|
||||||
|
# Create EPUB if any chapters were scraped
|
||||||
|
if chapters:
|
||||||
|
self.create_epub(title, author, status, attribute, image, chapters, start_chapter, end_chapter)
|
||||||
|
|
||||||
|
return successful_chapters > 0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Error during chapter scraping: {e}")
|
||||||
|
return False
|
||||||
|
finally:
|
||||||
|
driver.quit()
|
||||||
|
|
||||||
|
def create_epub(self, title, author, status, attribute, image, chapters, start_chapter, end_chapter):
|
||||||
|
"""Create an EPUB file from scraped chapters"""
|
||||||
|
print("\U0001F4D6 Creating EPUB file...")
|
||||||
|
book = epub.EpubBook()
|
||||||
|
# Set metadata
|
||||||
|
book.set_identifier(f"ta-chi-muon-huy-diet-tong-mon-{start_chapter}-{end_chapter}")
|
||||||
|
book.set_title(title)
|
||||||
|
book.set_language("vi")
|
||||||
|
book.add_author(author)
|
||||||
|
book.add_metadata(None, 'meta', '', {'name': 'status', 'content': status})
|
||||||
|
book.add_metadata(None, 'meta', '', {'name': 'chapter', 'content': str(len(chapters))})
|
||||||
|
book.add_metadata(None, 'meta', '', {'name': 'attribute', 'content': attribute})
|
||||||
|
if image:
|
||||||
|
book.set_cover(content=image, file_name='cover.jpg')
|
||||||
|
# Add custom CSS
|
||||||
|
style = '''
|
||||||
|
body {
|
||||||
|
font-family: Cambria, Liberation Serif, Bitstream Vera Serif, Georgia, Times, Times New Roman, serif;
|
||||||
|
}
|
||||||
|
h1 {
|
||||||
|
text-align: left;
|
||||||
|
text-transform: uppercase;
|
||||||
|
font-weight: 400;
|
||||||
|
}
|
||||||
|
h2 {
|
||||||
|
text-align: left;
|
||||||
|
text-transform: uppercase;
|
||||||
|
font-weight: 300;
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
nav_css = epub.EpubItem(uid="style_nav", file_name="style/nav.css", media_type="text/css", content=style)
|
||||||
|
book.add_item(nav_css)
|
||||||
|
epub_chapters = []
|
||||||
|
p = 1
|
||||||
|
for chap in chapters:
|
||||||
|
chap_title = chap['title']
|
||||||
|
chap_content = f'<h2>{chap_title}</h2>' + chap['content'].replace("\n", "<br/>")
|
||||||
|
if p == 1:
|
||||||
|
chap_content = f"<h1>{title}</h1>" + chap_content
|
||||||
|
p += 1
|
||||||
|
file_name = f'chapter{chap["number"]}-{chap_title}.html'
|
||||||
|
c = epub.EpubHtml(lang='vi', title=chap_title, file_name=file_name, uid=f'chapter{chap["number"]}')
|
||||||
|
c.content = chap_content
|
||||||
|
book.add_item(c)
|
||||||
|
epub_chapters.append(c)
|
||||||
|
book.spine = [f'chapter{chap["number"]}' for chap in chapters]
|
||||||
|
book.toc = tuple(epub_chapters)
|
||||||
|
book.add_item(epub.EpubNcx())
|
||||||
|
book.add_item(epub.EpubNav())
|
||||||
|
epub_filename = f"novel_{start_chapter:04d}_{end_chapter:04d}.epub"
|
||||||
|
epub.write_epub(epub_filename, book, {})
|
||||||
|
print(f"✅ EPUB created: {epub_filename}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Main function to run hybrid API scraper"""
|
||||||
|
scraper = HybridAPIScraper()
|
||||||
|
|
||||||
|
print("🚀 Starting hybrid API scraper...")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Test single chapter first
|
||||||
|
print("Testing single chapter access...")
|
||||||
|
success = scraper.scrape_with_authenticated_session()
|
||||||
|
|
||||||
|
if success:
|
||||||
|
print("\n✅ Single chapter test successful! Testing multiple chapters...")
|
||||||
|
|
||||||
|
# Test multiple chapters
|
||||||
|
scraper.scrape_multiple_chapters(1, 3)
|
||||||
|
|
||||||
|
print("=" * 60)
|
||||||
|
print("✅ Hybrid API scraper completed!")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user