ai_html_document_trainer/data_scraper.py
2025-08-22 16:30:56 +07:00

126 lines
4.3 KiB
Python

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
from urllib.parse import urljoin
class OdooDocScraper:
def __init__(self):
self.base_urls = {
'en': 'https://www.odoo.com/documentation/18.0/',
'id': 'https://www.odoo.com/documentation/18.0/id/'
}
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def get_page_content(self, url, lang):
"""Scrape content from a documentation page"""
try:
response = self.session.get(url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Extract main content
content_selectors = [
'div.document',
'div.content',
'main',
'article'
]
content_text = ""
for selector in content_selectors:
content_div = soup.select_one(selector)
if content_div:
# Remove script and style elements
for script in content_div(["script", "style"]):
script.decompose()
# Extract text
text = content_div.get_text(separator=' ', strip=True)
if len(text) > 100: # Only keep substantial content
content_text = text
break
return {
'url': url,
'language': lang,
'title': soup.title.string if soup.title else '',
'content': content_text
}
except Exception as e:
print(f"Error scraping {url}: {e}")
return None
def get_main_pages(self, lang):
"""Get main documentation pages to scrape"""
base_url = self.base_urls[lang]
main_pages = []
try:
response = self.session.get(base_url, timeout=30)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Look for navigation links
nav_selectors = [
'nav a[href]',
'.toctree a[href]',
'ul li a[href]'
]
for selector in nav_selectors:
links = soup.select(selector)
for link in links:
href = link.get('href')
if href and not href.startswith('#') and not href.startswith('mailto:'):
full_url = urljoin(base_url, href)
if full_url.startswith(base_url) and full_url not in main_pages:
main_pages.append(full_url)
# Limit to first 20 pages per language to avoid overwhelming
return main_pages[:20]
except Exception as e:
print(f"Error getting main pages for {lang}: {e}")
return [base_url] # Fallback to base URL
def scrape_documentation(self):
"""Scrape documentation from both languages"""
all_data = []
for lang in ['en', 'id']:
print(f"Scraping {lang} documentation...")
pages = self.get_main_pages(lang)
for i, page_url in enumerate(pages):
print(f"Scraping page {i+1}/{len(pages)}: {page_url}")
page_data = self.get_page_content(page_url, lang)
if page_data and page_data['content']:
all_data.append(page_data)
time.sleep(1) # Be respectful to the server
return all_data
def save_data(self, data, output_file='odoo_docs_data.csv'):
"""Save scraped data to CSV"""
if not data:
print("No data to save!")
return
df = pd.DataFrame(data)
df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Saved {len(data)} pages to {output_file}")
# Also save as JSON for training
df.to_json(output_file.replace('.csv', '.json'), orient='records', force_ascii=False, indent=2)
print(f"Also saved as JSON: {output_file.replace('.csv', '.json')}")
if __name__ == "__main__":
scraper = OdooDocScraper()
data = scraper.scrape_documentation()
scraper.save_data(data)