126 lines
4.3 KiB
Python
126 lines
4.3 KiB
Python
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import time
|
|
import os
|
|
from urllib.parse import urljoin
|
|
|
|
class OdooDocScraper:
|
|
def __init__(self):
|
|
self.base_urls = {
|
|
'en': 'https://www.odoo.com/documentation/18.0/',
|
|
'id': 'https://www.odoo.com/documentation/18.0/id/'
|
|
}
|
|
self.session = requests.Session()
|
|
self.session.headers.update({
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
|
})
|
|
|
|
def get_page_content(self, url, lang):
|
|
"""Scrape content from a documentation page"""
|
|
try:
|
|
response = self.session.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Extract main content
|
|
content_selectors = [
|
|
'div.document',
|
|
'div.content',
|
|
'main',
|
|
'article'
|
|
]
|
|
|
|
content_text = ""
|
|
for selector in content_selectors:
|
|
content_div = soup.select_one(selector)
|
|
if content_div:
|
|
# Remove script and style elements
|
|
for script in content_div(["script", "style"]):
|
|
script.decompose()
|
|
|
|
# Extract text
|
|
text = content_div.get_text(separator=' ', strip=True)
|
|
if len(text) > 100: # Only keep substantial content
|
|
content_text = text
|
|
break
|
|
|
|
return {
|
|
'url': url,
|
|
'language': lang,
|
|
'title': soup.title.string if soup.title else '',
|
|
'content': content_text
|
|
}
|
|
|
|
except Exception as e:
|
|
print(f"Error scraping {url}: {e}")
|
|
return None
|
|
|
|
def get_main_pages(self, lang):
|
|
"""Get main documentation pages to scrape"""
|
|
base_url = self.base_urls[lang]
|
|
main_pages = []
|
|
|
|
try:
|
|
response = self.session.get(base_url, timeout=30)
|
|
response.raise_for_status()
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
# Look for navigation links
|
|
nav_selectors = [
|
|
'nav a[href]',
|
|
'.toctree a[href]',
|
|
'ul li a[href]'
|
|
]
|
|
|
|
for selector in nav_selectors:
|
|
links = soup.select(selector)
|
|
for link in links:
|
|
href = link.get('href')
|
|
if href and not href.startswith('#') and not href.startswith('mailto:'):
|
|
full_url = urljoin(base_url, href)
|
|
if full_url.startswith(base_url) and full_url not in main_pages:
|
|
main_pages.append(full_url)
|
|
|
|
# Limit to first 20 pages per language to avoid overwhelming
|
|
return main_pages[:20]
|
|
|
|
except Exception as e:
|
|
print(f"Error getting main pages for {lang}: {e}")
|
|
return [base_url] # Fallback to base URL
|
|
|
|
def scrape_documentation(self):
|
|
"""Scrape documentation from both languages"""
|
|
all_data = []
|
|
|
|
for lang in ['en', 'id']:
|
|
print(f"Scraping {lang} documentation...")
|
|
pages = self.get_main_pages(lang)
|
|
|
|
for i, page_url in enumerate(pages):
|
|
print(f"Scraping page {i+1}/{len(pages)}: {page_url}")
|
|
page_data = self.get_page_content(page_url, lang)
|
|
if page_data and page_data['content']:
|
|
all_data.append(page_data)
|
|
time.sleep(1) # Be respectful to the server
|
|
|
|
return all_data
|
|
|
|
def save_data(self, data, output_file='odoo_docs_data.csv'):
|
|
"""Save scraped data to CSV"""
|
|
if not data:
|
|
print("No data to save!")
|
|
return
|
|
|
|
df = pd.DataFrame(data)
|
|
df.to_csv(output_file, index=False, encoding='utf-8')
|
|
print(f"Saved {len(data)} pages to {output_file}")
|
|
|
|
# Also save as JSON for training
|
|
df.to_json(output_file.replace('.csv', '.json'), orient='records', force_ascii=False, indent=2)
|
|
print(f"Also saved as JSON: {output_file.replace('.csv', '.json')}")
|
|
|
|
if __name__ == "__main__":
|
|
scraper = OdooDocScraper()
|
|
data = scraper.scrape_documentation()
|
|
scraper.save_data(data) |