import requests from bs4 import BeautifulSoup import pandas as pd import time import os from urllib.parse import urljoin class OdooDocScraper: def __init__(self): self.base_urls = { 'en': 'https://www.odoo.com/documentation/18.0/', 'id': 'https://www.odoo.com/documentation/18.0/id/' } self.session = requests.Session() self.session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' }) def get_page_content(self, url, lang): """Scrape content from a documentation page""" try: response = self.session.get(url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Extract main content content_selectors = [ 'div.document', 'div.content', 'main', 'article' ] content_text = "" for selector in content_selectors: content_div = soup.select_one(selector) if content_div: # Remove script and style elements for script in content_div(["script", "style"]): script.decompose() # Extract text text = content_div.get_text(separator=' ', strip=True) if len(text) > 100: # Only keep substantial content content_text = text break return { 'url': url, 'language': lang, 'title': soup.title.string if soup.title else '', 'content': content_text } except Exception as e: print(f"Error scraping {url}: {e}") return None def get_main_pages(self, lang): """Get main documentation pages to scrape""" base_url = self.base_urls[lang] main_pages = [] try: response = self.session.get(base_url, timeout=30) response.raise_for_status() soup = BeautifulSoup(response.content, 'html.parser') # Look for navigation links nav_selectors = [ 'nav a[href]', '.toctree a[href]', 'ul li a[href]' ] for selector in nav_selectors: links = soup.select(selector) for link in links: href = link.get('href') if href and not href.startswith('#') and not href.startswith('mailto:'): full_url = urljoin(base_url, href) if full_url.startswith(base_url) and full_url not in main_pages: main_pages.append(full_url) # Limit to first 20 pages per language to avoid overwhelming return main_pages[:20] except Exception as e: print(f"Error getting main pages for {lang}: {e}") return [base_url] # Fallback to base URL def scrape_documentation(self): """Scrape documentation from both languages""" all_data = [] for lang in ['en', 'id']: print(f"Scraping {lang} documentation...") pages = self.get_main_pages(lang) for i, page_url in enumerate(pages): print(f"Scraping page {i+1}/{len(pages)}: {page_url}") page_data = self.get_page_content(page_url, lang) if page_data and page_data['content']: all_data.append(page_data) time.sleep(1) # Be respectful to the server return all_data def save_data(self, data, output_file='odoo_docs_data.csv'): """Save scraped data to CSV""" if not data: print("No data to save!") return df = pd.DataFrame(data) df.to_csv(output_file, index=False, encoding='utf-8') print(f"Saved {len(data)} pages to {output_file}") # Also save as JSON for training df.to_json(output_file.replace('.csv', '.json'), orient='records', force_ascii=False, indent=2) print(f"Also saved as JSON: {output_file.replace('.csv', '.json')}") if __name__ == "__main__": scraper = OdooDocScraper() data = scraper.scrape_documentation() scraper.save_data(data)