ai_html_document_trainer/data_scraper.py

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
from urllib.parse import urljoin

class OdooDocScraper:
    def __init__(self):
        self.base_urls = {
            'en': 'https://www.odoo.com/documentation/18.0/',
            'id': 'https://www.odoo.com/documentation/18.0/id/'
        }
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        })

    def get_page_content(self, url, lang):
        """Scrape content from a documentation page"""
        try:
            response = self.session.get(url, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract main content
            content_selectors = [
                'div.document',
                'div.content',
                'main',
                'article'
            ]

            content_text = ""
            for selector in content_selectors:
                content_div = soup.select_one(selector)
                if content_div:
                    # Remove script and style elements
                    for script in content_div(["script", "style"]):
                        script.decompose()

                    # Extract text
                    text = content_div.get_text(separator=' ', strip=True)
                    if len(text) > 100:  # Only keep substantial content
                        content_text = text
                        break

            return {
                'url': url,
                'language': lang,
                'title': soup.title.string if soup.title else '',
                'content': content_text
            }

        except Exception as e:
            print(f"Error scraping {url}: {e}")
            return None

    def get_main_pages(self, lang):
        """Get main documentation pages to scrape"""
        base_url = self.base_urls[lang]
        main_pages = []

        try:
            response = self.session.get(base_url, timeout=30)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')

            # Look for navigation links
            nav_selectors = [
                'nav a[href]',
                '.toctree a[href]',
                'ul li a[href]'
            ]

            for selector in nav_selectors:
                links = soup.select(selector)
                for link in links:
                    href = link.get('href')
                    if href and not href.startswith('#') and not href.startswith('mailto:'):
                        full_url = urljoin(base_url, href)
                        if full_url.startswith(base_url) and full_url not in main_pages:
                            main_pages.append(full_url)

            # Limit to first 20 pages per language to avoid overwhelming
            return main_pages[:20]

        except Exception as e:
            print(f"Error getting main pages for {lang}: {e}")
            return [base_url]  # Fallback to base URL

    def scrape_documentation(self):
        """Scrape documentation from both languages"""
        all_data = []

        for lang in ['en', 'id']:
            print(f"Scraping {lang} documentation...")
            pages = self.get_main_pages(lang)

            for i, page_url in enumerate(pages):
                print(f"Scraping page {i+1}/{len(pages)}: {page_url}")
                page_data = self.get_page_content(page_url, lang)
                if page_data and page_data['content']:
                    all_data.append(page_data)
                time.sleep(1)  # Be respectful to the server

        return all_data

    def save_data(self, data, output_file='odoo_docs_data.csv'):
        """Save scraped data to CSV"""
        if not data:
            print("No data to save!")
            return

        df = pd.DataFrame(data)
        df.to_csv(output_file, index=False, encoding='utf-8')
        print(f"Saved {len(data)} pages to {output_file}")

        # Also save as JSON for training
        df.to_json(output_file.replace('.csv', '.json'), orient='records', force_ascii=False, indent=2)
        print(f"Also saved as JSON: {output_file.replace('.csv', '.json')}")

if __name__ == "__main__":
    scraper = OdooDocScraper()
    data = scraper.scrape_documentation()
    scraper.save_data(data)