import pandas as pd import json import re from typing import List, Dict import os class DataPreprocessor: def __init__(self): self.max_length = 2048 # Suitable for Qwen model self.overlap = 200 # Overlap between chunks def clean_text(self, text: str) -> str: """Clean and normalize text""" if not text: return "" # Remove extra whitespace text = re.sub(r'\s+', ' ', text.strip()) # Remove special characters but keep basic punctuation text = re.sub(r'[^\w\s\.\,\!\?\-\:\;\(\)]', '', text) return text.strip() def chunk_text(self, text: str, title: str = "", language: str = "en") -> List[str]: """Split text into chunks suitable for training""" if not text: return [] # Add title as context if available if title: if language == "id": context = f"Judul: {title}\n\nKonten:\n\n" else: context = f"Title: {title}\n\nContent:\n\n" text = context + text words = text.split() chunks = [] for i in range(0, len(words), self.max_length - self.overlap): chunk_words = words[i:i + self.max_length] chunk = ' '.join(chunk_words) if len(chunk_words) >= 50: # Only keep substantial chunks chunks.append(chunk) return chunks def create_training_format(self, chunk: str, language: str = "en") -> Dict: """Format chunk for instruction tuning""" if language == "id": instruction = "Jelaskan dan berikan informasi tentang topik berikut berdasarkan dokumentasi Odoo:" response_format = f"Berdasarkan dokumentasi Odoo:\n\n{chunk}" else: instruction = "Explain and provide information about the following topic based on Odoo documentation:" response_format = f"Based on Odoo documentation:\n\n{chunk}" return { "instruction": instruction, "input": chunk[:500] + "..." if len(chunk) > 500 else chunk, # Truncate input for instruction "output": response_format, "language": language } def process_csv_data(self, input_file: str = 'odoo_docs_data.csv') -> List[Dict]: """Process CSV data and prepare for training""" if not os.path.exists(input_file): print(f"Input file {input_file} not found!") return [] print(f"Loading data from {input_file}") df = pd.read_csv(input_file) training_data = [] for _, row in df.iterrows(): content = self.clean_text(row.get('content', '')) title = row.get('title', '') language = row.get('language', 'en') if not content: continue # Create chunks from the content chunks = self.chunk_text(content, title, language) # Convert each chunk to training format for chunk in chunks: training_format = self.create_training_format(chunk, language) training_data.append(training_format) print(f"Processed {len(training_data)} training samples") return training_data def save_training_data(self, training_data: List[Dict], output_file: str = 'training_data.json'): """Save processed training data""" if not training_data: print("No training data to save!") return with open(output_file, 'w', encoding='utf-8') as f: json.dump(training_data, f, ensure_ascii=False, indent=2) print(f"Saved {len(training_data)} samples to {output_file}") # Also create a smaller sample for testing sample_size = min(100, len(training_data)) sample_data = training_data[:sample_size] sample_file = output_file.replace('.json', '_sample.json') with open(sample_file, 'w', encoding='utf-8') as f: json.dump(sample_data, f, ensure_ascii=False, indent=2) print(f"Saved sample of {sample_size} items to {sample_file}") def get_statistics(self, training_data: List[Dict]) -> Dict: """Get statistics about the training data""" if not training_data: return {} languages = {} total_length = 0 for item in training_data: lang = item.get('language', 'unknown') languages[lang] = languages.get(lang, 0) + 1 total_length += len(item.get('output', '')) return { 'total_samples': len(training_data), 'language_distribution': languages, 'average_length': total_length / len(training_data), 'max_length': max(len(item.get('output', '')) for item in training_data), 'min_length': min(len(item.get('output', '')) for item in training_data) } if __name__ == "__main__": preprocessor = DataPreprocessor() # Process the scraped data training_data = preprocessor.process_csv_data() if training_data: # Save the training data preprocessor.save_training_data(training_data) # Print statistics stats = preprocessor.get_statistics(training_data) print("\nTraining Data Statistics:") print(f"Total samples: {stats['total_samples']}") print(f"Language distribution: {stats['language_distribution']}") print(".2f") print(f"Max length: {stats['max_length']}") print(f"Min length: {stats['min_length']}") else: print("No training data was generated!")