ai_html_document_trainer/data_preprocessor.py
2025-08-22 16:30:56 +07:00

157 lines
5.5 KiB
Python

import pandas as pd
import json
import re
from typing import List, Dict
import os
class DataPreprocessor:
def __init__(self):
self.max_length = 2048 # Suitable for Qwen model
self.overlap = 200 # Overlap between chunks
def clean_text(self, text: str) -> str:
"""Clean and normalize text"""
if not text:
return ""
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text.strip())
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s\.\,\!\?\-\:\;\(\)]', '', text)
return text.strip()
def chunk_text(self, text: str, title: str = "", language: str = "en") -> List[str]:
"""Split text into chunks suitable for training"""
if not text:
return []
# Add title as context if available
if title:
if language == "id":
context = f"Judul: {title}\n\nKonten:\n\n"
else:
context = f"Title: {title}\n\nContent:\n\n"
text = context + text
words = text.split()
chunks = []
for i in range(0, len(words), self.max_length - self.overlap):
chunk_words = words[i:i + self.max_length]
chunk = ' '.join(chunk_words)
if len(chunk_words) >= 50: # Only keep substantial chunks
chunks.append(chunk)
return chunks
def create_training_format(self, chunk: str, language: str = "en") -> Dict:
"""Format chunk for instruction tuning"""
if language == "id":
instruction = "Jelaskan dan berikan informasi tentang topik berikut berdasarkan dokumentasi Odoo:"
response_format = f"Berdasarkan dokumentasi Odoo:\n\n{chunk}"
else:
instruction = "Explain and provide information about the following topic based on Odoo documentation:"
response_format = f"Based on Odoo documentation:\n\n{chunk}"
return {
"instruction": instruction,
"input": chunk[:500] + "..." if len(chunk) > 500 else chunk, # Truncate input for instruction
"output": response_format,
"language": language
}
def process_csv_data(self, input_file: str = 'odoo_docs_data.csv') -> List[Dict]:
"""Process CSV data and prepare for training"""
if not os.path.exists(input_file):
print(f"Input file {input_file} not found!")
return []
print(f"Loading data from {input_file}")
df = pd.read_csv(input_file)
training_data = []
for _, row in df.iterrows():
content = self.clean_text(row.get('content', ''))
title = row.get('title', '')
language = row.get('language', 'en')
if not content:
continue
# Create chunks from the content
chunks = self.chunk_text(content, title, language)
# Convert each chunk to training format
for chunk in chunks:
training_format = self.create_training_format(chunk, language)
training_data.append(training_format)
print(f"Processed {len(training_data)} training samples")
return training_data
def save_training_data(self, training_data: List[Dict], output_file: str = 'training_data.json'):
"""Save processed training data"""
if not training_data:
print("No training data to save!")
return
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(training_data, f, ensure_ascii=False, indent=2)
print(f"Saved {len(training_data)} samples to {output_file}")
# Also create a smaller sample for testing
sample_size = min(100, len(training_data))
sample_data = training_data[:sample_size]
sample_file = output_file.replace('.json', '_sample.json')
with open(sample_file, 'w', encoding='utf-8') as f:
json.dump(sample_data, f, ensure_ascii=False, indent=2)
print(f"Saved sample of {sample_size} items to {sample_file}")
def get_statistics(self, training_data: List[Dict]) -> Dict:
"""Get statistics about the training data"""
if not training_data:
return {}
languages = {}
total_length = 0
for item in training_data:
lang = item.get('language', 'unknown')
languages[lang] = languages.get(lang, 0) + 1
total_length += len(item.get('output', ''))
return {
'total_samples': len(training_data),
'language_distribution': languages,
'average_length': total_length / len(training_data),
'max_length': max(len(item.get('output', '')) for item in training_data),
'min_length': min(len(item.get('output', '')) for item in training_data)
}
if __name__ == "__main__":
preprocessor = DataPreprocessor()
# Process the scraped data
training_data = preprocessor.process_csv_data()
if training_data:
# Save the training data
preprocessor.save_training_data(training_data)
# Print statistics
stats = preprocessor.get_statistics(training_data)
print("\nTraining Data Statistics:")
print(f"Total samples: {stats['total_samples']}")
print(f"Language distribution: {stats['language_distribution']}")
print(".2f")
print(f"Max length: {stats['max_length']}")
print(f"Min length: {stats['min_length']}")
else:
print("No training data was generated!")