157 lines
5.5 KiB
Python
157 lines
5.5 KiB
Python
import pandas as pd
|
|
import json
|
|
import re
|
|
from typing import List, Dict
|
|
import os
|
|
|
|
class DataPreprocessor:
|
|
def __init__(self):
|
|
self.max_length = 2048 # Suitable for Qwen model
|
|
self.overlap = 200 # Overlap between chunks
|
|
|
|
def clean_text(self, text: str) -> str:
|
|
"""Clean and normalize text"""
|
|
if not text:
|
|
return ""
|
|
|
|
# Remove extra whitespace
|
|
text = re.sub(r'\s+', ' ', text.strip())
|
|
|
|
# Remove special characters but keep basic punctuation
|
|
text = re.sub(r'[^\w\s\.\,\!\?\-\:\;\(\)]', '', text)
|
|
|
|
return text.strip()
|
|
|
|
def chunk_text(self, text: str, title: str = "", language: str = "en") -> List[str]:
|
|
"""Split text into chunks suitable for training"""
|
|
if not text:
|
|
return []
|
|
|
|
# Add title as context if available
|
|
if title:
|
|
if language == "id":
|
|
context = f"Judul: {title}\n\nKonten:\n\n"
|
|
else:
|
|
context = f"Title: {title}\n\nContent:\n\n"
|
|
text = context + text
|
|
|
|
words = text.split()
|
|
chunks = []
|
|
|
|
for i in range(0, len(words), self.max_length - self.overlap):
|
|
chunk_words = words[i:i + self.max_length]
|
|
chunk = ' '.join(chunk_words)
|
|
|
|
if len(chunk_words) >= 50: # Only keep substantial chunks
|
|
chunks.append(chunk)
|
|
|
|
return chunks
|
|
|
|
def create_training_format(self, chunk: str, language: str = "en") -> Dict:
|
|
"""Format chunk for instruction tuning"""
|
|
if language == "id":
|
|
instruction = "Jelaskan dan berikan informasi tentang topik berikut berdasarkan dokumentasi Odoo:"
|
|
response_format = f"Berdasarkan dokumentasi Odoo:\n\n{chunk}"
|
|
else:
|
|
instruction = "Explain and provide information about the following topic based on Odoo documentation:"
|
|
response_format = f"Based on Odoo documentation:\n\n{chunk}"
|
|
|
|
return {
|
|
"instruction": instruction,
|
|
"input": chunk[:500] + "..." if len(chunk) > 500 else chunk, # Truncate input for instruction
|
|
"output": response_format,
|
|
"language": language
|
|
}
|
|
|
|
def process_csv_data(self, input_file: str = 'odoo_docs_data.csv') -> List[Dict]:
|
|
"""Process CSV data and prepare for training"""
|
|
if not os.path.exists(input_file):
|
|
print(f"Input file {input_file} not found!")
|
|
return []
|
|
|
|
print(f"Loading data from {input_file}")
|
|
df = pd.read_csv(input_file)
|
|
|
|
training_data = []
|
|
|
|
for _, row in df.iterrows():
|
|
content = self.clean_text(row.get('content', ''))
|
|
title = row.get('title', '')
|
|
language = row.get('language', 'en')
|
|
|
|
if not content:
|
|
continue
|
|
|
|
# Create chunks from the content
|
|
chunks = self.chunk_text(content, title, language)
|
|
|
|
# Convert each chunk to training format
|
|
for chunk in chunks:
|
|
training_format = self.create_training_format(chunk, language)
|
|
training_data.append(training_format)
|
|
|
|
print(f"Processed {len(training_data)} training samples")
|
|
return training_data
|
|
|
|
def save_training_data(self, training_data: List[Dict], output_file: str = 'training_data.json'):
|
|
"""Save processed training data"""
|
|
if not training_data:
|
|
print("No training data to save!")
|
|
return
|
|
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(training_data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"Saved {len(training_data)} samples to {output_file}")
|
|
|
|
# Also create a smaller sample for testing
|
|
sample_size = min(100, len(training_data))
|
|
sample_data = training_data[:sample_size]
|
|
|
|
sample_file = output_file.replace('.json', '_sample.json')
|
|
with open(sample_file, 'w', encoding='utf-8') as f:
|
|
json.dump(sample_data, f, ensure_ascii=False, indent=2)
|
|
|
|
print(f"Saved sample of {sample_size} items to {sample_file}")
|
|
|
|
def get_statistics(self, training_data: List[Dict]) -> Dict:
|
|
"""Get statistics about the training data"""
|
|
if not training_data:
|
|
return {}
|
|
|
|
languages = {}
|
|
total_length = 0
|
|
|
|
for item in training_data:
|
|
lang = item.get('language', 'unknown')
|
|
languages[lang] = languages.get(lang, 0) + 1
|
|
total_length += len(item.get('output', ''))
|
|
|
|
return {
|
|
'total_samples': len(training_data),
|
|
'language_distribution': languages,
|
|
'average_length': total_length / len(training_data),
|
|
'max_length': max(len(item.get('output', '')) for item in training_data),
|
|
'min_length': min(len(item.get('output', '')) for item in training_data)
|
|
}
|
|
|
|
if __name__ == "__main__":
|
|
preprocessor = DataPreprocessor()
|
|
|
|
# Process the scraped data
|
|
training_data = preprocessor.process_csv_data()
|
|
|
|
if training_data:
|
|
# Save the training data
|
|
preprocessor.save_training_data(training_data)
|
|
|
|
# Print statistics
|
|
stats = preprocessor.get_statistics(training_data)
|
|
print("\nTraining Data Statistics:")
|
|
print(f"Total samples: {stats['total_samples']}")
|
|
print(f"Language distribution: {stats['language_distribution']}")
|
|
print(".2f")
|
|
print(f"Max length: {stats['max_length']}")
|
|
print(f"Min length: {stats['min_length']}")
|
|
else:
|
|
print("No training data was generated!") |