adding dataset processor in notebook file

This commit is contained in:
Suherdy Yacob 2025-08-23 07:04:45 +07:00
parent 81c8524809
commit 43d6f0e98a
3 changed files with 356 additions and 1 deletions

View File

@ -0,0 +1,113 @@
# Dataset Processing from GitHub Repositories
This guide explains how to get and process datasets from GitHub repositories using the provided tools.
## Prerequisites
Make sure you have installed the required dependencies:
```bash
pip install -r requirements.txt
```
## Using the DatasetProcessor Class
The `DatasetProcessor` class in `src/dataset_processor.py` provides comprehensive functionality for processing GitHub repositories into training datasets.
### Example Usage
```python
from src.dataset_processor import DatasetProcessor
from src.config import AppConfig, ModelConfig, TrainingConfig, DatasetConfig, MemoryConfig
# Initialize configuration
config = AppConfig(
model=ModelConfig(),
training=TrainingConfig(),
dataset=DatasetConfig(),
memory=MemoryConfig()
)
# Initialize dataset processor
processor = DatasetProcessor()
# Process GitHub repositories
repo_urls = [
"https://github.com/karpathy/nanoGPT.git",
# Add more repository URLs as needed
]
dataset = processor.process_github_repos(
repo_urls=repo_urls,
config=config,
github_token=None # Add your token for private repositories
)
print(f"Dataset processed successfully with {len(dataset)} samples")
```
## Using the Example Script
You can run the example script directly:
```bash
python example_dataset_processing.py
```
This will process the example repository and show information about the processed dataset.
## Using in Google Colab
The `ai_trainer_t4_colab.ipynb` notebook includes sections for processing GitHub repositories:
1. Simple repository processing (Section 5)
2. Advanced dataset processing (Section 5.1)
## Supported File Types
The DatasetProcessor supports the following file types:
- Python (.py)
- JavaScript (.js)
- TypeScript (.ts)
- Java (.java)
- C++ (.cpp, .hpp)
- C (.c, .h)
- C# (.cs)
- PHP (.php)
- Ruby (.rb)
- Go (.go)
- Rust (.rs)
- Swift (.swift)
- Kotlin (.kt)
- Scala (.scala)
- SQL (.sql)
- Bash (.sh)
- YAML (.yaml, .yml)
- JSON (.json)
- XML (.xml)
- HTML (.html)
- CSS (.css)
- Markdown (.md)
## Configuration
The dataset processing can be configured through the `DatasetConfig` class:
```python
dataset_config = DatasetConfig(
min_file_size=10, # Minimum file size in characters
max_file_size=10000, # Maximum file size in characters
supported_languages=[...], # List of supported programming languages
exclude_patterns=[...] # Patterns to exclude
)
```
## Output Format
The processed dataset contains the following fields for each sample:
- `text`: The content of the code file
- `language`: The programming language detected
- `file_path`: Relative path to the file within the repository
- `repo_name`: Name of the repository
- `file_size`: Size of the file in characters
- `line_count`: Number of lines in the file

View File

@ -189,6 +189,170 @@
"# dataset = process_github_repo(\"https://github.com/your-username/your-repo.git\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5.1 Advanced Dataset Processing\n",
"\n",
"For more comprehensive dataset processing with support for multiple file types, you can use this advanced processor:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class AdvancedDatasetProcessor:\n",
" \"\"\"Advanced processor for GitHub repositories with comprehensive file support\"\"\"\n",
" \n",
" # Supported file extensions\n",
" CODE_EXTENSIONS = {\n",
" '.py': 'python', '.js': 'javascript', '.ts': 'typescript',\n",
" '.java': 'java', '.cpp': 'cpp', '.c': 'c', '.cs': 'csharp',\n",
" '.php': 'php', '.rb': 'ruby', '.go': 'go', '.rs': 'rust',\n",
" '.swift': 'swift', '.kt': 'kotlin', '.scala': 'scala',\n",
" '.sql': 'sql', '.sh': 'bash', '.yaml': 'yaml', '.yml': 'yaml',\n",
" '.json': 'json', '.xml': 'xml', '.html': 'html', '.css': 'css',\n",
" '.md': 'markdown'\n",
" }\n",
" \n",
" def __init__(self):\n",
" pass\n",
" \n",
" def process_github_repos(self, repo_urls, max_files_per_repo=50):\n",
" \"\"\"Process multiple GitHub repositories into a training dataset\"\"\"\n",
" all_code_samples = []\n",
" \n",
" for repo_url in repo_urls:\n",
" try:\n",
" print(f\"Processing repository: {repo_url}\")\n",
" repo_samples = self._process_single_repo(repo_url, max_files_per_repo)\n",
" all_code_samples.extend(repo_samples)\n",
" print(f\"Extracted {len(repo_samples)} samples from {repo_url}\")\n",
" except Exception as e:\n",
" print(f\"Failed to process repository {repo_url}: {str(e)}\")\n",
" continue\n",
" \n",
" if not all_code_samples:\n",
" raise ValueError(\"No code samples extracted from any repository\")\n",
" \n",
" print(f\"Total samples collected: {len(all_code_samples)}\")\n",
" \n",
" # Create HuggingFace dataset\n",
" from datasets import Dataset\n",
" dataset = Dataset.from_list(all_code_samples)\n",
" return dataset\n",
" \n",
" def _process_single_repo(self, repo_url, max_files_per_repo):\n",
" \"\"\"Process a single GitHub repository\"\"\"\n",
" import tempfile\n",
" \n",
" with tempfile.TemporaryDirectory() as temp_dir:\n",
" try:\n",
" # Clone repository\n",
" repo_name = repo_url.split('/')[-1].replace('.git', '')\n",
" repo_path = f\"{temp_dir}/{repo_name}\"\n",
" \n",
" print(f\"Cloning {repo_url}...\")\n",
" repo = git.Repo.clone_from(repo_url, repo_path, depth=1)\n",
" \n",
" # Extract code samples\n",
" code_samples = self._extract_code_samples(repo_path, max_files_per_repo)\n",
" \n",
" return code_samples\n",
" \n",
" finally:\n",
" print(f\"Finished processing {repo_url}\")\n",
" \n",
" def _extract_code_samples(self, repo_path, max_files_per_repo):\n",
" \"\"\"Extract code samples from a repository\"\"\"\n",
" code_samples = []\n",
" repo_path_obj = Path(repo_path)\n",
" \n",
" # Find all code files\n",
" code_files = []\n",
" for ext in self.CODE_EXTENSIONS:\n",
" code_files.extend(repo_path_obj.rglob(f'*{ext}'))\n",
" \n",
" print(f\"Found {len(code_files)} code files\")\n",
" \n",
" # Limit files per repo to prevent memory issues\n",
" code_files = code_files[:max_files_per_repo]\n",
" \n",
" for code_file in code_files:\n",
" try:\n",
" if self._should_exclude_file(str(code_file.relative_to(repo_path))):\n",
" continue\n",
" \n",
" sample = self._process_code_file(code_file, repo_path_obj)\n",
" if sample:\n",
" code_samples.append(sample)\n",
" \n",
" except Exception as e:\n",
" print(f\"Failed to process {code_file}: {str(e)}\")\n",
" continue\n",
" \n",
" return code_samples\n",
" \n",
" def _should_exclude_file(self, relative_path):\n",
" \"\"\"Check if a file should be excluded based on patterns\"\"\"\n",
" import re\n",
" exclude_patterns = [\n",
" r'\\.git/', r'__pycache__/', r'node_modules/',\n",
" r'\\.venv/', r'venv/', r'package-lock\\.json$',\n",
" r'\\.log$', r'\\.tmp$', r'~\\$.*', r'\\.swp$',\n",
" r'\\.DS_Store', r'\\.pyc$'\n",
" ]\n",
" for pattern in exclude_patterns:\n",
" if re.search(pattern, relative_path):\n",
" return True\n",
" return False\n",
" \n",
" def _process_code_file(self, file_path, repo_path):\n",
" \"\"\"Process a single code file into a training sample\"\"\"\n",
" try:\n",
" # Read file content\n",
" with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
" content = f.read()\n",
" \n",
" # Skip if file is too small or too large\n",
" if len(content.strip()) < 10:\n",
" return None\n",
" if len(content) > 100000: # Rough limit\n",
" return None\n",
" \n",
" # Get relative path for context\n",
" relative_path = file_path.relative_to(repo_path)\n",
" \n",
" # Determine language\n",
" extension = file_path.suffix.lower()\n",
" language = self.CODE_EXTENSIONS.get(extension, 'unknown')\n",
" \n",
" # Create training sample\n",
" sample = {\n",
" 'text': content,\n",
" 'language': language,\n",
" 'file_path': str(relative_path),\n",
" 'repo_name': repo_path.name,\n",
" 'file_size': len(content),\n",
" 'line_count': len(content.splitlines())\n",
" }\n",
" \n",
" return sample\n",
" \n",
" except Exception as e:\n",
" print(f\"Error processing {file_path}: {str(e)}\")\n",
" return None\n",
"\n",
"# Example usage:\n",
"# processor = AdvancedDatasetProcessor()\n",
"# dataset = processor.process_github_repos([\n",
"# \"https://github.com/karpathy/nanoGPT.git\"\n",
"# ])"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -310,4 +474,4 @@
},
"nbformat": 4,
"nbformat_minor": 4
}
}

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
"""
Example script demonstrating how to get and process a dataset from GitHub repositories
using the DatasetProcessor class.
"""
import sys
from pathlib import Path
# Add src to path to import our modules
sys.path.append(str(Path(__file__).parent))
from src.dataset_processor import DatasetProcessor
from src.config import AppConfig, ModelConfig, TrainingConfig, DatasetConfig, MemoryConfig
def main():
# Initialize configuration
config = AppConfig(
model=ModelConfig(
name="unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit",
max_seq_length=2048
),
training=TrainingConfig(),
dataset=DatasetConfig(),
memory=MemoryConfig()
)
# Initialize dataset processor
processor = DatasetProcessor()
# Example GitHub repositories to process
# Replace these with your own repositories
repo_urls = [
"https://github.com/karpathy/nanoGPT.git",
# "https://github.com/your-username/your-repo.git"
]
try:
print("Processing GitHub repositories...")
dataset = processor.process_github_repos(
repo_urls=repo_urls,
config=config,
github_token=None # Add your token here if processing private repositories
)
print(f"Dataset processed successfully!")
print(f"Dataset size: {len(dataset)} samples")
# Show some examples from the dataset
print("\nFirst 3 samples from the dataset:")
for i in range(min(3, len(dataset))):
sample = dataset[i]
print(f"\nSample {i+1}:")
print(f" Repository: {sample['repo_name']}")
print(f" File path: {sample['file_path']}")
print(f" Language: {sample['language']}")
print(f" File size: {sample['file_size']} characters")
print(f" Lines: {sample['line_count']}")
# Show first 200 characters of the text
preview_text = sample['text'][:200] + "..." if len(sample['text']) > 200 else sample['text']
print(f" Text preview: {preview_text}")
# Save dataset to disk (optional)
# dataset.save_to_disk("./processed_dataset")
# print("\nDataset saved to ./processed_dataset")
return dataset
except Exception as e:
print(f"Error processing repositories: {e}")
import traceback
traceback.print_exc()
return None
if __name__ == "__main__":
dataset = main()