adding dataset processor in notebook file
This commit is contained in:
parent
81c8524809
commit
43d6f0e98a
113
README_DATASET_PROCESSING.md
Normal file
113
README_DATASET_PROCESSING.md
Normal file
@ -0,0 +1,113 @@
|
||||
# Dataset Processing from GitHub Repositories
|
||||
|
||||
This guide explains how to get and process datasets from GitHub repositories using the provided tools.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Make sure you have installed the required dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
## Using the DatasetProcessor Class
|
||||
|
||||
The `DatasetProcessor` class in `src/dataset_processor.py` provides comprehensive functionality for processing GitHub repositories into training datasets.
|
||||
|
||||
### Example Usage
|
||||
|
||||
```python
|
||||
from src.dataset_processor import DatasetProcessor
|
||||
from src.config import AppConfig, ModelConfig, TrainingConfig, DatasetConfig, MemoryConfig
|
||||
|
||||
# Initialize configuration
|
||||
config = AppConfig(
|
||||
model=ModelConfig(),
|
||||
training=TrainingConfig(),
|
||||
dataset=DatasetConfig(),
|
||||
memory=MemoryConfig()
|
||||
)
|
||||
|
||||
# Initialize dataset processor
|
||||
processor = DatasetProcessor()
|
||||
|
||||
# Process GitHub repositories
|
||||
repo_urls = [
|
||||
"https://github.com/karpathy/nanoGPT.git",
|
||||
# Add more repository URLs as needed
|
||||
]
|
||||
|
||||
dataset = processor.process_github_repos(
|
||||
repo_urls=repo_urls,
|
||||
config=config,
|
||||
github_token=None # Add your token for private repositories
|
||||
)
|
||||
|
||||
print(f"Dataset processed successfully with {len(dataset)} samples")
|
||||
```
|
||||
|
||||
## Using the Example Script
|
||||
|
||||
You can run the example script directly:
|
||||
|
||||
```bash
|
||||
python example_dataset_processing.py
|
||||
```
|
||||
|
||||
This will process the example repository and show information about the processed dataset.
|
||||
|
||||
## Using in Google Colab
|
||||
|
||||
The `ai_trainer_t4_colab.ipynb` notebook includes sections for processing GitHub repositories:
|
||||
|
||||
1. Simple repository processing (Section 5)
|
||||
2. Advanced dataset processing (Section 5.1)
|
||||
|
||||
## Supported File Types
|
||||
|
||||
The DatasetProcessor supports the following file types:
|
||||
- Python (.py)
|
||||
- JavaScript (.js)
|
||||
- TypeScript (.ts)
|
||||
- Java (.java)
|
||||
- C++ (.cpp, .hpp)
|
||||
- C (.c, .h)
|
||||
- C# (.cs)
|
||||
- PHP (.php)
|
||||
- Ruby (.rb)
|
||||
- Go (.go)
|
||||
- Rust (.rs)
|
||||
- Swift (.swift)
|
||||
- Kotlin (.kt)
|
||||
- Scala (.scala)
|
||||
- SQL (.sql)
|
||||
- Bash (.sh)
|
||||
- YAML (.yaml, .yml)
|
||||
- JSON (.json)
|
||||
- XML (.xml)
|
||||
- HTML (.html)
|
||||
- CSS (.css)
|
||||
- Markdown (.md)
|
||||
|
||||
## Configuration
|
||||
|
||||
The dataset processing can be configured through the `DatasetConfig` class:
|
||||
|
||||
```python
|
||||
dataset_config = DatasetConfig(
|
||||
min_file_size=10, # Minimum file size in characters
|
||||
max_file_size=10000, # Maximum file size in characters
|
||||
supported_languages=[...], # List of supported programming languages
|
||||
exclude_patterns=[...] # Patterns to exclude
|
||||
)
|
||||
```
|
||||
|
||||
## Output Format
|
||||
|
||||
The processed dataset contains the following fields for each sample:
|
||||
- `text`: The content of the code file
|
||||
- `language`: The programming language detected
|
||||
- `file_path`: Relative path to the file within the repository
|
||||
- `repo_name`: Name of the repository
|
||||
- `file_size`: Size of the file in characters
|
||||
- `line_count`: Number of lines in the file
|
||||
@ -189,6 +189,170 @@
|
||||
"# dataset = process_github_repo(\"https://github.com/your-username/your-repo.git\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 5.1 Advanced Dataset Processing\n",
|
||||
"\n",
|
||||
"For more comprehensive dataset processing with support for multiple file types, you can use this advanced processor:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class AdvancedDatasetProcessor:\n",
|
||||
" \"\"\"Advanced processor for GitHub repositories with comprehensive file support\"\"\"\n",
|
||||
" \n",
|
||||
" # Supported file extensions\n",
|
||||
" CODE_EXTENSIONS = {\n",
|
||||
" '.py': 'python', '.js': 'javascript', '.ts': 'typescript',\n",
|
||||
" '.java': 'java', '.cpp': 'cpp', '.c': 'c', '.cs': 'csharp',\n",
|
||||
" '.php': 'php', '.rb': 'ruby', '.go': 'go', '.rs': 'rust',\n",
|
||||
" '.swift': 'swift', '.kt': 'kotlin', '.scala': 'scala',\n",
|
||||
" '.sql': 'sql', '.sh': 'bash', '.yaml': 'yaml', '.yml': 'yaml',\n",
|
||||
" '.json': 'json', '.xml': 'xml', '.html': 'html', '.css': 'css',\n",
|
||||
" '.md': 'markdown'\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" def __init__(self):\n",
|
||||
" pass\n",
|
||||
" \n",
|
||||
" def process_github_repos(self, repo_urls, max_files_per_repo=50):\n",
|
||||
" \"\"\"Process multiple GitHub repositories into a training dataset\"\"\"\n",
|
||||
" all_code_samples = []\n",
|
||||
" \n",
|
||||
" for repo_url in repo_urls:\n",
|
||||
" try:\n",
|
||||
" print(f\"Processing repository: {repo_url}\")\n",
|
||||
" repo_samples = self._process_single_repo(repo_url, max_files_per_repo)\n",
|
||||
" all_code_samples.extend(repo_samples)\n",
|
||||
" print(f\"Extracted {len(repo_samples)} samples from {repo_url}\")\n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Failed to process repository {repo_url}: {str(e)}\")\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" if not all_code_samples:\n",
|
||||
" raise ValueError(\"No code samples extracted from any repository\")\n",
|
||||
" \n",
|
||||
" print(f\"Total samples collected: {len(all_code_samples)}\")\n",
|
||||
" \n",
|
||||
" # Create HuggingFace dataset\n",
|
||||
" from datasets import Dataset\n",
|
||||
" dataset = Dataset.from_list(all_code_samples)\n",
|
||||
" return dataset\n",
|
||||
" \n",
|
||||
" def _process_single_repo(self, repo_url, max_files_per_repo):\n",
|
||||
" \"\"\"Process a single GitHub repository\"\"\"\n",
|
||||
" import tempfile\n",
|
||||
" \n",
|
||||
" with tempfile.TemporaryDirectory() as temp_dir:\n",
|
||||
" try:\n",
|
||||
" # Clone repository\n",
|
||||
" repo_name = repo_url.split('/')[-1].replace('.git', '')\n",
|
||||
" repo_path = f\"{temp_dir}/{repo_name}\"\n",
|
||||
" \n",
|
||||
" print(f\"Cloning {repo_url}...\")\n",
|
||||
" repo = git.Repo.clone_from(repo_url, repo_path, depth=1)\n",
|
||||
" \n",
|
||||
" # Extract code samples\n",
|
||||
" code_samples = self._extract_code_samples(repo_path, max_files_per_repo)\n",
|
||||
" \n",
|
||||
" return code_samples\n",
|
||||
" \n",
|
||||
" finally:\n",
|
||||
" print(f\"Finished processing {repo_url}\")\n",
|
||||
" \n",
|
||||
" def _extract_code_samples(self, repo_path, max_files_per_repo):\n",
|
||||
" \"\"\"Extract code samples from a repository\"\"\"\n",
|
||||
" code_samples = []\n",
|
||||
" repo_path_obj = Path(repo_path)\n",
|
||||
" \n",
|
||||
" # Find all code files\n",
|
||||
" code_files = []\n",
|
||||
" for ext in self.CODE_EXTENSIONS:\n",
|
||||
" code_files.extend(repo_path_obj.rglob(f'*{ext}'))\n",
|
||||
" \n",
|
||||
" print(f\"Found {len(code_files)} code files\")\n",
|
||||
" \n",
|
||||
" # Limit files per repo to prevent memory issues\n",
|
||||
" code_files = code_files[:max_files_per_repo]\n",
|
||||
" \n",
|
||||
" for code_file in code_files:\n",
|
||||
" try:\n",
|
||||
" if self._should_exclude_file(str(code_file.relative_to(repo_path))):\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" sample = self._process_code_file(code_file, repo_path_obj)\n",
|
||||
" if sample:\n",
|
||||
" code_samples.append(sample)\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Failed to process {code_file}: {str(e)}\")\n",
|
||||
" continue\n",
|
||||
" \n",
|
||||
" return code_samples\n",
|
||||
" \n",
|
||||
" def _should_exclude_file(self, relative_path):\n",
|
||||
" \"\"\"Check if a file should be excluded based on patterns\"\"\"\n",
|
||||
" import re\n",
|
||||
" exclude_patterns = [\n",
|
||||
" r'\\.git/', r'__pycache__/', r'node_modules/',\n",
|
||||
" r'\\.venv/', r'venv/', r'package-lock\\.json$',\n",
|
||||
" r'\\.log$', r'\\.tmp$', r'~\\$.*', r'\\.swp$',\n",
|
||||
" r'\\.DS_Store', r'\\.pyc$'\n",
|
||||
" ]\n",
|
||||
" for pattern in exclude_patterns:\n",
|
||||
" if re.search(pattern, relative_path):\n",
|
||||
" return True\n",
|
||||
" return False\n",
|
||||
" \n",
|
||||
" def _process_code_file(self, file_path, repo_path):\n",
|
||||
" \"\"\"Process a single code file into a training sample\"\"\"\n",
|
||||
" try:\n",
|
||||
" # Read file content\n",
|
||||
" with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
|
||||
" content = f.read()\n",
|
||||
" \n",
|
||||
" # Skip if file is too small or too large\n",
|
||||
" if len(content.strip()) < 10:\n",
|
||||
" return None\n",
|
||||
" if len(content) > 100000: # Rough limit\n",
|
||||
" return None\n",
|
||||
" \n",
|
||||
" # Get relative path for context\n",
|
||||
" relative_path = file_path.relative_to(repo_path)\n",
|
||||
" \n",
|
||||
" # Determine language\n",
|
||||
" extension = file_path.suffix.lower()\n",
|
||||
" language = self.CODE_EXTENSIONS.get(extension, 'unknown')\n",
|
||||
" \n",
|
||||
" # Create training sample\n",
|
||||
" sample = {\n",
|
||||
" 'text': content,\n",
|
||||
" 'language': language,\n",
|
||||
" 'file_path': str(relative_path),\n",
|
||||
" 'repo_name': repo_path.name,\n",
|
||||
" 'file_size': len(content),\n",
|
||||
" 'line_count': len(content.splitlines())\n",
|
||||
" }\n",
|
||||
" \n",
|
||||
" return sample\n",
|
||||
" \n",
|
||||
" except Exception as e:\n",
|
||||
" print(f\"Error processing {file_path}: {str(e)}\")\n",
|
||||
" return None\n",
|
||||
"\n",
|
||||
"# Example usage:\n",
|
||||
"# processor = AdvancedDatasetProcessor()\n",
|
||||
"# dataset = processor.process_github_repos([\n",
|
||||
"# \"https://github.com/karpathy/nanoGPT.git\"\n",
|
||||
"# ])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -310,4 +474,4 @@
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
}
|
||||
78
example_dataset_processing.py
Normal file
78
example_dataset_processing.py
Normal file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Example script demonstrating how to get and process a dataset from GitHub repositories
|
||||
using the DatasetProcessor class.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Add src to path to import our modules
|
||||
sys.path.append(str(Path(__file__).parent))
|
||||
|
||||
from src.dataset_processor import DatasetProcessor
|
||||
from src.config import AppConfig, ModelConfig, TrainingConfig, DatasetConfig, MemoryConfig
|
||||
|
||||
|
||||
def main():
|
||||
# Initialize configuration
|
||||
config = AppConfig(
|
||||
model=ModelConfig(
|
||||
name="unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit",
|
||||
max_seq_length=2048
|
||||
),
|
||||
training=TrainingConfig(),
|
||||
dataset=DatasetConfig(),
|
||||
memory=MemoryConfig()
|
||||
)
|
||||
|
||||
# Initialize dataset processor
|
||||
processor = DatasetProcessor()
|
||||
|
||||
# Example GitHub repositories to process
|
||||
# Replace these with your own repositories
|
||||
repo_urls = [
|
||||
"https://github.com/karpathy/nanoGPT.git",
|
||||
# "https://github.com/your-username/your-repo.git"
|
||||
]
|
||||
|
||||
try:
|
||||
print("Processing GitHub repositories...")
|
||||
dataset = processor.process_github_repos(
|
||||
repo_urls=repo_urls,
|
||||
config=config,
|
||||
github_token=None # Add your token here if processing private repositories
|
||||
)
|
||||
|
||||
print(f"Dataset processed successfully!")
|
||||
print(f"Dataset size: {len(dataset)} samples")
|
||||
|
||||
# Show some examples from the dataset
|
||||
print("\nFirst 3 samples from the dataset:")
|
||||
for i in range(min(3, len(dataset))):
|
||||
sample = dataset[i]
|
||||
print(f"\nSample {i+1}:")
|
||||
print(f" Repository: {sample['repo_name']}")
|
||||
print(f" File path: {sample['file_path']}")
|
||||
print(f" Language: {sample['language']}")
|
||||
print(f" File size: {sample['file_size']} characters")
|
||||
print(f" Lines: {sample['line_count']}")
|
||||
# Show first 200 characters of the text
|
||||
preview_text = sample['text'][:200] + "..." if len(sample['text']) > 200 else sample['text']
|
||||
print(f" Text preview: {preview_text}")
|
||||
|
||||
# Save dataset to disk (optional)
|
||||
# dataset.save_to_disk("./processed_dataset")
|
||||
# print("\nDataset saved to ./processed_dataset")
|
||||
|
||||
return dataset
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing repositories: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
dataset = main()
|
||||
Loading…
Reference in New Issue
Block a user