diff --git a/README_DATASET_PROCESSING.md b/README_DATASET_PROCESSING.md new file mode 100644 index 0000000..03c38b0 --- /dev/null +++ b/README_DATASET_PROCESSING.md @@ -0,0 +1,113 @@ +# Dataset Processing from GitHub Repositories + +This guide explains how to get and process datasets from GitHub repositories using the provided tools. + +## Prerequisites + +Make sure you have installed the required dependencies: + +```bash +pip install -r requirements.txt +``` + +## Using the DatasetProcessor Class + +The `DatasetProcessor` class in `src/dataset_processor.py` provides comprehensive functionality for processing GitHub repositories into training datasets. + +### Example Usage + +```python +from src.dataset_processor import DatasetProcessor +from src.config import AppConfig, ModelConfig, TrainingConfig, DatasetConfig, MemoryConfig + +# Initialize configuration +config = AppConfig( + model=ModelConfig(), + training=TrainingConfig(), + dataset=DatasetConfig(), + memory=MemoryConfig() +) + +# Initialize dataset processor +processor = DatasetProcessor() + +# Process GitHub repositories +repo_urls = [ + "https://github.com/karpathy/nanoGPT.git", + # Add more repository URLs as needed +] + +dataset = processor.process_github_repos( + repo_urls=repo_urls, + config=config, + github_token=None # Add your token for private repositories +) + +print(f"Dataset processed successfully with {len(dataset)} samples") +``` + +## Using the Example Script + +You can run the example script directly: + +```bash +python example_dataset_processing.py +``` + +This will process the example repository and show information about the processed dataset. + +## Using in Google Colab + +The `ai_trainer_t4_colab.ipynb` notebook includes sections for processing GitHub repositories: + +1. Simple repository processing (Section 5) +2. Advanced dataset processing (Section 5.1) + +## Supported File Types + +The DatasetProcessor supports the following file types: +- Python (.py) +- JavaScript (.js) +- TypeScript (.ts) +- Java (.java) +- C++ (.cpp, .hpp) +- C (.c, .h) +- C# (.cs) +- PHP (.php) +- Ruby (.rb) +- Go (.go) +- Rust (.rs) +- Swift (.swift) +- Kotlin (.kt) +- Scala (.scala) +- SQL (.sql) +- Bash (.sh) +- YAML (.yaml, .yml) +- JSON (.json) +- XML (.xml) +- HTML (.html) +- CSS (.css) +- Markdown (.md) + +## Configuration + +The dataset processing can be configured through the `DatasetConfig` class: + +```python +dataset_config = DatasetConfig( + min_file_size=10, # Minimum file size in characters + max_file_size=10000, # Maximum file size in characters + supported_languages=[...], # List of supported programming languages + exclude_patterns=[...] # Patterns to exclude +) +``` + +## Output Format + +The processed dataset contains the following fields for each sample: +- `text`: The content of the code file +- `language`: The programming language detected +- `file_path`: Relative path to the file within the repository +- `repo_name`: Name of the repository +- `file_size`: Size of the file in characters +- `line_count`: Number of lines in the file \ No newline at end of file diff --git a/ai_trainer_t4_colab.ipynb b/ai_trainer_t4_colab.ipynb index 838f56b..2190c05 100644 --- a/ai_trainer_t4_colab.ipynb +++ b/ai_trainer_t4_colab.ipynb @@ -189,6 +189,170 @@ "# dataset = process_github_repo(\"https://github.com/your-username/your-repo.git\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5.1 Advanced Dataset Processing\n", + "\n", + "For more comprehensive dataset processing with support for multiple file types, you can use this advanced processor:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class AdvancedDatasetProcessor:\n", + " \"\"\"Advanced processor for GitHub repositories with comprehensive file support\"\"\"\n", + " \n", + " # Supported file extensions\n", + " CODE_EXTENSIONS = {\n", + " '.py': 'python', '.js': 'javascript', '.ts': 'typescript',\n", + " '.java': 'java', '.cpp': 'cpp', '.c': 'c', '.cs': 'csharp',\n", + " '.php': 'php', '.rb': 'ruby', '.go': 'go', '.rs': 'rust',\n", + " '.swift': 'swift', '.kt': 'kotlin', '.scala': 'scala',\n", + " '.sql': 'sql', '.sh': 'bash', '.yaml': 'yaml', '.yml': 'yaml',\n", + " '.json': 'json', '.xml': 'xml', '.html': 'html', '.css': 'css',\n", + " '.md': 'markdown'\n", + " }\n", + " \n", + " def __init__(self):\n", + " pass\n", + " \n", + " def process_github_repos(self, repo_urls, max_files_per_repo=50):\n", + " \"\"\"Process multiple GitHub repositories into a training dataset\"\"\"\n", + " all_code_samples = []\n", + " \n", + " for repo_url in repo_urls:\n", + " try:\n", + " print(f\"Processing repository: {repo_url}\")\n", + " repo_samples = self._process_single_repo(repo_url, max_files_per_repo)\n", + " all_code_samples.extend(repo_samples)\n", + " print(f\"Extracted {len(repo_samples)} samples from {repo_url}\")\n", + " except Exception as e:\n", + " print(f\"Failed to process repository {repo_url}: {str(e)}\")\n", + " continue\n", + " \n", + " if not all_code_samples:\n", + " raise ValueError(\"No code samples extracted from any repository\")\n", + " \n", + " print(f\"Total samples collected: {len(all_code_samples)}\")\n", + " \n", + " # Create HuggingFace dataset\n", + " from datasets import Dataset\n", + " dataset = Dataset.from_list(all_code_samples)\n", + " return dataset\n", + " \n", + " def _process_single_repo(self, repo_url, max_files_per_repo):\n", + " \"\"\"Process a single GitHub repository\"\"\"\n", + " import tempfile\n", + " \n", + " with tempfile.TemporaryDirectory() as temp_dir:\n", + " try:\n", + " # Clone repository\n", + " repo_name = repo_url.split('/')[-1].replace('.git', '')\n", + " repo_path = f\"{temp_dir}/{repo_name}\"\n", + " \n", + " print(f\"Cloning {repo_url}...\")\n", + " repo = git.Repo.clone_from(repo_url, repo_path, depth=1)\n", + " \n", + " # Extract code samples\n", + " code_samples = self._extract_code_samples(repo_path, max_files_per_repo)\n", + " \n", + " return code_samples\n", + " \n", + " finally:\n", + " print(f\"Finished processing {repo_url}\")\n", + " \n", + " def _extract_code_samples(self, repo_path, max_files_per_repo):\n", + " \"\"\"Extract code samples from a repository\"\"\"\n", + " code_samples = []\n", + " repo_path_obj = Path(repo_path)\n", + " \n", + " # Find all code files\n", + " code_files = []\n", + " for ext in self.CODE_EXTENSIONS:\n", + " code_files.extend(repo_path_obj.rglob(f'*{ext}'))\n", + " \n", + " print(f\"Found {len(code_files)} code files\")\n", + " \n", + " # Limit files per repo to prevent memory issues\n", + " code_files = code_files[:max_files_per_repo]\n", + " \n", + " for code_file in code_files:\n", + " try:\n", + " if self._should_exclude_file(str(code_file.relative_to(repo_path))):\n", + " continue\n", + " \n", + " sample = self._process_code_file(code_file, repo_path_obj)\n", + " if sample:\n", + " code_samples.append(sample)\n", + " \n", + " except Exception as e:\n", + " print(f\"Failed to process {code_file}: {str(e)}\")\n", + " continue\n", + " \n", + " return code_samples\n", + " \n", + " def _should_exclude_file(self, relative_path):\n", + " \"\"\"Check if a file should be excluded based on patterns\"\"\"\n", + " import re\n", + " exclude_patterns = [\n", + " r'\\.git/', r'__pycache__/', r'node_modules/',\n", + " r'\\.venv/', r'venv/', r'package-lock\\.json$',\n", + " r'\\.log$', r'\\.tmp$', r'~\\$.*', r'\\.swp$',\n", + " r'\\.DS_Store', r'\\.pyc$'\n", + " ]\n", + " for pattern in exclude_patterns:\n", + " if re.search(pattern, relative_path):\n", + " return True\n", + " return False\n", + " \n", + " def _process_code_file(self, file_path, repo_path):\n", + " \"\"\"Process a single code file into a training sample\"\"\"\n", + " try:\n", + " # Read file content\n", + " with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n", + " content = f.read()\n", + " \n", + " # Skip if file is too small or too large\n", + " if len(content.strip()) < 10:\n", + " return None\n", + " if len(content) > 100000: # Rough limit\n", + " return None\n", + " \n", + " # Get relative path for context\n", + " relative_path = file_path.relative_to(repo_path)\n", + " \n", + " # Determine language\n", + " extension = file_path.suffix.lower()\n", + " language = self.CODE_EXTENSIONS.get(extension, 'unknown')\n", + " \n", + " # Create training sample\n", + " sample = {\n", + " 'text': content,\n", + " 'language': language,\n", + " 'file_path': str(relative_path),\n", + " 'repo_name': repo_path.name,\n", + " 'file_size': len(content),\n", + " 'line_count': len(content.splitlines())\n", + " }\n", + " \n", + " return sample\n", + " \n", + " except Exception as e:\n", + " print(f\"Error processing {file_path}: {str(e)}\")\n", + " return None\n", + "\n", + "# Example usage:\n", + "# processor = AdvancedDatasetProcessor()\n", + "# dataset = processor.process_github_repos([\n", + "# \"https://github.com/karpathy/nanoGPT.git\"\n", + "# ])" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -310,4 +474,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/example_dataset_processing.py b/example_dataset_processing.py new file mode 100644 index 0000000..dbb4acf --- /dev/null +++ b/example_dataset_processing.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +""" +Example script demonstrating how to get and process a dataset from GitHub repositories +using the DatasetProcessor class. +""" + +import sys +from pathlib import Path + +# Add src to path to import our modules +sys.path.append(str(Path(__file__).parent)) + +from src.dataset_processor import DatasetProcessor +from src.config import AppConfig, ModelConfig, TrainingConfig, DatasetConfig, MemoryConfig + + +def main(): + # Initialize configuration + config = AppConfig( + model=ModelConfig( + name="unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit", + max_seq_length=2048 + ), + training=TrainingConfig(), + dataset=DatasetConfig(), + memory=MemoryConfig() + ) + + # Initialize dataset processor + processor = DatasetProcessor() + + # Example GitHub repositories to process + # Replace these with your own repositories + repo_urls = [ + "https://github.com/karpathy/nanoGPT.git", + # "https://github.com/your-username/your-repo.git" + ] + + try: + print("Processing GitHub repositories...") + dataset = processor.process_github_repos( + repo_urls=repo_urls, + config=config, + github_token=None # Add your token here if processing private repositories + ) + + print(f"Dataset processed successfully!") + print(f"Dataset size: {len(dataset)} samples") + + # Show some examples from the dataset + print("\nFirst 3 samples from the dataset:") + for i in range(min(3, len(dataset))): + sample = dataset[i] + print(f"\nSample {i+1}:") + print(f" Repository: {sample['repo_name']}") + print(f" File path: {sample['file_path']}") + print(f" Language: {sample['language']}") + print(f" File size: {sample['file_size']} characters") + print(f" Lines: {sample['line_count']}") + # Show first 200 characters of the text + preview_text = sample['text'][:200] + "..." if len(sample['text']) > 200 else sample['text'] + print(f" Text preview: {preview_text}") + + # Save dataset to disk (optional) + # dataset.save_to_disk("./processed_dataset") + # print("\nDataset saved to ./processed_dataset") + + return dataset + + except Exception as e: + print(f"Error processing repositories: {e}") + import traceback + traceback.print_exc() + return None + + +if __name__ == "__main__": + dataset = main() \ No newline at end of file