adding dataset processor in notebook file

2025-08-23 07:04:45 +07:00 · 2025-08-23 07:04:45 +07:00 · 43d6f0e98a
commit 43d6f0e98a
parent 81c8524809
3 changed files with 356 additions and 1 deletions
--- a/README_DATASET_PROCESSING.md
+++ b/README_DATASET_PROCESSING.md
@ -0,0 +1,113 @@
+# Dataset Processing from GitHub Repositories
+
+This guide explains how to get and process datasets from GitHub repositories using the provided tools.
+
+## Prerequisites
+
+Make sure you have installed the required dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Using the DatasetProcessor Class
+
+The `DatasetProcessor` class in `src/dataset_processor.py` provides comprehensive functionality for processing GitHub repositories into training datasets.
+
+### Example Usage
+
+```python
+from src.dataset_processor import DatasetProcessor
+from src.config import AppConfig, ModelConfig, TrainingConfig, DatasetConfig, MemoryConfig
+
+# Initialize configuration
+config = AppConfig(
+    model=ModelConfig(),
+    training=TrainingConfig(),
+    dataset=DatasetConfig(),
+    memory=MemoryConfig()
+)
+
+# Initialize dataset processor
+processor = DatasetProcessor()
+
+# Process GitHub repositories
+repo_urls = [
+    "https://github.com/karpathy/nanoGPT.git",
+    # Add more repository URLs as needed
+]
+
+dataset = processor.process_github_repos(
+    repo_urls=repo_urls,
+    config=config,
+    github_token=None  # Add your token for private repositories
+)
+
+print(f"Dataset processed successfully with {len(dataset)} samples")
+```
+
+## Using the Example Script
+
+You can run the example script directly:
+
+```bash
+python example_dataset_processing.py
+```
+
+This will process the example repository and show information about the processed dataset.
+
+## Using in Google Colab
+
+The `ai_trainer_t4_colab.ipynb` notebook includes sections for processing GitHub repositories:
+
+1. Simple repository processing (Section 5)
+2. Advanced dataset processing (Section 5.1)
+
+## Supported File Types
+
+The DatasetProcessor supports the following file types:
+- Python (.py)
+- JavaScript (.js)
+- TypeScript (.ts)
+- Java (.java)
+- C++ (.cpp, .hpp)
+- C (.c, .h)
+- C# (.cs)
+- PHP (.php)
+- Ruby (.rb)
+- Go (.go)
+- Rust (.rs)
+- Swift (.swift)
+- Kotlin (.kt)
+- Scala (.scala)
+- SQL (.sql)
+- Bash (.sh)
+- YAML (.yaml, .yml)
+- JSON (.json)
+- XML (.xml)
+- HTML (.html)
+- CSS (.css)
+- Markdown (.md)
+
+## Configuration
+
+The dataset processing can be configured through the `DatasetConfig` class:
+
+```python
+dataset_config = DatasetConfig(
+    min_file_size=10,  # Minimum file size in characters
+    max_file_size=10000,  # Maximum file size in characters
+    supported_languages=[...],  # List of supported programming languages
+    exclude_patterns=[...]  # Patterns to exclude
+)
+```
+
+## Output Format
+
+The processed dataset contains the following fields for each sample:
+- `text`: The content of the code file
+- `language`: The programming language detected
+- `file_path`: Relative path to the file within the repository
+- `repo_name`: Name of the repository
+- `file_size`: Size of the file in characters
+- `line_count`: Number of lines in the file
--- a/ai_trainer_t4_colab.ipynb
+++ b/ai_trainer_t4_colab.ipynb
@ -189,6 +189,170 @@
    "# dataset = process_github_repo(\"https://github.com/your-username/your-repo.git\")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5.1 Advanced Dataset Processing\n",
+    "\n",
+    "For more comprehensive dataset processing with support for multiple file types, you can use this advanced processor:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class AdvancedDatasetProcessor:\n",
+    "    \"\"\"Advanced processor for GitHub repositories with comprehensive file support\"\"\"\n",
+    "    \n",
+    "    # Supported file extensions\n",
+    "    CODE_EXTENSIONS = {\n",
+    "        '.py': 'python', '.js': 'javascript', '.ts': 'typescript',\n",
+    "        '.java': 'java', '.cpp': 'cpp', '.c': 'c', '.cs': 'csharp',\n",
+    "        '.php': 'php', '.rb': 'ruby', '.go': 'go', '.rs': 'rust',\n",
+    "        '.swift': 'swift', '.kt': 'kotlin', '.scala': 'scala',\n",
+    "        '.sql': 'sql', '.sh': 'bash', '.yaml': 'yaml', '.yml': 'yaml',\n",
+    "        '.json': 'json', '.xml': 'xml', '.html': 'html', '.css': 'css',\n",
+    "        '.md': 'markdown'\n",
+    "    }\n",
+    "    \n",
+    "    def __init__(self):\n",
+    "        pass\n",
+    "        \n",
+    "    def process_github_repos(self, repo_urls, max_files_per_repo=50):\n",
+    "        \"\"\"Process multiple GitHub repositories into a training dataset\"\"\"\n",
+    "        all_code_samples = []\n",
+    "        \n",
+    "        for repo_url in repo_urls:\n",
+    "            try:\n",
+    "                print(f\"Processing repository: {repo_url}\")\n",
+    "                repo_samples = self._process_single_repo(repo_url, max_files_per_repo)\n",
+    "                all_code_samples.extend(repo_samples)\n",
+    "                print(f\"Extracted {len(repo_samples)} samples from {repo_url}\")\n",
+    "            except Exception as e:\n",
+    "                print(f\"Failed to process repository {repo_url}: {str(e)}\")\n",
+    "                continue\n",
+    "        \n",
+    "        if not all_code_samples:\n",
+    "            raise ValueError(\"No code samples extracted from any repository\")\n",
+    "        \n",
+    "        print(f\"Total samples collected: {len(all_code_samples)}\")\n",
+    "        \n",
+    "        # Create HuggingFace dataset\n",
+    "        from datasets import Dataset\n",
+    "        dataset = Dataset.from_list(all_code_samples)\n",
+    "        return dataset\n",
+    "    \n",
+    "    def _process_single_repo(self, repo_url, max_files_per_repo):\n",
+    "        \"\"\"Process a single GitHub repository\"\"\"\n",
+    "        import tempfile\n",
+    "        \n",
+    "        with tempfile.TemporaryDirectory() as temp_dir:\n",
+    "            try:\n",
+    "                # Clone repository\n",
+    "                repo_name = repo_url.split('/')[-1].replace('.git', '')\n",
+    "                repo_path = f\"{temp_dir}/{repo_name}\"\n",
+    "                \n",
+    "                print(f\"Cloning {repo_url}...\")\n",
+    "                repo = git.Repo.clone_from(repo_url, repo_path, depth=1)\n",
+    "                \n",
+    "                # Extract code samples\n",
+    "                code_samples = self._extract_code_samples(repo_path, max_files_per_repo)\n",
+    "                \n",
+    "                return code_samples\n",
+    "                \n",
+    "            finally:\n",
+    "                print(f\"Finished processing {repo_url}\")\n",
+    "    \n",
+    "    def _extract_code_samples(self, repo_path, max_files_per_repo):\n",
+    "        \"\"\"Extract code samples from a repository\"\"\"\n",
+    "        code_samples = []\n",
+    "        repo_path_obj = Path(repo_path)\n",
+    "        \n",
+    "        # Find all code files\n",
+    "        code_files = []\n",
+    "        for ext in self.CODE_EXTENSIONS:\n",
+    "            code_files.extend(repo_path_obj.rglob(f'*{ext}'))\n",
+    "        \n",
+    "        print(f\"Found {len(code_files)} code files\")\n",
+    "        \n",
+    "        # Limit files per repo to prevent memory issues\n",
+    "        code_files = code_files[:max_files_per_repo]\n",
+    "        \n",
+    "        for code_file in code_files:\n",
+    "            try:\n",
+    "                if self._should_exclude_file(str(code_file.relative_to(repo_path))):\n",
+    "                    continue\n",
+    "                \n",
+    "                sample = self._process_code_file(code_file, repo_path_obj)\n",
+    "                if sample:\n",
+    "                    code_samples.append(sample)\n",
+    "                    \n",
+    "            except Exception as e:\n",
+    "                print(f\"Failed to process {code_file}: {str(e)}\")\n",
+    "                continue\n",
+    "        \n",
+    "        return code_samples\n",
+    "    \n",
+    "    def _should_exclude_file(self, relative_path):\n",
+    "        \"\"\"Check if a file should be excluded based on patterns\"\"\"\n",
+    "        import re\n",
+    "        exclude_patterns = [\n",
+    "            r'\\.git/', r'__pycache__/', r'node_modules/',\n",
+    "            r'\\.venv/', r'venv/', r'package-lock\\.json$',\n",
+    "            r'\\.log$', r'\\.tmp$', r'~\\$.*', r'\\.swp$',\n",
+    "            r'\\.DS_Store', r'\\.pyc$'\n",
+    "        ]\n",
+    "        for pattern in exclude_patterns:\n",
+    "            if re.search(pattern, relative_path):\n",
+    "                return True\n",
+    "        return False\n",
+    "    \n",
+    "    def _process_code_file(self, file_path, repo_path):\n",
+    "        \"\"\"Process a single code file into a training sample\"\"\"\n",
+    "        try:\n",
+    "            # Read file content\n",
+    "            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:\n",
+    "                content = f.read()\n",
+    "            \n",
+    "            # Skip if file is too small or too large\n",
+    "            if len(content.strip()) < 10:\n",
+    "                return None\n",
+    "            if len(content) > 100000:  # Rough limit\n",
+    "                return None\n",
+    "            \n",
+    "            # Get relative path for context\n",
+    "            relative_path = file_path.relative_to(repo_path)\n",
+    "            \n",
+    "            # Determine language\n",
+    "            extension = file_path.suffix.lower()\n",
+    "            language = self.CODE_EXTENSIONS.get(extension, 'unknown')\n",
+    "            \n",
+    "            # Create training sample\n",
+    "            sample = {\n",
+    "                'text': content,\n",
+    "                'language': language,\n",
+    "                'file_path': str(relative_path),\n",
+    "                'repo_name': repo_path.name,\n",
+    "                'file_size': len(content),\n",
+    "                'line_count': len(content.splitlines())\n",
+    "            }\n",
+    "            \n",
+    "            return sample\n",
+    "            \n",
+    "        except Exception as e:\n",
+    "            print(f\"Error processing {file_path}: {str(e)}\")\n",
+    "            return None\n",
+    "\n",
+    "# Example usage:\n",
+    "# processor = AdvancedDatasetProcessor()\n",
+    "# dataset = processor.process_github_repos([\n",
+    "#     \"https://github.com/karpathy/nanoGPT.git\"\n",
+    "# ])"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -310,4 +474,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 4
-}
+}
--- a/example_dataset_processing.py
+++ b/example_dataset_processing.py
@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating how to get and process a dataset from GitHub repositories
+using the DatasetProcessor class.
+"""
+
+import sys
+from pathlib import Path
+
+# Add src to path to import our modules
+sys.path.append(str(Path(__file__).parent))
+
+from src.dataset_processor import DatasetProcessor
+from src.config import AppConfig, ModelConfig, TrainingConfig, DatasetConfig, MemoryConfig
+
+
+def main():
+    # Initialize configuration
+    config = AppConfig(
+        model=ModelConfig(
+            name="unsloth/Qwen2.5-Coder-7B-Instruct-bnb-4bit",
+            max_seq_length=2048
+        ),
+        training=TrainingConfig(),
+        dataset=DatasetConfig(),
+        memory=MemoryConfig()
+    )
+    
+    # Initialize dataset processor
+    processor = DatasetProcessor()
+    
+    # Example GitHub repositories to process
+    # Replace these with your own repositories
+    repo_urls = [
+        "https://github.com/karpathy/nanoGPT.git",
+        # "https://github.com/your-username/your-repo.git"
+    ]
+    
+    try:
+        print("Processing GitHub repositories...")
+        dataset = processor.process_github_repos(
+            repo_urls=repo_urls,
+            config=config,
+            github_token=None  # Add your token here if processing private repositories
+        )
+        
+        print(f"Dataset processed successfully!")
+        print(f"Dataset size: {len(dataset)} samples")
+        
+        # Show some examples from the dataset
+        print("\nFirst 3 samples from the dataset:")
+        for i in range(min(3, len(dataset))):
+            sample = dataset[i]
+            print(f"\nSample {i+1}:")
+            print(f"  Repository: {sample['repo_name']}")
+            print(f"  File path: {sample['file_path']}")
+            print(f"  Language: {sample['language']}")
+            print(f"  File size: {sample['file_size']} characters")
+            print(f"  Lines: {sample['line_count']}")
+            # Show first 200 characters of the text
+            preview_text = sample['text'][:200] + "..." if len(sample['text']) > 200 else sample['text']
+            print(f"  Text preview: {preview_text}")
+        
+        # Save dataset to disk (optional)
+        # dataset.save_to_disk("./processed_dataset")
+        # print("\nDataset saved to ./processed_dataset")
+        
+        return dataset
+        
+    except Exception as e:
+        print(f"Error processing repositories: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+if __name__ == "__main__":
+    dataset = main()