survey_custom_certificate_t.../services/certificate_template_parser.py

# -*- coding: utf-8 -*-

import re
import logging
import hashlib
from io import BytesIO
from typing import List, Tuple, Optional

try:
    from docx import Document
    from docx.opc.exceptions import PackageNotFoundError
except ImportError:
    Document = None
    PackageNotFoundError = Exception

# Import zipfile.BadZipFile for handling corrupted files
from zipfile import BadZipFile

_logger = logging.getLogger(__name__)


class CertificateTemplateParser:
    """
    Service class for parsing DOCX certificate templates and extracting placeholders.

    This parser identifies placeholders in the format {key.field_name} within DOCX
    documents and validates template structure.

    Performance Optimizations:
    - Caches parsed placeholder results
    - Efficient regex pattern matching
    - Single-pass document traversal
    """

    # Regex pattern for matching placeholders: {key.field_name}
    PLACEHOLDER_PATTERN = r'\{key\.[a-zA-Z0-9_]+\}'

    # Class-level cache for parsed placeholders (LRU cache with max 100 templates)
    _placeholder_cache = {}
    _placeholder_cache_max_size = 100

    def __init__(self):
        """Initialize the template parser."""
        if Document is None:
            raise ImportError(
                "python-docx library is required. "
                "Install it with: pip install python-docx"
            )

    def get_placeholder_pattern(self) -> str:
        """
        Return the regex pattern used for placeholder matching.

        Returns:
            str: Regex pattern string for matching placeholders
        """
        return self.PLACEHOLDER_PATTERN

    @classmethod
    def _get_cache_key(cls, docx_binary: bytes) -> str:
        """
        Generate a cache key for a template based on its content.

        Args:
            docx_binary: Binary content of the template

        Returns:
            str: SHA256 hash of the template content
        """
        return hashlib.sha256(docx_binary).hexdigest()

    @classmethod
    def _get_cached_placeholders(cls, cache_key: str) -> Optional[List[str]]:
        """
        Retrieve cached placeholders if available.

        Args:
            cache_key: Cache key for the template

        Returns:
            List[str]: Cached placeholder list, or None if not cached
        """
        cached = cls._placeholder_cache.get(cache_key)
        if cached:
            _logger.debug(f'Placeholder cache hit for key: {cache_key[:16]}...')
        return cached

    @classmethod
    def _cache_placeholders(cls, cache_key: str, placeholders: List[str]):
        """
        Cache parsed placeholders.

        Implements LRU eviction when cache is full.

        Args:
            cache_key: Cache key for the template
            placeholders: List of parsed placeholders
        """
        # Implement simple LRU: remove oldest entry if cache is full
        if len(cls._placeholder_cache) >= cls._placeholder_cache_max_size:
            # Remove the first (oldest) entry
            oldest_key = next(iter(cls._placeholder_cache))
            del cls._placeholder_cache[oldest_key]
            _logger.debug(f'Evicted oldest placeholder cache entry: {oldest_key[:16]}...')

        cls._placeholder_cache[cache_key] = placeholders
        _logger.debug(
            f'Cached {len(placeholders)} placeholders with key: {cache_key[:16]}... '
            f'(cache size: {len(cls._placeholder_cache)})'
        )

    @classmethod
    def clear_cache(cls):
        """
        Clear the placeholder cache.

        This can be called to free memory or when templates are updated.
        """
        cache_size = len(cls._placeholder_cache)
        cls._placeholder_cache.clear()
        _logger.info(f'Cleared placeholder cache ({cache_size} entries removed)')

    def validate_template(self, docx_binary: bytes) -> Tuple[bool, str]:
        """
        Validate that the provided binary data is a valid DOCX file.

        Args:
            docx_binary: Binary content of the DOCX file

        Returns:
            Tuple[bool, str]: (is_valid, error_message)
                - is_valid: True if template is valid, False otherwise
                - error_message: Empty string if valid, error description if invalid
        """
        if not docx_binary:
            return False, "Template file is empty"

        if not isinstance(docx_binary, bytes):
            return False, "Template must be provided as binary data"

        try:
            # Attempt to open the document
            doc_stream = BytesIO(docx_binary)
            Document(doc_stream)
            return True, ""

        except (PackageNotFoundError, BadZipFile):
            error_msg = "The uploaded file is not a valid DOCX file or is corrupted"
            _logger.warning(f"Template validation failed: {error_msg}")
            return False, error_msg

        except Exception as e:
            error_msg = f"Unable to read template structure: {str(e)}"
            _logger.error(f"Template validation error: {error_msg}", exc_info=True)
            return False, error_msg

    def parse_template(self, docx_binary: bytes, use_cache: bool = True) -> List[str]:
        """
        Extract all placeholders from a DOCX template.

        This method scans through all paragraphs and table cells in the document
        to find text matching the placeholder pattern {key.field_name}.

        Performance optimization: Results are cached based on template content hash.

        Args:
            docx_binary: Binary content of the DOCX file
            use_cache: Whether to use caching (default: True)

        Returns:
            List[str]: List of unique placeholder strings found in the template

        Raises:
            ValueError: If the template is invalid or corrupted
        """
        # Check cache first if enabled
        if use_cache:
            cache_key = self._get_cache_key(docx_binary)
            cached_placeholders = self._get_cached_placeholders(cache_key)
            if cached_placeholders is not None:
                return cached_placeholders

        # First validate the template
        is_valid, error_msg = self.validate_template(docx_binary)
        if not is_valid:
            raise ValueError(error_msg)

        try:
            # Open the document
            doc_stream = BytesIO(docx_binary)
            document = Document(doc_stream)

            placeholders = set()

            # Extract placeholders from paragraphs
            for paragraph in document.paragraphs:
                placeholders.update(self._extract_placeholders_from_text(paragraph.text))

            # Extract placeholders from tables
            for table in document.tables:
                for row in table.rows:
                    for cell in row.cells:
                        for paragraph in cell.paragraphs:
                            placeholders.update(
                                self._extract_placeholders_from_text(paragraph.text)
                            )

            # Extract placeholders from headers and footers
            for section in document.sections:
                # Header
                header = section.header
                for paragraph in header.paragraphs:
                    placeholders.update(
                        self._extract_placeholders_from_text(paragraph.text)
                    )

                # Footer
                footer = section.footer
                for paragraph in footer.paragraphs:
                    placeholders.update(
                        self._extract_placeholders_from_text(paragraph.text)
                    )

            # Return sorted list for consistency
            result = sorted(list(placeholders))

            # Cache the result if caching is enabled
            if use_cache:
                self._cache_placeholders(cache_key, result)

            return result

        except Exception as e:
            error_msg = f"Error parsing template: {str(e)}"
            _logger.error(error_msg, exc_info=True)
            raise ValueError(error_msg)

    def _extract_placeholders_from_text(self, text: str) -> set:
        """
        Extract placeholders from a text string using regex.

        Args:
            text: Text string to search for placeholders

        Returns:
            set: Set of placeholder strings found in the text
        """
        if not text:
            return set()

        matches = re.findall(self.PLACEHOLDER_PATTERN, text)
        return set(matches)