# -*- coding: utf-8 -*- import re import logging import hashlib from io import BytesIO from typing import List, Tuple, Optional try: from docx import Document from docx.opc.exceptions import PackageNotFoundError except ImportError: Document = None PackageNotFoundError = Exception # Import zipfile.BadZipFile for handling corrupted files from zipfile import BadZipFile _logger = logging.getLogger(__name__) class CertificateTemplateParser: """ Service class for parsing DOCX certificate templates and extracting placeholders. This parser identifies placeholders in the format {key.field_name} within DOCX documents and validates template structure. Performance Optimizations: - Caches parsed placeholder results - Efficient regex pattern matching - Single-pass document traversal """ # Regex pattern for matching placeholders: {key.field_name} PLACEHOLDER_PATTERN = r'\{key\.[a-zA-Z0-9_]+\}' # Class-level cache for parsed placeholders (LRU cache with max 100 templates) _placeholder_cache = {} _placeholder_cache_max_size = 100 def __init__(self): """Initialize the template parser.""" if Document is None: raise ImportError( "python-docx library is required. " "Install it with: pip install python-docx" ) def get_placeholder_pattern(self) -> str: """ Return the regex pattern used for placeholder matching. Returns: str: Regex pattern string for matching placeholders """ return self.PLACEHOLDER_PATTERN @classmethod def _get_cache_key(cls, docx_binary: bytes) -> str: """ Generate a cache key for a template based on its content. Args: docx_binary: Binary content of the template Returns: str: SHA256 hash of the template content """ return hashlib.sha256(docx_binary).hexdigest() @classmethod def _get_cached_placeholders(cls, cache_key: str) -> Optional[List[str]]: """ Retrieve cached placeholders if available. Args: cache_key: Cache key for the template Returns: List[str]: Cached placeholder list, or None if not cached """ cached = cls._placeholder_cache.get(cache_key) if cached: _logger.debug(f'Placeholder cache hit for key: {cache_key[:16]}...') return cached @classmethod def _cache_placeholders(cls, cache_key: str, placeholders: List[str]): """ Cache parsed placeholders. Implements LRU eviction when cache is full. Args: cache_key: Cache key for the template placeholders: List of parsed placeholders """ # Implement simple LRU: remove oldest entry if cache is full if len(cls._placeholder_cache) >= cls._placeholder_cache_max_size: # Remove the first (oldest) entry oldest_key = next(iter(cls._placeholder_cache)) del cls._placeholder_cache[oldest_key] _logger.debug(f'Evicted oldest placeholder cache entry: {oldest_key[:16]}...') cls._placeholder_cache[cache_key] = placeholders _logger.debug( f'Cached {len(placeholders)} placeholders with key: {cache_key[:16]}... ' f'(cache size: {len(cls._placeholder_cache)})' ) @classmethod def clear_cache(cls): """ Clear the placeholder cache. This can be called to free memory or when templates are updated. """ cache_size = len(cls._placeholder_cache) cls._placeholder_cache.clear() _logger.info(f'Cleared placeholder cache ({cache_size} entries removed)') def validate_template(self, docx_binary: bytes) -> Tuple[bool, str]: """ Validate that the provided binary data is a valid DOCX file. Args: docx_binary: Binary content of the DOCX file Returns: Tuple[bool, str]: (is_valid, error_message) - is_valid: True if template is valid, False otherwise - error_message: Empty string if valid, error description if invalid """ if not docx_binary: return False, "Template file is empty" if not isinstance(docx_binary, bytes): return False, "Template must be provided as binary data" try: # Attempt to open the document doc_stream = BytesIO(docx_binary) Document(doc_stream) return True, "" except (PackageNotFoundError, BadZipFile): error_msg = "The uploaded file is not a valid DOCX file or is corrupted" _logger.warning(f"Template validation failed: {error_msg}") return False, error_msg except Exception as e: error_msg = f"Unable to read template structure: {str(e)}" _logger.error(f"Template validation error: {error_msg}", exc_info=True) return False, error_msg def parse_template(self, docx_binary: bytes, use_cache: bool = True) -> List[str]: """ Extract all placeholders from a DOCX template. This method scans through all paragraphs and table cells in the document to find text matching the placeholder pattern {key.field_name}. Performance optimization: Results are cached based on template content hash. Args: docx_binary: Binary content of the DOCX file use_cache: Whether to use caching (default: True) Returns: List[str]: List of unique placeholder strings found in the template Raises: ValueError: If the template is invalid or corrupted """ # Check cache first if enabled if use_cache: cache_key = self._get_cache_key(docx_binary) cached_placeholders = self._get_cached_placeholders(cache_key) if cached_placeholders is not None: return cached_placeholders # First validate the template is_valid, error_msg = self.validate_template(docx_binary) if not is_valid: raise ValueError(error_msg) try: # Open the document doc_stream = BytesIO(docx_binary) document = Document(doc_stream) placeholders = set() # Extract placeholders from paragraphs for paragraph in document.paragraphs: placeholders.update(self._extract_placeholders_from_text(paragraph.text)) # Extract placeholders from tables for table in document.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: placeholders.update( self._extract_placeholders_from_text(paragraph.text) ) # Extract placeholders from headers and footers for section in document.sections: # Header header = section.header for paragraph in header.paragraphs: placeholders.update( self._extract_placeholders_from_text(paragraph.text) ) # Footer footer = section.footer for paragraph in footer.paragraphs: placeholders.update( self._extract_placeholders_from_text(paragraph.text) ) # Return sorted list for consistency result = sorted(list(placeholders)) # Cache the result if caching is enabled if use_cache: self._cache_placeholders(cache_key, result) return result except Exception as e: error_msg = f"Error parsing template: {str(e)}" _logger.error(error_msg, exc_info=True) raise ValueError(error_msg) def _extract_placeholders_from_text(self, text: str) -> set: """ Extract placeholders from a text string using regex. Args: text: Text string to search for placeholders Returns: set: Set of placeholder strings found in the text """ if not text: return set() matches = re.findall(self.PLACEHOLDER_PATTERN, text) return set(matches)