survey_custom_certificate_t.../services/certificate_template_parser.py
2025-11-29 08:46:04 +07:00

253 lines
8.7 KiB
Python

# -*- coding: utf-8 -*-
import re
import logging
import hashlib
from io import BytesIO
from typing import List, Tuple, Optional
try:
from docx import Document
from docx.opc.exceptions import PackageNotFoundError
except ImportError:
Document = None
PackageNotFoundError = Exception
# Import zipfile.BadZipFile for handling corrupted files
from zipfile import BadZipFile
_logger = logging.getLogger(__name__)
class CertificateTemplateParser:
"""
Service class for parsing DOCX certificate templates and extracting placeholders.
This parser identifies placeholders in the format {key.field_name} within DOCX
documents and validates template structure.
Performance Optimizations:
- Caches parsed placeholder results
- Efficient regex pattern matching
- Single-pass document traversal
"""
# Regex pattern for matching placeholders: {key.field_name}
PLACEHOLDER_PATTERN = r'\{key\.[a-zA-Z0-9_]+\}'
# Class-level cache for parsed placeholders (LRU cache with max 100 templates)
_placeholder_cache = {}
_placeholder_cache_max_size = 100
def __init__(self):
"""Initialize the template parser."""
if Document is None:
raise ImportError(
"python-docx library is required. "
"Install it with: pip install python-docx"
)
def get_placeholder_pattern(self) -> str:
"""
Return the regex pattern used for placeholder matching.
Returns:
str: Regex pattern string for matching placeholders
"""
return self.PLACEHOLDER_PATTERN
@classmethod
def _get_cache_key(cls, docx_binary: bytes) -> str:
"""
Generate a cache key for a template based on its content.
Args:
docx_binary: Binary content of the template
Returns:
str: SHA256 hash of the template content
"""
return hashlib.sha256(docx_binary).hexdigest()
@classmethod
def _get_cached_placeholders(cls, cache_key: str) -> Optional[List[str]]:
"""
Retrieve cached placeholders if available.
Args:
cache_key: Cache key for the template
Returns:
List[str]: Cached placeholder list, or None if not cached
"""
cached = cls._placeholder_cache.get(cache_key)
if cached:
_logger.debug(f'Placeholder cache hit for key: {cache_key[:16]}...')
return cached
@classmethod
def _cache_placeholders(cls, cache_key: str, placeholders: List[str]):
"""
Cache parsed placeholders.
Implements LRU eviction when cache is full.
Args:
cache_key: Cache key for the template
placeholders: List of parsed placeholders
"""
# Implement simple LRU: remove oldest entry if cache is full
if len(cls._placeholder_cache) >= cls._placeholder_cache_max_size:
# Remove the first (oldest) entry
oldest_key = next(iter(cls._placeholder_cache))
del cls._placeholder_cache[oldest_key]
_logger.debug(f'Evicted oldest placeholder cache entry: {oldest_key[:16]}...')
cls._placeholder_cache[cache_key] = placeholders
_logger.debug(
f'Cached {len(placeholders)} placeholders with key: {cache_key[:16]}... '
f'(cache size: {len(cls._placeholder_cache)})'
)
@classmethod
def clear_cache(cls):
"""
Clear the placeholder cache.
This can be called to free memory or when templates are updated.
"""
cache_size = len(cls._placeholder_cache)
cls._placeholder_cache.clear()
_logger.info(f'Cleared placeholder cache ({cache_size} entries removed)')
def validate_template(self, docx_binary: bytes) -> Tuple[bool, str]:
"""
Validate that the provided binary data is a valid DOCX file.
Args:
docx_binary: Binary content of the DOCX file
Returns:
Tuple[bool, str]: (is_valid, error_message)
- is_valid: True if template is valid, False otherwise
- error_message: Empty string if valid, error description if invalid
"""
if not docx_binary:
return False, "Template file is empty"
if not isinstance(docx_binary, bytes):
return False, "Template must be provided as binary data"
try:
# Attempt to open the document
doc_stream = BytesIO(docx_binary)
Document(doc_stream)
return True, ""
except (PackageNotFoundError, BadZipFile):
error_msg = "The uploaded file is not a valid DOCX file or is corrupted"
_logger.warning(f"Template validation failed: {error_msg}")
return False, error_msg
except Exception as e:
error_msg = f"Unable to read template structure: {str(e)}"
_logger.error(f"Template validation error: {error_msg}", exc_info=True)
return False, error_msg
def parse_template(self, docx_binary: bytes, use_cache: bool = True) -> List[str]:
"""
Extract all placeholders from a DOCX template.
This method scans through all paragraphs and table cells in the document
to find text matching the placeholder pattern {key.field_name}.
Performance optimization: Results are cached based on template content hash.
Args:
docx_binary: Binary content of the DOCX file
use_cache: Whether to use caching (default: True)
Returns:
List[str]: List of unique placeholder strings found in the template
Raises:
ValueError: If the template is invalid or corrupted
"""
# Check cache first if enabled
if use_cache:
cache_key = self._get_cache_key(docx_binary)
cached_placeholders = self._get_cached_placeholders(cache_key)
if cached_placeholders is not None:
return cached_placeholders
# First validate the template
is_valid, error_msg = self.validate_template(docx_binary)
if not is_valid:
raise ValueError(error_msg)
try:
# Open the document
doc_stream = BytesIO(docx_binary)
document = Document(doc_stream)
placeholders = set()
# Extract placeholders from paragraphs
for paragraph in document.paragraphs:
placeholders.update(self._extract_placeholders_from_text(paragraph.text))
# Extract placeholders from tables
for table in document.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
placeholders.update(
self._extract_placeholders_from_text(paragraph.text)
)
# Extract placeholders from headers and footers
for section in document.sections:
# Header
header = section.header
for paragraph in header.paragraphs:
placeholders.update(
self._extract_placeholders_from_text(paragraph.text)
)
# Footer
footer = section.footer
for paragraph in footer.paragraphs:
placeholders.update(
self._extract_placeholders_from_text(paragraph.text)
)
# Return sorted list for consistency
result = sorted(list(placeholders))
# Cache the result if caching is enabled
if use_cache:
self._cache_placeholders(cache_key, result)
return result
except Exception as e:
error_msg = f"Error parsing template: {str(e)}"
_logger.error(error_msg, exc_info=True)
raise ValueError(error_msg)
def _extract_placeholders_from_text(self, text: str) -> set:
"""
Extract placeholders from a text string using regex.
Args:
text: Text string to search for placeholders
Returns:
set: Set of placeholder strings found in the text
"""
if not text:
return set()
matches = re.findall(self.PLACEHOLDER_PATTERN, text)
return set(matches)