253 lines
8.7 KiB
Python
253 lines
8.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import re
|
|
import logging
|
|
import hashlib
|
|
from io import BytesIO
|
|
from typing import List, Tuple, Optional
|
|
|
|
try:
|
|
from docx import Document
|
|
from docx.opc.exceptions import PackageNotFoundError
|
|
except ImportError:
|
|
Document = None
|
|
PackageNotFoundError = Exception
|
|
|
|
# Import zipfile.BadZipFile for handling corrupted files
|
|
from zipfile import BadZipFile
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CertificateTemplateParser:
|
|
"""
|
|
Service class for parsing DOCX certificate templates and extracting placeholders.
|
|
|
|
This parser identifies placeholders in the format {key.field_name} within DOCX
|
|
documents and validates template structure.
|
|
|
|
Performance Optimizations:
|
|
- Caches parsed placeholder results
|
|
- Efficient regex pattern matching
|
|
- Single-pass document traversal
|
|
"""
|
|
|
|
# Regex pattern for matching placeholders: {key.field_name}
|
|
PLACEHOLDER_PATTERN = r'\{key\.[a-zA-Z0-9_]+\}'
|
|
|
|
# Class-level cache for parsed placeholders (LRU cache with max 100 templates)
|
|
_placeholder_cache = {}
|
|
_placeholder_cache_max_size = 100
|
|
|
|
def __init__(self):
|
|
"""Initialize the template parser."""
|
|
if Document is None:
|
|
raise ImportError(
|
|
"python-docx library is required. "
|
|
"Install it with: pip install python-docx"
|
|
)
|
|
|
|
def get_placeholder_pattern(self) -> str:
|
|
"""
|
|
Return the regex pattern used for placeholder matching.
|
|
|
|
Returns:
|
|
str: Regex pattern string for matching placeholders
|
|
"""
|
|
return self.PLACEHOLDER_PATTERN
|
|
|
|
@classmethod
|
|
def _get_cache_key(cls, docx_binary: bytes) -> str:
|
|
"""
|
|
Generate a cache key for a template based on its content.
|
|
|
|
Args:
|
|
docx_binary: Binary content of the template
|
|
|
|
Returns:
|
|
str: SHA256 hash of the template content
|
|
"""
|
|
return hashlib.sha256(docx_binary).hexdigest()
|
|
|
|
@classmethod
|
|
def _get_cached_placeholders(cls, cache_key: str) -> Optional[List[str]]:
|
|
"""
|
|
Retrieve cached placeholders if available.
|
|
|
|
Args:
|
|
cache_key: Cache key for the template
|
|
|
|
Returns:
|
|
List[str]: Cached placeholder list, or None if not cached
|
|
"""
|
|
cached = cls._placeholder_cache.get(cache_key)
|
|
if cached:
|
|
_logger.debug(f'Placeholder cache hit for key: {cache_key[:16]}...')
|
|
return cached
|
|
|
|
@classmethod
|
|
def _cache_placeholders(cls, cache_key: str, placeholders: List[str]):
|
|
"""
|
|
Cache parsed placeholders.
|
|
|
|
Implements LRU eviction when cache is full.
|
|
|
|
Args:
|
|
cache_key: Cache key for the template
|
|
placeholders: List of parsed placeholders
|
|
"""
|
|
# Implement simple LRU: remove oldest entry if cache is full
|
|
if len(cls._placeholder_cache) >= cls._placeholder_cache_max_size:
|
|
# Remove the first (oldest) entry
|
|
oldest_key = next(iter(cls._placeholder_cache))
|
|
del cls._placeholder_cache[oldest_key]
|
|
_logger.debug(f'Evicted oldest placeholder cache entry: {oldest_key[:16]}...')
|
|
|
|
cls._placeholder_cache[cache_key] = placeholders
|
|
_logger.debug(
|
|
f'Cached {len(placeholders)} placeholders with key: {cache_key[:16]}... '
|
|
f'(cache size: {len(cls._placeholder_cache)})'
|
|
)
|
|
|
|
@classmethod
|
|
def clear_cache(cls):
|
|
"""
|
|
Clear the placeholder cache.
|
|
|
|
This can be called to free memory or when templates are updated.
|
|
"""
|
|
cache_size = len(cls._placeholder_cache)
|
|
cls._placeholder_cache.clear()
|
|
_logger.info(f'Cleared placeholder cache ({cache_size} entries removed)')
|
|
|
|
def validate_template(self, docx_binary: bytes) -> Tuple[bool, str]:
|
|
"""
|
|
Validate that the provided binary data is a valid DOCX file.
|
|
|
|
Args:
|
|
docx_binary: Binary content of the DOCX file
|
|
|
|
Returns:
|
|
Tuple[bool, str]: (is_valid, error_message)
|
|
- is_valid: True if template is valid, False otherwise
|
|
- error_message: Empty string if valid, error description if invalid
|
|
"""
|
|
if not docx_binary:
|
|
return False, "Template file is empty"
|
|
|
|
if not isinstance(docx_binary, bytes):
|
|
return False, "Template must be provided as binary data"
|
|
|
|
try:
|
|
# Attempt to open the document
|
|
doc_stream = BytesIO(docx_binary)
|
|
Document(doc_stream)
|
|
return True, ""
|
|
|
|
except (PackageNotFoundError, BadZipFile):
|
|
error_msg = "The uploaded file is not a valid DOCX file or is corrupted"
|
|
_logger.warning(f"Template validation failed: {error_msg}")
|
|
return False, error_msg
|
|
|
|
except Exception as e:
|
|
error_msg = f"Unable to read template structure: {str(e)}"
|
|
_logger.error(f"Template validation error: {error_msg}", exc_info=True)
|
|
return False, error_msg
|
|
|
|
def parse_template(self, docx_binary: bytes, use_cache: bool = True) -> List[str]:
|
|
"""
|
|
Extract all placeholders from a DOCX template.
|
|
|
|
This method scans through all paragraphs and table cells in the document
|
|
to find text matching the placeholder pattern {key.field_name}.
|
|
|
|
Performance optimization: Results are cached based on template content hash.
|
|
|
|
Args:
|
|
docx_binary: Binary content of the DOCX file
|
|
use_cache: Whether to use caching (default: True)
|
|
|
|
Returns:
|
|
List[str]: List of unique placeholder strings found in the template
|
|
|
|
Raises:
|
|
ValueError: If the template is invalid or corrupted
|
|
"""
|
|
# Check cache first if enabled
|
|
if use_cache:
|
|
cache_key = self._get_cache_key(docx_binary)
|
|
cached_placeholders = self._get_cached_placeholders(cache_key)
|
|
if cached_placeholders is not None:
|
|
return cached_placeholders
|
|
|
|
# First validate the template
|
|
is_valid, error_msg = self.validate_template(docx_binary)
|
|
if not is_valid:
|
|
raise ValueError(error_msg)
|
|
|
|
try:
|
|
# Open the document
|
|
doc_stream = BytesIO(docx_binary)
|
|
document = Document(doc_stream)
|
|
|
|
placeholders = set()
|
|
|
|
# Extract placeholders from paragraphs
|
|
for paragraph in document.paragraphs:
|
|
placeholders.update(self._extract_placeholders_from_text(paragraph.text))
|
|
|
|
# Extract placeholders from tables
|
|
for table in document.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
for paragraph in cell.paragraphs:
|
|
placeholders.update(
|
|
self._extract_placeholders_from_text(paragraph.text)
|
|
)
|
|
|
|
# Extract placeholders from headers and footers
|
|
for section in document.sections:
|
|
# Header
|
|
header = section.header
|
|
for paragraph in header.paragraphs:
|
|
placeholders.update(
|
|
self._extract_placeholders_from_text(paragraph.text)
|
|
)
|
|
|
|
# Footer
|
|
footer = section.footer
|
|
for paragraph in footer.paragraphs:
|
|
placeholders.update(
|
|
self._extract_placeholders_from_text(paragraph.text)
|
|
)
|
|
|
|
# Return sorted list for consistency
|
|
result = sorted(list(placeholders))
|
|
|
|
# Cache the result if caching is enabled
|
|
if use_cache:
|
|
self._cache_placeholders(cache_key, result)
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error parsing template: {str(e)}"
|
|
_logger.error(error_msg, exc_info=True)
|
|
raise ValueError(error_msg)
|
|
|
|
def _extract_placeholders_from_text(self, text: str) -> set:
|
|
"""
|
|
Extract placeholders from a text string using regex.
|
|
|
|
Args:
|
|
text: Text string to search for placeholders
|
|
|
|
Returns:
|
|
set: Set of placeholder strings found in the text
|
|
"""
|
|
if not text:
|
|
return set()
|
|
|
|
matches = re.findall(self.PLACEHOLDER_PATTERN, text)
|
|
return set(matches)
|