Source code for llm_etl_pipeline.extraction.public.converters.pdfconverters

"""
The `PdfConverter` class provides a robust
interface for converting PDF documents to text.

It encapsulates the `docling` library's `DocumentConverter` to offer
configurable options for OCR, table structure detection, and cell matching
during the conversion process. This class ensures consistent PDF processing
within the LLM ETL pipeline, with integrated logging for clarity and error handling.
"""

import logging
from pathlib import Path
from typing import Any, Union

from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from pydantic import BaseModel, Field, PrivateAttr, validate_call

from llm_etl_pipeline.customized_logger import logger
from llm_etl_pipeline.extraction.internal import _SpecificWarningFilter

# Get the logger instance that Docling is using
# This is used to ignore a specific warning message from docling.
docling_specific_logger = logging.getLogger("docling_core.types.doc.document")
SPECIFIC_MESSAGE_TO_IGNORE = (
    "Parameter `strict_text` has been deprecated and will be ignored."
)
my_filter = _SpecificWarningFilter(SPECIFIC_MESSAGE_TO_IGNORE)
docling_specific_logger.addFilter(my_filter)


[docs] class PdfConverter(BaseModel): """ A specialized class for the conversion of PDF documents, leveraging Pydantic for configuration options management and internally encapsulating a `DocumentConverter` instance. This class provides a streamlined interface for converting PDF documents into text with configurable table structure detection, and cell matching during the conversion process. For the moment, the attributes are frozen. Attributes: do_ocr (bool): Indicates whether Optical Character Recognition (OCR) should be performed on the PDF document. Defaults to `False`. This field is `frozen=True`. do_table_structure (bool): Indicates whether to detect table structures within the PDF. Defaults to `True`. This field is `frozen=True`. do_cell_matching (bool): Indicates whether to perform cell matching for detected tables. Defaults to `False`. This field is `frozen=True`. """ # Pydantic fields for configuration options do_ocr: bool = Field( default=False, description="Indicates whether to perform OCR on the PDF document.", strict=True, frozen=True, ) do_table_structure: bool = Field( default=True, description="Indicates whether to detect table structures.", strict=True, frozen=True, ) do_cell_matching: bool = Field( default=False, description="Indicates whether to perform cell matching for tables.", strict=True, frozen=True, ) # Private attribute for the DocumentConverter instance _doc_converter: DocumentConverter = PrivateAttr() def __init__(self, **data: Any): # Added Any type hint for clarity super().__init__(**data) # Call to BaseModel constructor # Configure the internal DocumentConverter based on Pydantic fields self._configure_document_converter() def _configure_document_converter(self) -> None: """ Configures the internal `DocumentConverter` instance based on the Pydantic fields of this model. This private method creates a `PdfPipelineOptions` object and populates it with the `do_ocr`, `do_table_structure`, and `do_cell_matching` settings defined in this `PdfConverter` instance. It then initializes the `_doc_converter` with these specific PDF format options. """ pdf_pipeline_options = PdfPipelineOptions() pdf_pipeline_options.do_ocr = self.do_ocr pdf_pipeline_options.do_table_structure = self.do_table_structure pdf_pipeline_options.table_structure_options.do_cell_matching = ( self.do_cell_matching ) self._doc_converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options) } )
[docs] @validate_call def convert_to_text(self, input_pdf_path: Union[Path, str, DocumentStream]) -> str: """ Converts a PDF document to plain text using the internal `DocumentConverter` instance. This method takes various forms of PDF input (file path, string path, or document stream) and utilizes the pre-configured `DocumentConverter` to perform the conversion. Args: input_pdf_path (Union[Path, str, DocumentStream]): The path to the input PDF document (as a `pathlib.Path` object or string), or a `DocumentStream` object representing the PDF content. Returns: str: The extracted plain text content from the converted PDF document. Raises: Exception: Re-raises any exception encountered during the conversion process by the underlying `DocumentConverter`. An error message is also logged. """ logger.info(f"Attempting to convert PDF to text from input: {input_pdf_path}") try: # Call the 'convert' method on the internal instance converted_document = self._doc_converter.convert(input_pdf_path) result_text = converted_document.document.export_to_text() logger.success("PDF successfully converted to text.") return result_text except Exception as e: logger.error( f"Error during PDF conversion for input '{input_pdf_path}': {e}" ) raise