ocrconverter

`to_plaintext(path, language)`

Convert a PDF containing ocr'd text to an iterable containing text paragraphs.

Pick up the tiff images created by to_tiff and use pytesseract to extract text from them.

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to the PDF file.	required
`language`	`str`	The language of the text in the PDF file.	required

Source code in corpustools/ocrconverter.py

def to_plaintext(path: Path, language: str) -> Iterable[str]:
    """Convert a PDF containing ocr'd text to an iterable containing text paragraphs.

    Pick up the tiff images created by to_tiff and use pytesseract to extract text from
    them.

    Args:
        path (Path): The path to the PDF file.
        language (str): The language of the text in the PDF file.
    """
    to_tiff(path)

    for image_file in Path("/tmp").glob(f"{path.stem}-*.tif"):
        for paragraph in pytesseract.image_to_string(
            Image.open(image_file), lang=language
        ).split("\n\n"):
            yield paragraph
        image_file.unlink()

`to_tiff(path)`

Convert a PDF to a series of tiff images.

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to the PDF file.	required

Raises:

Type	Description
`ConversionError`	If the conversion fails.

Source code in corpustools/ocrconverter.py

def to_tiff(path: Path) -> None:
    """Convert a PDF to a series of tiff images.

    Args:
        path (Path): The path to the PDF file.

    Raises:
        ConversionError: If the conversion fails.
    """
    command = f"pdfimages -tiff {path} {path.stem}".split()

    runner = ExternalCommandRunner()
    runner.run(command, cwd="/tmp")

    if runner.returncode != 0:
        with open(str(path) + ".log", "w") as logfile:
            print(f"stdout\n{runner.stdout}\n", file=logfile)
            print(f"stderr\n{runner.stderr}\n", file=logfile)
            raise ConversionError(
                "{} failed. More info in the log file: {}".format(
                    command[0], str(path) + ".log"
                )
            )

`to_xml(path, language)`

Convert a PDF containing ocr'd text to a Giella xml document.

Parameters:

Name	Type	Description	Default
`path`	`Path`	The path to the PDF file.	required
`language`	`str`	The language of the text in the PDF file.	required

Returns: (_Element): The xml document.

Source code in corpustools/ocrconverter.py

def to_xml(path: Path, language: str) -> _Element:
    """Convert a PDF containing ocr'd text to a Giella xml document.

    Args:
        path (Path): The path to the PDF file.
        language (str): The language of the text in the PDF file.
    Returns:
        (_Element): The xml document.
    """
    document = Element("document")
    SubElement(document, "header")
    body = SubElement(document, "body")
    for text in to_plaintext(path, language):
        SubElement(body, "p").text = text

    return document