Skip to content

ocrconverter

to_plaintext(path, language)

Convert a PDF containing ocr'd text to an iterable containing text paragraphs.

Pick up the tiff images created by to_tiff and use pytesseract to extract text from them.

Parameters:

Name Type Description Default
path Path

The path to the PDF file.

required
language str

The language of the text in the PDF file.

required
Source code in corpustools/ocrconverter.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def to_plaintext(path: Path, language: str) -> Iterable[str]:
    """Convert a PDF containing ocr'd text to an iterable containing text paragraphs.

    Pick up the tiff images created by to_tiff and use pytesseract to extract text from
    them.

    Args:
        path (Path): The path to the PDF file.
        language (str): The language of the text in the PDF file.
    """
    to_tiff(path)

    for image_file in Path("/tmp").glob(f"{path.stem}-*.tif"):
        for paragraph in pytesseract.image_to_string(
            Image.open(image_file), lang=language
        ).split("\n\n"):
            yield paragraph
        image_file.unlink()

to_tiff(path)

Convert a PDF to a series of tiff images.

Parameters:

Name Type Description Default
path Path

The path to the PDF file.

required

Raises:

Type Description
ConversionError

If the conversion fails.

Source code in corpustools/ocrconverter.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def to_tiff(path: Path) -> None:
    """Convert a PDF to a series of tiff images.

    Args:
        path (Path): The path to the PDF file.

    Raises:
        ConversionError: If the conversion fails.
    """
    command = f"pdfimages -tiff {path} {path.stem}".split()

    runner = ExternalCommandRunner()
    runner.run(command, cwd="/tmp")

    if runner.returncode != 0:
        with open(str(path) + ".log", "w") as logfile:
            print(f"stdout\n{runner.stdout}\n", file=logfile)
            print(f"stderr\n{runner.stderr}\n", file=logfile)
            raise ConversionError(
                "{} failed. More info in the log file: {}".format(
                    command[0], str(path) + ".log"
                )
            )

to_xml(path, language)

Convert a PDF containing ocr'd text to a Giella xml document.

Parameters:

Name Type Description Default
path Path

The path to the PDF file.

required
language str

The language of the text in the PDF file.

required

Returns: (_Element): The xml document.

Source code in corpustools/ocrconverter.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def to_xml(path: Path, language: str) -> _Element:
    """Convert a PDF containing ocr'd text to a Giella xml document.

    Args:
        path (Path): The path to the PDF file.
        language (str): The language of the text in the PDF file.
    Returns:
        (_Element): The xml document.
    """
    document = Element("document")
    SubElement(document, "header")
    body = SubElement(document, "body")
    for text in to_plaintext(path, language):
        SubElement(body, "p").text = text

    return document