Skip to content

pdfconverter

Convert pdf files to the Giella xml format.

PDF2XMLConverter

Bases: basicconverter.BasicConverter

Class to convert the xml output of pdftohtml to Giella xml.

Attributes:

Name Type Description
extractor PDFTextExtractor

class to extract text from the xml that pdftohtml produces.

pdffontspecs PDFFontspecs

class to store fontspecs found in the xml pages.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
class PDF2XMLConverter(basicconverter.BasicConverter):
    """Class to convert the xml output of pdftohtml to Giella xml.

    Attributes:
        extractor (PDFTextExtractor): class to extract text from the xml that
            pdftohtml produces.
        pdffontspecs (PDFFontspecs): class to store fontspecs found in the xml
            pages.
    """

    def __init__(self, filename):
        """Initialise the PDF2XMLConverte class.

        Args:
            filename (str): the path to the pdf file.
        """
        super().__init__(filename)
        self.pdffontspecs = PDFFontspecs()

    @staticmethod
    def strip_chars(content, extra=""):
        """Strip unwanted chars from the document.

        Args:
            content (str): the xml document that pdftohtml produces
            extra (str): more character that should be removed

        Returns:
            (str): containing the modified version of the document.
        """
        remove_re = re.compile(f"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F{extra}]")
        content, _ = remove_re.subn("", content)

        # Microsoft Word PDF's have Latin-1 file names in links; we
        # don't actually need any link attributes:
        content = re.sub("<a [^>]+>", "<a>", content)

        return content

    @staticmethod
    def replace_ligatures(content):
        """Replace unwanted strings with correct replacements.

        Args:
            content (str): content of an xml document.

        Returns:
            (str): String containing the new content of the xml document.
        """
        replacements = {
            "[dstrok]": "đ",
            "[Dstrok]": "Đ",
            "[tstrok]": "ŧ",
            "[Tstrok]": "Ŧ",
            "[scaron]": "š",
            "[Scaron]": "Š",
            "[zcaron]": "ž",
            "[Zcaron]": "Ž",
            "[ccaron]": "č",
            "[Ccaron]": "Č",
            "[eng": "ŋ",
            " ]": "",
            "Ď": "đ",  # cough
            "ď": "đ",  # cough
            "fi": "fi",
            "fl": "fl",
            "ff": "ff",
            "ffi": "ffi",
            "ffl": "ffl",
            "ſt": "ft",
        }

        for key, value in replacements.items():
            content = content.replace(key + " ", value)
            content = content.replace(key, value)

        return content

    def convert2intermediate(self):
        """Convert from pdf to a corpus xml file.

        Returns:
            (lxml.etree.Element): A corpus xml etree with the content of
                the pdf file, but without most of the metadata.
        """
        command = f"pdftohtml -hidden -enc UTF-8 -stdout -nodrm -i -s {self.orig}"
        pdftohtmloutput = self.extract_text(command.split())
        return self.pdftohtml2intermediate(pdftohtmloutput)

    @staticmethod
    def possibly_add_to_body(body, this_p):
        if this_p.text or len(this_p):
            body.append(this_p)

    def pdftohtml2intermediate(self, pdftohtmloutput):
        """Convert output of pdftohtml to a corpus xml file.

        Returns:
            (lxml.etree.Element): A corpus xml etree with the content of the
                pdf file, but without most of the metadata.
        """
        pdf_content = self.split_by_br(
            self.replace_ligatures(self.strip_chars(pdftohtmloutput))
        )

        document = etree.Element("html")
        body = etree.SubElement(document, "body")

        try:
            parser = etree.HTMLParser()
            root_element = etree.fromstring(pdf_content.encode("utf8"), parser=parser)
        except etree.XMLSyntaxError as error:
            self.handle_syntaxerror(error, util.lineno(), pdf_content)

        this_p = etree.Element("p")
        for paragraph in self.parse_pages(root_element):
            text = paragraph.xpath("string()").strip()
            if text:
                if text[0] != text[0].lower():
                    self.possibly_add_to_body(body, this_p)
                    this_p = etree.Element("p")
                this_p = merge(this_p, paragraph)

        self.possibly_add_to_body(body, this_p)

        return document

    def pdftohtml2html(self, pdftohtmloutput):
        """Convert output of pdftohtml to html (applying our regular fixes)

        Returns:
            (str): An html file as string with the content of the pdf
                file, but without most of the metadata.
        """
        doc = self.pdftohtml2intermediate(pdftohtmloutput)
        meta = etree.Element("meta")
        meta.attrib["charset"] = "utf-8"
        doc.insert(0, meta)
        list(map(doc.remove, doc.findall("header")))
        doc.tag = "html"
        lang = self.metadata.get_variable("mainlang")
        if lang is None or lang == "":
            lang = "se"
        doc.attrib["lang"] = lang
        return etree.tostring(doc, encoding="utf8", method="html", pretty_print=True)

    def parse_page(self, page):
        """Parse the page element.

        Args:
            page (Any): a pdf xml page element.
        """
        try:
            pdfpage = PDFPage(
                page,
                metadata_margins=self.metadata.margins,
                metadata_inner_margins=self.metadata.inner_margins,
                linespacing=self.metadata.linespacing,
            )
            if not pdfpage.is_skip_page(self.metadata.skip_pages):
                # pdfpage.fix_font_id(self.pdffontspecs)
                yield from pdfpage.pick_valid_text_elements()
        except xslsetter.XsltError as error:
            raise util.ConversionError(str(error))

    def parse_pages(self, root_element):
        """Parse the pages of the pdf xml document.

        Args:
            root_element (xml.etree.Element): the root element of the pdf2xml
                document.
        """
        return (
            paragraph
            for page in root_element.xpath('//div[starts-with(@id, "page")]')
            for paragraph in self.parse_page(page)
        )

    def add_fontspecs(self, page):
        """Extract font specs found in a pdf2xml page element.

        Args:
            page (etree.Element): a pdf page
        """
        for xmlfontspec in page.iter("fontspec"):
            self.pdffontspecs.add_fontspec(xmlfontspec)

    def split_by_br(self, text):
        brs = text.replace("&#160;", " ").split("<br/>")

        if len(brs) == 1:
            return text

        strings = [
            handle_br(brs[index], current) for index, current in enumerate(brs[1:])
        ]
        strings.append(brs[-1])

        return "".join(strings)

    def extract_text(self, command):
        """Extract the text from a document.

        Args:
            command (list[str]): a list containing the command and
                the arguments sent to ExternalCommandRunner.

        Returns:
            (bytes): byte string containing the output of the program
        """
        runner = util.ExternalCommandRunner()
        runner.run(command, cwd="/tmp")

        if runner.returncode != 0:
            with open(self.orig + ".log", "w") as logfile:
                print(f"stdout\n{runner.stdout}\n", file=logfile)
                print(f"stderr\n{runner.stderr}\n", file=logfile)
                raise util.ConversionError(
                    "{} failed. More info in the log file: {}".format(
                        command[0], self.orig + ".log"
                    )
                )

        return runner.stdout.decode("utf8")

    def handle_syntaxerror(self, error, lineno, invalid_input):
        """Handle an xml syntax error.

        Args:
            error (Exception): an exception
            lineno (int): the line number in this module where the error happened.
            invalid_input (str): a string containing the invalid input.
        """
        with open(self.orig + ".log", "w") as logfile:
            logfile.write(f"Error at: {lineno}")
            for entry in error.error_log:
                logfile.write(f"\n{str(entry.line)}: {str(entry.column)} ")
                try:
                    logfile.write(entry.message)
                except ValueError:
                    logfile.write(entry.message.encode("latin1"))

                logfile.write("\n")

            logfile.write(invalid_input)

        raise util.ConversionError(
            "{}: log is found in {}".format(type(self).__name__, self.orig + ".log")
        )

__init__(filename)

Initialise the PDF2XMLConverte class.

Parameters:

Name Type Description Default
filename str

the path to the pdf file.

required
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
446
447
448
449
450
451
452
453
def __init__(self, filename):
    """Initialise the PDF2XMLConverte class.

    Args:
        filename (str): the path to the pdf file.
    """
    super().__init__(filename)
    self.pdffontspecs = PDFFontspecs()

add_fontspecs(page)

Extract font specs found in a pdf2xml page element.

Parameters:

Name Type Description Default
page etree.Element

a pdf page

required
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
614
615
616
617
618
619
620
621
def add_fontspecs(self, page):
    """Extract font specs found in a pdf2xml page element.

    Args:
        page (etree.Element): a pdf page
    """
    for xmlfontspec in page.iter("fontspec"):
        self.pdffontspecs.add_fontspec(xmlfontspec)

convert2intermediate()

Convert from pdf to a corpus xml file.

Returns:

Type Description
lxml.etree.Element

A corpus xml etree with the content of the pdf file, but without most of the metadata.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
514
515
516
517
518
519
520
521
522
523
def convert2intermediate(self):
    """Convert from pdf to a corpus xml file.

    Returns:
        (lxml.etree.Element): A corpus xml etree with the content of
            the pdf file, but without most of the metadata.
    """
    command = f"pdftohtml -hidden -enc UTF-8 -stdout -nodrm -i -s {self.orig}"
    pdftohtmloutput = self.extract_text(command.split())
    return self.pdftohtml2intermediate(pdftohtmloutput)

extract_text(command)

Extract the text from a document.

Parameters:

Name Type Description Default
command list[str]

a list containing the command and the arguments sent to ExternalCommandRunner.

required

Returns:

Type Description
bytes

byte string containing the output of the program

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
def extract_text(self, command):
    """Extract the text from a document.

    Args:
        command (list[str]): a list containing the command and
            the arguments sent to ExternalCommandRunner.

    Returns:
        (bytes): byte string containing the output of the program
    """
    runner = util.ExternalCommandRunner()
    runner.run(command, cwd="/tmp")

    if runner.returncode != 0:
        with open(self.orig + ".log", "w") as logfile:
            print(f"stdout\n{runner.stdout}\n", file=logfile)
            print(f"stderr\n{runner.stderr}\n", file=logfile)
            raise util.ConversionError(
                "{} failed. More info in the log file: {}".format(
                    command[0], self.orig + ".log"
                )
            )

    return runner.stdout.decode("utf8")

handle_syntaxerror(error, lineno, invalid_input)

Handle an xml syntax error.

Parameters:

Name Type Description Default
error Exception

an exception

required
lineno int

the line number in this module where the error happened.

required
invalid_input str

a string containing the invalid input.

required
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
def handle_syntaxerror(self, error, lineno, invalid_input):
    """Handle an xml syntax error.

    Args:
        error (Exception): an exception
        lineno (int): the line number in this module where the error happened.
        invalid_input (str): a string containing the invalid input.
    """
    with open(self.orig + ".log", "w") as logfile:
        logfile.write(f"Error at: {lineno}")
        for entry in error.error_log:
            logfile.write(f"\n{str(entry.line)}: {str(entry.column)} ")
            try:
                logfile.write(entry.message)
            except ValueError:
                logfile.write(entry.message.encode("latin1"))

            logfile.write("\n")

        logfile.write(invalid_input)

    raise util.ConversionError(
        "{}: log is found in {}".format(type(self).__name__, self.orig + ".log")
    )

parse_page(page)

Parse the page element.

Parameters:

Name Type Description Default
page Any

a pdf xml page element.

required
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
def parse_page(self, page):
    """Parse the page element.

    Args:
        page (Any): a pdf xml page element.
    """
    try:
        pdfpage = PDFPage(
            page,
            metadata_margins=self.metadata.margins,
            metadata_inner_margins=self.metadata.inner_margins,
            linespacing=self.metadata.linespacing,
        )
        if not pdfpage.is_skip_page(self.metadata.skip_pages):
            # pdfpage.fix_font_id(self.pdffontspecs)
            yield from pdfpage.pick_valid_text_elements()
    except xslsetter.XsltError as error:
        raise util.ConversionError(str(error))

parse_pages(root_element)

Parse the pages of the pdf xml document.

Parameters:

Name Type Description Default
root_element xml.etree.Element

the root element of the pdf2xml document.

required
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
601
602
603
604
605
606
607
608
609
610
611
612
def parse_pages(self, root_element):
    """Parse the pages of the pdf xml document.

    Args:
        root_element (xml.etree.Element): the root element of the pdf2xml
            document.
    """
    return (
        paragraph
        for page in root_element.xpath('//div[starts-with(@id, "page")]')
        for paragraph in self.parse_page(page)
    )

pdftohtml2html(pdftohtmloutput)

Convert output of pdftohtml to html (applying our regular fixes)

Returns:

Type Description
str

An html file as string with the content of the pdf file, but without most of the metadata.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
def pdftohtml2html(self, pdftohtmloutput):
    """Convert output of pdftohtml to html (applying our regular fixes)

    Returns:
        (str): An html file as string with the content of the pdf
            file, but without most of the metadata.
    """
    doc = self.pdftohtml2intermediate(pdftohtmloutput)
    meta = etree.Element("meta")
    meta.attrib["charset"] = "utf-8"
    doc.insert(0, meta)
    list(map(doc.remove, doc.findall("header")))
    doc.tag = "html"
    lang = self.metadata.get_variable("mainlang")
    if lang is None or lang == "":
        lang = "se"
    doc.attrib["lang"] = lang
    return etree.tostring(doc, encoding="utf8", method="html", pretty_print=True)

pdftohtml2intermediate(pdftohtmloutput)

Convert output of pdftohtml to a corpus xml file.

Returns:

Type Description
lxml.etree.Element

A corpus xml etree with the content of the pdf file, but without most of the metadata.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
def pdftohtml2intermediate(self, pdftohtmloutput):
    """Convert output of pdftohtml to a corpus xml file.

    Returns:
        (lxml.etree.Element): A corpus xml etree with the content of the
            pdf file, but without most of the metadata.
    """
    pdf_content = self.split_by_br(
        self.replace_ligatures(self.strip_chars(pdftohtmloutput))
    )

    document = etree.Element("html")
    body = etree.SubElement(document, "body")

    try:
        parser = etree.HTMLParser()
        root_element = etree.fromstring(pdf_content.encode("utf8"), parser=parser)
    except etree.XMLSyntaxError as error:
        self.handle_syntaxerror(error, util.lineno(), pdf_content)

    this_p = etree.Element("p")
    for paragraph in self.parse_pages(root_element):
        text = paragraph.xpath("string()").strip()
        if text:
            if text[0] != text[0].lower():
                self.possibly_add_to_body(body, this_p)
                this_p = etree.Element("p")
            this_p = merge(this_p, paragraph)

    self.possibly_add_to_body(body, this_p)

    return document

replace_ligatures(content) staticmethod

Replace unwanted strings with correct replacements.

Parameters:

Name Type Description Default
content str

content of an xml document.

required

Returns:

Type Description
str

String containing the new content of the xml document.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
@staticmethod
def replace_ligatures(content):
    """Replace unwanted strings with correct replacements.

    Args:
        content (str): content of an xml document.

    Returns:
        (str): String containing the new content of the xml document.
    """
    replacements = {
        "[dstrok]": "đ",
        "[Dstrok]": "Đ",
        "[tstrok]": "ŧ",
        "[Tstrok]": "Ŧ",
        "[scaron]": "š",
        "[Scaron]": "Š",
        "[zcaron]": "ž",
        "[Zcaron]": "Ž",
        "[ccaron]": "č",
        "[Ccaron]": "Č",
        "[eng": "ŋ",
        " ]": "",
        "Ď": "đ",  # cough
        "ď": "đ",  # cough
        "fi": "fi",
        "fl": "fl",
        "ff": "ff",
        "ffi": "ffi",
        "ffl": "ffl",
        "ſt": "ft",
    }

    for key, value in replacements.items():
        content = content.replace(key + " ", value)
        content = content.replace(key, value)

    return content

strip_chars(content, extra='') staticmethod

Strip unwanted chars from the document.

Parameters:

Name Type Description Default
content str

the xml document that pdftohtml produces

required
extra str

more character that should be removed

''

Returns:

Type Description
str

containing the modified version of the document.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
@staticmethod
def strip_chars(content, extra=""):
    """Strip unwanted chars from the document.

    Args:
        content (str): the xml document that pdftohtml produces
        extra (str): more character that should be removed

    Returns:
        (str): containing the modified version of the document.
    """
    remove_re = re.compile(f"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F{extra}]")
    content, _ = remove_re.subn("", content)

    # Microsoft Word PDF's have Latin-1 file names in links; we
    # don't actually need any link attributes:
    content = re.sub("<a [^>]+>", "<a>", content)

    return content

PDFEmptyPageError

Bases: Exception

Raise this exception if a pdf page is empty.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
162
163
class PDFEmptyPageError(Exception):
    """Raise this exception if a pdf page is empty."""

PDFFontspecs

Add font specs found in a pdf page to this class.

Attributes:

Name Type Description
pdffontspecs dict[PDFFontspec, int]

map fontspecs to fontspec ids.

duplicates dict[str, str]

map ids of duplicate fontspecs to the id of the first instance of this fontspec.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
class PDFFontspecs:
    """Add font specs found in a pdf page to this class.

    Attributes:
        pdffontspecs (dict[PDFFontspec, int]): map fontspecs to fontspec ids.
        duplicates (dict[str, str]): map ids of duplicate fontspecs to the
            id of the first instance of this fontspec.
    """

    def __init__(self):
        """Initialise the PDFFontspecs class."""
        self.pdffontspecs = {}
        self.duplicates = {}

    def add_fontspec(self, xmlfontspec):
        """Add a pdf2xml fontspec to this class.

        Args:
            xmlfontspec (etree.Element): a PDF2XML fontspec element found in a
                PDF2XML page element.
        """
        this_id = xmlfontspec.get("id")
        this_fontspec = PDFFontspec(
            size=xmlfontspec.get("size"),
            family=xmlfontspec.get("family"),
            color=xmlfontspec.get("color"),
        )

        for fontspec in list(self.pdffontspecs.keys()):
            if fontspec == this_fontspec:
                self.duplicates[this_id] = self.pdffontspecs[fontspec]
                break
        else:
            self.pdffontspecs[this_fontspec] = this_id

    def corrected_id(self, font_id):
        """Return a corrected id of a fontspec.

        Some xmlfontspecs have different id's for an identical font.
        This function makes sure identical fonts have identical id's.

        Args:
            font_id (int): an integer that is the id of the fontspec.

        Returns:
            (int): an integer that is the corrected id of the fontspec.
        """
        if font_id in self.duplicates:
            return self.duplicates[font_id]
        else:
            return font_id

__init__()

Initialise the PDFFontspecs class.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
118
119
120
121
def __init__(self):
    """Initialise the PDFFontspecs class."""
    self.pdffontspecs = {}
    self.duplicates = {}

add_fontspec(xmlfontspec)

Add a pdf2xml fontspec to this class.

Parameters:

Name Type Description Default
xmlfontspec etree.Element

a PDF2XML fontspec element found in a PDF2XML page element.

required
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def add_fontspec(self, xmlfontspec):
    """Add a pdf2xml fontspec to this class.

    Args:
        xmlfontspec (etree.Element): a PDF2XML fontspec element found in a
            PDF2XML page element.
    """
    this_id = xmlfontspec.get("id")
    this_fontspec = PDFFontspec(
        size=xmlfontspec.get("size"),
        family=xmlfontspec.get("family"),
        color=xmlfontspec.get("color"),
    )

    for fontspec in list(self.pdffontspecs.keys()):
        if fontspec == this_fontspec:
            self.duplicates[this_id] = self.pdffontspecs[fontspec]
            break
    else:
        self.pdffontspecs[this_fontspec] = this_id

corrected_id(font_id)

Return a corrected id of a fontspec.

Some xmlfontspecs have different id's for an identical font. This function makes sure identical fonts have identical id's.

Parameters:

Name Type Description Default
font_id int

an integer that is the id of the fontspec.

required

Returns:

Type Description
int

an integer that is the corrected id of the fontspec.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def corrected_id(self, font_id):
    """Return a corrected id of a fontspec.

    Some xmlfontspecs have different id's for an identical font.
    This function makes sure identical fonts have identical id's.

    Args:
        font_id (int): an integer that is the id of the fontspec.

    Returns:
        (int): an integer that is the corrected id of the fontspec.
    """
    if font_id in self.duplicates:
        return self.duplicates[font_id]
    else:
        return font_id

PDFPage

Reads a page element.

Attributes:

Name Type Description
textelements list of PDFTextElements

contains the text of the page

pdf_pagemetadata PDFPageMetadata

contains the metadata of the page

The textelements are manipulated in several ways, then ordered in the way they appear on the page and finally sent to PDFTextExtractor

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
class PDFPage:
    """Reads a page element.

    Attributes:
        textelements (list of PDFTextElements): contains the text of the page
        pdf_pagemetadata (PDFPageMetadata): contains the metadata of the page

    The textelements are manipulated in several ways,
    then ordered in the way they appear on the page and
    finally sent to PDFTextExtractor
    """

    def __init__(
        self,
        page_element,
        metadata_margins=None,
        metadata_inner_margins=None,
        linespacing=None,
    ):
        """Initialise the PDFPage class.

        Args:
            page_element (etree.Element): an etree element representing a pdf page
            metadata_margins (dict): a dict containing margins read from the metadata
                file.
            metadata_inner_margins (dict): a dict containing inner_margins read from
                the metadata file.
        """
        self.page_element = page_element
        self.pdf_pagemetadata = PDFPageMetadata(
            page_id=page_element.get("id"),
            page_style=page_element.get("style"),
            metadata_margins=metadata_margins,
            metadata_inner_margins=metadata_inner_margins,
        )

    def is_skip_page(self, skip_pages):
        """Found out if this page should be skipped.

        Args:
            skip_pages (list of mixed): list of the pages that should be
                skipped.

        Returns:
            (bool): True if this page should be skipped, otherwise false.
        """
        return (
            ("odd" in skip_pages and (self.pdf_pagemetadata.page_number % 2) == 1)
            or ("even" in skip_pages and (self.pdf_pagemetadata.page_number % 2) == 0)
            or self.pdf_pagemetadata.page_number in skip_pages
        )

    @property
    def linespacing(self):
        """Return linespacing."""
        if self.linespacing_dict.get("all"):
            return self.linespacing_dict["all"]
        elif self.linespacing_dict.get("even") and (
            (self.pdf_pagemetadata.page_number % 2) == 0
        ):
            return self.linespacing_dict["even"]
        elif self.linespacing_dict.get("odd") and (
            (self.pdf_pagemetadata.page_number % 2) == 1
        ):
            return self.linespacing_dict["odd"]
        elif self.linespacing_dict.get(self.pdf_pagemetadata.page_number):
            return self.linespacing_dict[self.pdf_pagemetadata.page_number]
        else:
            return 1.5

    def fix_font_id(self, pdffontspecs):
        """Fix font id in text elements.

        Sometimes the same font has different ID's. Correct that ID
        if necessary.

        Args:
            pdffontspecs (PDFFontspecs): a PDFFontspecs instance.
        """
        for textelement in self.textelements:
            correct = pdffontspecs.corrected_id(textelement.font)
            textelement.text_elt.set("font", correct)

    def remove_elements_outside_margin(self):
        """Remove PDFTextElements from textelements if needed."""
        margins = self.pdf_pagemetadata.compute_margins()
        inner_margins = self.pdf_pagemetadata.compute_inner_margins()

        self.textelements[:] = [
            t for t in self.textelements if self.is_inside_margins(t, margins)
        ]
        if inner_margins:
            self.textelements[:] = [
                t
                for t in self.textelements
                if not self.is_inside_inner_margins(t, inner_margins)
            ]

    @staticmethod
    def is_inside_margins(text, margins):
        """Check if t is inside the given margins.

        t is a text element
        """
        if not margins:
            return False

        style = styles(text.get("style"))
        top = int(style.get("top"))
        left = int(style.get("left"))

        return (
            margins["top_margin"] < top < margins["bottom_margin"]
            and margins["left_margin"] < left < margins["right_margin"]
        )

    def pick_valid_text_elements(self):
        """Pick the wanted text elements from a page.

        This is the main function of this class
        """
        margins = self.pdf_pagemetadata.compute_margins()
        inner_margins = self.pdf_pagemetadata.compute_inner_margins()
        for paragraph in self.page_element.iter("p"):
            if self.is_inside_margins(
                paragraph, margins
            ) and not self.is_inside_margins(paragraph, inner_margins):
                yield deepcopy(paragraph)

linespacing property

Return linespacing.

__init__(page_element, metadata_margins=None, metadata_inner_margins=None, linespacing=None)

Initialise the PDFPage class.

Parameters:

Name Type Description Default
page_element etree.Element

an etree element representing a pdf page

required
metadata_margins dict

a dict containing margins read from the metadata file.

None
metadata_inner_margins dict

a dict containing inner_margins read from the metadata file.

None
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
def __init__(
    self,
    page_element,
    metadata_margins=None,
    metadata_inner_margins=None,
    linespacing=None,
):
    """Initialise the PDFPage class.

    Args:
        page_element (etree.Element): an etree element representing a pdf page
        metadata_margins (dict): a dict containing margins read from the metadata
            file.
        metadata_inner_margins (dict): a dict containing inner_margins read from
            the metadata file.
    """
    self.page_element = page_element
    self.pdf_pagemetadata = PDFPageMetadata(
        page_id=page_element.get("id"),
        page_style=page_element.get("style"),
        metadata_margins=metadata_margins,
        metadata_inner_margins=metadata_inner_margins,
    )

fix_font_id(pdffontspecs)

Fix font id in text elements.

Sometimes the same font has different ID's. Correct that ID if necessary.

Parameters:

Name Type Description Default
pdffontspecs PDFFontspecs

a PDFFontspecs instance.

required
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
376
377
378
379
380
381
382
383
384
385
386
387
def fix_font_id(self, pdffontspecs):
    """Fix font id in text elements.

    Sometimes the same font has different ID's. Correct that ID
    if necessary.

    Args:
        pdffontspecs (PDFFontspecs): a PDFFontspecs instance.
    """
    for textelement in self.textelements:
        correct = pdffontspecs.corrected_id(textelement.font)
        textelement.text_elt.set("font", correct)

is_inside_margins(text, margins) staticmethod

Check if t is inside the given margins.

t is a text element

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
@staticmethod
def is_inside_margins(text, margins):
    """Check if t is inside the given margins.

    t is a text element
    """
    if not margins:
        return False

    style = styles(text.get("style"))
    top = int(style.get("top"))
    left = int(style.get("left"))

    return (
        margins["top_margin"] < top < margins["bottom_margin"]
        and margins["left_margin"] < left < margins["right_margin"]
    )

is_skip_page(skip_pages)

Found out if this page should be skipped.

Parameters:

Name Type Description Default
skip_pages list of mixed

list of the pages that should be skipped.

required

Returns:

Type Description
bool

True if this page should be skipped, otherwise false.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def is_skip_page(self, skip_pages):
    """Found out if this page should be skipped.

    Args:
        skip_pages (list of mixed): list of the pages that should be
            skipped.

    Returns:
        (bool): True if this page should be skipped, otherwise false.
    """
    return (
        ("odd" in skip_pages and (self.pdf_pagemetadata.page_number % 2) == 1)
        or ("even" in skip_pages and (self.pdf_pagemetadata.page_number % 2) == 0)
        or self.pdf_pagemetadata.page_number in skip_pages
    )

pick_valid_text_elements()

Pick the wanted text elements from a page.

This is the main function of this class

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
422
423
424
425
426
427
428
429
430
431
432
433
def pick_valid_text_elements(self):
    """Pick the wanted text elements from a page.

    This is the main function of this class
    """
    margins = self.pdf_pagemetadata.compute_margins()
    inner_margins = self.pdf_pagemetadata.compute_inner_margins()
    for paragraph in self.page_element.iter("p"):
        if self.is_inside_margins(
            paragraph, margins
        ) and not self.is_inside_margins(paragraph, inner_margins):
            yield deepcopy(paragraph)

remove_elements_outside_margin()

Remove PDFTextElements from textelements if needed.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
def remove_elements_outside_margin(self):
    """Remove PDFTextElements from textelements if needed."""
    margins = self.pdf_pagemetadata.compute_margins()
    inner_margins = self.pdf_pagemetadata.compute_inner_margins()

    self.textelements[:] = [
        t for t in self.textelements if self.is_inside_margins(t, margins)
    ]
    if inner_margins:
        self.textelements[:] = [
            t
            for t in self.textelements
            if not self.is_inside_inner_margins(t, inner_margins)
        ]

PDFPageMetadata

Read pdf metadata from the metadata file into this class.

Compute metadata needed by the conversion from the data contained in this class.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
class PDFPageMetadata:
    """Read pdf metadata from the metadata file into this class.

    Compute metadata needed by the conversion from the data contained in
    this class.
    """

    def __init__(
        self, page_id, page_style, metadata_margins=None, metadata_inner_margins=None
    ):
        """Initialise the PDFPageMetadata class.

        Args:
            page_id (str): the page id
            page_style (str): the styles as a css string
            metadata_margins (dict): a dict containing margins read
                from the metadata file.
            metadata_inner_margins (dict): a dict containing inner_margins
                read from the metadata file.
        """
        self.page_number = int(page_id.replace("page", "").replace("-div", ""))
        style = styles(page_style)
        self.page_height = int(style.get("height"))
        self.page_width = int(style.get("width"))
        self.metadata_margins = metadata_margins or {}
        self.metadata_inner_margins = metadata_inner_margins or {}

    def compute_margins(self):
        """Compute the margins of a page in pixels.

        Returns:
            (dict): a dict containing the four margins in pixels
        """
        margins = {
            margin: self.compute_margin(margin)
            for margin in ["right_margin", "left_margin", "top_margin", "bottom_margin"]
        }

        return margins

    def compute_margin(self, margin):
        """Compute a margin in pixels.

        Args:
            margin (str): the name of the  margin

        Returns:
            (int): an int telling where the margin is on the page.
        """
        coefficient = self.get_coefficient(margin)

        if margin == "left_margin":
            return int(coefficient * self.page_width / 100.0)
        if margin == "right_margin":
            return int(self.page_width - coefficient * self.page_width / 100.0)
        if margin == "top_margin":
            return int(coefficient * self.page_height / 100.0)
        if margin == "bottom_margin":
            return int(self.page_height - coefficient * self.page_height / 100.0)

    def get_coefficient(self, margin):
        """Get the width of the margin in percent."""
        coefficient = 7
        if margin in list(self.metadata_margins.keys()):
            margin_data = self.metadata_margins[margin]
            if margin_data.get(str(self.page_number)) is not None:
                coefficient = margin_data[str(self.page_number)]
            elif margin_data.get("all") is not None:
                coefficient = margin_data["all"]
            elif self.page_number % 2 == 0 and margin_data.get("even") is not None:
                coefficient = margin_data["even"]
            elif self.page_number % 2 == 1 and margin_data.get("odd") is not None:
                coefficient = margin_data["odd"]

        return coefficient

    def compute_inner_margins(self):
        """Compute inner margins of the document.

        Returns:
            (dict): A dict where the key is the name of the margin and the
                value is an integer indicating where the margin is on the page.
        """
        margins = {
            margin.replace("inner_", ""): self.compute_inner_margin(margin)
            for margin in [
                "inner_right_margin",
                "inner_left_margin",
                "inner_top_margin",
                "inner_bottom_margin",
            ]
        }

        if (
            margins["bottom_margin"] == self.page_height
            and margins["top_margin"] == 0
            and margins["left_margin"] == 0
            and margins["right_margin"] == self.page_width
        ):
            margins = {}

        return margins

    def compute_inner_margin(self, margin):
        """Compute a margin in pixels.

        Args:
            margin (str): the name of the margin

        Returns:
            (int): an int telling where the margin is on the page.
        """
        coefficient = self.get_inner_coefficient(margin)

        if margin == "inner_left_margin":
            return int(coefficient * self.page_width / 100.0)
        if margin == "inner_right_margin":
            return int(self.page_width - coefficient * self.page_width / 100.0)
        if margin == "inner_top_margin":
            return int(coefficient * self.page_height / 100.0)
        if margin == "inner_bottom_margin":
            return int(self.page_height - coefficient * self.page_height / 100.0)

    def get_inner_coefficient(self, margin):
        """Get the width of the margin in percent."""
        coefficient = 0
        if margin in list(self.metadata_inner_margins.keys()):
            margin_data = self.metadata_inner_margins[margin]
            if margin_data.get(str(self.page_number)) is not None:
                coefficient = margin_data[str(self.page_number)]
            elif margin_data.get("all") is not None:
                coefficient = margin_data["all"]
            elif self.page_number % 2 == 0 and margin_data.get("even") is not None:
                coefficient = margin_data["even"]
            elif self.page_number % 2 == 1 and margin_data.get("odd") is not None:
                coefficient = margin_data["odd"]

        return coefficient

__init__(page_id, page_style, metadata_margins=None, metadata_inner_margins=None)

Initialise the PDFPageMetadata class.

Parameters:

Name Type Description Default
page_id str

the page id

required
page_style str

the styles as a css string

required
metadata_margins dict

a dict containing margins read from the metadata file.

None
metadata_inner_margins dict

a dict containing inner_margins read from the metadata file.

None
Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
def __init__(
    self, page_id, page_style, metadata_margins=None, metadata_inner_margins=None
):
    """Initialise the PDFPageMetadata class.

    Args:
        page_id (str): the page id
        page_style (str): the styles as a css string
        metadata_margins (dict): a dict containing margins read
            from the metadata file.
        metadata_inner_margins (dict): a dict containing inner_margins
            read from the metadata file.
    """
    self.page_number = int(page_id.replace("page", "").replace("-div", ""))
    style = styles(page_style)
    self.page_height = int(style.get("height"))
    self.page_width = int(style.get("width"))
    self.metadata_margins = metadata_margins or {}
    self.metadata_inner_margins = metadata_inner_margins or {}

compute_inner_margin(margin)

Compute a margin in pixels.

Parameters:

Name Type Description Default
margin str

the name of the margin

required

Returns:

Type Description
int

an int telling where the margin is on the page.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
def compute_inner_margin(self, margin):
    """Compute a margin in pixels.

    Args:
        margin (str): the name of the margin

    Returns:
        (int): an int telling where the margin is on the page.
    """
    coefficient = self.get_inner_coefficient(margin)

    if margin == "inner_left_margin":
        return int(coefficient * self.page_width / 100.0)
    if margin == "inner_right_margin":
        return int(self.page_width - coefficient * self.page_width / 100.0)
    if margin == "inner_top_margin":
        return int(coefficient * self.page_height / 100.0)
    if margin == "inner_bottom_margin":
        return int(self.page_height - coefficient * self.page_height / 100.0)

compute_inner_margins()

Compute inner margins of the document.

Returns:

Type Description
dict

A dict where the key is the name of the margin and the value is an integer indicating where the margin is on the page.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
def compute_inner_margins(self):
    """Compute inner margins of the document.

    Returns:
        (dict): A dict where the key is the name of the margin and the
            value is an integer indicating where the margin is on the page.
    """
    margins = {
        margin.replace("inner_", ""): self.compute_inner_margin(margin)
        for margin in [
            "inner_right_margin",
            "inner_left_margin",
            "inner_top_margin",
            "inner_bottom_margin",
        ]
    }

    if (
        margins["bottom_margin"] == self.page_height
        and margins["top_margin"] == 0
        and margins["left_margin"] == 0
        and margins["right_margin"] == self.page_width
    ):
        margins = {}

    return margins

compute_margin(margin)

Compute a margin in pixels.

Parameters:

Name Type Description Default
margin str

the name of the margin

required

Returns:

Type Description
int

an int telling where the margin is on the page.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def compute_margin(self, margin):
    """Compute a margin in pixels.

    Args:
        margin (str): the name of the  margin

    Returns:
        (int): an int telling where the margin is on the page.
    """
    coefficient = self.get_coefficient(margin)

    if margin == "left_margin":
        return int(coefficient * self.page_width / 100.0)
    if margin == "right_margin":
        return int(self.page_width - coefficient * self.page_width / 100.0)
    if margin == "top_margin":
        return int(coefficient * self.page_height / 100.0)
    if margin == "bottom_margin":
        return int(self.page_height - coefficient * self.page_height / 100.0)

compute_margins()

Compute the margins of a page in pixels.

Returns:

Type Description
dict

a dict containing the four margins in pixels

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
193
194
195
196
197
198
199
200
201
202
203
204
def compute_margins(self):
    """Compute the margins of a page in pixels.

    Returns:
        (dict): a dict containing the four margins in pixels
    """
    margins = {
        margin: self.compute_margin(margin)
        for margin in ["right_margin", "left_margin", "top_margin", "bottom_margin"]
    }

    return margins

get_coefficient(margin)

Get the width of the margin in percent.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def get_coefficient(self, margin):
    """Get the width of the margin in percent."""
    coefficient = 7
    if margin in list(self.metadata_margins.keys()):
        margin_data = self.metadata_margins[margin]
        if margin_data.get(str(self.page_number)) is not None:
            coefficient = margin_data[str(self.page_number)]
        elif margin_data.get("all") is not None:
            coefficient = margin_data["all"]
        elif self.page_number % 2 == 0 and margin_data.get("even") is not None:
            coefficient = margin_data["even"]
        elif self.page_number % 2 == 1 and margin_data.get("odd") is not None:
            coefficient = margin_data["odd"]

    return coefficient

get_inner_coefficient(margin)

Get the width of the margin in percent.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
def get_inner_coefficient(self, margin):
    """Get the width of the margin in percent."""
    coefficient = 0
    if margin in list(self.metadata_inner_margins.keys()):
        margin_data = self.metadata_inner_margins[margin]
        if margin_data.get(str(self.page_number)) is not None:
            coefficient = margin_data[str(self.page_number)]
        elif margin_data.get("all") is not None:
            coefficient = margin_data["all"]
        elif self.page_number % 2 == 0 and margin_data.get("even") is not None:
            coefficient = margin_data["even"]
        elif self.page_number % 2 == 1 and margin_data.get("odd") is not None:
            coefficient = margin_data["odd"]

    return coefficient

handle_br(previous, current)

Handle br tags in p elements.

Parameters:

Name Type Description Default
previous str

the previous string in front of a particular br tag

required
current str

the current string following a particular br tag

required

Returns:

Type Description
str

A possibly modified version of previous

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def handle_br(previous, current):
    """Handle br tags in p elements.

    Args:
        previous (str): the previous string in front of a particular br tag
        current (str):  the current string following a particular br tag

    Returns:
        (str): A possibly modified version of previous
    """
    # Remove hyphen
    if is_probably_hyphenated(previous, current):
        return previous[:-1]

    # Preserve hyphen
    if previous and previous[-1] == "-":
        return previous

    # Turn br tag into space
    return f"{previous} "

is_probably_hyphenated(previous, current)

Find out if previous is part of a hyphenated word.

Parameters:

Name Type Description Default
previous str

the previous string in front of a particular br tag

required
current str

the current string following a particular br tag

required

Returns:

Type Description
bool

True if previous is part of a hyphenated word, False otherwise

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def is_probably_hyphenated(previous, current):
    """Find out if previous is part of a hyphenated word.

    Args:
        previous (str): the previous string in front of a particular br tag
        current (str):  the current string following a particular br tag

    Returns:
        (bool): True if previous is part of a hyphenated word, False otherwise
    """
    previous1 = previous[-2:]
    current1 = current[:2]

    return (
        LETTER_HYPHEN_AT_END.match(previous1)
        and LETTER_AT_START.match(current1)
        and current[0] == current[0].lower()
    )

merge(first, second)

Merge two paragraph elements into one.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def merge(first, second):
    """Merge two paragraph elements into one."""
    if len(first):
        if second.text:
            if first[-1].tail:
                first[-1].tail = f"{first[-1].tail}{second.text}"
            else:
                first[-1].tail = second.text
    else:
        if second.text:
            if first.text:
                first.text = f"{first.text}{second.text}"
            else:
                first.text = second.text

    for child in second:
        first.append(child)

    return first

styles(page_style)

Turn inline css styles into a dict.

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
32
33
34
35
36
37
38
39
40
def styles(page_style):
    """Turn inline css styles into a dict."""
    styles = {}
    for style_pair in page_style.split(";"):
        if style_pair:
            values = style_pair.split(":")
            styles[values[0]] = values[1].replace("px", "")

    return styles

to_html_elt(path)

Convert a pdf document to the Giella xml format.

Parameters:

Name Type Description Default
path str

path to the document

required

Returns:

Type Description
lxml.etree.Element

the root element of the Giella xml document

Source code in /home/anders/projects/CorpusTools/corpustools/pdfconverter.py
687
688
689
690
691
692
693
694
695
696
697
def to_html_elt(path):
    """Convert a pdf document to the Giella xml format.

    Args:
        path (str): path to the document

    Returns:
        (lxml.etree.Element): the root element of the Giella xml document
    """
    converter = PDF2XMLConverter(path)
    return converter.convert2intermediate()