Skip to content

corpusxmlfile

Classes and functions to sentence align two files.

CorpusXMLFile

A class to handle all the info of a corpus xml file.

Source code in /home/anders/projects/CorpusTools/corpustools/corpusxmlfile.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class CorpusXMLFile:
    """A class to handle all the info of a corpus xml file."""

    def __init__(self, name):
        """Initialise the CorpusXMLFile class.

        Args:
            name (str): path to the xml file.
        """
        self.corpus_path = corpuspath.make_corpus_path(name)
        self.etree = etree.parse(name)
        self.root = self.etree.getroot()
        self.sanity_check()

    def sanity_check(self):
        """Check if the file really is a corpus xml file."""
        if self.root.tag != "document":
            raise util.ArgumentError(
                "Expected Corpus XML file (output of convert2xml) with "
                "<document> as the root tag, got {} -- did you pass the "
                "wrong file?".format(
                    self.root.tag,
                )
            )

    @property
    def lang(self):
        """Get the lang of the file."""
        return self.corpus_path.lang

    @property
    def word_count(self):
        """Return the word count of the file."""
        word_count = self.root.find(".//wordcount")
        if word_count is not None:
            return word_count.text
        else:
            raise AttributeError("wordcount not found!")

    @property
    def ocr(self):
        """Check if the ocr element exists.

        :returns: the ocr element or None
        """
        return self.root.find(".//ocr")

    @property
    def translated_from(self):
        """Get the translated_from element from the orig doc."""
        translated_from = self.root.find(".//translated_from")

        if translated_from is not None:
            return translated_from.attrib["{http://www.w3.org/XML/1998/namespace}lang"]

    def remove_version(self):
        """Remove the version element.

        This is often the only difference between the otherwise
        identical files in converted and prestable/converted
        """
        version_element = self.root.find(".//version")
        version_element.getparent().remove(version_element)

    def remove_skip(self):
        """Remove the skip element.

        This contains text that is not wanted in e.g. sentence alignment
        """
        skip_list = self.root.findall(".//skip")

        for skip_element in skip_list:
            skip_element.getparent().remove(skip_element)

    def move_later(self):
        """Move the later elements to the end of the body element."""
        body = self.root.xpath("/document/body")[0]

        later_list = self.root.xpath(".//later")

        for later_element in later_list:
            body.append(later_element)

    def set_body(self, new_body):
        """Replace the body element with new_body element."""
        if new_body.tag == "body":
            oldbody = self.etree.find(".//body")
            oldbody.getparent().replace(oldbody, new_body)

    def write(self, file_name=None):
        """Write self.etree."""
        if file_name is None:
            file_name = self.corpus_path

        self.etree.write(
            file_name, encoding="utf8", pretty_print=True, xml_declaration=True
        )

lang property

Get the lang of the file.

ocr property

Check if the ocr element exists.

:returns: the ocr element or None

translated_from property

Get the translated_from element from the orig doc.

word_count property

Return the word count of the file.

__init__(name)

Initialise the CorpusXMLFile class.

Parameters:

Name Type Description Default
name str

path to the xml file.

required
Source code in /home/anders/projects/CorpusTools/corpustools/corpusxmlfile.py
30
31
32
33
34
35
36
37
38
39
def __init__(self, name):
    """Initialise the CorpusXMLFile class.

    Args:
        name (str): path to the xml file.
    """
    self.corpus_path = corpuspath.make_corpus_path(name)
    self.etree = etree.parse(name)
    self.root = self.etree.getroot()
    self.sanity_check()

move_later()

Move the later elements to the end of the body element.

Source code in /home/anders/projects/CorpusTools/corpustools/corpusxmlfile.py
101
102
103
104
105
106
107
108
def move_later(self):
    """Move the later elements to the end of the body element."""
    body = self.root.xpath("/document/body")[0]

    later_list = self.root.xpath(".//later")

    for later_element in later_list:
        body.append(later_element)

remove_skip()

Remove the skip element.

This contains text that is not wanted in e.g. sentence alignment

Source code in /home/anders/projects/CorpusTools/corpustools/corpusxmlfile.py
91
92
93
94
95
96
97
98
99
def remove_skip(self):
    """Remove the skip element.

    This contains text that is not wanted in e.g. sentence alignment
    """
    skip_list = self.root.findall(".//skip")

    for skip_element in skip_list:
        skip_element.getparent().remove(skip_element)

remove_version()

Remove the version element.

This is often the only difference between the otherwise identical files in converted and prestable/converted

Source code in /home/anders/projects/CorpusTools/corpustools/corpusxmlfile.py
82
83
84
85
86
87
88
89
def remove_version(self):
    """Remove the version element.

    This is often the only difference between the otherwise
    identical files in converted and prestable/converted
    """
    version_element = self.root.find(".//version")
    version_element.getparent().remove(version_element)

sanity_check()

Check if the file really is a corpus xml file.

Source code in /home/anders/projects/CorpusTools/corpustools/corpusxmlfile.py
41
42
43
44
45
46
47
48
49
50
def sanity_check(self):
    """Check if the file really is a corpus xml file."""
    if self.root.tag != "document":
        raise util.ArgumentError(
            "Expected Corpus XML file (output of convert2xml) with "
            "<document> as the root tag, got {} -- did you pass the "
            "wrong file?".format(
                self.root.tag,
            )
        )

set_body(new_body)

Replace the body element with new_body element.

Source code in /home/anders/projects/CorpusTools/corpustools/corpusxmlfile.py
110
111
112
113
114
def set_body(self, new_body):
    """Replace the body element with new_body element."""
    if new_body.tag == "body":
        oldbody = self.etree.find(".//body")
        oldbody.getparent().replace(oldbody, new_body)

write(file_name=None)

Write self.etree.

Source code in /home/anders/projects/CorpusTools/corpustools/corpusxmlfile.py
116
117
118
119
120
121
122
123
def write(self, file_name=None):
    """Write self.etree."""
    if file_name is None:
        file_name = self.corpus_path

    self.etree.write(
        file_name, encoding="utf8", pretty_print=True, xml_declaration=True
    )