Skip to content

plaintextconverter

Convert plaintext files to the Giella xml format.

PlaintextConverter

Bases: basicconverter.BasicConverter

Convert plain text files to the Giella xml format.

Source code in /home/anders/projects/CorpusTools/corpustools/plaintextconverter.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
class PlaintextConverter(basicconverter.BasicConverter):
    """Convert plain text files to the Giella xml format."""

    def to_unicode(self):
        """Read a file into a unicode string.

        If the content of the file is not utf-8, pretend the encoding is
        latin1. The real encoding will be detected later.

        Returns:
            (str): The decoded string
        """
        try:
            content = codecs.open(self.orig, encoding="utf8").read()
        except ValueError:
            content = codecs.open(self.orig, encoding="latin1").read()

        content = self.strip_chars(content.replace("\r\n", "\n"))

        return content

    @staticmethod
    def strip_chars(content, extra=""):
        """Remove the characters found in plaintext_oddities from content.

        Args:
            content (str): a string containing the content of a document.
            extra (str): a string containg even more characters to remove
                from content.

        Returns:
            (str): A string containing the content sans unwanted characters.
        """
        plaintext_oddities = [
            ("ÊÊ", "\n"),
            (r"<\!q>", ""),
            (r"<\!h>", ""),
            ("<*B>", ""),
            ("<*P>", ""),
            ("<*I>", ""),
            ("\r", "\n"),
            ("<ASCII-MAC>", ""),
            ("<vsn:3.000000>", ""),
            ("<0x010C>", "Č"),
            ("<0x010D>", "č"),
            ("<0x0110>", "Đ"),
            ("<0x0111>", "đ"),
            ("<0x014A>", "Ŋ"),
            ("<0x014B>", "ŋ"),
            ("<0x0160>", "Š"),
            ("<0x0161>", "š"),
            ("<0x0166>", "Ŧ"),
            ("<0x0167>", "ŧ"),
            ("<0x017D>", "Ž"),
            ("<0x017E>", "ž"),
            ("<0x2003>", " "),
            (
                "========================================================"
                "========================",
                "\n",
            ),
        ]
        content = util.replace_all(plaintext_oddities, content)
        remove_re = re.compile(f"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F{extra}]")
        content, _ = remove_re.subn("", content)

        return content

    @staticmethod
    def make_element(element_name, text):
        """Make an xml element.

        Args:
            element_name (str): Name of the xml element
            text (str): The text the xml should contain

        Returns:
            (lxml.etree.Element): an lxml.etree.Element
        """
        element = etree.Element(element_name)

        hyph_parts = text.split("<hyph/>")
        if len(hyph_parts) > 1:
            element.text = hyph_parts[0]
            for hyph_part in hyph_parts[1:]:
                hyph = etree.Element("hyph")
                hyph.tail = hyph_part
                element.append(hyph)
        else:
            element.text = text

        return element

    def content2xml(self, content):
        """Transform plaintext to an intermediate xml document.

        Args:
            content (str): the content of the plaintext document.

        Returns:
            (lxml.etree.Element): An etree element.
        """
        document = etree.Element("document")
        header = etree.Element("header")
        body = etree.Element("body")

        ptext = ""

        for line_no, line in enumerate(content, start=1):
            if line_no not in self.metadata.skip_lines:
                if line.strip() == "":
                    if ptext.strip() != "":
                        body.append(self.make_element("p", ptext))
                    ptext = ""
                else:
                    ptext = ptext + line

        if ptext != "":
            body.append(self.make_element("p", ptext))

        document.append(header)
        document.append(body)

        return document

content2xml(content)

Transform plaintext to an intermediate xml document.

Parameters:

Name Type Description Default
content str

the content of the plaintext document.

required

Returns:

Type Description
lxml.etree.Element

An etree element.

Source code in /home/anders/projects/CorpusTools/corpustools/plaintextconverter.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def content2xml(self, content):
    """Transform plaintext to an intermediate xml document.

    Args:
        content (str): the content of the plaintext document.

    Returns:
        (lxml.etree.Element): An etree element.
    """
    document = etree.Element("document")
    header = etree.Element("header")
    body = etree.Element("body")

    ptext = ""

    for line_no, line in enumerate(content, start=1):
        if line_no not in self.metadata.skip_lines:
            if line.strip() == "":
                if ptext.strip() != "":
                    body.append(self.make_element("p", ptext))
                ptext = ""
            else:
                ptext = ptext + line

    if ptext != "":
        body.append(self.make_element("p", ptext))

    document.append(header)
    document.append(body)

    return document

make_element(element_name, text) staticmethod

Make an xml element.

Parameters:

Name Type Description Default
element_name str

Name of the xml element

required
text str

The text the xml should contain

required

Returns:

Type Description
lxml.etree.Element

an lxml.etree.Element

Source code in /home/anders/projects/CorpusTools/corpustools/plaintextconverter.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
@staticmethod
def make_element(element_name, text):
    """Make an xml element.

    Args:
        element_name (str): Name of the xml element
        text (str): The text the xml should contain

    Returns:
        (lxml.etree.Element): an lxml.etree.Element
    """
    element = etree.Element(element_name)

    hyph_parts = text.split("<hyph/>")
    if len(hyph_parts) > 1:
        element.text = hyph_parts[0]
        for hyph_part in hyph_parts[1:]:
            hyph = etree.Element("hyph")
            hyph.tail = hyph_part
            element.append(hyph)
    else:
        element.text = text

    return element

strip_chars(content, extra='') staticmethod

Remove the characters found in plaintext_oddities from content.

Parameters:

Name Type Description Default
content str

a string containing the content of a document.

required
extra str

a string containg even more characters to remove from content.

''

Returns:

Type Description
str

A string containing the content sans unwanted characters.

Source code in /home/anders/projects/CorpusTools/corpustools/plaintextconverter.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
@staticmethod
def strip_chars(content, extra=""):
    """Remove the characters found in plaintext_oddities from content.

    Args:
        content (str): a string containing the content of a document.
        extra (str): a string containg even more characters to remove
            from content.

    Returns:
        (str): A string containing the content sans unwanted characters.
    """
    plaintext_oddities = [
        ("ÊÊ", "\n"),
        (r"<\!q>", ""),
        (r"<\!h>", ""),
        ("<*B>", ""),
        ("<*P>", ""),
        ("<*I>", ""),
        ("\r", "\n"),
        ("<ASCII-MAC>", ""),
        ("<vsn:3.000000>", ""),
        ("<0x010C>", "Č"),
        ("<0x010D>", "č"),
        ("<0x0110>", "Đ"),
        ("<0x0111>", "đ"),
        ("<0x014A>", "Ŋ"),
        ("<0x014B>", "ŋ"),
        ("<0x0160>", "Š"),
        ("<0x0161>", "š"),
        ("<0x0166>", "Ŧ"),
        ("<0x0167>", "ŧ"),
        ("<0x017D>", "Ž"),
        ("<0x017E>", "ž"),
        ("<0x2003>", " "),
        (
            "========================================================"
            "========================",
            "\n",
        ),
    ]
    content = util.replace_all(plaintext_oddities, content)
    remove_re = re.compile(f"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F{extra}]")
    content, _ = remove_re.subn("", content)

    return content

to_unicode()

Read a file into a unicode string.

If the content of the file is not utf-8, pretend the encoding is latin1. The real encoding will be detected later.

Returns:

Type Description
str

The decoded string

Source code in /home/anders/projects/CorpusTools/corpustools/plaintextconverter.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def to_unicode(self):
    """Read a file into a unicode string.

    If the content of the file is not utf-8, pretend the encoding is
    latin1. The real encoding will be detected later.

    Returns:
        (str): The decoded string
    """
    try:
        content = codecs.open(self.orig, encoding="utf8").read()
    except ValueError:
        content = codecs.open(self.orig, encoding="latin1").read()

    content = self.strip_chars(content.replace("\r\n", "\n"))

    return content

convert2intermediate(filename)

Transform plaintext to an intermediate xml document.

Parameters:

Name Type Description Default
filename str

name of the file that should be converted

required

Returns:

Type Description
lxml.etree.Element

An etree element.

Source code in /home/anders/projects/CorpusTools/corpustools/plaintextconverter.py
155
156
157
158
159
160
161
162
163
164
165
166
def convert2intermediate(filename):
    """Transform plaintext to an intermediate xml document.

    Args:
        filename (str): name of the file that should be converted

    Returns:
        (lxml.etree.Element): An etree element.
    """
    converter = PlaintextConverter(filename)

    return converter.content2xml(io.StringIO(converter.to_unicode()))