Skip to content

plaintextconverter

Convert plaintext files to the Giella xml format.

PlaintextConverter

Bases: BasicConverter

Convert plain text files to the Giella xml format.

Source code in corpustools/plaintextconverter.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
class PlaintextConverter(basicconverter.BasicConverter):
    """Convert plain text files to the Giella xml format."""

    def to_unicode(self) -> str:
        """Read a file into a unicode string.

        If the content of the file is not utf-8, pretend the encoding is
        latin1. The real encoding will be detected later.

        Returns:
            (str): The decoded string
        """
        try:
            content = codecs.open(self.orig.as_posix(), encoding="utf8").read()
        except ValueError:
            content = codecs.open(self.orig.as_posix(), encoding="latin1").read()

        content = self.strip_chars(content.replace("\r\n", "\n"))

        return content

    @staticmethod
    def strip_chars(content: str, extra="") -> str:
        """Remove the characters found in plaintext_oddities from content.

        Args:
            content: a string containing the content of a document.
            extra: a string containg even more characters to remove
                from content.

        Returns:
            A string containing the content sans unwanted characters.
        """
        plaintext_oddities = [
            ("ÊÊ", "\n"),
            (r"<\!q>", ""),
            (r"<\!h>", ""),
            ("<*B>", ""),
            ("<*P>", ""),
            ("<*I>", ""),
            ("\r", "\n"),
            ("<ASCII-MAC>", ""),
            ("<vsn:3.000000>", ""),
            ("<0x010C>", "Č"),
            ("<0x010D>", "č"),
            ("<0x0110>", "Đ"),
            ("<0x0111>", "đ"),
            ("<0x014A>", "Ŋ"),
            ("<0x014B>", "ŋ"),
            ("<0x0160>", "Š"),
            ("<0x0161>", "š"),
            ("<0x0166>", "Ŧ"),
            ("<0x0167>", "ŧ"),
            ("<0x017D>", "Ž"),
            ("<0x017E>", "ž"),
            ("<0x2003>", " "),
            (
                "========================================================"
                "========================",
                "\n",
            ),
        ]
        content = util.replace_all(plaintext_oddities, content)
        remove_re = re.compile(f"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F{extra}]")
        content, _ = remove_re.subn("", content)

        return content

    @staticmethod
    def make_element(element_name: str, text: str) -> etree._Element:
        """Make an xml element.

        Args:
            element_name: Name of the xml element
            text: The text the xml should contain

        Returns:
            an etree element
        """
        element = etree.Element(element_name)

        hyph_parts = text.split("<hyph/>")
        if len(hyph_parts) > 1:
            element.text = hyph_parts[0]
            for hyph_part in hyph_parts[1:]:
                hyph = etree.Element("hyph")
                hyph.tail = hyph_part
                element.append(hyph)
        else:
            element.text = text

        return element

    def lines2xml(self, content: io.StringIO) -> Iterable[etree._Element]:
        """Transform paragraphs to etree elements.

        Args:
            content: the content of the plaintext document.

        Yields:
            An etree element.
        """

        valid_lines = (
            line
            for line_no, line in enumerate(content, start=1)
            if line_no not in self.metadata.skip_lines or line.startswith("#")
        )

        buffer: list[str] = []
        for line in valid_lines:
            if line.strip() == "" and buffer:
                yield self.make_element("p", "".join(buffer))
                buffer.clear()
            else:
                buffer.append(line)

        if buffer:
            yield self.make_element("p", "".join(buffer))

    def content2xml(self, content: io.StringIO) -> etree._Element:
        """Transform plaintext to an intermediate xml document.

        Args:
            content: the content of the plaintext document.

        Returns:
            An etree element.
        """
        document = etree.Element("document")
        etree.SubElement(document, "header")
        body = etree.SubElement(document, "body")

        for para in self.lines2xml(content):
            body.append(para)

        return document

content2xml(content)

Transform plaintext to an intermediate xml document.

Parameters:

Name Type Description Default
content StringIO

the content of the plaintext document.

required

Returns:

Type Description
_Element

An etree element.

Source code in corpustools/plaintextconverter.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
def content2xml(self, content: io.StringIO) -> etree._Element:
    """Transform plaintext to an intermediate xml document.

    Args:
        content: the content of the plaintext document.

    Returns:
        An etree element.
    """
    document = etree.Element("document")
    etree.SubElement(document, "header")
    body = etree.SubElement(document, "body")

    for para in self.lines2xml(content):
        body.append(para)

    return document

lines2xml(content)

Transform paragraphs to etree elements.

Parameters:

Name Type Description Default
content StringIO

the content of the plaintext document.

required

Yields:

Type Description
Iterable[_Element]

An etree element.

Source code in corpustools/plaintextconverter.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def lines2xml(self, content: io.StringIO) -> Iterable[etree._Element]:
    """Transform paragraphs to etree elements.

    Args:
        content: the content of the plaintext document.

    Yields:
        An etree element.
    """

    valid_lines = (
        line
        for line_no, line in enumerate(content, start=1)
        if line_no not in self.metadata.skip_lines or line.startswith("#")
    )

    buffer: list[str] = []
    for line in valid_lines:
        if line.strip() == "" and buffer:
            yield self.make_element("p", "".join(buffer))
            buffer.clear()
        else:
            buffer.append(line)

    if buffer:
        yield self.make_element("p", "".join(buffer))

make_element(element_name, text) staticmethod

Make an xml element.

Parameters:

Name Type Description Default
element_name str

Name of the xml element

required
text str

The text the xml should contain

required

Returns:

Type Description
_Element

an etree element

Source code in corpustools/plaintextconverter.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
@staticmethod
def make_element(element_name: str, text: str) -> etree._Element:
    """Make an xml element.

    Args:
        element_name: Name of the xml element
        text: The text the xml should contain

    Returns:
        an etree element
    """
    element = etree.Element(element_name)

    hyph_parts = text.split("<hyph/>")
    if len(hyph_parts) > 1:
        element.text = hyph_parts[0]
        for hyph_part in hyph_parts[1:]:
            hyph = etree.Element("hyph")
            hyph.tail = hyph_part
            element.append(hyph)
    else:
        element.text = text

    return element

strip_chars(content, extra='') staticmethod

Remove the characters found in plaintext_oddities from content.

Parameters:

Name Type Description Default
content str

a string containing the content of a document.

required
extra

a string containg even more characters to remove from content.

''

Returns:

Type Description
str

A string containing the content sans unwanted characters.

Source code in corpustools/plaintextconverter.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
@staticmethod
def strip_chars(content: str, extra="") -> str:
    """Remove the characters found in plaintext_oddities from content.

    Args:
        content: a string containing the content of a document.
        extra: a string containg even more characters to remove
            from content.

    Returns:
        A string containing the content sans unwanted characters.
    """
    plaintext_oddities = [
        ("ÊÊ", "\n"),
        (r"<\!q>", ""),
        (r"<\!h>", ""),
        ("<*B>", ""),
        ("<*P>", ""),
        ("<*I>", ""),
        ("\r", "\n"),
        ("<ASCII-MAC>", ""),
        ("<vsn:3.000000>", ""),
        ("<0x010C>", "Č"),
        ("<0x010D>", "č"),
        ("<0x0110>", "Đ"),
        ("<0x0111>", "đ"),
        ("<0x014A>", "Ŋ"),
        ("<0x014B>", "ŋ"),
        ("<0x0160>", "Š"),
        ("<0x0161>", "š"),
        ("<0x0166>", "Ŧ"),
        ("<0x0167>", "ŧ"),
        ("<0x017D>", "Ž"),
        ("<0x017E>", "ž"),
        ("<0x2003>", " "),
        (
            "========================================================"
            "========================",
            "\n",
        ),
    ]
    content = util.replace_all(plaintext_oddities, content)
    remove_re = re.compile(f"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F{extra}]")
    content, _ = remove_re.subn("", content)

    return content

to_unicode()

Read a file into a unicode string.

If the content of the file is not utf-8, pretend the encoding is latin1. The real encoding will be detected later.

Returns:

Type Description
str

The decoded string

Source code in corpustools/plaintextconverter.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def to_unicode(self) -> str:
    """Read a file into a unicode string.

    If the content of the file is not utf-8, pretend the encoding is
    latin1. The real encoding will be detected later.

    Returns:
        (str): The decoded string
    """
    try:
        content = codecs.open(self.orig.as_posix(), encoding="utf8").read()
    except ValueError:
        content = codecs.open(self.orig.as_posix(), encoding="latin1").read()

    content = self.strip_chars(content.replace("\r\n", "\n"))

    return content

convert2intermediate(filename)

Transform plaintext to an intermediate xml document.

Parameters:

Name Type Description Default
filename Path

path of the file that should be converted

required

Returns:

Type Description
_Element

An etree element.

Source code in corpustools/plaintextconverter.py
170
171
172
173
174
175
176
177
178
179
180
181
def convert2intermediate(filename: Path) -> etree._Element:
    """Transform plaintext to an intermediate xml document.

    Args:
        filename: path of the file that should be converted

    Returns:
        An etree element.
    """
    converter = PlaintextConverter(filename)

    return converter.content2xml(io.StringIO(converter.to_unicode()))