Skip to content

test_plaintextconverter

Test conversion of plaintext files.

TestPlaintextConverter

Bases: xmltester.XMLTester

Test the PlaintextConverter.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_plaintextconverter.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
class TestPlaintextConverter(xmltester.XMLTester):
    """Test the PlaintextConverter."""

    def test_to_unicode(self):
        """Check that winsami2 is converted to unicode."""
        plaintext = plaintextconverter.PlaintextConverter(
            os.path.join(
                HERE,
                "converter_data/fakecorpus/orig/sme/riddu/" "winsami2-test-ws2.txt",
            )
        )
        got = plaintext.to_unicode()

        # Ensure that the data in want is unicode
        file_ = codecs.open(
            os.path.join(HERE, "converter_data/winsami2-test-utf8.txt"), encoding="utf8"
        )
        want = file_.read()
        file_.close()

        self.assertEqual(got, want)

    def test_strip_chars1(self):
        """Check that weird chars are converted as exptected."""
        plaintext = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
        got = plaintext.strip_chars(
            "\x0d\n" "<ASCII-MAC>\n" "<vsn:3.000000>\n" "<\\!q>\n" "<\\!h>\n"
        )
        want = """\n\n\n\n\n\n"""

        self.assertEqual(got, want)

    def test_strip_chars2(self):
        """Check that special chars are converted as expected."""
        plaintext = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
        got = plaintext.strip_chars(
            "<0x010C><0x010D><0x0110><0x0111><0x014A><0x014B><0x0160><0x0161>"
            "<0x0166><0x0167><0x017D><0x017E><0x2003>"
        )
        want = """ČčĐđŊŋŠšŦŧŽž """

        self.assertEqual(got, want)

    def test_plaintext(self):
        """Check that an empty line signal paragraph."""
        plaintext = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
        got = plaintext.content2xml(
            io.StringIO(
                """Sámegiella.

Buot leat."""
            )
        )

        want = etree.fromstring(
            r"""<document>
    <header/>
    <body>
        <p>
            Sámegiella.
        </p>
        <p>
           Buot leat.
       </p>
    </body>
</document>"""
        )

        self.assertXmlEqual(got, want)

    def test_two_lines(self):
        """Test that two consecutive lines are treated as a paragraph."""
        newstext = plaintextconverter.PlaintextConverter("orig/sme/admin/tullball.txt")
        got = newstext.content2xml(
            io.StringIO(
                """Guovssahasa nieida.
Filbma lea.
"""
            )
        )
        want = etree.fromstring(
            """<document>
    <header/>
    <body>
        <p>Guovssahasa nieida.
Filbma lea.</p>
    </body>
</document>
"""
        )

        self.assertXmlEqual(got, want)

    def test_hyph(self):
        """Check that hyph is conserved."""
        newstext = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
        got = newstext.content2xml(io.StringIO("Guovssa<hyph/>hasa"))
        want = etree.fromstring(
            """
            <document>
            <header/>
            <body>
                <p>Guovssa<hyph/>hasa</p>
            </body>
            </document> """
        )

        self.assertXmlEqual(got, want)

    def test_skip_lines(self):
        """Check that lines are skipped."""
        content = """
a

b

c

d

e
"""
        want_string = """
<document>
    <header/>
    <body>
        <p>a</p>
        <p>d</p>
        <p>e</p>
    </body>
</document>
"""
        text = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
        text.metadata.set_variable("skip_lines", "4-6")
        got = text.content2xml(io.StringIO(content))
        want = etree.fromstring(want_string)

        self.assertXmlEqual(got, want)

test_hyph()

Check that hyph is conserved.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_plaintextconverter.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def test_hyph(self):
    """Check that hyph is conserved."""
    newstext = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
    got = newstext.content2xml(io.StringIO("Guovssa<hyph/>hasa"))
    want = etree.fromstring(
        """
        <document>
        <header/>
        <body>
            <p>Guovssa<hyph/>hasa</p>
        </body>
        </document> """
    )

    self.assertXmlEqual(got, want)

test_plaintext()

Check that an empty line signal paragraph.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_plaintextconverter.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
    def test_plaintext(self):
        """Check that an empty line signal paragraph."""
        plaintext = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
        got = plaintext.content2xml(
            io.StringIO(
                """Sámegiella.

Buot leat."""
            )
        )

        want = etree.fromstring(
            r"""<document>
    <header/>
    <body>
        <p>
            Sámegiella.
        </p>
        <p>
           Buot leat.
       </p>
    </body>
</document>"""
        )

        self.assertXmlEqual(got, want)

test_skip_lines()

Check that lines are skipped.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_plaintextconverter.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
    def test_skip_lines(self):
        """Check that lines are skipped."""
        content = """
a

b

c

d

e
"""
        want_string = """
<document>
    <header/>
    <body>
        <p>a</p>
        <p>d</p>
        <p>e</p>
    </body>
</document>
"""
        text = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
        text.metadata.set_variable("skip_lines", "4-6")
        got = text.content2xml(io.StringIO(content))
        want = etree.fromstring(want_string)

        self.assertXmlEqual(got, want)

test_strip_chars1()

Check that weird chars are converted as exptected.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_plaintextconverter.py
55
56
57
58
59
60
61
62
63
def test_strip_chars1(self):
    """Check that weird chars are converted as exptected."""
    plaintext = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
    got = plaintext.strip_chars(
        "\x0d\n" "<ASCII-MAC>\n" "<vsn:3.000000>\n" "<\\!q>\n" "<\\!h>\n"
    )
    want = """\n\n\n\n\n\n"""

    self.assertEqual(got, want)

test_strip_chars2()

Check that special chars are converted as expected.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_plaintextconverter.py
65
66
67
68
69
70
71
72
73
74
def test_strip_chars2(self):
    """Check that special chars are converted as expected."""
    plaintext = plaintextconverter.PlaintextConverter("orig/sme/riddu/tullball.txt")
    got = plaintext.strip_chars(
        "<0x010C><0x010D><0x0110><0x0111><0x014A><0x014B><0x0160><0x0161>"
        "<0x0166><0x0167><0x017D><0x017E><0x2003>"
    )
    want = """ČčĐđŊŋŠšŦŧŽž """

    self.assertEqual(got, want)

test_to_unicode()

Check that winsami2 is converted to unicode.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_plaintextconverter.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def test_to_unicode(self):
    """Check that winsami2 is converted to unicode."""
    plaintext = plaintextconverter.PlaintextConverter(
        os.path.join(
            HERE,
            "converter_data/fakecorpus/orig/sme/riddu/" "winsami2-test-ws2.txt",
        )
    )
    got = plaintext.to_unicode()

    # Ensure that the data in want is unicode
    file_ = codecs.open(
        os.path.join(HERE, "converter_data/winsami2-test-utf8.txt"), encoding="utf8"
    )
    want = file_.read()
    file_.close()

    self.assertEqual(got, want)

test_two_lines()

Test that two consecutive lines are treated as a paragraph.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_plaintextconverter.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
    def test_two_lines(self):
        """Test that two consecutive lines are treated as a paragraph."""
        newstext = plaintextconverter.PlaintextConverter("orig/sme/admin/tullball.txt")
        got = newstext.content2xml(
            io.StringIO(
                """Guovssahasa nieida.
Filbma lea.
"""
            )
        )
        want = etree.fromstring(
            """<document>
    <header/>
    <body>
        <p>Guovssahasa nieida.
Filbma lea.</p>
    </body>
</document>
"""
        )

        self.assertXmlEqual(got, want)