Skip to content

test_epubconverter

Test conversion of epub files.

TestEpubConverter

Bases: XMLTester

Test the epub converter.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class TestEpubConverter(XMLTester):
    """Test the epub converter."""

    def setUp(self):
        """Setup epub content."""
        self.testdoc = os.path.join(
            HERE, "converter_data/fakecorpus/orig/sme/riddu/test.epub"
        )

    def test_convert2intermediate_1(self):
        """Test without skip_elements."""
        got = htmlcontentconverter.convert2intermediate(self.testdoc)
        want = """
            <document>
                <body>
                    <p type="title">1 Bajilčála</p>
                    <p>1asdf</p>
                    <p type="title">1.1 Bajilčála</p>
                    <p>2asdf</p>
                    <p type="title">1.1.1 Bajilčála</p>
                    <p>3asdf</p>
                    <p type="title">2 Bajilčála</p>
                    <p>4asdf</p>
                    <p type="title">2.1 Bajilčála</p>
                    <p>5asdf</p>
                    <p type="title">2.1.1 Bajilčála</p>
                    <p>6asdf</p>
                    <p type="title">3.1 Bajilčála</p>
                    <p>7asdf</p>
                    <p type="title">3.1.1 Bajilčála</p>
                    <p>8asdf</p>
                </body>
            </document>
        """

        self.assertXmlEqual(got, etree.fromstring(want))

    def test_convert2intermediate_2(self):
        """Test with skip_elements."""
        with TempDirectory() as directory:
            temp_epub = set_data(
                directory,
                self.testdoc,
                ".//html:body/html:div[1]/html:h2[1];"
                ".//html:body/html:div[3]/html:div[1]/html:h3[1]",
            )
            got = htmlcontentconverter.convert2intermediate(temp_epub)
            want = """
                <document>
                    <body>
                        <p type="title">1 Bajilčála</p>
                        <p>1asdf</p>
                        <p type="title">3.1.1 Bajilčála</p>
                        <p>8asdf</p>
                    </body>
                </document>
            """

            self.assertXmlEqual(got, etree.fromstring(want))

    def test_convert2intermediate_3(self):
        """Test with skip_elements that only has first path defined."""
        with TempDirectory() as directory:
            temp_epub = set_data(
                directory, self.testdoc, ".//html:body/html:div[1]/html:h2[1];"
            )
            got = htmlcontentconverter.convert2intermediate(temp_epub)
            want = """
                <document>
                    <body>
                        <p type="title">1 Bajilčála</p>
                        <p>1asdf</p>
                        <p>2asdf</p>
                        <p type="title">1.1.1 Bajilčála</p>
                        <p>3asdf</p>
                        <p type="title">2 Bajilčála</p>
                        <p>4asdf</p>
                        <p type="title">2.1 Bajilčála</p>
                        <p>5asdf</p>
                        <p type="title">2.1.1 Bajilčála</p>
                        <p>6asdf</p>
                        <p type="title">3.1 Bajilčála</p>
                        <p>7asdf</p>
                        <p type="title">3.1.1 Bajilčála</p>
                        <p>8asdf</p>
                    </body>
                </document>
            """

            self.assertXmlEqual(got, etree.fromstring(want))

setUp()

Setup epub content.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
56
57
58
59
60
def setUp(self):
    """Setup epub content."""
    self.testdoc = os.path.join(
        HERE, "converter_data/fakecorpus/orig/sme/riddu/test.epub"
    )

test_convert2intermediate_1()

Test without skip_elements.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def test_convert2intermediate_1(self):
    """Test without skip_elements."""
    got = htmlcontentconverter.convert2intermediate(self.testdoc)
    want = """
        <document>
            <body>
                <p type="title">1 Bajilčála</p>
                <p>1asdf</p>
                <p type="title">1.1 Bajilčála</p>
                <p>2asdf</p>
                <p type="title">1.1.1 Bajilčála</p>
                <p>3asdf</p>
                <p type="title">2 Bajilčála</p>
                <p>4asdf</p>
                <p type="title">2.1 Bajilčála</p>
                <p>5asdf</p>
                <p type="title">2.1.1 Bajilčála</p>
                <p>6asdf</p>
                <p type="title">3.1 Bajilčála</p>
                <p>7asdf</p>
                <p type="title">3.1.1 Bajilčála</p>
                <p>8asdf</p>
            </body>
        </document>
    """

    self.assertXmlEqual(got, etree.fromstring(want))

test_convert2intermediate_2()

Test with skip_elements.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def test_convert2intermediate_2(self):
    """Test with skip_elements."""
    with TempDirectory() as directory:
        temp_epub = set_data(
            directory,
            self.testdoc,
            ".//html:body/html:div[1]/html:h2[1];"
            ".//html:body/html:div[3]/html:div[1]/html:h3[1]",
        )
        got = htmlcontentconverter.convert2intermediate(temp_epub)
        want = """
            <document>
                <body>
                    <p type="title">1 Bajilčála</p>
                    <p>1asdf</p>
                    <p type="title">3.1.1 Bajilčála</p>
                    <p>8asdf</p>
                </body>
            </document>
        """

        self.assertXmlEqual(got, etree.fromstring(want))

test_convert2intermediate_3()

Test with skip_elements that only has first path defined.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def test_convert2intermediate_3(self):
    """Test with skip_elements that only has first path defined."""
    with TempDirectory() as directory:
        temp_epub = set_data(
            directory, self.testdoc, ".//html:body/html:div[1]/html:h2[1];"
        )
        got = htmlcontentconverter.convert2intermediate(temp_epub)
        want = """
            <document>
                <body>
                    <p type="title">1 Bajilčála</p>
                    <p>1asdf</p>
                    <p>2asdf</p>
                    <p type="title">1.1.1 Bajilčála</p>
                    <p>3asdf</p>
                    <p type="title">2 Bajilčála</p>
                    <p>4asdf</p>
                    <p type="title">2.1 Bajilčála</p>
                    <p>5asdf</p>
                    <p type="title">2.1.1 Bajilčála</p>
                    <p>6asdf</p>
                    <p type="title">3.1 Bajilčála</p>
                    <p>7asdf</p>
                    <p type="title">3.1.1 Bajilčála</p>
                    <p>8asdf</p>
                </body>
            </document>
        """

        self.assertXmlEqual(got, etree.fromstring(want))

TestEpubConverter1

Bases: XMLTester

Test the epub converter.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
class TestEpubConverter1(XMLTester):
    """Test the epub converter."""

    def setUp(self):
        """Setup epub content."""
        self.testdoc = os.path.join(
            HERE, "converter_data/fakecorpus/orig/sme/riddu/test2.epub"
        )

    def test_convert2intermediate(self):
        """Range of same depth with the same name in the next to last level."""
        with TempDirectory() as directory:
            temp_epub = set_data(
                directory,
                self.testdoc,
                ".//body/div[1]/div[1]/p[1];.//body/div[2]/div[1]/p[4]",
            )
            got = htmlcontentconverter.convert2intermediate(temp_epub)
            want = """
                <document>
                    <body>
                        <p>igjen går hesten</p>
                        <p>baklengs inni framtida</p>
                    </body>
                </document>
            """

            self.assertXmlEqual(got, etree.fromstring(want))

    def test_convert2intermediate1(self):
        """Range with same parents."""
        with TempDirectory() as directory:
            temp_epub = set_data(
                directory,
                self.testdoc,
                ".//body/div[2]/div[1]/p[1];.//body/div[2]/div[1]/p[4]",
            )
            got = htmlcontentconverter.convert2intermediate(temp_epub)
            want = """
                <document>
                    <body>
                        <p>alle gir gass</p>
                        <p>men ikke</p>
                        <p>alle</p>
                        <p>har tass</p>
                        <p>igjen går hesten</p>
                        <p>baklengs inni framtida</p>
                    </body>
                </document>
            """

            self.assertXmlEqual(got, etree.fromstring(want))

    def test_convert2intermediate_invalid_skipelements(self):
        """Range of same depth with the same name in the next to last level."""
        with TempDirectory() as directory:
            temp_epub = set_data(
                directory,
                self.testdoc,
                ".//body/div[1]/div[1]/p[1];.//body/div[2]/div[15]/p[4]",
            )

            self.assertRaises(
                util.ConversionError,
                htmlcontentconverter.convert2intermediate,
                temp_epub,
            )

setUp()

Setup epub content.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
148
149
150
151
152
def setUp(self):
    """Setup epub content."""
    self.testdoc = os.path.join(
        HERE, "converter_data/fakecorpus/orig/sme/riddu/test2.epub"
    )

test_convert2intermediate()

Range of same depth with the same name in the next to last level.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def test_convert2intermediate(self):
    """Range of same depth with the same name in the next to last level."""
    with TempDirectory() as directory:
        temp_epub = set_data(
            directory,
            self.testdoc,
            ".//body/div[1]/div[1]/p[1];.//body/div[2]/div[1]/p[4]",
        )
        got = htmlcontentconverter.convert2intermediate(temp_epub)
        want = """
            <document>
                <body>
                    <p>igjen går hesten</p>
                    <p>baklengs inni framtida</p>
                </body>
            </document>
        """

        self.assertXmlEqual(got, etree.fromstring(want))

test_convert2intermediate1()

Range with same parents.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
def test_convert2intermediate1(self):
    """Range with same parents."""
    with TempDirectory() as directory:
        temp_epub = set_data(
            directory,
            self.testdoc,
            ".//body/div[2]/div[1]/p[1];.//body/div[2]/div[1]/p[4]",
        )
        got = htmlcontentconverter.convert2intermediate(temp_epub)
        want = """
            <document>
                <body>
                    <p>alle gir gass</p>
                    <p>men ikke</p>
                    <p>alle</p>
                    <p>har tass</p>
                    <p>igjen går hesten</p>
                    <p>baklengs inni framtida</p>
                </body>
            </document>
        """

        self.assertXmlEqual(got, etree.fromstring(want))

test_convert2intermediate_invalid_skipelements()

Range of same depth with the same name in the next to last level.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def test_convert2intermediate_invalid_skipelements(self):
    """Range of same depth with the same name in the next to last level."""
    with TempDirectory() as directory:
        temp_epub = set_data(
            directory,
            self.testdoc,
            ".//body/div[1]/div[1]/p[1];.//body/div[2]/div[15]/p[4]",
        )

        self.assertRaises(
            util.ConversionError,
            htmlcontentconverter.convert2intermediate,
            temp_epub,
        )

set_data(directory, testdoc, skip_elements)

Set needed testdata.

Parameters:

Name Type Description Default
directory testfixtures.TempDirectory

path to the directory

required
testdoc str

path to the test document

required
skip_elements str

the range of elements to skip

required

Returns:

Type Description
str

path to the test document in the temporary test directory

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_epubconverter.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def set_data(directory, testdoc, skip_elements):
    """Set needed testdata.

    Args:
        directory (testfixtures.TempDirectory): path to the directory
        testdoc (str): path to the test document
        skip_elements (str): the range of elements to skip

    Returns:
        (str): path to the test document in the temporary test directory
    """
    temp_epub = os.path.join(directory.path, os.path.basename(testdoc))
    copyfile(testdoc, temp_epub)
    metadata = xslsetter.MetadataHandler(temp_epub + ".xsl", create=True)
    metadata.set_variable("skip_elements", skip_elements)
    metadata.write_file()

    return temp_epub