Skip to content

test_converter

Test the Converter class.

TestConverter

Bases: XMLTester

Test the converter class.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_converter.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class TestConverter(XMLTester):
    """Test the converter class."""

    def setUp(self):
        self.converter_inside_orig = converter.Converter(
            os.path.join(
                HERE,
                "converter_data/fakecorpus/orig/nob/admin/samediggi-" "article-16.html",
            ),
            True,
        )

    def test_get_orig(self):
        """Get the original name."""
        self.assertEqual(
            self.converter_inside_orig.names.orig,
            os.path.join(
                HERE,
                "converter_data/fakecorpus/orig/nob/admin/samediggi-article-" "16.html",
            ),
        )

    def test_get_xsl(self):
        """Get the name of the metadata file."""
        self.assertEqual(
            self.converter_inside_orig.names.xsl,
            os.path.join(
                HERE,
                "converter_data/fakecorpus/orig/nob/admin/samediggi-"
                "article-16.html.xsl",
            ),
        )

    def test_get_tmpdir(self):
        """Get the temp dir."""
        self.assertEqual(
            self.converter_inside_orig.tmpdir,
            os.path.join(HERE, "converter_data/fakecorpus/tmp"),
        )

    def test_get_corpusdir(self):
        """Get the corpus directory."""
        self.assertEqual(
            self.converter_inside_orig.corpusdir.rstrip(os.path.sep),
            os.path.join(HERE, "converter_data/fakecorpus"),
        )

    def test_get_converted_name(self):
        """Get the name of the converted file."""
        self.assertEqual(
            self.converter_inside_orig.names.converted,
            os.path.join(
                HERE,
                "converter_data/fakecorpus/converted/nob/admin/samediggi-"
                "article-16.html.xml",
            ),
        )

    def test_validate_complete(self):
        """Check that an exception is raised if a document is invalid."""
        complete = etree.fromstring("<document/>")

        self.assertRaises(
            util.ConversionError, self.converter_inside_orig.validate_complete, complete
        )

    def test_detect_quote_is_skipped_on_errormarkup_documents(self):
        """quote detection should not be done in errormarkup documents

        This is a test for that covers the case covered in
        http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=2151
        """
        want_string = """
            <document xml:lang="smj" id="no_id">
            <header>
                <title/>
                <genre code="ficti"/>
                <year>2011</year>
                <wordcount>15</wordcount>
            </header>
                <body>
                    <p>
                        Lev lähkám Skánen,
                        <errorort correct="Evenskjeran" errorinfo="vowm,á-a">
                            Evenskjerán
                        </errorort>
                        Sáme
                        <errorort correct="gilppusijn" errorinfo="infl">
                            gilppojn
                        </errorort>
                        ja lev aj dán vahko lähkám
                        <errorort
                            correct="&quot;hárjjidallamskåvlån&quot;"
                            errorinfo="conc,rj-rjj;cmp,2-X">
                                hárjidallam-"skåvlån"
                        </errorort>
                        <errorort correct="tjuojggusijn" errorinfo="vowlat,o-u">
                            tjuojggosijn
                        </errorort>.
                    </p>
                </body>
            </document>
        """
        got = etree.fromstring(want_string)

        conv = converter.Converter("orig/sme/admin/blogg_5.correct.txt")
        conv.metadata = xslsetter.MetadataHandler(conv.names.xsl, create=True)
        conv.metadata.set_variable("conversion_status", "correct")
        conv.fix_document(got)

        self.assertXmlEqual(got, etree.fromstring(want_string))

test_detect_quote_is_skipped_on_errormarkup_documents()

quote detection should not be done in errormarkup documents

This is a test for that covers the case covered in http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=2151

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_converter.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
def test_detect_quote_is_skipped_on_errormarkup_documents(self):
    """quote detection should not be done in errormarkup documents

    This is a test for that covers the case covered in
    http://giellatekno.uit.no/bugzilla/show_bug.cgi?id=2151
    """
    want_string = """
        <document xml:lang="smj" id="no_id">
        <header>
            <title/>
            <genre code="ficti"/>
            <year>2011</year>
            <wordcount>15</wordcount>
        </header>
            <body>
                <p>
                    Lev lähkám Skánen,
                    <errorort correct="Evenskjeran" errorinfo="vowm,á-a">
                        Evenskjerán
                    </errorort>
                    Sáme
                    <errorort correct="gilppusijn" errorinfo="infl">
                        gilppojn
                    </errorort>
                    ja lev aj dán vahko lähkám
                    <errorort
                        correct="&quot;hárjjidallamskåvlån&quot;"
                        errorinfo="conc,rj-rjj;cmp,2-X">
                            hárjidallam-"skåvlån"
                    </errorort>
                    <errorort correct="tjuojggusijn" errorinfo="vowlat,o-u">
                        tjuojggosijn
                    </errorort>.
                </p>
            </body>
        </document>
    """
    got = etree.fromstring(want_string)

    conv = converter.Converter("orig/sme/admin/blogg_5.correct.txt")
    conv.metadata = xslsetter.MetadataHandler(conv.names.xsl, create=True)
    conv.metadata.set_variable("conversion_status", "correct")
    conv.fix_document(got)

    self.assertXmlEqual(got, etree.fromstring(want_string))

test_get_converted_name()

Get the name of the converted file.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_converter.py
80
81
82
83
84
85
86
87
88
89
def test_get_converted_name(self):
    """Get the name of the converted file."""
    self.assertEqual(
        self.converter_inside_orig.names.converted,
        os.path.join(
            HERE,
            "converter_data/fakecorpus/converted/nob/admin/samediggi-"
            "article-16.html.xml",
        ),
    )

test_get_corpusdir()

Get the corpus directory.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_converter.py
73
74
75
76
77
78
def test_get_corpusdir(self):
    """Get the corpus directory."""
    self.assertEqual(
        self.converter_inside_orig.corpusdir.rstrip(os.path.sep),
        os.path.join(HERE, "converter_data/fakecorpus"),
    )

test_get_orig()

Get the original name.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_converter.py
45
46
47
48
49
50
51
52
53
def test_get_orig(self):
    """Get the original name."""
    self.assertEqual(
        self.converter_inside_orig.names.orig,
        os.path.join(
            HERE,
            "converter_data/fakecorpus/orig/nob/admin/samediggi-article-" "16.html",
        ),
    )

test_get_tmpdir()

Get the temp dir.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_converter.py
66
67
68
69
70
71
def test_get_tmpdir(self):
    """Get the temp dir."""
    self.assertEqual(
        self.converter_inside_orig.tmpdir,
        os.path.join(HERE, "converter_data/fakecorpus/tmp"),
    )

test_get_xsl()

Get the name of the metadata file.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_converter.py
55
56
57
58
59
60
61
62
63
64
def test_get_xsl(self):
    """Get the name of the metadata file."""
    self.assertEqual(
        self.converter_inside_orig.names.xsl,
        os.path.join(
            HERE,
            "converter_data/fakecorpus/orig/nob/admin/samediggi-"
            "article-16.html.xsl",
        ),
    )

test_validate_complete()

Check that an exception is raised if a document is invalid.

Source code in /home/anders/projects/CorpusTools/corpustools/test/test_converter.py
91
92
93
94
95
96
97
def test_validate_complete(self):
    """Check that an exception is raised if a document is invalid."""
    complete = etree.fromstring("<document/>")

    self.assertRaises(
        util.ConversionError, self.converter_inside_orig.validate_complete, complete
    )