Skip to content

compare_tmx_goldstandard

Compare prestable tmx files to files produced by the parallelizer.

TmxComparator

A class to compare two tmx-files

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
class TmxComparator:
    """A class to compare two tmx-files"""

    def __init__(self, want_tmx, got_tmx):
        self.want_tmx = want_tmx
        self.got_tmx = got_tmx

    def get_lines_in_wantedfile(self):
        """Return the number of lines in the reference doc"""
        return len(self.want_tmx.tmx_to_stringlist())

    def get_number_of_differing_lines(self):
        """Find how many lines differ between to tmx documents.

        Given a unified_diff, find out how many lines in the reference doc
        differs from the doc to be tested. A return value of -1 means that
        the docs are equal
        """
        # Start at -1 because a unified diff always starts with a --- line
        num_diff_lines = -1
        for line in difflib.unified_diff(
            self.want_tmx.tmx_to_stringlist(), self.got_tmx.tmx_to_stringlist(), n=0
        ):
            if line[:1] == "-":
                num_diff_lines += 1

        return num_diff_lines

    def get_diff_as_text(self):
        """Return a stringlist containing the diff lines"""
        diff = []
        for line in difflib.unified_diff(
            self.want_tmx.tmx_to_stringlist(), self.got_tmx.tmx_to_stringlist(), n=0
        ):
            diff.append(line)

        return diff

    def get_lang_diff_as_text(self, lang):
        """Return a stringlist containing the diff lines"""
        diff = []
        for line in difflib.unified_diff(
            self.want_tmx.lang_to_stringlist(lang),
            self.got_tmx.lang_to_stringlist(lang),
            n=0,
        ):
            diff.append(line + "\n")

        return diff

get_diff_as_text()

Return a stringlist containing the diff lines

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
61
62
63
64
65
66
67
68
69
def get_diff_as_text(self):
    """Return a stringlist containing the diff lines"""
    diff = []
    for line in difflib.unified_diff(
        self.want_tmx.tmx_to_stringlist(), self.got_tmx.tmx_to_stringlist(), n=0
    ):
        diff.append(line)

    return diff

get_lang_diff_as_text(lang)

Return a stringlist containing the diff lines

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
71
72
73
74
75
76
77
78
79
80
81
def get_lang_diff_as_text(self, lang):
    """Return a stringlist containing the diff lines"""
    diff = []
    for line in difflib.unified_diff(
        self.want_tmx.lang_to_stringlist(lang),
        self.got_tmx.lang_to_stringlist(lang),
        n=0,
    ):
        diff.append(line + "\n")

    return diff

get_lines_in_wantedfile()

Return the number of lines in the reference doc

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
40
41
42
def get_lines_in_wantedfile(self):
    """Return the number of lines in the reference doc"""
    return len(self.want_tmx.tmx_to_stringlist())

get_number_of_differing_lines()

Find how many lines differ between to tmx documents.

Given a unified_diff, find out how many lines in the reference doc differs from the doc to be tested. A return value of -1 means that the docs are equal

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def get_number_of_differing_lines(self):
    """Find how many lines differ between to tmx documents.

    Given a unified_diff, find out how many lines in the reference doc
    differs from the doc to be tested. A return value of -1 means that
    the docs are equal
    """
    # Start at -1 because a unified diff always starts with a --- line
    num_diff_lines = -1
    for line in difflib.unified_diff(
        self.want_tmx.tmx_to_stringlist(), self.got_tmx.tmx_to_stringlist(), n=0
    ):
        if line[:1] == "-":
            num_diff_lines += 1

    return num_diff_lines

TmxGoldstandardTester

A class to test the alignment pipeline against the tmx goldstandard

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
class TmxGoldstandardTester:
    """A class to test the alignment pipeline against the tmx goldstandard"""

    def __init__(self, testresult_filename, dateformat_addition=None):
        """Set the name where the testresults should be written

        Find all goldstandard tmx files
        """
        self.number_of_diff_lines = 0
        self.testresult_writer = TmxTestDataWriter(testresult_filename)
        if dateformat_addition is None:
            self.date = self.dateformat()
        else:
            self.date = self.dateformat() + dateformat_addition

    def set_number_of_diff_lines(self, diff_lines):
        """Increase the total number of difflines in this test run"""
        self.number_of_diff_lines += diff_lines

    def get_number_of_diff_lines(self):
        """Get the number of diff lines."""
        return self.number_of_diff_lines

    def dateformat(self):
        """Get the date and time, 20111209-1234. Used in a testrun element"""
        d = datetime.datetime.fromtimestamp(time.time())

        return d.strftime("%Y%m%d-%H%M")

    def run_test(self):
        """Make a testrun element.

        This element contain the result of the test.
        """
        testrun = self.testresult_writer.make_testrun_element(self.date)

        paralang = ""
        # Go through each tmx goldstandard file
        for want_tmx_file in self.find_goldstandard_tmx_files():
            print(f"testing {want_tmx_file} …")

            # Calculate the parallel lang, to be used in parallelization
            if want_tmx_file.find("nob2sme") > -1:
                paralang = "sme"
            else:
                paralang = "nob"

            # Align files
            self.align_files(testrun, want_tmx_file, paralang, aligner="tca2")

        # All files have been tested, insert this run at the top of the
        # paragstest element
        self.testresult_writer.insert_testrun_element(testrun)
        # Write data to file
        self.testresult_writer.write_paragstesting_data()

    def align_files(self, testrun, want_tmx_file, paralang, aligner):
        """Align files

        Compare the tmx's of the result of this parallellization and
        the tmx of the goldstandard file
        Write the result to a file
        Write the diffs of these to tmx's to a separate file
        """

        # Compute the name of the main file to parallelize
        xml_file = self.compute_xmlfilename(want_tmx_file)

        parallelizer = parallelize.Parallelize(xml_file, paralang)
        got_tmx = parallelizer.parallelize_files()

        # This is the tmx element fetched from the goldstandard file
        want_tmx = parallelize.Tmx(etree.parse(want_tmx_file))

        # Instantiate a comparator with the two tmxes
        comparator = TmxComparator(want_tmx, got_tmx)

        # Make a file_element for our results file
        file_element = self.testresult_writer.make_file_element(
            filelist[0].get_basename(),
            str(comparator.get_lines_in_wantedfile()),
            str(comparator.get_number_of_differing_lines()),
        )

        self.set_number_of_diff_lines(comparator.get_number_of_differing_lines())

        # Append the result for this file to the testrun element
        testrun.append(file_element)

        self.write_diff_files(comparator, parallelizer, filelist[0].get_basename())

    def compute_xmlfilename(self, want_tmx_file):
        """Compute the name of the xmlfile which should be aligned"""
        xml_file = want_tmx_file.replace("tmx/goldstandard/", "converted/")
        xml_file = xml_file.replace("nob2sme", "nob")
        xml_file = xml_file.replace("sme2nob", "sme")
        xml_file = xml_file.replace(".toktmx", ".xml")

        return xml_file

    def write_diff_files(self, comparator, parallelizer, filename):
        """Write diffs to a jspwiki file"""
        print(f"write_diff_files {filename}")
        filename = f"{filename}_{self.date}.jspwiki"
        dirname = os.path.join(
            os.path.dirname(self.testresult_writer.get_filename()), "tca2testing"
        )

        with open(os.path.join(dirname, filename), "w") as diff_file:
            diff_file.write(f"!!!{filename}\n")
            diff_file.write("!!TMX diff\n{{{\n")
            diff_file.writelines(comparator.get_diff_as_text())
            diff_file.write("\n}}}\n!! diff\n{{{\n".format(parallelizer.get_lang1()))
            diff_file.writelines(
                comparator.get_lang_diff_as_text(parallelizer.get_lang1())
            )
            diff_file.write("\n}}}\n!!{} diff\n{{{\n".format(parallelizer.get_lang2()))
            diff_file.writelines(
                comparator.get_lang_diff_as_text(parallelizer.get_lang2())
            )
            diff_file.write("\n}}}\n")

    def find_goldstandard_tmx_files(self):
        """Find the goldstandard tmx files, return them as a list"""
        file_list = []
        for root, dirs, files in os.walk(
            os.path.join(os.environ["GTFREE"], "prestable/toktmx")
        ):
            for f in files:
                if f.endswith(".toktmx"):
                    print(util.lineno(), f)
                    file_list.append(os.path.join(root, f))

        return file_list

__init__(testresult_filename, dateformat_addition=None)

Set the name where the testresults should be written

Find all goldstandard tmx files

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
87
88
89
90
91
92
93
94
95
96
97
def __init__(self, testresult_filename, dateformat_addition=None):
    """Set the name where the testresults should be written

    Find all goldstandard tmx files
    """
    self.number_of_diff_lines = 0
    self.testresult_writer = TmxTestDataWriter(testresult_filename)
    if dateformat_addition is None:
        self.date = self.dateformat()
    else:
        self.date = self.dateformat() + dateformat_addition

align_files(testrun, want_tmx_file, paralang, aligner)

Align files

Compare the tmx's of the result of this parallellization and the tmx of the goldstandard file Write the result to a file Write the diffs of these to tmx's to a separate file

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
def align_files(self, testrun, want_tmx_file, paralang, aligner):
    """Align files

    Compare the tmx's of the result of this parallellization and
    the tmx of the goldstandard file
    Write the result to a file
    Write the diffs of these to tmx's to a separate file
    """

    # Compute the name of the main file to parallelize
    xml_file = self.compute_xmlfilename(want_tmx_file)

    parallelizer = parallelize.Parallelize(xml_file, paralang)
    got_tmx = parallelizer.parallelize_files()

    # This is the tmx element fetched from the goldstandard file
    want_tmx = parallelize.Tmx(etree.parse(want_tmx_file))

    # Instantiate a comparator with the two tmxes
    comparator = TmxComparator(want_tmx, got_tmx)

    # Make a file_element for our results file
    file_element = self.testresult_writer.make_file_element(
        filelist[0].get_basename(),
        str(comparator.get_lines_in_wantedfile()),
        str(comparator.get_number_of_differing_lines()),
    )

    self.set_number_of_diff_lines(comparator.get_number_of_differing_lines())

    # Append the result for this file to the testrun element
    testrun.append(file_element)

    self.write_diff_files(comparator, parallelizer, filelist[0].get_basename())

compute_xmlfilename(want_tmx_file)

Compute the name of the xmlfile which should be aligned

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
175
176
177
178
179
180
181
182
def compute_xmlfilename(self, want_tmx_file):
    """Compute the name of the xmlfile which should be aligned"""
    xml_file = want_tmx_file.replace("tmx/goldstandard/", "converted/")
    xml_file = xml_file.replace("nob2sme", "nob")
    xml_file = xml_file.replace("sme2nob", "sme")
    xml_file = xml_file.replace(".toktmx", ".xml")

    return xml_file

dateformat()

Get the date and time, 20111209-1234. Used in a testrun element

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
107
108
109
110
111
def dateformat(self):
    """Get the date and time, 20111209-1234. Used in a testrun element"""
    d = datetime.datetime.fromtimestamp(time.time())

    return d.strftime("%Y%m%d-%H%M")

find_goldstandard_tmx_files()

Find the goldstandard tmx files, return them as a list

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
206
207
208
209
210
211
212
213
214
215
216
217
def find_goldstandard_tmx_files(self):
    """Find the goldstandard tmx files, return them as a list"""
    file_list = []
    for root, dirs, files in os.walk(
        os.path.join(os.environ["GTFREE"], "prestable/toktmx")
    ):
        for f in files:
            if f.endswith(".toktmx"):
                print(util.lineno(), f)
                file_list.append(os.path.join(root, f))

    return file_list

get_number_of_diff_lines()

Get the number of diff lines.

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
103
104
105
def get_number_of_diff_lines(self):
    """Get the number of diff lines."""
    return self.number_of_diff_lines

run_test()

Make a testrun element.

This element contain the result of the test.

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def run_test(self):
    """Make a testrun element.

    This element contain the result of the test.
    """
    testrun = self.testresult_writer.make_testrun_element(self.date)

    paralang = ""
    # Go through each tmx goldstandard file
    for want_tmx_file in self.find_goldstandard_tmx_files():
        print(f"testing {want_tmx_file} …")

        # Calculate the parallel lang, to be used in parallelization
        if want_tmx_file.find("nob2sme") > -1:
            paralang = "sme"
        else:
            paralang = "nob"

        # Align files
        self.align_files(testrun, want_tmx_file, paralang, aligner="tca2")

    # All files have been tested, insert this run at the top of the
    # paragstest element
    self.testresult_writer.insert_testrun_element(testrun)
    # Write data to file
    self.testresult_writer.write_paragstesting_data()

set_number_of_diff_lines(diff_lines)

Increase the total number of difflines in this test run

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
 99
100
101
def set_number_of_diff_lines(self, diff_lines):
    """Increase the total number of difflines in this test run"""
    self.number_of_diff_lines += diff_lines

write_diff_files(comparator, parallelizer, filename)

Write diffs to a jspwiki file

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
def write_diff_files(self, comparator, parallelizer, filename):
    """Write diffs to a jspwiki file"""
    print(f"write_diff_files {filename}")
    filename = f"{filename}_{self.date}.jspwiki"
    dirname = os.path.join(
        os.path.dirname(self.testresult_writer.get_filename()), "tca2testing"
    )

    with open(os.path.join(dirname, filename), "w") as diff_file:
        diff_file.write(f"!!!{filename}\n")
        diff_file.write("!!TMX diff\n{{{\n")
        diff_file.writelines(comparator.get_diff_as_text())
        diff_file.write("\n}}}\n!! diff\n{{{\n".format(parallelizer.get_lang1()))
        diff_file.writelines(
            comparator.get_lang_diff_as_text(parallelizer.get_lang1())
        )
        diff_file.write("\n}}}\n!!{} diff\n{{{\n".format(parallelizer.get_lang2()))
        diff_file.writelines(
            comparator.get_lang_diff_as_text(parallelizer.get_lang2())
        )
        diff_file.write("\n}}}\n")

TmxTestDataWriter

A class that writes tmx test data to a file

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
class TmxTestDataWriter:
    """A class that writes tmx test data to a file"""

    def __init__(self, filename):
        self.filename = filename

        try:
            tree = etree.parse(filename)
            self.set_parags_testing_element(tree.getroot())
        except OSError as error:
            util.note(f"I/O error({error.errno}): {error.strerror}")
            sys.exit(1)

    def get_filename(self):
        return self.filename

    def make_file_element(self, name, gspairs, diffpairs):
        """Make the element file, set the attributes"""
        file_element = etree.Element("file")
        file_element.attrib["name"] = name
        file_element.attrib["gspairs"] = gspairs
        file_element.attrib["diffpairs"] = diffpairs

        return file_element

    def set_parags_testing_element(self, paragstesting):
        self.paragstesting = paragstesting

    def make_testrun_element(self, datetime):
        """Make the testrun element, set the attribute"""
        testrun_element = etree.Element("testrun")
        testrun_element.attrib["datetime"] = datetime

        return testrun_element

    def make_paragstesting_element(self):
        """Make the paragstesting element"""
        paragstesting_element = etree.Element("paragstesting")

        return paragstesting_element

    def insert_testrun_element(self, testrun):
        self.paragstesting.insert(0, testrun)

    def write_paragstesting_data(self):
        """Write the paragstesting data to a file"""
        with open(self.filename, "w") as paragstesting:
            et = etree.ElementTree(self.paragstesting)
            et.write(
                paragstesting, pretty_print=True, encoding="utf-8", xml_declaration=True
            )

make_file_element(name, gspairs, diffpairs)

Make the element file, set the attributes

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
236
237
238
239
240
241
242
243
def make_file_element(self, name, gspairs, diffpairs):
    """Make the element file, set the attributes"""
    file_element = etree.Element("file")
    file_element.attrib["name"] = name
    file_element.attrib["gspairs"] = gspairs
    file_element.attrib["diffpairs"] = diffpairs

    return file_element

make_paragstesting_element()

Make the paragstesting element

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
255
256
257
258
259
def make_paragstesting_element(self):
    """Make the paragstesting element"""
    paragstesting_element = etree.Element("paragstesting")

    return paragstesting_element

make_testrun_element(datetime)

Make the testrun element, set the attribute

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
248
249
250
251
252
253
def make_testrun_element(self, datetime):
    """Make the testrun element, set the attribute"""
    testrun_element = etree.Element("testrun")
    testrun_element.attrib["datetime"] = datetime

    return testrun_element

write_paragstesting_data()

Write the paragstesting data to a file

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
264
265
266
267
268
269
270
def write_paragstesting_data(self):
    """Write the paragstesting data to a file"""
    with open(self.filename, "w") as paragstesting:
        et = etree.ElementTree(self.paragstesting)
        et.write(
            paragstesting, pretty_print=True, encoding="utf-8", xml_declaration=True
        )

parse_options()

Parse the command line.

Expected input is one or more tmx goldstandard files.

Source code in /home/anders/projects/CorpusTools/corpustools/compare_tmx_goldstandard.py
273
274
275
276
277
278
279
280
281
282
283
284
def parse_options():
    """Parse the command line.

    Expected input is one or more tmx goldstandard files.
    """
    parser = argparse.ArgumentParser(
        description="Compare goldstandard tmx "
        "files to files produced by the "
        "parallelizer pipeline."
    )

    parser.parse_args()