Skip to content

bibel_no_aligner

Functions to align bible texts from bibel.no.

add_filename_id(filename)

Add the tmx filename as an prop element in the header.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
40
41
42
43
44
45
46
def add_filename_id(filename):
    """Add the tmx filename as an prop element in the header."""
    prop = etree.Element("prop")
    prop.attrib["type"] = "x-filename"
    prop.text = os.path.basename(filename)

    return prop

common_verses(filename, parallel_path)

Return string pairs from common verses.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def common_verses(filename, parallel_path):
    """Return string pairs from common verses."""

    orig = etree.parse(filename)
    parallel = etree.parse(parallel_path)

    orig_dict = {verse.get("number"): verse.text for verse in orig.iter("verse")}
    parallel_dict = {
        verse.get("number"): verse.text for verse in parallel.iter("verse")
    }

    common_verse_numbers = set(orig_dict.keys()).intersection(set(parallel_dict.keys()))

    return [
        (orig_dict[common_verse], parallel_dict[common_verse])
        for common_verse in sorted(common_verse_numbers)
    ]

main()

Make tmx files.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
145
146
147
148
149
150
151
152
153
def main():
    """Make tmx files."""
    args = parse_options()

    for path in valid_path(args.source_lang, args.target_lang):
        write_tmx(
            make_tmx(path, args.source_lang, args.target_lang),
            path.prestable_tmx(args.target_lang),
        )

make_tmx(path, source_lang, target_lang)

Make a tmx element with verse pairs.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
119
120
121
122
123
124
125
126
127
128
129
def make_tmx(path, source_lang, target_lang):
    """Make a tmx element with verse pairs."""
    print("Making", path.prestable_tmx(target_lang))
    tmx = make_tmx_template(path.orig, source_lang)
    body = etree.SubElement(tmx, "body")
    for common_verse_pair in common_verses(path.orig, path.parallel(target_lang)):
        translation_unit = etree.SubElement(body, "tu")
        translation_unit.append(make_tuv(common_verse_pair[0], source_lang))
        translation_unit.append(make_tuv(common_verse_pair[1], target_lang))

    return tmx

make_tmx_header(filename, source_lang)

Make a tmx header based on the lang variable.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def make_tmx_header(filename, source_lang):
    """Make a tmx header based on the lang variable."""
    header = etree.Element("header")

    # Set various attributes
    header.attrib["segtype"] = "sentence"
    header.attrib["o-tmf"] = "OmegaT TMX"
    header.attrib["adminlang"] = "en-US"
    header.attrib["srclang"] = source_lang
    header.attrib["datatype"] = "plaintext"

    header.append(add_filename_id(filename))

    return header

make_tmx_template(filename, source_lang)

Make tmx file based on the output of the aligner.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
65
66
67
68
69
70
71
def make_tmx_template(filename, source_lang):
    """Make tmx file based on the output of the aligner."""
    tmx = etree.Element("tmx")
    header = make_tmx_header(filename, source_lang)
    tmx.append(header)

    return tmx

make_tuv(line, lang)

Make a tuv element given an input line and a lang variable.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
29
30
31
32
33
34
35
36
37
def make_tuv(line, lang):
    """Make a tuv element given an input line and a lang variable."""
    tuv = etree.Element("tuv")
    tuv.attrib["{http://www.w3.org/XML/1998/namespace}lang"] = lang
    seg = etree.Element("seg")
    seg.text = line.strip()
    tuv.append(seg)

    return tuv

parse_options()

Gather options.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
def parse_options():
    """Gather options."""
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser], description="Align bibel.no texts."
    )

    parser.add_argument("source_lang", help="Source language")
    parser.add_argument("target_lang", help="Target language")

    return parser.parse_args()

valid_path(source_lang, target_lang)

Yield a CorpusPath if the parallel file exists.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
105
106
107
108
109
110
111
112
113
114
115
116
def valid_path(source_lang, target_lang):
    """Yield a CorpusPath if the parallel file exists."""
    for testament in ["nt", "ot"]:
        source_dir = os.path.join(
            os.getenv("GTBOUND"), "orig", source_lang, "bible", testament, "bibel.no"
        )
        for filename in glob.glob(f"{source_dir}/*.xml"):
            path = corpuspath.make_corpus_path(filename)
            parallel_path = path.parallel(target_lang)

            if os.path.exists(parallel_path):
                yield path

write_tmx(tmx, tmx_filename)

Write a tmx file.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_aligner.py
132
133
134
135
136
137
138
139
140
141
142
def write_tmx(tmx, tmx_filename):
    """Write a tmx file."""
    with util.ignored(OSError):
        os.makedirs(os.path.dirname(tmx_filename))

    with open(tmx_filename, "wb") as tmx_stream:
        tmx_stream.write(
            etree.tostring(
                tmx, encoding="utf8", pretty_print=True, xml_declaration=True
            )
        )