Skip to content

ces2homegrown

Turn cesDoc xml into our homegrown xml.

get_verses(chapter)

Extract the chapter content.

Source code in /home/anders/projects/CorpusTools/corpustools/ces2homegrown.py
66
67
68
69
70
71
72
73
74
def get_verses(chapter):
    """Extract the chapter content."""
    body = etree.Element("body")
    for seg in chapter.iter("seg"):
        verse = etree.SubElement(body, "verse")
        verse.set("number", seg.get("id").split(".")[-1])
        verse.text = seg.text.strip()

    return body

main()

Turn cesDoc to homegrown xml.

Source code in /home/anders/projects/CorpusTools/corpustools/ces2homegrown.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def main():
    """Turn cesDoc to homegrown xml."""
    args = parse_options()
    tree = etree.parse(args.cesdoc)

    chapter_paths = (
        save_chapter(
            args.lang,
            args.testament,
            f"{bookindex:0>2}_{chapterindex:0>3}",
            get_verses(chapter),
            os.path.basename(args.cesdoc),
        )
        for (bookindex, book) in enumerate(tree.xpath(".//div[@type='book']"), start=1)
        for (chapterindex, chapter) in enumerate(
            book.xpath(".//div[@type='chapter']"), start=1
        )
    )

    set_parallels(chapter_paths, args.testament, args.lang)

parse_options()

Parse the options for this script.

Source code in /home/anders/projects/CorpusTools/corpustools/ces2homegrown.py
30
31
32
33
34
35
36
37
38
39
40
41
def parse_options():
    """Parse the options for this script."""
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser],
        description="Turn cesDoc xml into our homegrown xml.",
    )

    parser.add_argument("lang", help="Language of the file")
    parser.add_argument("testament", choices=["ot", "nt"], help="Old or new testament")
    parser.add_argument("cesdoc", help="The cesdoc that should be converted")

    return parser.parse_args()

save_chapter(language, testament, filename, body, address)

Save chapter info.

Source code in /home/anders/projects/CorpusTools/corpustools/ces2homegrown.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def save_chapter(language, testament, filename, body, address):
    """Save chapter info."""
    language_year = {"nob": 2011, "sme": 2019.0}
    name = os.path.join(
        os.getenv("GTBOUND"),
        "orig",
        language,
        "bible",
        testament,
        "bibel.no",
        f"{filename}.xml",
    )
    with util.ignored(OSError):
        os.makedirs(os.path.dirname(name))

    path = corpuspath.make_corpus_path(name)
    path.metadata.set_variable("filename", address)
    path.metadata.set_variable("mainlang", language)
    path.metadata.set_variable("genre", "bible")
    path.metadata.set_variable("monolingual", "1")
    path.metadata.set_variable("license_type", "standard")
    path.metadata.set_variable("year", language_year.get(language, datetime.now().year))

    path.metadata.write_file()
    root = etree.Element("document")
    root.append(body)

    with open(name, "wb") as page_stream:
        page_stream.write(etree.tostring(root, encoding="utf8", pretty_print=True))

    return path

set_parallels(chapter_paths, testament, new_lang)

Set the parallels.

Use the nob names as the base, it has all the books and chapters.

Source code in /home/anders/projects/CorpusTools/corpustools/ces2homegrown.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def set_parallels(chapter_paths, testament, new_lang):
    """Set the parallels.

    Use the nob names as the base, it has all the books and chapters.
    """
    nob_names = sorted(
        glob.glob(
            f'{os.path.join(os.getenv("GTBOUND"), "orig/nob/bible", testament, "bibel.no")}/*.xml'
        )
    )
    for (chapter_path, nob_name) in zip(chapter_paths, nob_names):
        nob_path = corpuspath.make_corpus_path(nob_name)
        nob_meta = nob_path.metadata
        chapter_meta = chapter_path.metadata

        chapter_meta.set_parallel_text("nob", os.path.basename(nob_name))
        nob_meta.set_parallel_text(new_lang, os.path.basename(chapter_path.orig))
        nob_meta.write_file()

        for (lang, filename) in nob_meta.get_parallel_texts().items():
            chapter_meta.set_parallel_text(lang, filename)
            parallel_path = corpuspath.make_corpus_path(nob_path.parallel(lang))
            parallel_path.metadata.set_parallel_text(
                new_lang, os.path.basename(chapter_path.orig)
            )
            parallel_path.metadata.write_file()

        chapter_meta.write_file()