Skip to content

bibel_no_crawler

Functions to fetch bible texts from bibel.no.

fetch_other_languages(book_name, bookindex, chapternumber, address)

Given an address, fetch all parallels.

Source code in corpustools/bibel_no_crawler.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
def fetch_other_languages(book_name, bookindex, chapternumber, address):
    """Given an address, fetch all parallels."""
    languages = {
        "nob": "bokmal11",
        "sme": "nordsamisk19",
        "sma": "sorsamisk",
        "smj": "lulesamisk",
    }

    parallels = []
    for lang in ["nob", "sme", "smj", "sma"]:
        new_address = f'{address.replace("bokmal11", languages[lang])}'
        # first_page = html.parse("nob_mat11.html")
        first_page = fetch_page(new_address)
        body = get_verses(first_page)
        if body is not None:
            header = first_page.find(".//h1").text.strip()
            parallels.append(
                save_page(
                    lang,
                    book_name,
                    filename=namechanger.normalise_filename(
                        f"{bookindex:0>2}_{chapternumber:0>3}_{header}"
                    ),
                    body=body,
                    address=new_address,
                )
            )

        for this_parallel in parallels:
            for that_parallel in parallels:
                if this_parallel != that_parallel:
                    this_parallel.metadata.set_parallel_text(
                        that_parallel.metadata.get_variable("mainlang"),
                        os.path.basename(that_parallel.orig),
                    )
            this_parallel.metadata.write_file()

fetch_page(address) cached

Fetch a page.

Source code in corpustools/bibel_no_crawler.py
30
31
32
33
34
@functools.lru_cache
def fetch_page(address):
    """Fetch a page."""
    main_content = requests.get(address)
    return html.document_fromstring(main_content.text)

get_books(tree)

Get the addresses for the books on bible.no.

Source code in corpustools/bibel_no_crawler.py
37
38
39
40
41
42
43
44
45
46
47
48
49
def get_books(tree):
    """Get the addresses for the books on bible.no."""
    books = {"ot": [], "nt": []}
    ot_index = 1
    nt_index = 3
    for table_row in tree.xpath(".//table[@class='booklist']/tr"):
        for index, address in enumerate(table_row.xpath("./td[@class='tablePR']/a")):
            if index == ot_index:
                books["ot"].append(address.get("href"))
            if index == nt_index:
                books["nt"].append(address.get("href"))

    return books

get_chapter_addresses(first_chapter_page)

Extract the addresses to the other chapters.

Source code in corpustools/bibel_no_crawler.py
52
53
54
55
56
57
def get_chapter_addresses(first_chapter_page):
    """Extract the addresses to the other chapters."""
    return (
        (address.text.strip(), address.get("href"))
        for address in first_chapter_page.xpath(".//a[@class='versechapter']")
    )

get_verses(chapter_page)

Extract the chapter content.

If the table does not exist, then this chapter does not exist. This is the case for some sámi translations.

Source code in corpustools/bibel_no_crawler.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def get_verses(chapter_page):
    """Extract the chapter content.

    If the table does not exist, then this chapter does not exist. This is the
    case for some sámi translations.
    """

    content_element = chapter_page.find(".//table[@class='biblesingle']/tr/td")
    if content_element is not None:
        for bibleref in content_element.xpath(".//div[@class='bibleref']"):
            bibleref.getparent().remove(bibleref)

        body = etree.Element("body")
        lastparent = body
        for element in content_element:
            if element.get("class") in ["versenumberdropcap", "versenumber"]:
                verse_number = element.get("name")
            if element.get("class") == "verse":
                text = " ".join("".join(element.itertext()).split())
                if text:
                    verse = etree.SubElement(lastparent, "verse")
                    verse.set("number", verse_number)
                    verse.text = text
            if element.get("class") == "verseheader" and element.text is not None:
                lastparent = etree.SubElement(body, "section")
                lastparent.set("title", element.text.strip())

        return body

    return None

main()

Fetch bible texts from bibel.no

Source code in corpustools/bibel_no_crawler.py
166
167
168
169
170
171
172
173
174
175
176
177
178
179
def main():
    """Fetch bible texts from bibel.no"""
    prefix = "https://bibel.no"
    books = get_books(fetch_page("https://bibel.no/nettbibelen?slang=bokmal11"))
    for book_name in books:
        for bookindex, first_address in enumerate(books[book_name], start=1):
            address = f"{prefix}{first_address}"
            first_page = fetch_page(address)
            fetch_other_languages(book_name, bookindex, 1, address)
            for chapter_number, chapter_address in get_chapter_addresses(first_page):
                chapter_address = f"{prefix}{chapter_address}"
                fetch_other_languages(
                    book_name, bookindex, chapter_number, chapter_address
                )

save_page(language, bookname, filename, body, address)

Save chapter page.

Source code in corpustools/bibel_no_crawler.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def save_page(language, bookname, filename, body, address):
    """Save chapter page."""
    language_year = {"nob": 2011, "sme": 2019.0}
    name = os.path.join(
        os.getenv("GTBOUND"),
        "orig",
        language,
        "bible",
        bookname,
        "bibel.no",
        f"{filename}.xml",
    )
    with util.ignored(OSError):
        os.makedirs(os.path.dirname(name))

    path = corpuspath.make_corpus_path(name)
    path.metadata.set_variable("filename", address)
    path.metadata.set_variable("mainlang", language)
    path.metadata.set_variable("genre", "bible")
    path.metadata.set_variable("monolingual", "1")
    path.metadata.set_variable("license_type", "standard")
    path.metadata.set_variable("publisher", "Det Norske Bibelselskap")
    path.metadata.set_variable("publChannel", "https://bibel.no/nettbibelen")
    path.metadata.set_variable("year", language_year.get(language, datetime.now().year))

    path.metadata.write_file()
    root = etree.Element("document")
    root.append(body)

    with open(name, "wb") as page_stream:
        page_stream.write(etree.tostring(root, encoding="utf8", pretty_print=True))

    return path