Skip to content

bibel_no_crawler

Functions to fetch bible texts from bibel.no.

fetch_other_languages(book_name, bookindex, chapternumber, address)

Given an address, fetch all parallels.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_crawler.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def fetch_other_languages(book_name, bookindex, chapternumber, address):
    """Given an address, fetch all parallels."""
    languages = {
        "nob": "bokmal11",
        "sme": "nordsamisk19",
        "sma": "sorsamisk",
        "smj": "lulesamisk",
    }

    parallels = []
    for lang in ["nob", "sme", "smj", "sma"]:
        new_address = f'{address.replace("bokmal11", languages[lang])}'
        # first_page = html.parse("nob_mat11.html")
        first_page = fetch_page(new_address)
        body = get_verses(first_page)
        if body is not None:
            header = first_page.find(".//h1").text.strip()
            parallels.append(
                save_page(
                    lang,
                    book_name,
                    filename=namechanger.normalise_filename(
                        f"{bookindex:0>2}_{chapternumber:0>3}_{header}"
                    ),
                    body=body,
                    address=new_address,
                )
            )

        for this_parallel in parallels:
            for that_parallel in parallels:
                if this_parallel != that_parallel:
                    this_parallel.metadata.set_parallel_text(
                        that_parallel.metadata.get_variable("mainlang"),
                        os.path.basename(that_parallel.orig),
                    )
            this_parallel.metadata.write_file()

fetch_page(address) cached

Fetch a page.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_crawler.py
30
31
32
33
34
@functools.lru_cache
def fetch_page(address):
    """Fetch a page."""
    main_content = requests.get(address)
    return html.document_fromstring(main_content.text)

get_books(tree)

Get the addresses for the books on bible.no.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_crawler.py
37
38
39
40
41
42
43
44
45
46
47
def get_books(tree):
    """Get the addresses for the books on bible.no."""
    books = {"ot": [], "nt": []}
    for table_row in tree.xpath(".//table[@class='booklist']/tr"):
        for (index, address) in enumerate(table_row.xpath("./td[@class='tablePR']/a")):
            if index == 1:
                books["ot"].append(address.get("href"))
            if index == 3:
                books["nt"].append(address.get("href"))

    return books

get_chapter_addresses(first_chapter_page)

Extract the addresses to the other chapters.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_crawler.py
50
51
52
53
54
55
def get_chapter_addresses(first_chapter_page):
    """Extract the addresses to the other chapters."""
    return (
        (address.text.strip(), address.get("href"))
        for address in first_chapter_page.xpath(".//a[@class='versechapter']")
    )

get_verses(chapter_page)

Extract the chapter content.

If the table does not exist, then this chapter does not exist. This is the case for some sámi translations.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_crawler.py
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def get_verses(chapter_page):
    """Extract the chapter content.

    If the table does not exist, then this chapter does not exist. This is the
    case for some sámi translations.
    """

    content_element = chapter_page.find(".//table[@class='biblesingle']/tr/td")
    if content_element is not None:

        for bibleref in content_element.xpath(".//div[@class='bibleref']"):
            bibleref.getparent().remove(bibleref)

        body = etree.Element("body")
        lastparent = body
        for element in content_element:
            if element.get("class") in ["versenumberdropcap", "versenumber"]:
                verse_number = element.get("name")
            if element.get("class") == "verse":
                text = " ".join("".join(element.itertext()).split())
                if text:
                    verse = etree.SubElement(lastparent, "verse")
                    verse.set("number", verse_number)
                    verse.text = text
            if element.get("class") == "verseheader" and element.text is not None:
                lastparent = etree.SubElement(body, "section")
                lastparent.set("title", element.text.strip())

        return body

    return None

main()

Fetch bible texts from bibel.no

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_crawler.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def main():
    """Fetch bible texts from bibel.no"""
    prefix = "https://bibel.no"
    books = get_books(fetch_page("https://bibel.no/nettbibelen?slang=bokmal11"))
    for book_name in books:
        for (bookindex, first_address) in enumerate(books[book_name], start=1):
            address = f"{prefix}{first_address}"
            first_page = fetch_page(address)
            fetch_other_languages(book_name, bookindex, 1, address)
            for (chapter_number, chapter_address) in get_chapter_addresses(first_page):
                chapter_address = f"{prefix}{chapter_address}"
                fetch_other_languages(
                    book_name, bookindex, chapter_number, chapter_address
                )

save_page(language, bookname, filename, body, address)

Save chapter page.

Source code in /home/anders/projects/CorpusTools/corpustools/bibel_no_crawler.py
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def save_page(language, bookname, filename, body, address):
    """Save chapter page."""
    language_year = {"nob": 2011, "sme": 2019.0}
    name = os.path.join(
        os.getenv("GTBOUND"),
        "orig",
        language,
        "bible",
        bookname,
        "bibel.no",
        f"{filename}.xml",
    )
    with util.ignored(OSError):
        os.makedirs(os.path.dirname(name))

    path = corpuspath.make_corpus_path(name)
    path.metadata.set_variable("filename", address)
    path.metadata.set_variable("mainlang", language)
    path.metadata.set_variable("genre", "bible")
    path.metadata.set_variable("monolingual", "1")
    path.metadata.set_variable("license_type", "standard")
    path.metadata.set_variable("publisher", "Det Norske Bibelselskap")
    path.metadata.set_variable("publChannel", "https://bibel.no/nettbibelen")
    path.metadata.set_variable("year", language_year.get(language, datetime.now().year))

    path.metadata.write_file()
    root = etree.Element("document")
    root.append(body)

    with open(name, "wb") as page_stream:
        page_stream.write(etree.tostring(root, encoding="utf8", pretty_print=True))

    return path