Skip to content

samediggi_no_crawler

This file contains routines to crawl sites containing saami text.

SamediggiNoCrawler

Bases: Crawler

Crawl samediggi.no and save html documents to the corpus.

Source code in corpustools/samediggi_no_crawler.py
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
class SamediggiNoCrawler(crawler.Crawler):
    """Crawl samediggi.no and save html documents to the corpus."""

    langs = ["nob", "sma", "sme", "smj"]

    def __init__(self) -> None:
        """Initialise the SamediggiNoCrawler class."""
        super().__init__()
        self.unvisited_links.add("https://sametinget.no/")
        self.vcs = {
            lang: versioncontrol.vcs(self.corpus_parent / f"corpus-{lang}-orig")
            for lang in self.langs
        }

        self.dupe_table = self.make_dupe_dict()

    def samediggi_corpus_dirs(self) -> Iterator[Path]:
        return (
            self.corpus_parent / f"corpus-{lang}-orig" / "admin/sd/samediggi.no"
            for lang in self.langs
        )

    def samediggi_corpus_files(self) -> Iterator[Path]:
        return (
            html_path
            for corpus_dir in self.samediggi_corpus_dirs()
            for html_path in corpus_dir.rglob("*.html")
        )

    def make_dupe_dict(self) -> dict[str, Path]:
        """Make a dict to map md5-digest to filename."""
        return {
            make_digest(fullpath.read_bytes()): fullpath
            for fullpath in self.samediggi_corpus_files()
        }

    def crawl_page(self, link) -> SamediggiNoPage | None:
        """Collect links from a page."""
        self.visited_links.add(link)
        result = requests.get(link)

        if not result.ok:
            return None

        content_type = result.headers.get("content-type")
        if content_type is None:
            return None

        if "html" not in content_type.lower():
            return None

        tree = etree.HTML(result.text)

        if tree is None:
            return None

        orig_page = SamediggiNoPage(
            result.url, etree.HTML(result.text), self.corpus_parent, self.dupe_table
        )

        orig_page.sanity_test()
        self.visited_links.add(orig_page.url)
        self.unvisited_links.update(orig_page.links)

        return orig_page

    def crawl_site(self):
        """Crawl samediggi.no."""
        while self.unvisited_links:
            link = self.unvisited_links.pop()

            if link not in self.visited_links:
                self.crawl_pageset(link)

            self.unvisited_links.difference_update(self.visited_links)

    def is_page_addable(self, page: SamediggiNoPage | None):
        """Add a page to the list of parallel pages."""
        if page is None:
            return False

        return page.saveable and page.claimed_lang == page.real_lang

    @staticmethod
    def set_parallel_info(parallel_pages):
        """Set the parallels for this set of parallel pages."""
        lang_combinations = (
            (parallel_page1, parallel_page2)
            for parallel_page1 in parallel_pages
            for parallel_page2 in parallel_pages
            if parallel_page1 != parallel_page2
        )

        for parallel_page1, parallel_page2 in lang_combinations:
            parallel_page1.set_parallel_file(
                parallel_page2.lang, parallel_page2.basename
            )

    def get_page_set(self, orig_page) -> list[SamediggiNoPage]:
        """Get parallel pages for the original page.

        Args:
            orig_page: The original page to get parallel pages for.

        Returns:
            A list of parallel pages.
        """
        crawled_pages = [orig_page]
        crawled_pages.extend(
            [self.crawl_page(link) for link in orig_page.parallel_links]
        )

        pages = [page for page in crawled_pages if self.is_page_addable(page)]

        # If there is only a norwegian page, return an empty list
        # We are interested in the saami pages, the norwegian page is
        # valueable only if there is a saami page to compare it to
        if len(pages) and pages[0].claimed_lang == "nob":
            return []

        return pages

    def crawl_pageset(self, link):
        """Crawl a pageset that link gives us."""

        pages = self.get_page_set(self.crawl_page(link))

        self.set_parallel_info(pages)
        for page in pages:
            self.dupe_table[page.digest] = page.corpuspath.orig
            page.save()
            self.vcs[page.real_lang].add(page.corpuspath.orig)
            self.vcs[page.real_lang].add(page.corpuspath.xsl)

__init__()

Initialise the SamediggiNoCrawler class.

Source code in corpustools/samediggi_no_crawler.py
41
42
43
44
45
46
47
48
49
50
def __init__(self) -> None:
    """Initialise the SamediggiNoCrawler class."""
    super().__init__()
    self.unvisited_links.add("https://sametinget.no/")
    self.vcs = {
        lang: versioncontrol.vcs(self.corpus_parent / f"corpus-{lang}-orig")
        for lang in self.langs
    }

    self.dupe_table = self.make_dupe_dict()

crawl_page(link)

Collect links from a page.

Source code in corpustools/samediggi_no_crawler.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def crawl_page(self, link) -> SamediggiNoPage | None:
    """Collect links from a page."""
    self.visited_links.add(link)
    result = requests.get(link)

    if not result.ok:
        return None

    content_type = result.headers.get("content-type")
    if content_type is None:
        return None

    if "html" not in content_type.lower():
        return None

    tree = etree.HTML(result.text)

    if tree is None:
        return None

    orig_page = SamediggiNoPage(
        result.url, etree.HTML(result.text), self.corpus_parent, self.dupe_table
    )

    orig_page.sanity_test()
    self.visited_links.add(orig_page.url)
    self.unvisited_links.update(orig_page.links)

    return orig_page

crawl_pageset(link)

Crawl a pageset that link gives us.

Source code in corpustools/samediggi_no_crawler.py
158
159
160
161
162
163
164
165
166
167
168
def crawl_pageset(self, link):
    """Crawl a pageset that link gives us."""

    pages = self.get_page_set(self.crawl_page(link))

    self.set_parallel_info(pages)
    for page in pages:
        self.dupe_table[page.digest] = page.corpuspath.orig
        page.save()
        self.vcs[page.real_lang].add(page.corpuspath.orig)
        self.vcs[page.real_lang].add(page.corpuspath.xsl)

crawl_site()

Crawl samediggi.no.

Source code in corpustools/samediggi_no_crawler.py
102
103
104
105
106
107
108
109
110
def crawl_site(self):
    """Crawl samediggi.no."""
    while self.unvisited_links:
        link = self.unvisited_links.pop()

        if link not in self.visited_links:
            self.crawl_pageset(link)

        self.unvisited_links.difference_update(self.visited_links)

get_page_set(orig_page)

Get parallel pages for the original page.

Parameters:

Name Type Description Default
orig_page

The original page to get parallel pages for.

required

Returns:

Type Description
list[SamediggiNoPage]

A list of parallel pages.

Source code in corpustools/samediggi_no_crawler.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def get_page_set(self, orig_page) -> list[SamediggiNoPage]:
    """Get parallel pages for the original page.

    Args:
        orig_page: The original page to get parallel pages for.

    Returns:
        A list of parallel pages.
    """
    crawled_pages = [orig_page]
    crawled_pages.extend(
        [self.crawl_page(link) for link in orig_page.parallel_links]
    )

    pages = [page for page in crawled_pages if self.is_page_addable(page)]

    # If there is only a norwegian page, return an empty list
    # We are interested in the saami pages, the norwegian page is
    # valueable only if there is a saami page to compare it to
    if len(pages) and pages[0].claimed_lang == "nob":
        return []

    return pages

is_page_addable(page)

Add a page to the list of parallel pages.

Source code in corpustools/samediggi_no_crawler.py
112
113
114
115
116
117
def is_page_addable(self, page: SamediggiNoPage | None):
    """Add a page to the list of parallel pages."""
    if page is None:
        return False

    return page.saveable and page.claimed_lang == page.real_lang

make_dupe_dict()

Make a dict to map md5-digest to filename.

Source code in corpustools/samediggi_no_crawler.py
65
66
67
68
69
70
def make_dupe_dict(self) -> dict[str, Path]:
    """Make a dict to map md5-digest to filename."""
    return {
        make_digest(fullpath.read_bytes()): fullpath
        for fullpath in self.samediggi_corpus_files()
    }

set_parallel_info(parallel_pages) staticmethod

Set the parallels for this set of parallel pages.

Source code in corpustools/samediggi_no_crawler.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
@staticmethod
def set_parallel_info(parallel_pages):
    """Set the parallels for this set of parallel pages."""
    lang_combinations = (
        (parallel_page1, parallel_page2)
        for parallel_page1 in parallel_pages
        for parallel_page2 in parallel_pages
        if parallel_page1 != parallel_page2
    )

    for parallel_page1, parallel_page2 in lang_combinations:
        parallel_page1.set_parallel_file(
            parallel_page2.lang, parallel_page2.basename
        )