Skip to content

nrk_no_crawler

This file contains routines to crawl nrk.no containing saami text.

NrkNoCrawler

Bases: Crawler

Collect pages from nrk.no.

Source code in corpustools/nrk_no_crawler.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class NrkNoCrawler(Crawler):
    """Collect pages from nrk.no."""

    langs: list[str] = ["sme", "sma", "smj", "nob"]
    limit: int = 1000
    counter: defaultdict[str, int] = defaultdict(int)

    def __init__(self) -> None:
        super().__init__()
        print("init nrk.no")
        self.visited_links = self.get_fetched_ids()
        print("visited links:", len(self.visited_links))
        self.unvisited_links = self.fetchable_ids()
        print("unvisited links:", len(self.unvisited_links))
        self.vcs = {
            lang: vcs(self.corpus_parent / f"corpus-{lang}-orig-x-closed")
            for lang in self.langs
        }

    def get_article_ids(self) -> set[str]:
        """Get article ids from NRK Sápmi.

        Returns:
            A set of article ids.
        """
        json_sources = [
            f"https://www.nrk.no/serum/api/content/json/1.11160953?start=2&limit={self.limit}",  # https://www.nrk.no/sapmi/nyheter/
            f"https://www.nrk.no/serum/api/content/json/1.13572949?start=2&limit={self.limit}&context=items",  # https://www.nrk.no/sapmi/davvisamegillii/
            f"https://www.nrk.no/serum/api/content/json/1.13572946?start=2&limit={self.limit}&context=items",  # https://www.nrk.no/sapmi/julevsabmaj/
            f"https://www.nrk.no/serum/api/content/json/1.13572943?start=2&limit={self.limit}&context=items",  # https://www.nrk.no/sapmi/aaarjelsaemiengielesne/
        ]

        responses = (requests.get(url) for url in json_sources)

        response_jsons = (response.json() for response in responses)

        return {
            relation.get("id")
            for data in response_jsons
            for relation in data.get("relations")
        }

    def fetchable_ids(self) -> set[str]:
        article_ids = self.get_article_ids()
        return article_ids - self.visited_links

    def get_fetched_ids(self) -> set[str]:
        """Find articles ids of fetched documents.

        Args:
            path (str): path to the directory where nrk articles are found.

        Returns:
            A set of strings, where the strings are ids of the
            fetched articles.
        """
        corpus_dirs = [
            self.corpus_parent / f"corpus-{lang}-orig-x-closed" / "news/nrk.no"
            for lang in self.langs
        ]

        return {
            file_.stem.replace(".html", "").split("-")[-1]
            for path in corpus_dirs
            for file_ in Path(path).glob("*.xsl")
        }

    def crawl_page(self, article_id: str) -> NrkNoPage | None:
        """Collect links from a page."""
        self.visited_links.add(article_id)
        try:
            result = requests.get(f"https://nrk.no/sapmi/{article_id}", timeout=10)
        except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout):
            return None

        if not result.ok:
            return None

        content_type = result.headers.get("content-type")
        if content_type is None:
            return None

        if "html" not in content_type.lower():
            return None

        tree = etree.HTML(result.text)

        if tree is None:
            return None

        orig_page = NrkNoPage(result.url, etree.HTML(result.text), self.corpus_parent)

        self.unvisited_links.update(orig_page.links)

        return orig_page

    def crawl_site(self):
        print("Crawling nrk.no.")
        while self.unvisited_links:
            article_id = self.unvisited_links.pop()
            if article_id not in self.visited_links:
                try:
                    self.crawl_pageset(article_id)
                except NrkNoUnknownPageError as error:
                    print(f"Error: {error}")
                sleep(0.5)

            self.unvisited_links.difference_update(self.visited_links)
            print(
                article_id,
                "U:",
                len(self.unvisited_links),
                "V:",
                len(self.visited_links),
                end="\r",
            )

        pprint(self.counter)

    @staticmethod
    def set_parallel_info(parallel_pages):
        """Set the parallels for this set of parallel pages."""
        lang_combinations = (
            (parallel_page1, parallel_page2)
            for parallel_page1 in parallel_pages
            for parallel_page2 in parallel_pages
            if parallel_page1 != parallel_page2
        )

        for parallel_page1, parallel_page2 in lang_combinations:
            parallel_page1.set_parallel_file(
                parallel_page2.lang, parallel_page2.basename
            )

    def crawl_pageset(self, article_id: str) -> None:
        orig_page = self.crawl_page(article_id)
        if orig_page is None:
            print(f"Could not crawl {article_id}.")
            return

        pages = self.get_page_set(orig_page=orig_page)

        self.set_parallel_info(pages)
        for page in pages:
            page.save()
            self.vcs[page.lang].add(page.fullpath.orig)
            self.vcs[page.lang].add(page.fullpath.xsl)
            self.counter[page.lang] += 1

    def get_page_set(self, orig_page) -> list[NrkNoPage]:
        """Get parallel pages for the original page.

        Args:
            orig_page: The original page to get parallel pages for.

        Returns:
            A list of parallel pages.
        """
        pages = [orig_page]
        pages.extend([self.crawl_page(link) for link in orig_page.parallel_ids])

        # If we only have norwegian, we don't want to save any pages
        page_langs = {page.lang for page in pages if page is not None}
        if len(page_langs) == 1 and "nob" in page_langs:
            return []

        return [page for page in pages if page is not None]

crawl_page(article_id)

Collect links from a page.

Source code in corpustools/nrk_no_crawler.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def crawl_page(self, article_id: str) -> NrkNoPage | None:
    """Collect links from a page."""
    self.visited_links.add(article_id)
    try:
        result = requests.get(f"https://nrk.no/sapmi/{article_id}", timeout=10)
    except (requests.exceptions.ConnectionError, requests.exceptions.ReadTimeout):
        return None

    if not result.ok:
        return None

    content_type = result.headers.get("content-type")
    if content_type is None:
        return None

    if "html" not in content_type.lower():
        return None

    tree = etree.HTML(result.text)

    if tree is None:
        return None

    orig_page = NrkNoPage(result.url, etree.HTML(result.text), self.corpus_parent)

    self.unvisited_links.update(orig_page.links)

    return orig_page

get_article_ids()

Get article ids from NRK Sápmi.

Returns:

Type Description
set[str]

A set of article ids.

Source code in corpustools/nrk_no_crawler.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_article_ids(self) -> set[str]:
    """Get article ids from NRK Sápmi.

    Returns:
        A set of article ids.
    """
    json_sources = [
        f"https://www.nrk.no/serum/api/content/json/1.11160953?start=2&limit={self.limit}",  # https://www.nrk.no/sapmi/nyheter/
        f"https://www.nrk.no/serum/api/content/json/1.13572949?start=2&limit={self.limit}&context=items",  # https://www.nrk.no/sapmi/davvisamegillii/
        f"https://www.nrk.no/serum/api/content/json/1.13572946?start=2&limit={self.limit}&context=items",  # https://www.nrk.no/sapmi/julevsabmaj/
        f"https://www.nrk.no/serum/api/content/json/1.13572943?start=2&limit={self.limit}&context=items",  # https://www.nrk.no/sapmi/aaarjelsaemiengielesne/
    ]

    responses = (requests.get(url) for url in json_sources)

    response_jsons = (response.json() for response in responses)

    return {
        relation.get("id")
        for data in response_jsons
        for relation in data.get("relations")
    }

get_fetched_ids()

Find articles ids of fetched documents.

Parameters:

Name Type Description Default
path str

path to the directory where nrk articles are found.

required

Returns:

Type Description
set[str]

A set of strings, where the strings are ids of the

set[str]

fetched articles.

Source code in corpustools/nrk_no_crawler.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def get_fetched_ids(self) -> set[str]:
    """Find articles ids of fetched documents.

    Args:
        path (str): path to the directory where nrk articles are found.

    Returns:
        A set of strings, where the strings are ids of the
        fetched articles.
    """
    corpus_dirs = [
        self.corpus_parent / f"corpus-{lang}-orig-x-closed" / "news/nrk.no"
        for lang in self.langs
    ]

    return {
        file_.stem.replace(".html", "").split("-")[-1]
        for path in corpus_dirs
        for file_ in Path(path).glob("*.xsl")
    }

get_page_set(orig_page)

Get parallel pages for the original page.

Parameters:

Name Type Description Default
orig_page

The original page to get parallel pages for.

required

Returns:

Type Description
list[NrkNoPage]

A list of parallel pages.

Source code in corpustools/nrk_no_crawler.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def get_page_set(self, orig_page) -> list[NrkNoPage]:
    """Get parallel pages for the original page.

    Args:
        orig_page: The original page to get parallel pages for.

    Returns:
        A list of parallel pages.
    """
    pages = [orig_page]
    pages.extend([self.crawl_page(link) for link in orig_page.parallel_ids])

    # If we only have norwegian, we don't want to save any pages
    page_langs = {page.lang for page in pages if page is not None}
    if len(page_langs) == 1 and "nob" in page_langs:
        return []

    return [page for page in pages if page is not None]

set_parallel_info(parallel_pages) staticmethod

Set the parallels for this set of parallel pages.

Source code in corpustools/nrk_no_crawler.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
@staticmethod
def set_parallel_info(parallel_pages):
    """Set the parallels for this set of parallel pages."""
    lang_combinations = (
        (parallel_page1, parallel_page2)
        for parallel_page1 in parallel_pages
        for parallel_page2 in parallel_pages
        if parallel_page1 != parallel_page2
    )

    for parallel_page1, parallel_page2 in lang_combinations:
        parallel_page1.set_parallel_file(
            parallel_page2.lang, parallel_page2.basename
        )