Skip to content

nrk_no_crawler

This file contains routines to crawl nrk.no containing saami text.

NrkSmeCrawler

Collect Northern Saami pages from nrk.no.

Attributes:

Name Type Description
language_guesser text_cat.Classifier

guess language from a given string

goaldir str

the directory where the working copy of the corpus is

corpus_adder adder.AddToCorpus

the working horse, adds urls to the corpus

tags dict of str to str

numerical tags that point to a specific topic on nrk.no

invalid_links set of str

all links containing 'gammelsystem'

counter collections.defaultdict of int

collect interesting statistics, such number of links visited and fetched links within a tag

fetched_links set of str

links to articles that have already been fetched

authors set of str

authors of articles

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
class NrkSmeCrawler:
    """Collect Northern Saami pages from nrk.no.

    Attributes:
        language_guesser (text_cat.Classifier): guess language from a given
            string
        goaldir (str): the directory where the working copy of the corpus is
        corpus_adder (adder.AddToCorpus): the working horse, adds urls to
            the corpus
        tags (dict of str to str): numerical tags that point to a specific
            topic on nrk.no
        invalid_links (set of str): all links containing 'gammelsystem'
        counter (collections.defaultdict of int): collect interesting
            statistics, such number of links visited and fetched links within
            a tag
        fetched_links (set of str): links to articles that have already been
            fetched
        authors (set of str): authors of articles
    """

    language_guesser = text_cat.Classifier(None)
    goaldir = str(os.getenv("GTBOUND"))
    corpus_adder = adder.AddToCorpus(goaldir, "sme", "news/nrk.no")
    tags = defaultdict(str)
    invalid_links = set()
    counter = defaultdict(int)
    authors = set()

    def __init__(self):
        """Initialise the NrkSmeCrawler class."""
        self.fetched_ids = self.get_fetched_links(self.corpus_adder.goaldir)
        # Ids containing norwegian text
        self.fetched_ids |= {
            "1.11060139",
            "1.11205504",
            "1.11518300",
            "1.11526579",
            "1.11876027",
            "1.11909062",
            "1.12274706",
            "1.13050654",
            "1.13077542",
            "1.13599435",
            "1.13683886",
            "1.13683979",
            "1.13684081",
            "1.2265333",
            "1.4708759",
            "1.4837038",
            "1.5174999",
            "1.6129908",
            "1.6431307",
            "1.6439563",
            "1.6468432",
            "1.6469363",
            "1.6538125",
            "1.6563405",
            "1.6776103",
            "1.6784213",
            "1.6857178",
            "1.7066094",
            "1.7222473",
            "1.7391316",
            "1.7397359",
            "1.7826351",
            "1.7971308",
            "1.7990373",
            "1.8065147",
            "1.8231915",
            "1.8239588",
            "1.8836268",
            "1.4178483",
            "1.6474023",
            "1.7096768",
            "1.12593187",
            "1.6479890",
            "1.6136593",
            "1.6602458",
        }

    def guess_lang(self, address):
        """Guess the language of the address element.

        Args:
            address (html.Element): An element where interesting text is found

        Returns:
            (str): str containing the language of the text
        """
        # This bytes hoopla is done because the text
        # comes out as utf8 encoded as latin1 …
        try:
            text = bytes(
                address.find('.//p[@class="plug-preamble"]').text, encoding="latin1"
            )
        except AttributeError:
            text = bytes(address.find('.//h2[@class="title"]').text, encoding="latin1")
        lang = self.language_guesser.classify(text)
        if lang == "sme":
            util.print_frame(text)

        return lang

    def get_tag_page_trees(self, tag):
        """Fetch topic pages containing links to articles.

        By using the page_links_template, one can fetch `quantity` number of
        links to articles within `tag` at a time.

        Attributes:
            page_links_template: a url to a specific topic in nrk.no.
            quantity (int): the number of links to fetch a time
            limit (int): max number of links that one tries to fetch

        Args:
            tag (str): a numerical tag, pointing to a specific topic on nrk.no

        Yields:
            (lxml.html.HtmlElement): a parsed html document.
        """
        page_links_template = (
            "https://www.nrk.no/serum/api/render/{tag}?"
            "size=18&perspective=BRIEF&alignment=AUTO&"
            "classes=surrogate-content&"
            "display=false&arrangement.offset={offset}&"
            "arrangement.quantity={quantity}&"
            "arrangement.repetition=PATTERN&"
            "arrangement.view[0].perspective=BRIEF&"
            "arrangement.view[0].size=6&"
            "arrangement.view[0].alignment=LEFT&"
            "paged=SIMPLE"
        )
        quantity = 10
        limit = 10000

        for offset in range(0, limit, quantity):
            print(".", end="")
            sys.stdout.flush()
            try:
                result = requests.get(
                    page_links_template.format(
                        tag=tag, offset=offset, quantity=quantity
                    )
                )
            except requests.exceptions.ConnectionError:
                util.note(f"Connection error when fetching {tag}")
                break
            else:
                try:
                    yield html.document_fromstring(result.content)
                except etree.ParserError:
                    util.note(f"No more articles in tag: «{self.tags[tag]}»")
                    break

    def interesting_links(self, tag):
        """Find interesting pages inside a topic.

        Args:
            tag (str): a numerical tag pointing to a specific topic.

        Yields:
            (str): a url to an nrk.no article
        """
        for tree in self.get_tag_page_trees(tag):
            for address in tree.xpath('//a[@class="autonomous lp_plug"]'):
                self.counter[tag + "_total"] += 1
                href = address.get("href")
                article_id = href.strip().split("-")[-1]
                if "systemtest" in href:
                    self.invalid_links.add(href)
                if (
                    "systemtest" not in href
                    and article_id not in self.fetched_ids
                    and self.guess_lang(address) == "sme"
                ):
                    self.counter[tag + "_fetched"] += 1
                    yield href

    @staticmethod
    def pick_tags(path):
        """Find tags in an nrk.no article.

        Tags potientially contain more Northern Sámi articles.

        Args:
            path (str): path to an nrk.no article

        Yields:
            (tuple[str, str]): a numerical tag, used internally by nrk.no to
                point to a specific topic and a short description of the topic.
        """
        article = html.parse(path)

        for address in article.xpath(
            '//a[@class="universe widget reference article-universe-link '
            'universe-teaser skin-border skin-text lp_universe_link"]'
        ):
            href = address.get("href")
            yield href[href.rfind("-") + 1 :], address[0].tail.strip()

    def crawl_tag(self, tag, tagname):
        """Look for articles in nrk.no tags.

        Args:
            tag (str): an internal nrk.no tag
        """
        if tag not in self.tags:
            util.note(f"Fetching articles from «{tagname}»")
            self.tags[tag] = tagname
            for href in self.interesting_links(tag):
                self.add_nrk_article(href)

            self.counter["total"] += self.counter[tag + "_total"]
            self.counter["fetched"] += self.counter[tag + "_fetched"]

    def add_nrk_article(self, href):
        """Copy an article to the working copy.

        Args:
            href (str): a url to an nrk article.
        """
        self.fetched_ids.add(href.split("-")[-1])
        try:
            path = self.corpus_adder.copy_url_to_corpus(href)
            self.add_metadata(path)
        except (
            requests.exceptions.TooManyRedirects,
            adder.AdderError,
            UserWarning,
        ) as error:
            util.note(href)
            util.note(error)

    def crawl_site(self):
        """Fetch Northern Saami pages from nrk.no."""
        self.crawl_oanehaccat()
        self.crawl_existing_tags()
        # self.crawl_authors()
        # self.corpus_adder.add_files_to_working_copy()
        self.report()

    def find_nrk_files(self):
        """Find all nrk.no files."""
        for root, _, files in os.walk(self.corpus_adder.goaldir):
            for file_ in files:
                if file_.endswith(".html"):
                    yield os.path.join(root, file_)

    def crawl_existing_tags(self):
        """Crawl all tags found in nrk.no documents."""
        for nrk_file in self.find_nrk_files():
            for additional_tag, tag_name in self.pick_tags(nrk_file):
                self.crawl_tag(additional_tag, tag_name)

    def crawl_oanehaccat(self):
        """Crawl short news, provided by a json dict.

        This feed only contains Northern Sámi articles.
        """
        util.note("Fetching articles from {}".format("oanehaččat"))
        self.tags["oanehaččat"] = "oanehaččat"
        oanehaccat = requests.get(
            "https://www.nrk.no/serum/api/content/json/"
            "1.13572949?v=2&limit=1000&context=items"
        )
        for relation in oanehaccat.json()["relations"]:
            self.counter["oanehaččat_total"] += 1
            if relation["id"] not in self.fetched_ids:
                self.counter["oanehaččat_fetched"] += 1
                self.add_nrk_article(
                    "https://www.nrk.no/sapmi/{}".format(relation["id"])
                )

        self.counter["total"] += self.counter["oanehaččat_total"]
        self.counter["fetched"] += self.counter["oanehaččat_fetched"]

    def handle_search_hits(self, hits):
        """Decide whether articles found in search results should be saved."""
        for hit in hits:
            if hit["url"].split("-")[-1] not in self.fetched_ids and hit.get(
                "description"
            ):
                lang = self.language_guesser.classify(hit["description"])
                if lang == "sme":
                    util.print_frame(len(hit["description"]), hit["description"], "\n")
                    if len(hit["description"]) > 15:
                        self.counter["authors_fetched"] += 1
                        self.add_nrk_article(hit["url"])

    def crawl_authors(self):
        """Search for authors on nrk.no.

        Not all articles have are represented under the tags found, so
        a search on author names is also done.
        """
        self.tags["authors"] = "authors"
        for nrk_file in self.find_nrk_files():
            self.counter["nrk_file"] += 1
            article = html.parse(nrk_file)
            for author_body in article.xpath('.//div[@class="author__body"]'):
                self.counter["author__body"] += 1
                author_name = author_body.find('./a[@class="author__name"]')
                if author_name is not None and author_name.text is not None:
                    self.authors.add(author_name.text.strip().split()[-1].lower())
                    self.counter["name"] += 1

        for author_parts in self.authors:
            util.print_frame(author_parts, "\n")
            index = 0
            total = 100001
            while True:
                hits = self.get_search_page(
                    "https://www.nrk.no/sok/?format=json&scope=nrkno"
                    "&filter=nrkno&q={}&from={}".format(author_parts, str(index))
                )
                if not hits:
                    util.print_frame("empty hits, should break")
                    break
                if int(hits["total"]) < total:
                    total = int(hits["total"])
                self.handle_search_hits(hits["hits"])
                if index > total:
                    break
                index += 20

        self.counter["fetched"] += self.counter["authors_fetched"]

    @staticmethod
    def get_search_page(search_link):
        """Get search pages, containing links to author search.

        Args:
            search_link (str): query string to nrk.no

        Returns:
            (dict): dict containing search results from search
        """
        result = requests.get(search_link)
        content = result.content.decode("utf8")

        try:
            return json.loads(content)
        except json.decoder.JSONDecodeError:
            util.print_frame(search_link)
            util.print_frame(result)
            util.print_frame(content)

            if content:
                return {"hits": [], "from": "-1", "total": "100000"}
            else:
                return content

    def report(self):
        """Print a report on what was found."""
        print(f"{len(self.invalid_links)} invalid links.")
        for invalid_link in self.invalid_links:
            print(invalid_link)
        print()
        print(f"Searched through {len(self.tags)} tags")
        print(f"Searched through {len(self.authors)} authors")
        print("Fetched {fetched} pages".format(**self.counter))
        for tag in self.tags:
            if self.counter[tag + "_fetched"]:
                print(
                    "Fetched {} articles from category {} from nrk.no".format(
                        self.counter[tag + "_fetched"], self.tags[tag]
                    )
                )

    @staticmethod
    def valid_authors(article):
        """Find authors with the correct roles.

        Args:
            article (etree.Element): The parsed html document.

        Yields:
            (tuple[str, ...]): Authors
        """
        for author_role in article.xpath('.//span[@class="author__role"]'):
            text = author_role.text.strip()
            if text is not None and (
                text.startswith("Journ")
                or text.startswith("Komm")
                or text.startswith("Arti")
            ):
                parts = author_role.getprevious().text.strip().split()

                yield parts

    def add_metadata(self, path):
        """Get metadata from the nrk.no article.

        Args:
            path (str): path to the nrk.no article
        """
        article = html.parse(path)
        metadata = xslsetter.MetadataHandler(path + ".xsl")

        for count, author_parts in enumerate(self.valid_authors(article), start=1):
            metadata.set_variable("author" + str(count) + "_ln", author_parts[-1])
            metadata.set_variable(
                "author" + str(count) + "_fn", " ".join(author_parts[:-1])
            )

        time = article.find('//time[@itemprop="datePublished"]')
        if time is None:
            time = article.find('//time[@class="relative bulletin-time"]')
        date = dateutil.parser.parse(time.get("datetime"))
        metadata.set_variable("year", date.year)

        metadata.set_variable("publisher", "NRK")
        metadata.set_variable("license_type", "standard")
        metadata.write_file()

    @staticmethod
    def get_fetched_links(path):
        """Find fetched links.

        Args:
            path (str): path to the directory where nrk articles are found.

        Returns:
            (set[str]): Set of strings, where the strings are ids to the
                article.
        """
        return {
            xslsetter.MetadataHandler(os.path.join(root, file_))
            .get_variable("filename")
            .split("-")[-1]
            for root, _, files in os.walk(path)
            for file_ in files
            if file_.endswith(".xsl")
        }

__init__()

Initialise the NrkSmeCrawler class.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def __init__(self):
    """Initialise the NrkSmeCrawler class."""
    self.fetched_ids = self.get_fetched_links(self.corpus_adder.goaldir)
    # Ids containing norwegian text
    self.fetched_ids |= {
        "1.11060139",
        "1.11205504",
        "1.11518300",
        "1.11526579",
        "1.11876027",
        "1.11909062",
        "1.12274706",
        "1.13050654",
        "1.13077542",
        "1.13599435",
        "1.13683886",
        "1.13683979",
        "1.13684081",
        "1.2265333",
        "1.4708759",
        "1.4837038",
        "1.5174999",
        "1.6129908",
        "1.6431307",
        "1.6439563",
        "1.6468432",
        "1.6469363",
        "1.6538125",
        "1.6563405",
        "1.6776103",
        "1.6784213",
        "1.6857178",
        "1.7066094",
        "1.7222473",
        "1.7391316",
        "1.7397359",
        "1.7826351",
        "1.7971308",
        "1.7990373",
        "1.8065147",
        "1.8231915",
        "1.8239588",
        "1.8836268",
        "1.4178483",
        "1.6474023",
        "1.7096768",
        "1.12593187",
        "1.6479890",
        "1.6136593",
        "1.6602458",
    }

add_metadata(path)

Get metadata from the nrk.no article.

Parameters:

Name Type Description Default
path str

path to the nrk.no article

required
Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
def add_metadata(self, path):
    """Get metadata from the nrk.no article.

    Args:
        path (str): path to the nrk.no article
    """
    article = html.parse(path)
    metadata = xslsetter.MetadataHandler(path + ".xsl")

    for count, author_parts in enumerate(self.valid_authors(article), start=1):
        metadata.set_variable("author" + str(count) + "_ln", author_parts[-1])
        metadata.set_variable(
            "author" + str(count) + "_fn", " ".join(author_parts[:-1])
        )

    time = article.find('//time[@itemprop="datePublished"]')
    if time is None:
        time = article.find('//time[@class="relative bulletin-time"]')
    date = dateutil.parser.parse(time.get("datetime"))
    metadata.set_variable("year", date.year)

    metadata.set_variable("publisher", "NRK")
    metadata.set_variable("license_type", "standard")
    metadata.write_file()

add_nrk_article(href)

Copy an article to the working copy.

Parameters:

Name Type Description Default
href str

a url to an nrk article.

required
Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
def add_nrk_article(self, href):
    """Copy an article to the working copy.

    Args:
        href (str): a url to an nrk article.
    """
    self.fetched_ids.add(href.split("-")[-1])
    try:
        path = self.corpus_adder.copy_url_to_corpus(href)
        self.add_metadata(path)
    except (
        requests.exceptions.TooManyRedirects,
        adder.AdderError,
        UserWarning,
    ) as error:
        util.note(href)
        util.note(error)

crawl_authors()

Search for authors on nrk.no.

Not all articles have are represented under the tags found, so a search on author names is also done.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
def crawl_authors(self):
    """Search for authors on nrk.no.

    Not all articles have are represented under the tags found, so
    a search on author names is also done.
    """
    self.tags["authors"] = "authors"
    for nrk_file in self.find_nrk_files():
        self.counter["nrk_file"] += 1
        article = html.parse(nrk_file)
        for author_body in article.xpath('.//div[@class="author__body"]'):
            self.counter["author__body"] += 1
            author_name = author_body.find('./a[@class="author__name"]')
            if author_name is not None and author_name.text is not None:
                self.authors.add(author_name.text.strip().split()[-1].lower())
                self.counter["name"] += 1

    for author_parts in self.authors:
        util.print_frame(author_parts, "\n")
        index = 0
        total = 100001
        while True:
            hits = self.get_search_page(
                "https://www.nrk.no/sok/?format=json&scope=nrkno"
                "&filter=nrkno&q={}&from={}".format(author_parts, str(index))
            )
            if not hits:
                util.print_frame("empty hits, should break")
                break
            if int(hits["total"]) < total:
                total = int(hits["total"])
            self.handle_search_hits(hits["hits"])
            if index > total:
                break
            index += 20

    self.counter["fetched"] += self.counter["authors_fetched"]

crawl_existing_tags()

Crawl all tags found in nrk.no documents.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
282
283
284
285
286
def crawl_existing_tags(self):
    """Crawl all tags found in nrk.no documents."""
    for nrk_file in self.find_nrk_files():
        for additional_tag, tag_name in self.pick_tags(nrk_file):
            self.crawl_tag(additional_tag, tag_name)

crawl_oanehaccat()

Crawl short news, provided by a json dict.

This feed only contains Northern Sámi articles.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
def crawl_oanehaccat(self):
    """Crawl short news, provided by a json dict.

    This feed only contains Northern Sámi articles.
    """
    util.note("Fetching articles from {}".format("oanehaččat"))
    self.tags["oanehaččat"] = "oanehaččat"
    oanehaccat = requests.get(
        "https://www.nrk.no/serum/api/content/json/"
        "1.13572949?v=2&limit=1000&context=items"
    )
    for relation in oanehaccat.json()["relations"]:
        self.counter["oanehaččat_total"] += 1
        if relation["id"] not in self.fetched_ids:
            self.counter["oanehaččat_fetched"] += 1
            self.add_nrk_article(
                "https://www.nrk.no/sapmi/{}".format(relation["id"])
            )

    self.counter["total"] += self.counter["oanehaččat_total"]
    self.counter["fetched"] += self.counter["oanehaččat_fetched"]

crawl_site()

Fetch Northern Saami pages from nrk.no.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
267
268
269
270
271
272
273
def crawl_site(self):
    """Fetch Northern Saami pages from nrk.no."""
    self.crawl_oanehaccat()
    self.crawl_existing_tags()
    # self.crawl_authors()
    # self.corpus_adder.add_files_to_working_copy()
    self.report()

crawl_tag(tag, tagname)

Look for articles in nrk.no tags.

Parameters:

Name Type Description Default
tag str

an internal nrk.no tag

required
Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
def crawl_tag(self, tag, tagname):
    """Look for articles in nrk.no tags.

    Args:
        tag (str): an internal nrk.no tag
    """
    if tag not in self.tags:
        util.note(f"Fetching articles from «{tagname}»")
        self.tags[tag] = tagname
        for href in self.interesting_links(tag):
            self.add_nrk_article(href)

        self.counter["total"] += self.counter[tag + "_total"]
        self.counter["fetched"] += self.counter[tag + "_fetched"]

find_nrk_files()

Find all nrk.no files.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
275
276
277
278
279
280
def find_nrk_files(self):
    """Find all nrk.no files."""
    for root, _, files in os.walk(self.corpus_adder.goaldir):
        for file_ in files:
            if file_.endswith(".html"):
                yield os.path.join(root, file_)

Find fetched links.

Parameters:

Name Type Description Default
path str

path to the directory where nrk articles are found.

required

Returns:

Type Description
set[str]

Set of strings, where the strings are ids to the article.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
@staticmethod
def get_fetched_links(path):
    """Find fetched links.

    Args:
        path (str): path to the directory where nrk articles are found.

    Returns:
        (set[str]): Set of strings, where the strings are ids to the
            article.
    """
    return {
        xslsetter.MetadataHandler(os.path.join(root, file_))
        .get_variable("filename")
        .split("-")[-1]
        for root, _, files in os.walk(path)
        for file_ in files
        if file_.endswith(".xsl")
    }

get_search_page(search_link) staticmethod

Get search pages, containing links to author search.

Parameters:

Name Type Description Default
search_link str

query string to nrk.no

required

Returns:

Type Description
dict

dict containing search results from search

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
@staticmethod
def get_search_page(search_link):
    """Get search pages, containing links to author search.

    Args:
        search_link (str): query string to nrk.no

    Returns:
        (dict): dict containing search results from search
    """
    result = requests.get(search_link)
    content = result.content.decode("utf8")

    try:
        return json.loads(content)
    except json.decoder.JSONDecodeError:
        util.print_frame(search_link)
        util.print_frame(result)
        util.print_frame(content)

        if content:
            return {"hits": [], "from": "-1", "total": "100000"}
        else:
            return content

get_tag_page_trees(tag)

Fetch topic pages containing links to articles.

By using the page_links_template, one can fetch quantity number of links to articles within tag at a time.

Attributes:

Name Type Description
page_links_template

a url to a specific topic in nrk.no.

quantity int

the number of links to fetch a time

limit int

max number of links that one tries to fetch

Parameters:

Name Type Description Default
tag str

a numerical tag, pointing to a specific topic on nrk.no

required

Yields:

Type Description
lxml.html.HtmlElement

a parsed html document.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def get_tag_page_trees(self, tag):
    """Fetch topic pages containing links to articles.

    By using the page_links_template, one can fetch `quantity` number of
    links to articles within `tag` at a time.

    Attributes:
        page_links_template: a url to a specific topic in nrk.no.
        quantity (int): the number of links to fetch a time
        limit (int): max number of links that one tries to fetch

    Args:
        tag (str): a numerical tag, pointing to a specific topic on nrk.no

    Yields:
        (lxml.html.HtmlElement): a parsed html document.
    """
    page_links_template = (
        "https://www.nrk.no/serum/api/render/{tag}?"
        "size=18&perspective=BRIEF&alignment=AUTO&"
        "classes=surrogate-content&"
        "display=false&arrangement.offset={offset}&"
        "arrangement.quantity={quantity}&"
        "arrangement.repetition=PATTERN&"
        "arrangement.view[0].perspective=BRIEF&"
        "arrangement.view[0].size=6&"
        "arrangement.view[0].alignment=LEFT&"
        "paged=SIMPLE"
    )
    quantity = 10
    limit = 10000

    for offset in range(0, limit, quantity):
        print(".", end="")
        sys.stdout.flush()
        try:
            result = requests.get(
                page_links_template.format(
                    tag=tag, offset=offset, quantity=quantity
                )
            )
        except requests.exceptions.ConnectionError:
            util.note(f"Connection error when fetching {tag}")
            break
        else:
            try:
                yield html.document_fromstring(result.content)
            except etree.ParserError:
                util.note(f"No more articles in tag: «{self.tags[tag]}»")
                break

guess_lang(address)

Guess the language of the address element.

Parameters:

Name Type Description Default
address html.Element

An element where interesting text is found

required

Returns:

Type Description
str

str containing the language of the text

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
def guess_lang(self, address):
    """Guess the language of the address element.

    Args:
        address (html.Element): An element where interesting text is found

    Returns:
        (str): str containing the language of the text
    """
    # This bytes hoopla is done because the text
    # comes out as utf8 encoded as latin1 …
    try:
        text = bytes(
            address.find('.//p[@class="plug-preamble"]').text, encoding="latin1"
        )
    except AttributeError:
        text = bytes(address.find('.//h2[@class="title"]').text, encoding="latin1")
    lang = self.language_guesser.classify(text)
    if lang == "sme":
        util.print_frame(text)

    return lang

handle_search_hits(hits)

Decide whether articles found in search results should be saved.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
310
311
312
313
314
315
316
317
318
319
320
321
def handle_search_hits(self, hits):
    """Decide whether articles found in search results should be saved."""
    for hit in hits:
        if hit["url"].split("-")[-1] not in self.fetched_ids and hit.get(
            "description"
        ):
            lang = self.language_guesser.classify(hit["description"])
            if lang == "sme":
                util.print_frame(len(hit["description"]), hit["description"], "\n")
                if len(hit["description"]) > 15:
                    self.counter["authors_fetched"] += 1
                    self.add_nrk_article(hit["url"])

Find interesting pages inside a topic.

Parameters:

Name Type Description Default
tag str

a numerical tag pointing to a specific topic.

required

Yields:

Type Description
str

a url to an nrk.no article

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def interesting_links(self, tag):
    """Find interesting pages inside a topic.

    Args:
        tag (str): a numerical tag pointing to a specific topic.

    Yields:
        (str): a url to an nrk.no article
    """
    for tree in self.get_tag_page_trees(tag):
        for address in tree.xpath('//a[@class="autonomous lp_plug"]'):
            self.counter[tag + "_total"] += 1
            href = address.get("href")
            article_id = href.strip().split("-")[-1]
            if "systemtest" in href:
                self.invalid_links.add(href)
            if (
                "systemtest" not in href
                and article_id not in self.fetched_ids
                and self.guess_lang(address) == "sme"
            ):
                self.counter[tag + "_fetched"] += 1
                yield href

pick_tags(path) staticmethod

Find tags in an nrk.no article.

Tags potientially contain more Northern Sámi articles.

Parameters:

Name Type Description Default
path str

path to an nrk.no article

required

Yields:

Type Description
tuple[str, str]

a numerical tag, used internally by nrk.no to point to a specific topic and a short description of the topic.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
@staticmethod
def pick_tags(path):
    """Find tags in an nrk.no article.

    Tags potientially contain more Northern Sámi articles.

    Args:
        path (str): path to an nrk.no article

    Yields:
        (tuple[str, str]): a numerical tag, used internally by nrk.no to
            point to a specific topic and a short description of the topic.
    """
    article = html.parse(path)

    for address in article.xpath(
        '//a[@class="universe widget reference article-universe-link '
        'universe-teaser skin-border skin-text lp_universe_link"]'
    ):
        href = address.get("href")
        yield href[href.rfind("-") + 1 :], address[0].tail.strip()

report()

Print a report on what was found.

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
def report(self):
    """Print a report on what was found."""
    print(f"{len(self.invalid_links)} invalid links.")
    for invalid_link in self.invalid_links:
        print(invalid_link)
    print()
    print(f"Searched through {len(self.tags)} tags")
    print(f"Searched through {len(self.authors)} authors")
    print("Fetched {fetched} pages".format(**self.counter))
    for tag in self.tags:
        if self.counter[tag + "_fetched"]:
            print(
                "Fetched {} articles from category {} from nrk.no".format(
                    self.counter[tag + "_fetched"], self.tags[tag]
                )
            )

valid_authors(article) staticmethod

Find authors with the correct roles.

Parameters:

Name Type Description Default
article etree.Element

The parsed html document.

required

Yields:

Type Description
tuple[str, ...]

Authors

Source code in /home/anders/projects/CorpusTools/corpustools/nrk_no_crawler.py
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
@staticmethod
def valid_authors(article):
    """Find authors with the correct roles.

    Args:
        article (etree.Element): The parsed html document.

    Yields:
        (tuple[str, ...]): Authors
    """
    for author_role in article.xpath('.//span[@class="author__role"]'):
        text = author_role.text.strip()
        if text is not None and (
            text.startswith("Journ")
            or text.startswith("Komm")
            or text.startswith("Arti")
        ):
            parts = author_role.getprevious().text.strip().split()

            yield parts