Skip to content

nrk_no_page

NrkNoPage

Save a NRK sápmi page to the corpus.

Source code in corpustools/nrk_no_page.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
class NrkNoPage:
    """Save a NRK sápmi page to the corpus."""

    languageguesser = Classifier()
    language_codes = {
        "nob",
        "sma",
        "sme",
        "smj",
    }

    def __init__(
        self,
        original_url: str,
        html_element: etree._Element,
        corpus_parent: Path,
    ):
        """Initialise the NrkNoPage class."""
        self.corpus_parent = corpus_parent
        self.url = original_url
        self.tree = html_element
        self.lang = self.languageguesser.classify(
            self.body_text, langs=list(self.language_codes)
        )
        self.article_id = self.url.split("/")[-1]
        # Only look for links on NRK sápmi pages
        self.links = (
            {
                link.get("ec-id", "").replace("pp:", "")
                for link in self.tree.findall(".//a[@ec-id]")
                if link.get("ec-id") is not None and "pp:" in link.get("ec-id", "")
            }
            if "/sapmi/" in self.url
            else set()
        )
        self.fullpath = make_corpus_path(
            self.corpus_parent
            / f"corpus-{self.lang}-orig-x-closed"
            / "news/nrk.no"
            / f"{self.url.split('/')[-1]}.html"
        )

        self.set_initial_metadata()

    @property
    def basename(self) -> str:
        """Get the name of the corpus path."""
        return self.fullpath.orig.name

    @property
    def canonical_url(self) -> str:
        """Return the link to the article."""
        url = self.tree.find('.//link[@rel="canonical"]')
        if url is None:
            raise SystemExit(f"No url found in {self.url}.")

        href = url.get("href")
        if href is None:
            raise SystemExit(f"No href found in {self.url}.")

        return href

    @property
    def body_text(self):
        """Get all the text inside 'body'."""
        return " ".join(self.content.xpath(".//text()"))

    @property
    def content(self) -> etree._Element:
        """Extract only the content that is interesting to save from the web page."""
        article_content = self.tree.find(".//article[@role='main']")
        if article_content is not None:
            return article_content

        bulletin_content = self.tree.find(".//div[@class='bulletin-text-body']")
        if bulletin_content is not None:
            article_content = bulletin_content.getparent()
            if article_content is not None:
                return article_content

        kortstokk_content = self.tree.find(".//kortstokk-app")
        if kortstokk_content is not None:
            return kortstokk_content

        raise NrkNoUnknownPageError(f"No content found in {self.url}.")

    @property
    def content_string(self):
        """This will be the content of the saved file."""
        return etree.tostring(self.content, encoding="utf8", pretty_print=True)

    @property
    def valid_authors(self) -> Iterable[list[str]]:
        """Find authors with the correct roles.

        Args:
            article (etree.Element): The parsed html document.

        Yields:
            (tuple[str, ...]): Authors
        """
        authors = [
            author_role.text
            for author_role in self.tree.findall('.//meta[@property="article:author"]')
            if author_role.text is not None
        ]

        if not authors:

            author = self.tree.find('.//meta[@name="author"]')
            if author is None:
                raise SystemExit(f"No authors found in {self.url}.")

            authors = [author.get("content", "")]

        return (author.split("/")[0].split() for author in authors)

    @property
    def parallel_ids(self) -> set[str]:
        """Get the id of the parallel document."""

        parellel_ids = [
            article_element.get("ec-id")
            for article_element in self.content.findall(".//a")
            if any(
                text in article_element.get("ec-name", "").lower()
                for text in [
                    "lohkh",
                    "lågå",
                    "loga",
                    "les på",
                ]
            )
        ]

        return {
            parallel_id.replace("pp:", "")
            for parallel_id in parellel_ids
            if parallel_id is not None
        }

    @property
    def title(self) -> str:
        title_element = self.tree.find('.//meta[@property="og:title"]')
        if title_element is None:
            raise SystemExit(f"No title element found in {self.url}.")

        title = title_element.get("content")

        if title is None:
            raise SystemExit(f"No content found in {self.url}.")

        return title

    @property
    def year(self) -> str:
        time_element = self.tree.find('.//meta[@name="dc.date.issued"]')
        if time_element is None:
            raise SystemExit(f"No time found in {self.url}.")

        time_str = time_element.get("content")

        if time_str is None:
            raise SystemExit(f"No time found in {self.url}.")

        return time_str[:4]

    def set_initial_metadata(self):
        """Set the metadata for the page."""
        for count, author_parts in enumerate(self.valid_authors, start=1):
            self.fullpath.metadata.set_variable(
                "author" + str(count) + "_ln", author_parts[-1]
            )
            self.fullpath.metadata.set_variable(
                "author" + str(count) + "_fn", " ".join(author_parts[:-1])
            )

        self.fullpath.metadata.set_variable("filename", self.canonical_url)
        self.fullpath.metadata.set_variable("title", self.title)
        self.fullpath.metadata.set_variable("year", self.year)
        self.fullpath.metadata.set_variable("publisher", "NRK")
        self.fullpath.metadata.set_variable("publChannel", "https://nrk.no/sapmi")
        self.fullpath.metadata.set_variable("license_type", "standard")

    def set_parallel_file(self, lang, name):
        """Update metadata info on parallel files."""
        self.fullpath.metadata.set_parallel_text(lang, name)

    def save(self):
        """Save html and metadata."""
        self.fullpath.orig.parent.mkdir(parents=True, exist_ok=True)
        self.fullpath.orig.write_bytes(self.content_string)
        self.fullpath.metadata.write_file()

basename property

Get the name of the corpus path.

body_text property

Get all the text inside 'body'.

canonical_url property

Return the link to the article.

content property

Extract only the content that is interesting to save from the web page.

content_string property

This will be the content of the saved file.

parallel_ids property

Get the id of the parallel document.

valid_authors property

Find authors with the correct roles.

Parameters:

Name Type Description Default
article Element

The parsed html document.

required

Yields:

Type Description
tuple[str, ...]

Authors

__init__(original_url, html_element, corpus_parent)

Initialise the NrkNoPage class.

Source code in corpustools/nrk_no_page.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
    self,
    original_url: str,
    html_element: etree._Element,
    corpus_parent: Path,
):
    """Initialise the NrkNoPage class."""
    self.corpus_parent = corpus_parent
    self.url = original_url
    self.tree = html_element
    self.lang = self.languageguesser.classify(
        self.body_text, langs=list(self.language_codes)
    )
    self.article_id = self.url.split("/")[-1]
    # Only look for links on NRK sápmi pages
    self.links = (
        {
            link.get("ec-id", "").replace("pp:", "")
            for link in self.tree.findall(".//a[@ec-id]")
            if link.get("ec-id") is not None and "pp:" in link.get("ec-id", "")
        }
        if "/sapmi/" in self.url
        else set()
    )
    self.fullpath = make_corpus_path(
        self.corpus_parent
        / f"corpus-{self.lang}-orig-x-closed"
        / "news/nrk.no"
        / f"{self.url.split('/')[-1]}.html"
    )

    self.set_initial_metadata()

save()

Save html and metadata.

Source code in corpustools/nrk_no_page.py
204
205
206
207
208
def save(self):
    """Save html and metadata."""
    self.fullpath.orig.parent.mkdir(parents=True, exist_ok=True)
    self.fullpath.orig.write_bytes(self.content_string)
    self.fullpath.metadata.write_file()

set_initial_metadata()

Set the metadata for the page.

Source code in corpustools/nrk_no_page.py
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def set_initial_metadata(self):
    """Set the metadata for the page."""
    for count, author_parts in enumerate(self.valid_authors, start=1):
        self.fullpath.metadata.set_variable(
            "author" + str(count) + "_ln", author_parts[-1]
        )
        self.fullpath.metadata.set_variable(
            "author" + str(count) + "_fn", " ".join(author_parts[:-1])
        )

    self.fullpath.metadata.set_variable("filename", self.canonical_url)
    self.fullpath.metadata.set_variable("title", self.title)
    self.fullpath.metadata.set_variable("year", self.year)
    self.fullpath.metadata.set_variable("publisher", "NRK")
    self.fullpath.metadata.set_variable("publChannel", "https://nrk.no/sapmi")
    self.fullpath.metadata.set_variable("license_type", "standard")

set_parallel_file(lang, name)

Update metadata info on parallel files.

Source code in corpustools/nrk_no_page.py
200
201
202
def set_parallel_file(self, lang, name):
    """Update metadata info on parallel files."""
    self.fullpath.metadata.set_parallel_text(lang, name)

NrkNoUnknownPageError

Bases: Exception

Raise an error if the page is not recognised.

Source code in corpustools/nrk_no_page.py
10
11
12
13
class NrkNoUnknownPageError(Exception):
    """Raise an error if the page is not recognised."""

    pass