This file contains routines to crawl sites containing saami text.
Bases: Crawler
Crawl samediggi.no and save html documents to the corpus.
Source code in corpustools/samediggi_no_crawler.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168 | class SamediggiNoCrawler(crawler.Crawler):
"""Crawl samediggi.no and save html documents to the corpus."""
langs = ["nob", "sma", "sme", "smj"]
def __init__(self) -> None:
"""Initialise the SamediggiNoCrawler class."""
super().__init__()
self.unvisited_links.add("https://sametinget.no/")
self.vcs = {
lang: versioncontrol.vcs(self.corpus_parent / f"corpus-{lang}-orig")
for lang in self.langs
}
self.dupe_table = self.make_dupe_dict()
def samediggi_corpus_dirs(self) -> Iterator[Path]:
return (
self.corpus_parent / f"corpus-{lang}-orig" / "admin/sd/samediggi.no"
for lang in self.langs
)
def samediggi_corpus_files(self) -> Iterator[Path]:
return (
html_path
for corpus_dir in self.samediggi_corpus_dirs()
for html_path in corpus_dir.rglob("*.html")
)
def make_dupe_dict(self) -> dict[str, Path]:
"""Make a dict to map md5-digest to filename."""
return {
make_digest(fullpath.read_bytes()): fullpath
for fullpath in self.samediggi_corpus_files()
}
def crawl_page(self, link) -> SamediggiNoPage | None:
"""Collect links from a page."""
self.visited_links.add(link)
result = requests.get(link)
if not result.ok:
return None
content_type = result.headers.get("content-type")
if content_type is None:
return None
if "html" not in content_type.lower():
return None
tree = etree.HTML(result.text)
if tree is None:
return None
orig_page = SamediggiNoPage(
result.url, etree.HTML(result.text), self.corpus_parent, self.dupe_table
)
orig_page.sanity_test()
self.visited_links.add(orig_page.url)
self.unvisited_links.update(orig_page.links)
return orig_page
def crawl_site(self):
"""Crawl samediggi.no."""
while self.unvisited_links:
link = self.unvisited_links.pop()
if link not in self.visited_links:
self.crawl_pageset(link)
self.unvisited_links.difference_update(self.visited_links)
def is_page_addable(self, page: SamediggiNoPage | None):
"""Add a page to the list of parallel pages."""
if page is None:
return False
return page.saveable and page.claimed_lang == page.real_lang
@staticmethod
def set_parallel_info(parallel_pages):
"""Set the parallels for this set of parallel pages."""
lang_combinations = (
(parallel_page1, parallel_page2)
for parallel_page1 in parallel_pages
for parallel_page2 in parallel_pages
if parallel_page1 != parallel_page2
)
for parallel_page1, parallel_page2 in lang_combinations:
parallel_page1.set_parallel_file(
parallel_page2.lang, parallel_page2.basename
)
def get_page_set(self, orig_page) -> list[SamediggiNoPage]:
"""Get parallel pages for the original page.
Args:
orig_page: The original page to get parallel pages for.
Returns:
A list of parallel pages.
"""
crawled_pages = [orig_page]
crawled_pages.extend(
[self.crawl_page(link) for link in orig_page.parallel_links]
)
pages = [page for page in crawled_pages if self.is_page_addable(page)]
# If there is only a norwegian page, return an empty list
# We are interested in the saami pages, the norwegian page is
# valueable only if there is a saami page to compare it to
if len(pages) and pages[0].claimed_lang == "nob":
return []
return pages
def crawl_pageset(self, link):
"""Crawl a pageset that link gives us."""
pages = self.get_page_set(self.crawl_page(link))
self.set_parallel_info(pages)
for page in pages:
self.dupe_table[page.digest] = page.corpuspath.orig
page.save()
self.vcs[page.real_lang].add(page.corpuspath.orig)
self.vcs[page.real_lang].add(page.corpuspath.xsl)
|
Initialise the SamediggiNoCrawler class.
Source code in corpustools/samediggi_no_crawler.py
41
42
43
44
45
46
47
48
49
50 | def __init__(self) -> None:
"""Initialise the SamediggiNoCrawler class."""
super().__init__()
self.unvisited_links.add("https://sametinget.no/")
self.vcs = {
lang: versioncontrol.vcs(self.corpus_parent / f"corpus-{lang}-orig")
for lang in self.langs
}
self.dupe_table = self.make_dupe_dict()
|
crawl_page(link)
Collect links from a page.
Source code in corpustools/samediggi_no_crawler.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100 | def crawl_page(self, link) -> SamediggiNoPage | None:
"""Collect links from a page."""
self.visited_links.add(link)
result = requests.get(link)
if not result.ok:
return None
content_type = result.headers.get("content-type")
if content_type is None:
return None
if "html" not in content_type.lower():
return None
tree = etree.HTML(result.text)
if tree is None:
return None
orig_page = SamediggiNoPage(
result.url, etree.HTML(result.text), self.corpus_parent, self.dupe_table
)
orig_page.sanity_test()
self.visited_links.add(orig_page.url)
self.unvisited_links.update(orig_page.links)
return orig_page
|
crawl_pageset(link)
Crawl a pageset that link gives us.
Source code in corpustools/samediggi_no_crawler.py
158
159
160
161
162
163
164
165
166
167
168 | def crawl_pageset(self, link):
"""Crawl a pageset that link gives us."""
pages = self.get_page_set(self.crawl_page(link))
self.set_parallel_info(pages)
for page in pages:
self.dupe_table[page.digest] = page.corpuspath.orig
page.save()
self.vcs[page.real_lang].add(page.corpuspath.orig)
self.vcs[page.real_lang].add(page.corpuspath.xsl)
|
Crawl samediggi.no.
Source code in corpustools/samediggi_no_crawler.py
102
103
104
105
106
107
108
109
110 | def crawl_site(self):
"""Crawl samediggi.no."""
while self.unvisited_links:
link = self.unvisited_links.pop()
if link not in self.visited_links:
self.crawl_pageset(link)
self.unvisited_links.difference_update(self.visited_links)
|
get_page_set(orig_page)
Get parallel pages for the original page.
Parameters:
| Name |
Type |
Description |
Default |
orig_page
|
|
The original page to get parallel pages for.
|
required
|
Returns:
Source code in corpustools/samediggi_no_crawler.py
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156 | def get_page_set(self, orig_page) -> list[SamediggiNoPage]:
"""Get parallel pages for the original page.
Args:
orig_page: The original page to get parallel pages for.
Returns:
A list of parallel pages.
"""
crawled_pages = [orig_page]
crawled_pages.extend(
[self.crawl_page(link) for link in orig_page.parallel_links]
)
pages = [page for page in crawled_pages if self.is_page_addable(page)]
# If there is only a norwegian page, return an empty list
# We are interested in the saami pages, the norwegian page is
# valueable only if there is a saami page to compare it to
if len(pages) and pages[0].claimed_lang == "nob":
return []
return pages
|
is_page_addable(page)
Add a page to the list of parallel pages.
Source code in corpustools/samediggi_no_crawler.py
| def is_page_addable(self, page: SamediggiNoPage | None):
"""Add a page to the list of parallel pages."""
if page is None:
return False
return page.saveable and page.claimed_lang == page.real_lang
|
Make a dict to map md5-digest to filename.
Source code in corpustools/samediggi_no_crawler.py
| def make_dupe_dict(self) -> dict[str, Path]:
"""Make a dict to map md5-digest to filename."""
return {
make_digest(fullpath.read_bytes()): fullpath
for fullpath in self.samediggi_corpus_files()
}
|
Set the parallels for this set of parallel pages.
Source code in corpustools/samediggi_no_crawler.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132 | @staticmethod
def set_parallel_info(parallel_pages):
"""Set the parallels for this set of parallel pages."""
lang_combinations = (
(parallel_page1, parallel_page2)
for parallel_page1 in parallel_pages
for parallel_page2 in parallel_pages
if parallel_page1 != parallel_page2
)
for parallel_page1, parallel_page2 in lang_combinations:
parallel_page1.set_parallel_file(
parallel_page2.lang, parallel_page2.basename
)
|