Check if this is an address that should be crawled.
Source code in corpustools/samediggi_no_links.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77 | def is_valid_address(href: str) -> bool | None:
"""Check if this is an address that should be crawled."""
match = ADDRESS_RE.match(href)
return (
match
and not re.search(
"sametingets-vedtak-1989-2004|endresprak.aspx|innsyn.aspx|/english/|/#|"
"sametingets-representanter|samedikki-airasat|samedikke-ajrrasa|saemiedigkien-tjirkijh|"
"plenumssaker|dievascoahkkinassit|allestjahkanimassje|stoerretjaanghkoeaamhtesh|"
"ofte-stilte-sporsmal|davja-jerron-gazaldagat",
href,
)
and not href.endswith(unwanted_endings)
)
|