Skip to content

samediggi_no_links

is_valid_address(href)

Check if this is an address that should be crawled.

Source code in corpustools/samediggi_no_links.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def is_valid_address(href: str) -> bool | None:
    """Check if this is an address that should be crawled."""
    match = ADDRESS_RE.match(href)
    return (
        match
        and not re.search(
            "sametingets-vedtak-1989-2004|endresprak.aspx|innsyn.aspx|/english/|/#|"
            "sametingets-representanter|samedikki-airasat|samedikke-ajrrasa|saemiedigkien-tjirkijh|"
            "plenumssaker|dievascoahkkinassit|allestjahkanimassje|stoerretjaanghkoeaamhtesh|"
            "ofte-stilte-sporsmal|davja-jerron-gazaldagat",
            href,
        )
        and not href.endswith(unwanted_endings)
    )