Skip to content

saami_crawler

This file contains routines to crawl sites containing saami text.

main()

Crawl sites.

Source code in /home/anders/projects/CorpusTools/corpustools/saami_crawler.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def main():
    """Crawl sites."""
    args = parse_options()

    crawlers = {
        "www.samediggi.fi": samediggi_fi_crawler.SamediggiFiCrawler(),
        "samediggi.no": samediggi_no_crawler.SamediggiNoCrawler(),
        # "nrk.no": nrk_no_crawler.NrkSmeCrawler(),
        # "samas.no": samas_crawler.SamasCrawler(),
    }

    for site in args.sites:
        crawler = crawlers[site]
        crawler.crawl_site()

parse_options()

Parse the commandline options.

Returns:

Type Description
argparse.Namespace

the parsed commandline arguments

Source code in /home/anders/projects/CorpusTools/corpustools/saami_crawler.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def parse_options():
    """Parse the commandline options.

    Returns:
        (argparse.Namespace): the parsed commandline arguments
    """
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser],
        description="Crawl saami sites (for now, only samediggi.no and www.samediggi.fi).",
    )

    parser.add_argument("sites", nargs="+", help="The sites to crawl")

    args = parser.parse_args()
    return args