Skip to content

crawler

This file contains routines to crawl sites containing saami text.

Crawler

A base class to save downloaded files to the corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/crawler.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
class Crawler:
    """A base class to save downloaded files to the corpus."""

    def __init__(self):
        """Initialise the Crawler class."""
        self.goaldir = Path(os.getenv("GTLANGS"))
        self.unvisited_links = set()
        self.visited_links = set()
        self.download_links = set()
        self.corpus_adders = {}
        self.downloader = adder.UrlDownloader(os.path.join(self.goaldir, "tmp"))

    def __del__(self):
        """Add all files to the corpus."""
        for _, corpus_adder in self.corpus_adders.items():
            corpus_adder.add_files_to_working_copy()

    def save_pages(self, pages):
        """Write pages to disk.

        pages is a list of url, lang tuples
        """
        parallelpath = ""

        for url, lang in pages:
            try:
                (_, tmpname) = self.downloader.download(url)
            except adder.AdderError as error:
                util.print_frame(debug=str(error) + "\n")
            else:
                normalised_name = namechanger.normalise_filename(
                    os.path.basename(tmpname)
                )
                normalised_path = os.path.join(
                    self.corpus_adders[lang].goalpath, normalised_name
                )

                if not os.path.exists(normalised_path):
                    parallelpath = self.corpus_adders[lang].copy_file_to_corpus(
                        tmpname, url, parallelpath=parallelpath
                    )
                    util.print_frame(debug=f"adding {parallelpath}")
                else:
                    parallelpath = normalised_path
        print(file=sys.stderr)

    def name_from_url(self, url):
        os.path.basename(url)

__del__()

Add all files to the corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/crawler.py
41
42
43
44
def __del__(self):
    """Add all files to the corpus."""
    for _, corpus_adder in self.corpus_adders.items():
        corpus_adder.add_files_to_working_copy()

__init__()

Initialise the Crawler class.

Source code in /home/anders/projects/CorpusTools/corpustools/crawler.py
32
33
34
35
36
37
38
39
def __init__(self):
    """Initialise the Crawler class."""
    self.goaldir = Path(os.getenv("GTLANGS"))
    self.unvisited_links = set()
    self.visited_links = set()
    self.download_links = set()
    self.corpus_adders = {}
    self.downloader = adder.UrlDownloader(os.path.join(self.goaldir, "tmp"))

save_pages(pages)

Write pages to disk.

pages is a list of url, lang tuples

Source code in /home/anders/projects/CorpusTools/corpustools/crawler.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def save_pages(self, pages):
    """Write pages to disk.

    pages is a list of url, lang tuples
    """
    parallelpath = ""

    for url, lang in pages:
        try:
            (_, tmpname) = self.downloader.download(url)
        except adder.AdderError as error:
            util.print_frame(debug=str(error) + "\n")
        else:
            normalised_name = namechanger.normalise_filename(
                os.path.basename(tmpname)
            )
            normalised_path = os.path.join(
                self.corpus_adders[lang].goalpath, normalised_name
            )

            if not os.path.exists(normalised_path):
                parallelpath = self.corpus_adders[lang].copy_file_to_corpus(
                    tmpname, url, parallelpath=parallelpath
                )
                util.print_frame(debug=f"adding {parallelpath}")
            else:
                parallelpath = normalised_path
    print(file=sys.stderr)