crawler

This file contains routines to crawl sites containing saami text.

`Crawler`

A base class to save downloaded files to the corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/crawler.py

class Crawler:
    """A base class to save downloaded files to the corpus."""

    def __init__(self):
        """Initialise the Crawler class."""
        self.goaldir = Path(os.getenv("GTLANGS"))
        self.unvisited_links = set()
        self.visited_links = set()
        self.download_links = set()
        self.corpus_adders = {}
        self.downloader = adder.UrlDownloader(os.path.join(self.goaldir, "tmp"))

    def __del__(self):
        """Add all files to the corpus."""
        for _, corpus_adder in self.corpus_adders.items():
            corpus_adder.add_files_to_working_copy()

    def save_pages(self, pages):
        """Write pages to disk.

        pages is a list of url, lang tuples
        """
        parallelpath = ""

        for url, lang in pages:
            try:
                (_, tmpname) = self.downloader.download(url)
            except adder.AdderError as error:
                util.print_frame(debug=str(error) + "\n")
            else:
                normalised_name = namechanger.normalise_filename(
                    os.path.basename(tmpname)
                )
                normalised_path = os.path.join(
                    self.corpus_adders[lang].goalpath, normalised_name
                )

                if not os.path.exists(normalised_path):
                    parallelpath = self.corpus_adders[lang].copy_file_to_corpus(
                        tmpname, url, parallelpath=parallelpath
                    )
                    util.print_frame(debug=f"adding {parallelpath}")
                else:
                    parallelpath = normalised_path
        print(file=sys.stderr)

    def name_from_url(self, url):
        os.path.basename(url)

`del()`

Add all files to the corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/crawler.py

def __del__(self):
    """Add all files to the corpus."""
    for _, corpus_adder in self.corpus_adders.items():
        corpus_adder.add_files_to_working_copy()

`init()`

Initialise the Crawler class.

Source code in /home/anders/projects/CorpusTools/corpustools/crawler.py

def __init__(self):
    """Initialise the Crawler class."""
    self.goaldir = Path(os.getenv("GTLANGS"))
    self.unvisited_links = set()
    self.visited_links = set()
    self.download_links = set()
    self.corpus_adders = {}
    self.downloader = adder.UrlDownloader(os.path.join(self.goaldir, "tmp"))

`save_pages(pages)`

Write pages to disk.

pages is a list of url, lang tuples

Source code in /home/anders/projects/CorpusTools/corpustools/crawler.py

def save_pages(self, pages):
    """Write pages to disk.

    pages is a list of url, lang tuples
    """
    parallelpath = ""

    for url, lang in pages:
        try:
            (_, tmpname) = self.downloader.download(url)
        except adder.AdderError as error:
            util.print_frame(debug=str(error) + "\n")
        else:
            normalised_name = namechanger.normalise_filename(
                os.path.basename(tmpname)
            )
            normalised_path = os.path.join(
                self.corpus_adders[lang].goalpath, normalised_name
            )

            if not os.path.exists(normalised_path):
                parallelpath = self.corpus_adders[lang].copy_file_to_corpus(
                    tmpname, url, parallelpath=parallelpath
                )
                util.print_frame(debug=f"adding {parallelpath}")
            else:
                parallelpath = normalised_path
    print(file=sys.stderr)

crawler

Crawler

__del__()

__init__()

save_pages(pages)

`Crawler`

`del()`

`init()`

`save_pages(pages)`