samas_crawler

This file contains routines to crawl samas.no.

`SamasCrawler`

Collect pages from samas.no.

We only want to fetch saami pages, and their parallels.

Source code in /home/anders/projects/CorpusTools/corpustools/samas_crawler.py

class SamasCrawler:
    """Collect pages from samas.no.

    We only want to fetch saami pages, and their parallels.

    <ul class="language-switcher-locale-url"> tells which language is active.
    If se is active, save the page and its parallels.
    If se is not active, check to see if it has a parallel. Save the page and its parallels.
    If the link of one of the list elements contain /node, skip it.
    """

    goaldir = str(os.getenv("GTFREE"))
    external_links = set()
    samas_languages = {"se": "sme", "nb": "nob", "en-UK": "eng"}

    def __init__(self):
        self.fetched_links = {
            "http://samas.no/en",
            "http://samas.no/nb",
            "http://samas.no/se",
        }
        self.corpus_adders = {
            lang: adder.AddToCorpus(
                self.goaldir, self.samas_languages[lang], "admin/allaskuvla/samas.no"
            )
            for lang in self.samas_languages
        }
        self.downloader = adder.UrlDownloader(os.path.join(self.goaldir, "tmp"))

    @staticmethod
    def get_samas_href(href):
        return f"http://samas.no{href}"

    def harvest_links(self, content):
        """Find interesting pages inside a topic.

        Args:
            content (etree.Element): content of a samas page, without the
                language_switcher element.

        Yields:
            (str): a url to a samas.no page
        """
        lang_switcher = content.find('.//ul[@class="language-switcher-locale-url"]')
        lang_switcher.getparent().remove(lang_switcher)

        for address in content.xpath("//a"):
            if self.is_internal(address.get("href")):
                yield self.get_samas_href(address.get("href").strip())

    def is_internal(self, href):
        return (
            href
            and "/node" not in href
            and "/Node" not in href
            and href.startswith("/")
            and "field_" not in href
            and "page=" not in href
            and "/user" not in href
        )

    def get_uff(self, tmpname):
        content = html.parse(tmpname).getroot()
        lang_switcher = content.find('.//ul[@class="language-switcher-locale-url"]')

        return {
            address.get("xml:lang"): address.get("href")
            for address in lang_switcher.xpath(".//a")
            if self.is_internal(address.get("href"))
        }

    def add_samas_page(self, link):
        """Get a saami samas.no page and its parallels.

        Args:
            link (str): a url to samas.no page, that has been vetted by
                the is_internal function.
        """
        paths = set()
        if link not in self.fetched_links:
            try:
                (request, tmpname) = self.downloader.download(link)
                uff = self.get_uff(tmpname)

                if "se" in uff:
                    util.note("")
                    util.print_frame(link, uff)
                    path = paths.add(self.uff_fetcher(uff, "se", link, tmpname, ""))

                    for lang in ["nb", "en-UK"]:
                        if lang in uff:
                            paths.add(self.uff_fetcher(uff, lang, link, tmpname, path))

            except (adder.AdderError, UserWarning) as error:
                util.note(error)

        for puth in paths:
            for lunk in self.harvest_links(html.parse(puth)):
                self.add_samas_page(lunk)

    def uff_fetcher(self, uff, lang, link, tmpname, path):
        lunk = self.get_samas_href(uff[lang])
        self.fetched_links.add(lunk)
        if lunk == link:
            return self.corpus_adders[lang].copy_file_to_corpus(
                tmpname, lunk, parallelpath=path
            )
        else:
            return self.corpus_adders[lang].copy_url_to_corpus(lunk, parallelpath=path)

    def crawl_site(self):
        for lang in self.samas_languages:
            (request, tmpname) = self.downloader.download(f"http://samas.no/{lang[:2]}")
            for link in self.harvest_links(html.parse(tmpname).getroot()):
                self.add_samas_page(link)

        for lang in self.corpus_adders:
            self.corpus_adders[lang].add_files_to_working_copy()

`add_samas_page(link)`

Get a saami samas.no page and its parallels.

Parameters:

Name	Type	Description	Default
`link`	`str`	a url to samas.no page, that has been vetted by the is_internal function.	required

Source code in /home/anders/projects/CorpusTools/corpustools/samas_crawler.py

def add_samas_page(self, link):
    """Get a saami samas.no page and its parallels.

    Args:
        link (str): a url to samas.no page, that has been vetted by
            the is_internal function.
    """
    paths = set()
    if link not in self.fetched_links:
        try:
            (request, tmpname) = self.downloader.download(link)
            uff = self.get_uff(tmpname)

            if "se" in uff:
                util.note("")
                util.print_frame(link, uff)
                path = paths.add(self.uff_fetcher(uff, "se", link, tmpname, ""))

                for lang in ["nb", "en-UK"]:
                    if lang in uff:
                        paths.add(self.uff_fetcher(uff, lang, link, tmpname, path))

        except (adder.AdderError, UserWarning) as error:
            util.note(error)

    for puth in paths:
        for lunk in self.harvest_links(html.parse(puth)):
            self.add_samas_page(lunk)

`harvest_links(content)`

Find interesting pages inside a topic.

Parameters:

Name	Type	Description	Default
`content`	`etree.Element`	content of a samas page, without the language_switcher element.	required

Yields:

Type	Description
`str`	a url to a samas.no page

Source code in /home/anders/projects/CorpusTools/corpustools/samas_crawler.py

def harvest_links(self, content):
    """Find interesting pages inside a topic.

    Args:
        content (etree.Element): content of a samas page, without the
            language_switcher element.

    Yields:
        (str): a url to a samas.no page
    """
    lang_switcher = content.find('.//ul[@class="language-switcher-locale-url"]')
    lang_switcher.getparent().remove(lang_switcher)

    for address in content.xpath("//a"):
        if self.is_internal(address.get("href")):
            yield self.get_samas_href(address.get("href").strip())