Skip to content

samas_crawler

This file contains routines to crawl samas.no.

SamasCrawler

Collect pages from samas.no.

We only want to fetch saami pages, and their parallels.

    tells which language is active. If se is active, save the page and its parallels. If se is not active, check to see if it has a parallel. Save the page and its parallels. If the link of one of the list elements contain /node, skip it.
    Source code in /home/anders/projects/CorpusTools/corpustools/samas_crawler.py
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    class SamasCrawler:
        """Collect pages from samas.no.
    
        We only want to fetch saami pages, and their parallels.
    
        <ul class="language-switcher-locale-url"> tells which language is active.
        If se is active, save the page and its parallels.
        If se is not active, check to see if it has a parallel. Save the page and its parallels.
        If the link of one of the list elements contain /node, skip it.
        """
    
        goaldir = str(os.getenv("GTFREE"))
        external_links = set()
        samas_languages = {"se": "sme", "nb": "nob", "en-UK": "eng"}
    
        def __init__(self):
            self.fetched_links = {
                "http://samas.no/en",
                "http://samas.no/nb",
                "http://samas.no/se",
            }
            self.corpus_adders = {
                lang: adder.AddToCorpus(
                    self.goaldir, self.samas_languages[lang], "admin/allaskuvla/samas.no"
                )
                for lang in self.samas_languages
            }
            self.downloader = adder.UrlDownloader(os.path.join(self.goaldir, "tmp"))
    
        @staticmethod
        def get_samas_href(href):
            return f"http://samas.no{href}"
    
        def harvest_links(self, content):
            """Find interesting pages inside a topic.
    
            Args:
                content (etree.Element): content of a samas page, without the
                    language_switcher element.
    
            Yields:
                (str): a url to a samas.no page
            """
            lang_switcher = content.find('.//ul[@class="language-switcher-locale-url"]')
            lang_switcher.getparent().remove(lang_switcher)
    
            for address in content.xpath("//a"):
                if self.is_internal(address.get("href")):
                    yield self.get_samas_href(address.get("href").strip())
    
        def is_internal(self, href):
            return (
                href
                and "/node" not in href
                and "/Node" not in href
                and href.startswith("/")
                and "field_" not in href
                and "page=" not in href
                and "/user" not in href
            )
    
        def get_uff(self, tmpname):
            content = html.parse(tmpname).getroot()
            lang_switcher = content.find('.//ul[@class="language-switcher-locale-url"]')
    
            return {
                address.get("xml:lang"): address.get("href")
                for address in lang_switcher.xpath(".//a")
                if self.is_internal(address.get("href"))
            }
    
        def add_samas_page(self, link):
            """Get a saami samas.no page and its parallels.
    
            Args:
                link (str): a url to samas.no page, that has been vetted by
                    the is_internal function.
            """
            paths = set()
            if link not in self.fetched_links:
                try:
                    (request, tmpname) = self.downloader.download(link)
                    uff = self.get_uff(tmpname)
    
                    if "se" in uff:
                        util.note("")
                        util.print_frame(link, uff)
                        path = paths.add(self.uff_fetcher(uff, "se", link, tmpname, ""))
    
                        for lang in ["nb", "en-UK"]:
                            if lang in uff:
                                paths.add(self.uff_fetcher(uff, lang, link, tmpname, path))
    
                except (adder.AdderError, UserWarning) as error:
                    util.note(error)
    
            for puth in paths:
                for lunk in self.harvest_links(html.parse(puth)):
                    self.add_samas_page(lunk)
    
        def uff_fetcher(self, uff, lang, link, tmpname, path):
            lunk = self.get_samas_href(uff[lang])
            self.fetched_links.add(lunk)
            if lunk == link:
                return self.corpus_adders[lang].copy_file_to_corpus(
                    tmpname, lunk, parallelpath=path
                )
            else:
                return self.corpus_adders[lang].copy_url_to_corpus(lunk, parallelpath=path)
    
        def crawl_site(self):
            for lang in self.samas_languages:
                (request, tmpname) = self.downloader.download(f"http://samas.no/{lang[:2]}")
                for link in self.harvest_links(html.parse(tmpname).getroot()):
                    self.add_samas_page(link)
    
            for lang in self.corpus_adders:
                self.corpus_adders[lang].add_files_to_working_copy()
    

    add_samas_page(link)

    Get a saami samas.no page and its parallels.

    Parameters:

    Name Type Description Default
    link str

    a url to samas.no page, that has been vetted by the is_internal function.

    required
    Source code in /home/anders/projects/CorpusTools/corpustools/samas_crawler.py
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    def add_samas_page(self, link):
        """Get a saami samas.no page and its parallels.
    
        Args:
            link (str): a url to samas.no page, that has been vetted by
                the is_internal function.
        """
        paths = set()
        if link not in self.fetched_links:
            try:
                (request, tmpname) = self.downloader.download(link)
                uff = self.get_uff(tmpname)
    
                if "se" in uff:
                    util.note("")
                    util.print_frame(link, uff)
                    path = paths.add(self.uff_fetcher(uff, "se", link, tmpname, ""))
    
                    for lang in ["nb", "en-UK"]:
                        if lang in uff:
                            paths.add(self.uff_fetcher(uff, lang, link, tmpname, path))
    
            except (adder.AdderError, UserWarning) as error:
                util.note(error)
    
        for puth in paths:
            for lunk in self.harvest_links(html.parse(puth)):
                self.add_samas_page(lunk)
    

    Find interesting pages inside a topic.

    Parameters:

    Name Type Description Default
    content etree.Element

    content of a samas page, without the language_switcher element.

    required

    Yields:

    Type Description
    str

    a url to a samas.no page

    Source code in /home/anders/projects/CorpusTools/corpustools/samas_crawler.py
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    def harvest_links(self, content):
        """Find interesting pages inside a topic.
    
        Args:
            content (etree.Element): content of a samas page, without the
                language_switcher element.
    
        Yields:
            (str): a url to a samas.no page
        """
        lang_switcher = content.find('.//ul[@class="language-switcher-locale-url"]')
        lang_switcher.getparent().remove(lang_switcher)
    
        for address in content.xpath("//a"):
            if self.is_internal(address.get("href")):
                yield self.get_samas_href(address.get("href").strip())