Skip to content

pick_titles

Program to pick out documents to be saved to the corpus from samediggi.se.

The documents have been fetched using wget.

DocumentPicker

Pick documents from samediggi.se to be added to the corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/pick_titles.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
class DocumentPicker:
    """Pick documents from samediggi.se to be added to the corpus."""

    def __init__(self, source_dir):
        self.freecorpus = os.getenv("GTFREE")
        self.source_dir = source_dir
        self.file_dict = {}
        self.file_dict.setdefault("sma", [])
        self.file_dict.setdefault("sme", [])
        self.file_dict.setdefault("smj", [])
        self.file_dict.setdefault("swe", [])
        self.file_dict.setdefault("none", [])
        self.parallel_dict = {}
        self.total_file = 0

    def classify_files(self):
        """Iterate through all files, classify them according to language"""
        for root, dirs, files in os.walk(self.source_dir):
            for f in files:
                if f.endswith(".xsl"):
                    self.total_file += 1
                    self.classify_file(os.path.join(root, f))

    def classify_file(self, file_):
        """Identify the language of the file"""
        mh = xslsetter.MetadataHandler(file_, create=True)
        url = mh.get_variable("filename")
        if (
            "regjeringen.no" in url
            and "regjeringen.no" not in file_
            and ".pdf" not in file_
        ):
            try:
                remote = urllib2.urlopen(urllib2.Request(url.encode("utf8")))
                self.copyfile(remote, file_)
            except urllib2.HTTPError:
                print(
                    util.lineno(),
                    "Could not fetch",
                    file_.replace(".xsl", ""),
                    file=sys.stderr,
                )
            except UnicodeEncodeError:
                print(util.lineno(), "Unicode error in url", url, file=sys.stderr)
            print(util.lineno(), "sleeping …")
            time.sleep(2)

    def copyfile(self, remote, file_):
        try:
            with open(file_.replace(".xsl", ""), "wb") as f:
                print(util.lineno(), "Fetching", file_.replace(".xsl", ""))
                shutil.copyfileobj(remote, f)
        finally:
            remote.close()

classify_file(file_)

Identify the language of the file

Source code in /home/anders/projects/CorpusTools/corpustools/pick_titles.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def classify_file(self, file_):
    """Identify the language of the file"""
    mh = xslsetter.MetadataHandler(file_, create=True)
    url = mh.get_variable("filename")
    if (
        "regjeringen.no" in url
        and "regjeringen.no" not in file_
        and ".pdf" not in file_
    ):
        try:
            remote = urllib2.urlopen(urllib2.Request(url.encode("utf8")))
            self.copyfile(remote, file_)
        except urllib2.HTTPError:
            print(
                util.lineno(),
                "Could not fetch",
                file_.replace(".xsl", ""),
                file=sys.stderr,
            )
        except UnicodeEncodeError:
            print(util.lineno(), "Unicode error in url", url, file=sys.stderr)
        print(util.lineno(), "sleeping …")
        time.sleep(2)

classify_files()

Iterate through all files, classify them according to language

Source code in /home/anders/projects/CorpusTools/corpustools/pick_titles.py
55
56
57
58
59
60
61
def classify_files(self):
    """Iterate through all files, classify them according to language"""
    for root, dirs, files in os.walk(self.source_dir):
        for f in files:
            if f.endswith(".xsl"):
                self.total_file += 1
                self.classify_file(os.path.join(root, f))