Skip to content

one-off-functions

One off funtions to set metadata.

Might be useful in other contexts.

find_endings(directories, suffix)

Find all files in with suffix within directories.

Parameters:

Name Type Description Default
directories list of str

list of directories to walk

required
suffix str

files suffixes to be searched for

required

Yields:

Type Description
str

path to file with suffix

Source code in /home/anders/projects/CorpusTools/corpustools/one-off-functions.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
def find_endings(directories, suffix):
    """Find all files in with suffix within directories.

    Args:
        directories (list of str): list of directories to walk
        suffix (str): files suffixes to be searched for

    Yields:
        (str): path to file with suffix
    """
    for directory in directories:
        for root, _, files in os.walk(directory):
            for file_ in files:
                if file_.endswith(suffix):
                    yield os.path.join(root, file_)

regjeringen_no(directories)

Set metadata for regjeringen.no html files.

Parameters:

Name Type Description Default
directories list of str

list of directories to walk

required
Source code in /home/anders/projects/CorpusTools/corpustools/one-off-functions.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def regjeringen_no(directories):
    """Set metadata for regjeringen.no html files.

    Args:
        directories (list of str): list of directories to walk
    """
    for html_file in find_endings(directories, ".html"):
        conv = converter.HTMLConverter(html_file)
        content = html.document_fromstring(conv.content)

        should_write = False
        author = content.find('.//meta[@name="AUTHOR"]')
        if author is not None:
            should_write = True
            conv.md.set_variable("author1_ln", author.get("content"))

        creation_date = content.find('.//meta[@name="creation_date"]')
        if creation_date is not None:
            should_write = True
            conv.md.set_variable("year", parse(creation_date.get("content")).year)

        publisher = content.find('.//meta[@name="DC.Publisher"]')
        if publisher is not None:
            should_write = True
            conv.md.set_variable("publisher", publisher.get("content"))

        if should_write:
            conv.md.write_file()

skuvla_historja(directories)

Find skuvlahistorja directories in paths, set year.

Parameters:

Name Type Description Default
directories list of str

list of directories to walk

required
Source code in /home/anders/projects/CorpusTools/corpustools/one-off-functions.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def skuvla_historja(directories):
    """Find skuvlahistorja directories in paths, set year.

    Args:
        directories (list of str): list of directories to walk
    """
    years = {
        "skuvlahistorja1": "2005",
        "skuvlahistorja2": "2007",
        "skuvlahistorja3": "2009",
        "skuvlahistorja4": "2010",
        "skuvlahistorja5": "2011",
        "skuvlahistorja6": "2013",
    }

    for file_ in find_endings(directories, ".xsl"):
        if "skuvlahistorja" in file_:
            print(file_)
            metadata = xslsetter.MetadataHandler(file_)
            metadata.set_variable("year", years[file_.split("/")[-1]])
            metadata.write_file()

to_free(path)

Set the lisence type.

Source code in /home/anders/projects/CorpusTools/corpustools/one-off-functions.py
81
82
83
84
85
86
87
88
89
def to_free(path):
    """Set the lisence type."""
    conv_manager = converter.ConverterManager(False, False)
    conv_manager.collect_files([path])

    for file_ in conv_manager.FILES:
        conv = conv_manager.converter(file_)
        conv.md.set_variable("license_type", "free")
        conv.md.write_file()

translated_from(url_part, mainlang, directories)

Set all docs from url_part to be translated from mainlang.

Parameters:

Name Type Description Default
url_part str

the defining part of the url

required
mainlang str

three character long language code

required
directories list of str

list of directories to walk

required
Source code in /home/anders/projects/CorpusTools/corpustools/one-off-functions.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
def translated_from(url_part, mainlang, directories):
    """Set all docs from url_part to be translated from mainlang.

    Args:
        url_part (str): the defining part of the url
        mainlang (str): three character long language code
        directories (list of str): list of directories to walk
    """
    # Check if the arguments are valid
    if "." not in url_part:
        raise UserWarning(f"{url_part} does not seem to part of a url")
    if len(mainlang) != 3 and not isinstance(mainlang, "str"):
        raise UserWarning("{} does not seem to be a valid language code")

    counter = collections.defaultdict(int)
    for file_ in find_endings(directories, ".xsl"):
        corpus_path = corpuspath.make_corpus_path(file_)
        if (
            url_part in corpus_path.metadata.get_variable("filename")
            and corpus_path.metadata.get_variable("mainlang") == mainlang
        ):
            counter[mainlang] += 1
            for parallel in corpus_path.parallels():
                counter["parallels"] += 1
                try:
                    metadata = xslsetter.MetadataHandler(parallel + ".xsl")
                except util.ArgumentError as error:
                    util.note(error)
                    util.note(f"Referenced from {file_}")
                finally:
                    metadata.set_variable("translated_from", mainlang)
                    metadata.write_file()

    print(counter)