Skip to content

trainingcorpusmaker

Classes and functions to make corpus training files.

TrainingCorpusMaker

Turn analysed giella xml files into training corpus.

Filter out all sentences containing words unknown to the giella fst analysers.

Attributes:

Name Type Description
only_words str

regex catching word made up of letters.

xml_printer ccat.XMLPrinter

extracts the dependency analysis from the giella xml files.

lang str

the language of the training corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
class TrainingCorpusMaker:
    """Turn analysed giella xml files into training corpus.

    Filter out all sentences containing words unknown to the
    giella fst analysers.

    Attributes:
        only_words (str): regex catching word made up of letters.
        xml_printer (ccat.XMLPrinter): extracts the dependency analysis
            from the giella xml files.
        lang (str): the language of the training corpus.
    """

    only_words = regex.compile(r"\p{L}+")
    xml_printer = ccat.XMLPrinter(dependency=True)

    def __init__(self, lang):
        """Initialise the TrainingCorpusMaker class.

        Args:
            lang (str): three-letter language code
        """
        self.lang = lang

    def parse_dependency(self, text):
        """Parse the dependency element found in a giella xml file.

        Args:
            text (str): contains the dependency element of a giella xml file.

        Yields:
            (str): a sentence containing only words known to the giella fst
                analysers, that contain at least a word as identified by
                the only_words regex.
        """
        sentence_buffer = []
        uff_buffer = []
        for line in io.StringIO(text):
            line = line.rstrip()
            if line == ":" or line == ":\\n":
                sentence_buffer.append(" ")
            elif line.startswith(":"):
                uff_buffer.append(line)
            elif line.startswith('"'):
                sentence_buffer.append(line[2:-2])
            elif "CLB" in line:
                if not (
                    '".' not in line
                    and '"¶"' not in line
                    and '"?"' not in line
                    and '"!"' not in line
                    and '"…"' not in line
                ):
                    if uff_buffer:
                        for uff in uff_buffer:
                            util.print_frame(uff)
                    else:
                        sentence_line = (
                            "".join(sentence_buffer).replace("¶", "").strip()
                        )
                        if self.only_words.search(sentence_line):
                            yield sentence_line
                    uff_buffer[:] = []
                    sentence_buffer[:] = []
            elif '" ?' in line:
                uff_buffer.append(line)

    def file_to_sentences(self, filename):
        """Turn a giella xml into a list of sentences.

        Args:
            filename (str): name of the giella xml file containing a
                dependency element.

        Returns:
            (list[str]): list of the sentences
        """
        self.xml_printer.parse_file(filename)
        text = self.xml_printer.process_file().getvalue()
        if text.strip():
            return [sentence for sentence in self.parse_dependency(text) if sentence]
        else:
            return []

    def analysed_files(self):
        """Find analysed files.

        Yields:
            (str): filename of an analysed file.
        """
        for corpus in [
            os.path.join(os.getenv("GTFREE"), "analysed", self.lang),
            os.path.join(os.getenv("GTBOUND"), "analysed", self.lang),
        ]:
            for root, _, files in os.walk(corpus):
                for file_ in files:
                    yield os.path.join(root, file_)

    def make_corpus_files(self):
        """Make .txt files from .xml files.

        The .txt files contain only sentences with words known to the
        giella fsts.
        """
        for analysed_file in self.analysed_files():
            if analysed_file.endswith(".xml"):
                with open(analysed_file.replace(".xml", ".txt"), "w") as txt_stream:
                    txt_stream.write("\n".join(self.file_to_sentences(analysed_file)))
                    txt_stream.write("\n")

    def pytextcat_corpus(self):
        """Turn the free and bound corpus into a pytextcat training corpus."""
        corpus_dir = os.path.join("pytextcat", self.lang)
        with util.ignored(OSError):
            os.makedirs(corpus_dir)

        with open(f"{os.path.join(corpus_dir, self.lang)}.txt", "w") as corpusfile:
            for analysed_file in self.analysed_files():
                if analysed_file.endswith(".txt"):
                    with open(analysed_file) as analysed:
                        corpusfile.write(analysed.read())

    def langid_corpus(self):
        """Turn the free and bound corpus into a langid training corpus."""
        for analysed_file in self.analysed_files():
            if analysed_file.endswith(".txt"):
                langid_dir = "langid/{}/{}".format(
                    util.split_path(analysed_file).genre, self.lang
                )
                with util.ignored(OSError):
                    os.makedirs(langid_dir)
                copy(analysed_file, langid_dir)

__init__(lang)

Initialise the TrainingCorpusMaker class.

Parameters:

Name Type Description Default
lang str

three-letter language code

required
Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
48
49
50
51
52
53
54
def __init__(self, lang):
    """Initialise the TrainingCorpusMaker class.

    Args:
        lang (str): three-letter language code
    """
    self.lang = lang

analysed_files()

Find analysed files.

Yields:

Type Description
str

filename of an analysed file.

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
116
117
118
119
120
121
122
123
124
125
126
127
128
def analysed_files(self):
    """Find analysed files.

    Yields:
        (str): filename of an analysed file.
    """
    for corpus in [
        os.path.join(os.getenv("GTFREE"), "analysed", self.lang),
        os.path.join(os.getenv("GTBOUND"), "analysed", self.lang),
    ]:
        for root, _, files in os.walk(corpus):
            for file_ in files:
                yield os.path.join(root, file_)

file_to_sentences(filename)

Turn a giella xml into a list of sentences.

Parameters:

Name Type Description Default
filename str

name of the giella xml file containing a dependency element.

required

Returns:

Type Description
list[str]

list of the sentences

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def file_to_sentences(self, filename):
    """Turn a giella xml into a list of sentences.

    Args:
        filename (str): name of the giella xml file containing a
            dependency element.

    Returns:
        (list[str]): list of the sentences
    """
    self.xml_printer.parse_file(filename)
    text = self.xml_printer.process_file().getvalue()
    if text.strip():
        return [sentence for sentence in self.parse_dependency(text) if sentence]
    else:
        return []

langid_corpus()

Turn the free and bound corpus into a langid training corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
154
155
156
157
158
159
160
161
162
163
def langid_corpus(self):
    """Turn the free and bound corpus into a langid training corpus."""
    for analysed_file in self.analysed_files():
        if analysed_file.endswith(".txt"):
            langid_dir = "langid/{}/{}".format(
                util.split_path(analysed_file).genre, self.lang
            )
            with util.ignored(OSError):
                os.makedirs(langid_dir)
            copy(analysed_file, langid_dir)

make_corpus_files()

Make .txt files from .xml files.

The .txt files contain only sentences with words known to the giella fsts.

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
130
131
132
133
134
135
136
137
138
139
140
def make_corpus_files(self):
    """Make .txt files from .xml files.

    The .txt files contain only sentences with words known to the
    giella fsts.
    """
    for analysed_file in self.analysed_files():
        if analysed_file.endswith(".xml"):
            with open(analysed_file.replace(".xml", ".txt"), "w") as txt_stream:
                txt_stream.write("\n".join(self.file_to_sentences(analysed_file)))
                txt_stream.write("\n")

parse_dependency(text)

Parse the dependency element found in a giella xml file.

Parameters:

Name Type Description Default
text str

contains the dependency element of a giella xml file.

required

Yields:

Type Description
str

a sentence containing only words known to the giella fst analysers, that contain at least a word as identified by the only_words regex.

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def parse_dependency(self, text):
    """Parse the dependency element found in a giella xml file.

    Args:
        text (str): contains the dependency element of a giella xml file.

    Yields:
        (str): a sentence containing only words known to the giella fst
            analysers, that contain at least a word as identified by
            the only_words regex.
    """
    sentence_buffer = []
    uff_buffer = []
    for line in io.StringIO(text):
        line = line.rstrip()
        if line == ":" or line == ":\\n":
            sentence_buffer.append(" ")
        elif line.startswith(":"):
            uff_buffer.append(line)
        elif line.startswith('"'):
            sentence_buffer.append(line[2:-2])
        elif "CLB" in line:
            if not (
                '".' not in line
                and '"¶"' not in line
                and '"?"' not in line
                and '"!"' not in line
                and '"…"' not in line
            ):
                if uff_buffer:
                    for uff in uff_buffer:
                        util.print_frame(uff)
                else:
                    sentence_line = (
                        "".join(sentence_buffer).replace("¶", "").strip()
                    )
                    if self.only_words.search(sentence_line):
                        yield sentence_line
                uff_buffer[:] = []
                sentence_buffer[:] = []
        elif '" ?' in line:
            uff_buffer.append(line)

pytextcat_corpus()

Turn the free and bound corpus into a pytextcat training corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
142
143
144
145
146
147
148
149
150
151
152
def pytextcat_corpus(self):
    """Turn the free and bound corpus into a pytextcat training corpus."""
    corpus_dir = os.path.join("pytextcat", self.lang)
    with util.ignored(OSError):
        os.makedirs(corpus_dir)

    with open(f"{os.path.join(corpus_dir, self.lang)}.txt", "w") as corpusfile:
        for analysed_file in self.analysed_files():
            if analysed_file.endswith(".txt"):
                with open(analysed_file) as analysed:
                    corpusfile.write(analysed.read())

main()

Turn the corpus into a pytextcat training corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def main():
    """Turn the corpus into a pytextcat training corpus."""
    args = parse_options()

    for lang in args.langs:
        sentence_maker = TrainingCorpusMaker(lang)
        sentence_maker.make_corpus_files()
        sentence_maker.pytextcat_corpus()
        sentence_maker.langid_corpus()

    print(
        "Now you will find training corpus for pytextcat and langid "
        "in the pytextcat and langid directories in the current directory."
    )

parse_options()

Parse the options given to the program.

Source code in /home/anders/projects/CorpusTools/corpustools/trainingcorpusmaker.py
166
167
168
169
170
171
172
173
174
175
176
177
def parse_options():
    """Parse the options given to the program."""
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser],
        description="Make training corpus from analysed giella xml files.\n"
        "Sentences with words unknown for the giella fsts are not included.",
    )
    parser.add_argument(
        "langs", nargs="+", help="The languages to make a training corpus for."
    )

    return parser.parse_args()