Skip to content

compile_cwb_mono

Take the "korp-ready" files in a "corpus-xxx/korp_mono" directory of a corpus, and create the binary CWB files (the files that are in data/ and registry/).

Category dataclass

A category in a stage of a corpora. The category may exist in only the open, or the closed corpora, or both. corpus-xxx[-x-closed]/STAGE/CATEGORY

Source code in corpustools/compile_cwb_mono.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
@dataclass
class Category:
    """A category in a stage of a corpora.
    The category may exist in only the open, or the closed
    corpora, or both.
    corpus-xxx[-x-closed]/STAGE/CATEGORY
    """
    lang: str
    # "converted", "analysed", "korp_mono", etc...
    stage: str
    # "admin", "blogs", "facta", "ficit", "laws", etc...
    category: str

    # the original path to where all corpus-xxx directories
    # are kept
    root: Path

    # does this category exist in the open corpora? what about
    # in the closed?
    # invariant: both cannot be False at the same time
    #   (that would mean there is no such category)
    in_open: bool
    in_closed: bool

    def __post_init__(self):
        if not self.in_open and not self.in_closed:
            raise ValueError("either in_open must be true, or in_closed must be true, or both. they cannot both be False")

    def directory(self, openorclosed):
        """The Path to the category directory in the open corpora."""
        cn = corpusname(self.lang, False, openorclosed)
        return self.root / cn / self.stage / self.category

    def files(self, suffix=""):
        glob = f"**/*{suffix}"
        if self.in_open:
            yield from self.directory("open").glob(glob)
        if self.in_closed:
            yield from self.directory("closed").glob(glob)

directory(openorclosed)

The Path to the category directory in the open corpora.

Source code in corpustools/compile_cwb_mono.py
112
113
114
115
def directory(self, openorclosed):
    """The Path to the category directory in the open corpora."""
    cn = corpusname(self.lang, False, openorclosed)
    return self.root / cn / self.stage / self.category

Corp dataclass

Source code in corpustools/compile_cwb_mono.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
@dataclass
class Corp:
    lang: str
    root: Path

    @classmethod
    def from_root_and_lang(cls, root: Path, lang: str):
        root = root.resolve()
        open_corp = root / corpusname(lang, False, "open")
        closed_corp = root / corpusname(lang, False, "closed")
        has_open = open_corp.is_dir()
        has_closed = closed_corp.is_dir()
        if has_open or has_closed:
            return cls(lang, root)

    def categories(self, stage: str):
        """Get all categories for a stage, that is, the set
        of all subdirectories in both the open and closed
        corpora of this language. That would be all of these
        subdirectories:
           corpus-{self.lang}/{stage}/SUBDIR
           corpus-{self.lang}-x-closed/{stage}/SUBDIR
        """

        # this gives you the category, but you don't know if
        # that category exists in the open corpora, the closed,
        # or both, so the caller will have to check

        d = defaultdict(list)  # name -> [open_path, closed_path]
        for oc in ("open", "closed"):
            this_corp_name = corpusname(self.lang, False, oc)
            thiscorp = self.root / this_corp_name

            stage_dir = thiscorp / stage
            if not stage_dir.is_dir():
                # stage doesn't exist here
                continue

            for category in stage_dir.iterdir():
                if not category.is_dir():
                    # not expecting to find a file here..
                    continue
                name = category.name
                d[name].append(oc)

        for name, ocs in d.items():
            in_open = in_closed = False
            for oc in ocs:
                if oc == "open":
                    in_open = True
                elif oc == "closed":
                    in_closed = True

            yield Category(
                self.lang,
                stage,
                name,
                self.root,
                in_open,
                in_closed,
            )

categories(stage)

Get all categories for a stage, that is, the set of all subdirectories in both the open and closed corpora of this language. That would be all of these subdirectories: corpus-{self.lang}/{stage}/SUBDIR corpus-{self.lang}-x-closed/{stage}/SUBDIR

Source code in corpustools/compile_cwb_mono.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
def categories(self, stage: str):
    """Get all categories for a stage, that is, the set
    of all subdirectories in both the open and closed
    corpora of this language. That would be all of these
    subdirectories:
       corpus-{self.lang}/{stage}/SUBDIR
       corpus-{self.lang}-x-closed/{stage}/SUBDIR
    """

    # this gives you the category, but you don't know if
    # that category exists in the open corpora, the closed,
    # or both, so the caller will have to check

    d = defaultdict(list)  # name -> [open_path, closed_path]
    for oc in ("open", "closed"):
        this_corp_name = corpusname(self.lang, False, oc)
        thiscorp = self.root / this_corp_name

        stage_dir = thiscorp / stage
        if not stage_dir.is_dir():
            # stage doesn't exist here
            continue

        for category in stage_dir.iterdir():
            if not category.is_dir():
                # not expecting to find a file here..
                continue
            name = category.name
            d[name].append(oc)

    for name, ocs in d.items():
        in_open = in_closed = False
        for oc in ocs:
            if oc == "open":
                in_open = True
            elif oc == "closed":
                in_closed = True

        yield Category(
            self.lang,
            stage,
            name,
            self.root,
            in_open,
            in_closed,
        )

Corpus dataclass

Source code in corpustools/compile_cwb_mono.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
@dataclass
class Corpus:
    # The original path that was given to us. This is kept because if only
    # a specific directory is given, we only want to recurse starting from
    # that directory. If only a single file is given, we only want to
    # process that single file
    path: Path

    lang: str

    # directory where corpus-LANG[-orig][-x-closed] directories is stored
    root_dir: Path

    # the path to the orig folder
    orig_dir: Path

    # the path to the other folder (the ones with converted, analysed, etc)
    processed_dir: Path

    # is the corpora a closed one (i.e. "not open", one that uses source
    # material bound by copyright or such things)
    closed: bool

    # Which module we have selected, module being "converted", "analysed", etc
    module: Module | None

    # which category we have selected, if any
    category: str | None = None

    # a specific subpath inside of the category that is selected.
    # if given, only recurses from this directory
    subpath: str | None = None

    def has_module(self, module: Module):
        """Returns True if this corpus has module `module`, else False"""
        return (self.processed_dir / module).is_dir()

    @staticmethod
    def from_path(path):
        if isinstance(path, str):
            path = Path(path)
        path = path.resolve(strict=True)

        info = Corpus._find_corpus_folder(path)
        return Corpus(*info)

    @staticmethod
    def _find_corpus_folder(path):
        """Find the corpus directory in the given path.

        Args:
            path (pathlib.Path): the path to search

        Raises:
            ValueError: if no corpus directory was found

        Returns:
            (tuple): The result
        """
        parts = path.parts
        for idx, folder in enumerate(parts):
            if len(folder) >= 7 and folder.startswith("corpus-"):
                try:
                    # str.index() ValueError's on substring not found
                    lang = folder[7:folder.index("-", 7)]
                except ValueError:
                    lang = folder[7:]

                closed = folder.endswith("-x-closed")
                root_dir = Path(*parts[0:idx])
                if not closed:
                    orig_dir = root_dir / f"corpus-{lang}-orig"
                    processed_dir = root_dir / f"corpus-{lang}"
                else:
                    orig_dir = root_dir / f"corpus-{lang}-orig-x-closed"
                    processed_dir = root_dir / f"corpus-{lang}-x-closed"

                module = None if idx + 1 >= len(parts) else parts[idx + 1]
                category = None if idx + 2 >= len(parts) else parts[idx + 2]
                subpath = None if idx + 3 >= len(parts) else Path(*parts[idx + 3 :])

                return (
                    path,
                    lang,
                    root_dir,
                    orig_dir,
                    processed_dir,
                    closed,
                    module,
                    category,
                    subpath,
                )

        raise ValueError(
            f"no corpus directory found in path {Path(*parts)}\n"
            "Hint: The first folder in the path that is named in the form "
            "'corpus-LANG[...]' (LANG being the language code), is "
            "considered the corpus directory. In the path given, no such "
            "folder was found"
        )

    def categories(self):
        """Yields category dictionaries"""
        if self.category:
            # only a specific category selected, so only yield one result
            yield self
        else:
            # iterate over all categories in CORPUS_ROOT/corpus-xxx/korp/<category>
            for p in (self.processed_dir / "korp_mono").iterdir():
                yield Corpus(
                    path=self.path,
                    lang=self.lang,
                    root_dir=self.root_dir,
                    orig_dir=self.orig_dir,
                    processed_dir=self.processed_dir,
                    closed=self.closed,
                    module="korp_mono",
                    category=p.parts[-1],
                    subpath=None,
                )

    def iter_files(self, suffix=""):
        directory = self.processed_dir
        if self.module:
            directory /= self.module
        if self.category:
            directory /= self.category
        if self.subpath is not None:
            directory /= self.subpath
        yield from directory.glob(f"**/*{suffix}")

categories()

Yields category dictionaries

Source code in corpustools/compile_cwb_mono.py
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def categories(self):
    """Yields category dictionaries"""
    if self.category:
        # only a specific category selected, so only yield one result
        yield self
    else:
        # iterate over all categories in CORPUS_ROOT/corpus-xxx/korp/<category>
        for p in (self.processed_dir / "korp_mono").iterdir():
            yield Corpus(
                path=self.path,
                lang=self.lang,
                root_dir=self.root_dir,
                orig_dir=self.orig_dir,
                processed_dir=self.processed_dir,
                closed=self.closed,
                module="korp_mono",
                category=p.parts[-1],
                subpath=None,
            )

has_module(module)

Returns True if this corpus has module module, else False

Source code in corpustools/compile_cwb_mono.py
221
222
223
def has_module(self, module: Module):
    """Returns True if this corpus has module `module`, else False"""
    return (self.processed_dir / module).is_dir()

corpusname(lang, origs=False, openorclosed='open')

The name of a corpus directory for a given lang, whether or not it is origs, or open or closed.

Source code in corpustools/compile_cwb_mono.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def corpusname(lang: str, origs=False, openorclosed="open"):
    """The name of a corpus directory for a given lang,
    whether or not it is origs, or open or closed."""
    if openorclosed != "open" and openorclosed != "closed":
        raise ValueError("openorclosed must be 'open' or 'closed'")

    match (origs, openorclosed):
        case (True, "closed"):
            return f"corpus-{lang}-orig-x-closed"
        case (False, "closed"):
            return f"corpus-{lang}-x-closed"
        case (True, "open"):
            return f"corpus-{lang}-orig"
        case (False, "open"):
            return f"corpus-{lang}"

create_korp_settings(korp_config_dir, vrt_file)

Create the Korp backend corpus config-.yaml file that Korp needs, i.e. KORP_CORPUS_CONFIG_DIR/corpora/LANG_CATEGORY_DATE.yaml

Fill it with default values

Source code in corpustools/compile_cwb_mono.py
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
def create_korp_settings(korp_config_dir, vrt_file):
    """Create the Korp backend corpus config-.yaml file that Korp needs, i.e.
    KORP_CORPUS_CONFIG_DIR/corpora/LANG_CATEGORY_DATE.yaml

    Fill it with default values
    """
    category = vrt_file.name.split("_")[1]  # lang_category_date
    title_and_description = CORPUS_CONFIG_TITLE_AND_DESCRIPTIONS.get(
        category,
        CORPUS_CONFIG_TITLE_AND_DESCRIPTIONS["__DEFAULT__"],
    )
    corpus_id = vrt_file.stem
    file_contents = KORP_SETTINGS_TEMPLATE.format(
        corpus_id=corpus_id,
        title_and_description=title_and_description,
    )
    file = (korp_config_dir / "corpora" / vrt_file.name).with_suffix(".yaml")
    print(f"Write korp-backend yaml config to {file}")
    with open(file, "w") as f:
        f.write(file_contents)

encode_corpus(vrt_file, date, lang, data_dir, registry_dir, cwb_binaries_directory)

Run the CWB tools on the given folder that contains .vrt files, to create the data/ and registry/ folder contents for a corpus.

Parameters:

Name Type Description Default
vrt_directory Path

the output directory from the previous steps, that contains a .vrt file for a corpus.

required
date date

The date that we created this corpus

required
lang str

which language this corpus is in. 3-letter language code

required
cwb_binaries_directory Path

path to where the CWB binaries are located

required
target_directory Path

path to the directory where the final encoded corpus resides (the directory that has subfolders data/ and registry/)

required
Source code in corpustools/compile_cwb_mono.py
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
def encode_corpus(
    vrt_file: Path,
    date: date,
    lang: str,
    data_dir: Path,
    registry_dir: Path,
    cwb_binaries_directory: Path,
):
    """Run the CWB tools on the given folder that contains .vrt files, to
    create the data/ and registry/ folder contents for a corpus.

    Args:
        vrt_directory (Path): the output directory from the previous steps,
            that contains a .vrt file for a corpus.
        date (date): The date that we created this corpus
        lang (str): which language this corpus is in. 3-letter language code
        cwb_binaries_directory (Path): path to where the CWB binaries are
            located
        target_directory (Path): path to the directory where the
            final encoded corpus resides (the directory that has subfolders
            data/ and registry/)
    """

    n_sentences, first_date, last_date = read_vrt_xml(vrt_file)
    corpus_name = vrt_file.name[: vrt_file.name.index(".")]
    upper_corpus_name = corpus_name.upper()
    # in metadata: id name title description lang updated
    # TODO this is supposed to be the "NAME" field in the file registry/<corpus>/<id>

    # sh loc_encode_gt_corpus_20181106.sh "$input_data" "$date" "$ln" "$lang_code" "$corpus_name" "$fd" "$ld"

    corpus_data_dir = data_dir / corpus_name
    corpus_data_dir.mkdir(parents=True, exist_ok=True)
    with open(corpus_data_dir / ".info", "w") as f:
        f.write(
            f"Sentences: {n_sentences}\nUpdated: {date}\n"
            f"FirstDate: {first_date}\nLastDate: {last_date}\n"
        )

    cwbbindir = cwb_binaries_directory
    cwb_encode(
        cwbbindir, vrt_file, corpus_name, data_dir, registry_dir
    )
    cwb_makeall(cwbbindir, registry_dir, upper_corpus_name)
    cwb_huffcode(cwbbindir, registry_dir, upper_corpus_name)
    cwb_compress_rdx(cwbbindir, registry_dir, upper_corpus_name)
    rm_unneeded_data_files(data_dir, corpus_name)
    DESCRIPTIVE_NAME = "DESCRIPTIVE " + corpus_name
    update_registry(registry_dir, corpus_name, DESCRIPTIVE_NAME, lang)

read_vrt_xml(vrt_file)

Read a (xml based) .vrt file, and return the number of sentences it contains, as well as the first and last date of the texts

Source code in corpustools/compile_cwb_mono.py
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
def read_vrt_xml(vrt_file):
    """Read a (xml based) .vrt file, and return the number of sentences it
    contains, as well as the first and last date of the texts"""
    xml_root = ET.parse(vrt_file)
    dates = []
    for text_el in xml_root.findall("text"):
        datefrom = text_el.attrib["datefrom"]
        if datefrom:
            try:
                dates.append(date.fromisoformat(datefrom))
            except ValueError:
                pass
    n_sentences = len(xml_root.findall("sentence"))

    dates.sort()
    first_date = dates[0] if dates else None
    last_date = dates[-1] if dates else None

    return n_sentences, first_date, last_date