Skip to content

compile_cwb_mono

Take the "korp-ready" files in a "corpus-xxx/korp_mono" directory of a corpus, and create the binary CWB files (the files that are in data/ and registry/).

Corpus dataclass

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
@dataclass
class Corpus:
    # The original path that was given to us. This is kept because if only
    # a specific directory is given, we only want to recurse starting from
    # that directory. If only a single file is given, we only want to
    # process that single file
    path: Path

    lang: str

    # root corpus directory (some people call it "corpus", other "corpora",
    # others maybe "giella-corpora", who knows)
    root_dir: Path

    # the path to the orig folder
    orig_dir: Path

    # the path to the other folder (the ones with converted, analysed, etc)
    processed_dir: Path

    # is the corpora a closed one (i.e. "not open", one that uses source
    # material bound by copyright or such things)
    closed: bool

    # Which module we have selected, module being "converted", "analysed", etc
    module: Module | None

    # which category we have selected, if any
    category: str | None = None

    # a specific subpath inside of the category that is selected.
    # if given, only recurses from this directory
    subpath: str | None = None

    def has_module(self, module: Module):
        """Returns True if this corpus has module `module`, else False"""
        return (self.processed_dir / module).is_dir()

    @staticmethod
    def from_path(path):
        if isinstance(path, str):
            path = Path(path)
        path = path.resolve(strict=True)

        info = Corpus._find_corpus_folder(path)
        return Corpus(*info)

    @staticmethod
    def _find_corpus_folder(path):
        """Find the corpus directory in the given path.

        Args:
            path (pathlib.Path): the path to search

        Raises:
            ValueError: if no corpus directory was found

        Returns:
            (tuple): The result
        """
        parts = path.parts
        for idx, folder in enumerate(parts):
            if len(folder) >= 7 and folder.startswith("corpus-"):
                try:
                    lang_end_index = folder.index("-", 7)
                    lang = folder[7:lang_end_index + 1]
                except ValueError:
                    lang = folder[7:]

                closed = folder.endswith("-x-closed")
                root_dir = Path(*parts[0:idx])
                if not closed:
                    orig_dir = root_dir / f"corpus-{lang}-orig"
                    processed_dir = root_dir / f"corpus-{lang}"
                else:
                    orig_dir = root_dir / f"corpus-{lang}-orig-x-closed"
                    processed_dir = root_dir / f"corpus-{lang}-x-closed"

                module = None if idx + 1 >= len(parts) else parts[idx + 1]
                category = None if idx + 2 >= len(parts) else parts[idx + 2]
                subpath = None if idx + 3 >= len(parts) else Path(*parts[idx + 3:])

                return (path, lang, root_dir, orig_dir, processed_dir, closed,
                        module, category, subpath)

        raise ValueError(
            f"no corpus directory found in path {Path(*parts)}\n"
            "Hint: The first folder in the path that is named in the form "
            "'corpus-LANG[...]' (LANG being the language code), is "
            "considered the corpus directory. In the path given, no such "
            "folder was found"
        )

    def categories(self):
        """Yields category dictionaries"""
        if self.category:
            # only a specific category selected, so only yield one result
            yield self
        else:
            # iterate over all categories in CORPUS_ROOT/corpus-xxx/korp/<category>
            for p in (self.processed_dir / "korp_mono").iterdir():
                yield Corpus(
                    path=self.path,
                    lang=self.lang,
                    root_dir=self.root_dir,
                    orig_dir=self.orig_dir,
                    processed_dir=self.processed_dir,
                    closed=self.closed,
                    module="korp_mono",
                    category=p.parts[-1],
                    subpath=None,
                )

    def iter_files(self, suffix=""):
        directory = self.processed_dir
        if self.module:
            directory /= self.module
        if self.category:
            directory /= self.category
        if self.subpath is not None:
            directory /= self.subpath
        yield from directory.glob(f"**/*{suffix}")

categories()

Yields category dictionaries

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def categories(self):
    """Yields category dictionaries"""
    if self.category:
        # only a specific category selected, so only yield one result
        yield self
    else:
        # iterate over all categories in CORPUS_ROOT/corpus-xxx/korp/<category>
        for p in (self.processed_dir / "korp_mono").iterdir():
            yield Corpus(
                path=self.path,
                lang=self.lang,
                root_dir=self.root_dir,
                orig_dir=self.orig_dir,
                processed_dir=self.processed_dir,
                closed=self.closed,
                module="korp_mono",
                category=p.parts[-1],
                subpath=None,
            )

has_module(module)

Returns True if this corpus has module module, else False

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py
84
85
86
def has_module(self, module: Module):
    """Returns True if this corpus has module `module`, else False"""
    return (self.processed_dir / module).is_dir()

concat_corpus(corpus, date, parallel=None)

Concatenate all the vrt files in a corpus, and store it in one file.

This function replaces what the compile_corpus.xsl script does.

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
@timed
def concat_corpus(corpus, date, parallel=None):
    """Concatenate all the vrt files in a corpus, and store it in one file.

    This function replaces what the compile_corpus.xsl script does.
    """
    print("Concatenating corpora...")
    date_s = str(date).replace('-', '')
    compiled_directory = Path(f"vrt_{corpus.lang}_{date_s}")
    if compiled_directory.exists():
        remove_directory_contents(compiled_directory)
    compiled_directory.mkdir(exist_ok=True)

    errors = []

    for corpus_category in corpus.categories():
        category = corpus_category.category
        corpus_id = f"{corpus.lang}_{category}_{date_s}"
        print(f"  processing corpus {corpus_id}...")
        # corpus_sentence_count = 0
        root_element = ET.Element("corpus")
        root_element.attrib["id"] = corpus_id
        n_tot_sentences, n_tot_tokens = 0, 0
        text_num = 1
        file_list = list(corpus_category.iter_files(suffix="xml"))
        nfiles = len(file_list)

        for i, file in enumerate(file_list, start=1):
            print(f"    processing file [{i}/{nfiles}] {file}...",
                  end=" ", flush=True)
            try:
                text_el, n_sentences, n_tokens = process_input_xml(
                        file, category, text_num)
            except ET.ParseError:
                errors.append(f"file {file} could not be parsed (invalid xml?)")
                print("failed (could not parse xml)")
            else:
                if text_el:
                    text_num += 1
                    root_element.append(text_el)
                    n_tot_tokens += n_tokens
                    n_tot_sentences += n_sentences
                    print("done")
                else:
                    errors.append(f"file {file} had no text")
                    print("failed (file contains no text)")

        ET.indent(root_element, "")
        with open(Path(compiled_directory / f"{corpus_id}.vrt"), "w") as f:
            f.write(ET.tostring(root_element, encoding="unicode"))

    for error in errors:
        print(error)

create_korp_settings(korp_corpus_config_dir, vrt_directory, corpus_name)

Create the Korp corpus config-.yaml file that Korp needs, i.e. KORP_CORPUS_CONFIG_DIR/corpora/LANG_CATEGORY_DATE.yaml

Fill it with default values

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
def create_korp_settings(korp_corpus_config_dir, vrt_directory, corpus_name):
    """Create the Korp corpus config-.yaml file that Korp needs, i.e.
    KORP_CORPUS_CONFIG_DIR/corpora/LANG_CATEGORY_DATE.yaml

    Fill it with default values
    """
    raise NotImplementedError
    # create the Korp settings files,
    # vrt_fao_DATE
    default_title = " ".join(vrt_directory.split("_")[:-1])
    data = dedent(f"""
    description: blank
    id: {corpus_name}
    mode:
    - name: default
    title: {default_title}
    context:
      - label:
          eng: 1 sentence
          nob: 1 mening
        value: 1 sentence
    within:
      - label:
          eng: sentence
          swe: mening
        value: sentence
    """).strip()
    file = (korp_corpus_config_dir / corpus_name).with_suffix(".yaml")
    with open(file, "w") as f:
        f.write(data)

encode_corpus(vrt_directory, date, lang, data_dir, registry_dir, cwb_binaries_directory)

Run the CWB tools on the given folder that contains .vrt files, to create the data/ and registry/ folder contents for a corpus.

Parameters:

Name Type Description Default
vrt_directory Path

the output directory from the previous steps, that contains a .vrt file for a corpus.

required
date date

The date that we created this corpus

required
lang str

which language this corpus is in. 3-letter language code

required
cwb_binaries_directory Path

path to where the CWB binaries are located

required
target_directory Path

path to the directory where the final encoded corpus resides (the directory that has subfolders data/ and registry/)

required
Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
def encode_corpus(
    vrt_directory: Path,
    date: date,
    lang: str,
    data_dir: Path,
    registry_dir: Path,
    cwb_binaries_directory: Path,
):
    """Run the CWB tools on the given folder that contains .vrt files, to
    create the data/ and registry/ folder contents for a corpus.

    Args:
        vrt_directory (Path): the output directory from the previous steps,
            that contains a .vrt file for a corpus.
        date (date): The date that we created this corpus
        lang (str): which language this corpus is in. 3-letter language code
        cwb_binaries_directory (Path): path to where the CWB binaries are
            located
        target_directory (Path): path to the directory where the
            final encoded corpus resides (the directory that has subfolders
            data/ and registry/)
    """

    for vrt_file in vrt_directory.iterdir():
        print(f"{vrt_file.name}...")
        n_sentences, first_date, last_date = read_vrt_xml(vrt_file)
        corpus_name = vrt_file.name[:vrt_file.name.index(".")]
        upper_corpus_name = corpus_name.upper()
        # in metadata: id name title description lang updated
        # TODO this is supposed to be the "NAME" field in the file registry/<corpus>/<id>
        long_name = "" # in the metadata file, indexed by id=corpus_name

        #sh loc_encode_gt_corpus_20181106.sh "$input_data" "$date" "$ln" "$lang_code" "$corpus_name" "$fd" "$ld"

        corpus_data_dir = data_dir / corpus_name
        corpus_data_dir.mkdir(parents=True, exist_ok=True)
        with open(corpus_data_dir / ".info", "w") as f:
            f.write(
                f"Sentences: {n_sentences}\nUpdated: {date}\n"
                f"FirstDate: {first_date}\nLastDate: {last_date}\n"
            )

        cwb_encode(vrt_file, corpus_name, cwb_binaries_directory, data_dir, registry_dir)
        cwb_makeall(cwb_binaries_directory, registry_dir, upper_corpus_name)
        cwb_huffcode(cwb_binaries_directory, registry_dir, upper_corpus_name)
        cwb_compress_rdx(cwb_binaries_directory, registry_dir, upper_corpus_name)
        rm_unneeded_data_files(data_dir, corpus_name)
        DESCRIPTIVE_NAME = "DESCRIPTIVE " + corpus_name
        update_registry(registry_dir, corpus_name, DESCRIPTIVE_NAME, lang)

read_vrt_xml(vrt_file)

Read a (xml based) .vrt file, and return the number of sentences it contains, as well as the first and last date of the texts

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
def read_vrt_xml(vrt_file):
    """Read a (xml based) .vrt file, and return the number of sentences it
    contains, as well as the first and last date of the texts"""
    xml_root = ET.parse(vrt_file)
    dates = []
    for text_el in xml_root.findall("text"):
        datefrom = text_el.attrib["datefrom"]
        if datefrom:
            try:
                dates.append(date.fromisoformat(datefrom))
            except ValueError:
                pass
    n_sentences = len(xml_root.findall("sentence"))

    if not dates:
        first_date = None
        last_date = None
    else:
        dates.sort()
        first_date = dates[0]
        last_date = dates[-1]

    return n_sentences, first_date, last_date