compile_cwb_mono

Take the "korp-ready" files in a "corpus-xxx/korp_mono" directory of a corpus, and create the binary CWB files (the files that are in data/ and registry/).

`Corpus` `dataclass`

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py

@dataclass
class Corpus:
    # The original path that was given to us. This is kept because if only
    # a specific directory is given, we only want to recurse starting from
    # that directory. If only a single file is given, we only want to
    # process that single file
    path: Path

    lang: str

    # root corpus directory (some people call it "corpus", other "corpora",
    # others maybe "giella-corpora", who knows)
    root_dir: Path

    # the path to the orig folder
    orig_dir: Path

    # the path to the other folder (the ones with converted, analysed, etc)
    processed_dir: Path

    # is the corpora a closed one (i.e. "not open", one that uses source
    # material bound by copyright or such things)
    closed: bool

    # Which module we have selected, module being "converted", "analysed", etc
    module: Module | None

    # which category we have selected, if any
    category: str | None = None

    # a specific subpath inside of the category that is selected.
    # if given, only recurses from this directory
    subpath: str | None = None

    def has_module(self, module: Module):
        """Returns True if this corpus has module `module`, else False"""
        return (self.processed_dir / module).is_dir()

    @staticmethod
    def from_path(path):
        if isinstance(path, str):
            path = Path(path)
        path = path.resolve(strict=True)

        info = Corpus._find_corpus_folder(path)
        return Corpus(*info)

    @staticmethod
    def _find_corpus_folder(path):
        """Find the corpus directory in the given path.

        Args:
            path (pathlib.Path): the path to search

        Raises:
            ValueError: if no corpus directory was found

        Returns:
            (tuple): The result
        """
        parts = path.parts
        for idx, folder in enumerate(parts):
            if len(folder) >= 7 and folder.startswith("corpus-"):
                try:
                    lang_end_index = folder.index("-", 7)
                    lang = folder[7:lang_end_index + 1]
                except ValueError:
                    lang = folder[7:]

                closed = folder.endswith("-x-closed")
                root_dir = Path(*parts[0:idx])
                if not closed:
                    orig_dir = root_dir / f"corpus-{lang}-orig"
                    processed_dir = root_dir / f"corpus-{lang}"
                else:
                    orig_dir = root_dir / f"corpus-{lang}-orig-x-closed"
                    processed_dir = root_dir / f"corpus-{lang}-x-closed"

                module = None if idx + 1 >= len(parts) else parts[idx + 1]
                category = None if idx + 2 >= len(parts) else parts[idx + 2]
                subpath = None if idx + 3 >= len(parts) else Path(*parts[idx + 3:])

                return (path, lang, root_dir, orig_dir, processed_dir, closed,
                        module, category, subpath)

        raise ValueError(
            f"no corpus directory found in path {Path(*parts)}\n"
            "Hint: The first folder in the path that is named in the form "
            "'corpus-LANG[...]' (LANG being the language code), is "
            "considered the corpus directory. In the path given, no such "
            "folder was found"
        )

    def categories(self):
        """Yields category dictionaries"""
        if self.category:
            # only a specific category selected, so only yield one result
            yield self
        else:
            # iterate over all categories in CORPUS_ROOT/corpus-xxx/korp/<category>
            for p in (self.processed_dir / "korp_mono").iterdir():
                yield Corpus(
                    path=self.path,
                    lang=self.lang,
                    root_dir=self.root_dir,
                    orig_dir=self.orig_dir,
                    processed_dir=self.processed_dir,
                    closed=self.closed,
                    module="korp_mono",
                    category=p.parts[-1],
                    subpath=None,
                )

    def iter_files(self, suffix=""):
        directory = self.processed_dir
        if self.module:
            directory /= self.module
        if self.category:
            directory /= self.category
        if self.subpath is not None:
            directory /= self.subpath
        yield from directory.glob(f"**/*{suffix}")

`categories()`

Yields category dictionaries

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py

def categories(self):
    """Yields category dictionaries"""
    if self.category:
        # only a specific category selected, so only yield one result
        yield self
    else:
        # iterate over all categories in CORPUS_ROOT/corpus-xxx/korp/<category>
        for p in (self.processed_dir / "korp_mono").iterdir():
            yield Corpus(
                path=self.path,
                lang=self.lang,
                root_dir=self.root_dir,
                orig_dir=self.orig_dir,
                processed_dir=self.processed_dir,
                closed=self.closed,
                module="korp_mono",
                category=p.parts[-1],
                subpath=None,
            )

`has_module(module)`

Returns True if this corpus has module module, else False

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py

def has_module(self, module: Module):
    """Returns True if this corpus has module `module`, else False"""
    return (self.processed_dir / module).is_dir()

`concat_corpus(corpus, date, parallel=None)`

Concatenate all the vrt files in a corpus, and store it in one file.

This function replaces what the compile_corpus.xsl script does.

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py

@timed
def concat_corpus(corpus, date, parallel=None):
    """Concatenate all the vrt files in a corpus, and store it in one file.

    This function replaces what the compile_corpus.xsl script does.
    """
    print("Concatenating corpora...")
    date_s = str(date).replace('-', '')
    compiled_directory = Path(f"vrt_{corpus.lang}_{date_s}")
    if compiled_directory.exists():
        remove_directory_contents(compiled_directory)
    compiled_directory.mkdir(exist_ok=True)

    errors = []

    for corpus_category in corpus.categories():
        category = corpus_category.category
        corpus_id = f"{corpus.lang}_{category}_{date_s}"
        print(f"  processing corpus {corpus_id}...")
        # corpus_sentence_count = 0
        root_element = ET.Element("corpus")
        root_element.attrib["id"] = corpus_id
        n_tot_sentences, n_tot_tokens = 0, 0
        text_num = 1
        file_list = list(corpus_category.iter_files(suffix="xml"))
        nfiles = len(file_list)

        for i, file in enumerate(file_list, start=1):
            print(f"    processing file [{i}/{nfiles}] {file}...",
                  end=" ", flush=True)
            try:
                text_el, n_sentences, n_tokens = process_input_xml(
                        file, category, text_num)
            except ET.ParseError:
                errors.append(f"file {file} could not be parsed (invalid xml?)")
                print("failed (could not parse xml)")
            else:
                if text_el:
                    text_num += 1
                    root_element.append(text_el)
                    n_tot_tokens += n_tokens
                    n_tot_sentences += n_sentences
                    print("done")
                else:
                    errors.append(f"file {file} had no text")
                    print("failed (file contains no text)")

        ET.indent(root_element, "")
        with open(Path(compiled_directory / f"{corpus_id}.vrt"), "w") as f:
            f.write(ET.tostring(root_element, encoding="unicode"))

    for error in errors:
        print(error)

`create_korp_settings(korp_corpus_config_dir, vrt_directory, corpus_name)`

Create the Korp corpus config-.yaml file that Korp needs, i.e. KORP_CORPUS_CONFIG_DIR/corpora/LANG_CATEGORY_DATE.yaml

Fill it with default values

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py

def create_korp_settings(korp_corpus_config_dir, vrt_directory, corpus_name):
    """Create the Korp corpus config-.yaml file that Korp needs, i.e.
    KORP_CORPUS_CONFIG_DIR/corpora/LANG_CATEGORY_DATE.yaml

    Fill it with default values
    """
    raise NotImplementedError
    # create the Korp settings files,
    # vrt_fao_DATE
    default_title = " ".join(vrt_directory.split("_")[:-1])
    data = dedent(f"""
    description: blank
    id: {corpus_name}
    mode:
    - name: default
    title: {default_title}
    context:
      - label:
          eng: 1 sentence
          nob: 1 mening
        value: 1 sentence
    within:
      - label:
          eng: sentence
          swe: mening
        value: sentence
    """).strip()
    file = (korp_corpus_config_dir / corpus_name).with_suffix(".yaml")
    with open(file, "w") as f:
        f.write(data)

`encode_corpus(vrt_directory, date, lang, data_dir, registry_dir, cwb_binaries_directory)`

Run the CWB tools on the given folder that contains .vrt files, to create the data/ and registry/ folder contents for a corpus.

Parameters:

Name	Type	Description	Default
`vrt_directory`	`Path`	the output directory from the previous steps, that contains a .vrt file for a corpus.	required
`date`	`date`	The date that we created this corpus	required
`lang`	`str`	which language this corpus is in. 3-letter language code	required
`cwb_binaries_directory`	`Path`	path to where the CWB binaries are located	required
`target_directory`	`Path`	path to the directory where the final encoded corpus resides (the directory that has subfolders data/ and registry/)	required

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py

def encode_corpus(
    vrt_directory: Path,
    date: date,
    lang: str,
    data_dir: Path,
    registry_dir: Path,
    cwb_binaries_directory: Path,
):
    """Run the CWB tools on the given folder that contains .vrt files, to
    create the data/ and registry/ folder contents for a corpus.

    Args:
        vrt_directory (Path): the output directory from the previous steps,
            that contains a .vrt file for a corpus.
        date (date): The date that we created this corpus
        lang (str): which language this corpus is in. 3-letter language code
        cwb_binaries_directory (Path): path to where the CWB binaries are
            located
        target_directory (Path): path to the directory where the
            final encoded corpus resides (the directory that has subfolders
            data/ and registry/)
    """

    for vrt_file in vrt_directory.iterdir():
        print(f"{vrt_file.name}...")
        n_sentences, first_date, last_date = read_vrt_xml(vrt_file)
        corpus_name = vrt_file.name[:vrt_file.name.index(".")]
        upper_corpus_name = corpus_name.upper()
        # in metadata: id name title description lang updated
        # TODO this is supposed to be the "NAME" field in the file registry/<corpus>/<id>
        long_name = "" # in the metadata file, indexed by id=corpus_name

        #sh loc_encode_gt_corpus_20181106.sh "$input_data" "$date" "$ln" "$lang_code" "$corpus_name" "$fd" "$ld"

        corpus_data_dir = data_dir / corpus_name
        corpus_data_dir.mkdir(parents=True, exist_ok=True)
        with open(corpus_data_dir / ".info", "w") as f:
            f.write(
                f"Sentences: {n_sentences}\nUpdated: {date}\n"
                f"FirstDate: {first_date}\nLastDate: {last_date}\n"
            )

        cwb_encode(vrt_file, corpus_name, cwb_binaries_directory, data_dir, registry_dir)
        cwb_makeall(cwb_binaries_directory, registry_dir, upper_corpus_name)
        cwb_huffcode(cwb_binaries_directory, registry_dir, upper_corpus_name)
        cwb_compress_rdx(cwb_binaries_directory, registry_dir, upper_corpus_name)
        rm_unneeded_data_files(data_dir, corpus_name)
        DESCRIPTIVE_NAME = "DESCRIPTIVE " + corpus_name
        update_registry(registry_dir, corpus_name, DESCRIPTIVE_NAME, lang)

`read_vrt_xml(vrt_file)`

Read a (xml based) .vrt file, and return the number of sentences it contains, as well as the first and last date of the texts

Source code in /home/anders/projects/CorpusTools/corpustools/compile_cwb_mono.py

def read_vrt_xml(vrt_file):
    """Read a (xml based) .vrt file, and return the number of sentences it
    contains, as well as the first and last date of the texts"""
    xml_root = ET.parse(vrt_file)
    dates = []
    for text_el in xml_root.findall("text"):
        datefrom = text_el.attrib["datefrom"]
        if datefrom:
            try:
                dates.append(date.fromisoformat(datefrom))
            except ValueError:
                pass
    n_sentences = len(xml_root.findall("sentence"))

    if not dates:
        first_date = None
        last_date = None
    else:
        dates.sort()
        first_date = dates[0]
        last_date = dates[-1]

    return n_sentences, first_date, last_date

compile_cwb_mono

Corpus dataclass

categories()

has_module(module)

concat_corpus(corpus, date, parallel=None)

create_korp_settings(korp_corpus_config_dir, vrt_directory, corpus_name)

encode_corpus(vrt_directory, date, lang, data_dir, registry_dir, cwb_binaries_directory)

read_vrt_xml(vrt_file)

`Corpus` `dataclass`

`categories()`

`has_module(module)`

`concat_corpus(corpus, date, parallel=None)`

`create_korp_settings(korp_corpus_config_dir, vrt_directory, corpus_name)`

`encode_corpus(vrt_directory, date, lang, data_dir, registry_dir, cwb_binaries_directory)`

`read_vrt_xml(vrt_file)`