parallelize

Classes and functions to sentence align two files.

`is_translated_from_lang2(path, lang2)`

Find out if the given doc is translated from lang2.

Source code in corpustools/parallelize.py

def is_translated_from_lang2(path: corpuspath.CorpusPath, lang2: str) -> bool:
    """Find out if the given doc is translated from lang2."""
    translated_from = path.metadata.get_variable("translated_from")

    if translated_from is None:
        return False

    return translated_from == lang2

`main()`

Parallelise files.

Source code in corpustools/parallelize.py

def main():
    """Parallelise files."""
    args = parse_options()

    for path in corpuspath.collect_files(args.sources, suffix=".xml"):
        orig_corpuspath = corpuspath.make_corpus_path(path.as_posix())

        if orig_corpuspath.lang == args.lang2:
            raise SystemExit(
                "Error: change the value of the -l2 option.\n"
                f"The -l2 value ({args.lang2}) cannot be the same as the "
                f"language as the source documents ({orig_corpuspath.lang})"
            )

        try:
            para_path, source_path = get_filepair(orig_corpuspath, args.lang2)
        except TypeError:
            continue

        try:
            parallelise_file(
                source_path,
                para_path,
                anchor_file=(
                    get_dictionary(lang1=source_path.lang, lang2=para_path.lang)
                    if args.dict is None
                    else args.dict
                ),
            )
        except (OSError, UserWarning) as error:
            print(str(error))
        except util.ArgumentError as error:
            raise SystemExit(
                f"{error}\nMore info here: "
                "https://divvun.github.io/CorpusTools/scripts/parallelize/#compile-dependencies",
            ) from error

`parallelise_file(source_lang_file, para_lang_file, anchor_file=None)`

Align sentences of two parallel files.

Source code in corpustools/parallelize.py

def parallelise_file(
    source_lang_file: corpuspath.CorpusPath,
    para_lang_file: corpuspath.CorpusPath,
    anchor_file: str | None = None,
):
    """Align sentences of two parallel files."""
    anchor_word_list = AnchorWordList()
    if anchor_file is not None:
        anchor_word_list.load_from_file(anchor_file)

    aligner = AlignmentModel(
        sentences_tuple=(
            sentencedivider.make_valid_sentences(source_lang_file),
            sentencedivider.make_valid_sentences(para_lang_file),
        ),
        anchor_word_list=anchor_word_list,
    )

    aligned = aligner.suggest_without_gui()

    aligned_sentences = aligned.non_empty_pairs()

    tmx_result = make_tmx(
        file1_name=source_lang_file.orig.name,
        language_pair=(source_lang_file.lang, para_lang_file.lang),
        aligned_text_pairs=aligned_sentences,
    )
    tmx_path = source_lang_file.tmx(para_lang_file.lang)
    tmx_path.write_bytes(
        etree.tostring(
            tmx_result,
            pretty_print=True,
            encoding="utf-8",
        )
    )
    print(f"TMX file created: {tmx_path}")

`parse_options()`

Parse the commandline options.

Returns:

Type	Description
`Namespace`	the parsed commandline arguments

Source code in corpustools/parallelize.py

def parse_options():
    """Parse the commandline options.

    Returns:
        (argparse.Namespace): the parsed commandline arguments
    """
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser], description="Sentence align file pairs."
    )

    parser.add_argument(
        "sources",
        nargs="+",
        help="Files or directories to search for parallelisable files",
    )
    parser.add_argument(
        "-d",
        "--dict",
        default=None,
        help="Use a different bilingual seed dictionary. "
        "Must have two columns, with input_file language "
        "first, and --parallel_language second, separated "
        "by `/'. By default, python_tca2 files are used, but these "
        "files only supports pairings between "
        "sme/sma/smj/fin/eng/nob. ",
    )
    parser.add_argument(
        "-l2",
        "--lang2",
        help="Indicate which language the given file should be parallelised with",
        required=True,
    )

    args = parser.parse_args()
    return args

`setup_anchors(lang1, lang2)`

Setup anchor file.

Parameters:

Name	Type	Description	Default
`lang1`	`str`	language 1	required
`lang2`	`str`	language 2	required

Returns:

Type	Description
`GenerateAnchorList`	The anchor list

Source code in corpustools/parallelize.py

def setup_anchors(
    lang1: str, lang2: str
) -> generate_anchor_list.GenerateAnchorList | None:
    """Setup anchor file.

    Args:
        lang1 (str): language 1
        lang2 (str): language 2

    Returns:
        (generate_anchor_list.GenerateAnchorList): The anchor list
    """
    path1 = os.path.join(
        os.environ["GTHOME"],
        f"gt/common/src/anchor-{lang1}-{lang2}.txt",
    )
    if os.path.exists(path1):
        return generate_anchor_list.GenerateAnchorList(
            lang1, lang2, [lang1, lang2], path1
        )

    path2 = os.path.join(
        os.environ["GTHOME"],
        f"gt/common/src/anchor-{lang2}-{lang1}.txt",
    )
    if os.path.exists(path2):
        return generate_anchor_list.GenerateAnchorList(
            lang1, lang2, [lang2, lang1], path2
        )

    return None