Skip to content

normalise_filenames

Normalise the files in the given directory.

main()

Normalise filenames.

Source code in corpustools/normalise_filenames.py
96
97
98
99
def main():
    """Normalise filenames."""
    for target_dir in normalise_parse_args().target_dirs:
        normalise(Path(target_dir))

normalise(target_dir)

Normalise the filenames in the corpuses.

Parameters:

Name Type Description Default
target_dir str

directory where filenames should be normalised

required
Source code in corpustools/normalise_filenames.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def normalise(target_dir: Path):
    """Normalise the filenames in the corpuses.

    Args:
        target_dir (str): directory where filenames should be normalised
    """
    print(f"Normalising names in {target_dir}")
    files = (
        root / file_
        for root, _, filenames in target_dir.walk()
        for file_ in filenames
        if ".git" not in str(root) and not file_.endswith(".xsl")
    )

    normalised_paths = (
        (
            make_corpus_path(str(file_.with_name(normalise_filename(file_.name)))),
            make_corpus_path(str(file_)),
        )
        for file_ in files
        if normalise_filename(file_.name) != file_.name
    )

    for normalised_path, orig_corpus_path in normalised_paths:
        print(f"\t\tmove {orig_corpus_path.orig} -> {normalised_path.orig}")
        orig_corpus_path.orig.rename(normalised_path.orig)
        if orig_corpus_path.xsl.exists():
            orig_corpus_path.xsl.rename(normalised_path.xsl)
        if orig_corpus_path.converted.exists():
            orig_corpus_path.converted.rename(normalised_path.converted)

        for parallel_path in orig_corpus_path.parallels():
            if parallel_path is not None and parallel_path.exists():
                parallel_corpuspath = make_corpus_path(str(parallel_path))
                parallel_corpuspath.metadata.set_parallel_text(
                    normalised_path.lang, normalised_path.orig.name
                )
                parallel_corpuspath.metadata.write_file()

                if orig_corpus_path.tmx(parallel_corpuspath.lang).exists():
                    orig_corpus_path.tmx(parallel_corpuspath.lang).rename(
                        normalised_path.tmx(parallel_corpuspath.lang)
                    )

normalise_parse_args()

Parse the commandline options.

Returns:

Type Description
Namespace

the parsed commandline arguments

Source code in corpustools/normalise_filenames.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def normalise_parse_args():
    """Parse the commandline options.

    Returns:
        (argparse.Namespace): the parsed commandline arguments
    """
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser],
        description="Program to normalise names in given directories. "
        "The filenames are downcased, non ascii characters are replaced "
        "by ascii ones and some unwanted characters are removed.",
    )
    parser.add_argument(
        "target_dirs",
        nargs="+",
        help="The directory/ies where filenames should be normalised.",
    )

    args = parser.parse_args()

    return args