Skip to content

realign

Sentence align a given file anew.

convert_and_copy(corpus_path1, corpus_path2)

Reconvert and copy files to prestable/converted.

Parameters:

Name Type Description Default
corpus_path1 make_corpus_path

A CorpusPath representing the lang1 file that should be reconverted.

required
corpus_path2 make_corpus_path

A CorpusPath representing the lang2 file that should be reconverted.

required
Source code in corpustools/realign.py
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def convert_and_copy(corpus_path1, corpus_path2):
    """Reconvert and copy files to prestable/converted.

    Args:
        corpus_path1 (corpuspath.make_corpus_path): A CorpusPath representing the
            lang1 file that should be reconverted.
        corpus_path2 (corpuspath.make_corpus_path): A CorpusPath representing the
            lang2 file that should be reconverted.
    """
    for corpus_path in [corpus_path1, corpus_path2]:
        corpus_path.converted.unlink(missing_ok=True)

    convertermanager.sanity_check()
    converter_manager = convertermanager.ConverterManager()
    converter_manager.collect_files(
        [corpus_path1.orig.as_posix(), corpus_path2.orig.as_posix()]
    )
    converter_manager.convert_serially()

main()

Sentence align a given file anew.

Source code in corpustools/realign.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def main():
    """Sentence align a given file anew."""
    convertermanager.LOGGER.setLevel(logging.DEBUG)
    args = parse_options()

    tmxhtml = Path(args.tmxhtml).resolve()
    path = tmxhtml.with_suffix("") if tmxhtml.suffix == ".html" else tmxhtml
    source_path = corpuspath.make_corpus_path(path)

    if not source_path.orig.exists():
        raise SystemExit(
            f"\nERROR: You should delete\n«{args.tmxhtml}»\n"
            f"The source of it does not exist."
        )

    lang2 = Path(path.as_posix().split("/tmx/")[1]).parts[0]
    parallel = source_path.parallel(lang2)
    if parallel is None:
        raise SystemExit(f"Could not find parallel file of {source_path.orig}")

    para_path = corpuspath.make_corpus_path(parallel)

    print_filenames(source_path, para_path)

    if args.files:
        raise SystemExit("Only printing file names")

    try:
        convert_and_copy(source_path, para_path)
    except Exception as error:
        raise SystemExit from error

    if args.convert:
        raise SystemExit("Only converting")

    try:
        parallelize.parallelise_file(
            source_path,
            para_path,
            anchor_file=parallelize.get_dictionary(para_path.lang, source_path.lang),
        )
        tmx.tmx2html(source_path.tmx(para_path.lang))
    except util.ArgumentError as error:
        raise SystemExit(
            f"\n{error}\n"
            f"Run «make install» in lang-{source_path.lang} "
            f"and/or lang-{para_path.lang} first."
        ) from error

parse_options()

Parse the commandline options.

Returns:

Type Description
Namespace

the parsed commandline arguments

Source code in corpustools/realign.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def parse_options():
    """Parse the commandline options.

    Returns:
        (argparse.Namespace): the parsed commandline arguments
    """
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser],
        description="Sentence align a given file anew.\n"
        "Files are converted before being parallelised.\n"
        "This is mainly thought of as a debugging program "
        "when trying to solve issues in parallelised files.",
    )
    parser.add_argument(
        "--files",
        action="store_true",
        help="Only show the interesting filenames "
        "that are needed for improving sentence "
        "alignment.",
    )
    parser.add_argument(
        "--convert",
        action="store_true",
        help="Only convert the original files "
        "that are the source of the .tmx.html file. "
        "This is useful when improving the content of "
        "the converted files.",
    )
    parser.add_argument("tmxhtml", help="The tmx.html file to realign.")

    args = parser.parse_args()
    return args

print_filename(corpus_path)

Print interesting filenames for doing sentence alignment.

Parameters:

Name Type Description Default
corpus_path make_corpus_path

filenames

required
Source code in corpustools/realign.py
38
39
40
41
42
43
44
45
46
47
48
def print_filename(corpus_path):
    """Print interesting filenames for doing sentence alignment.

    Args:
        corpus_path (corpuspath.make_corpus_path): filenames
    """
    print(
        "\toriginal: {}\n\tmetatada: {}\n\tconverted: {}".format(
            corpus_path.orig, corpus_path.xsl, corpus_path.converted
        )
    )

print_filenames(corpus_path1, corpus_path2)

Print interesting filenames for doing sentence alignment.

Parameters:

Name Type Description Default
corpus_path1 make_corpus_path

filenames for the lang1 file.

required
corpus_path2 make_corpus_path

filenames for the lang2 file.

required
Source code in corpustools/realign.py
51
52
53
54
55
56
57
58
59
60
61
def print_filenames(corpus_path1, corpus_path2):
    """Print interesting filenames for doing sentence alignment.

    Args:
        corpus_path1 (corpuspath.make_corpus_path): filenames for the lang1 file.
        corpus_path2 (corpuspath.make_corpus_path): filenames for the lang2 file.
    """
    print("\nLanguage 1 filenames:")
    print_filename(corpus_path1)
    print("\nLanguage 2 filenames:")
    print_filename(corpus_path2)