Skip to content

realign

Sentence align a given file anew.

calculate_paths(tmxhtml)

Calculate paths, given a file from the command line.

Parameters:

Name Type Description Default
tmxhtml str

path to a .tmx or a .tmx.html file

required

Returns:

Type Description
tuple[CorpusPath, CorpusPath]

tuple of corpuspath.make_corpus_path

Source code in /home/anders/projects/CorpusTools/corpustools/realign.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def calculate_paths(tmxhtml):
    """Calculate paths, given a file from the command line.

    Args:
        tmxhtml (str): path to a .tmx or a .tmx.html file

    Returns:
        (tuple[CorpusPath, CorpusPath]): tuple of corpuspath.make_corpus_path
    """
    path = tmxhtml.with_suffix("") if tmxhtml.suffix == ".html" else tmxhtml
    corpus_path1 = corpuspath.make_corpus_path(path)

    lang2 = Path(path.as_posix().split("/tmx/")[1]).parts[0]
    corpus_path2 = corpuspath.make_corpus_path(corpus_path1.parallel(lang2))

    return corpus_path1, corpus_path2

convert_and_copy(corpus_path1, corpus_path2)

Reconvert and copy files to prestable/converted.

Parameters:

Name Type Description Default
corpus_path1 corpuspath.make_corpus_path

A CorpusPath representing the lang1 file that should be reconverted.

required
corpus_path2 corpuspath.make_corpus_path

A CorpusPath representing the lang2 file that should be reconverted.

required
Source code in /home/anders/projects/CorpusTools/corpustools/realign.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def convert_and_copy(corpus_path1, corpus_path2):
    """Reconvert and copy files to prestable/converted.

    Args:
        corpus_path1 (corpuspath.make_corpus_path): A CorpusPath representing the
            lang1 file that should be reconverted.
        corpus_path2 (corpuspath.make_corpus_path): A CorpusPath representing the
            lang2 file that should be reconverted.
    """
    for corpus_path in [corpus_path1, corpus_path2]:
        corpus_path.converted.unlink(missing_ok=True)

    convertermanager.sanity_check()
    converter_manager = convertermanager.ConverterManager()
    converter_manager.collect_files(
        [corpus_path1.orig.as_posix(), corpus_path2.orig.as_posix()]
    )
    converter_manager.convert_serially()

main()

Sentence align a given file anew.

Source code in /home/anders/projects/CorpusTools/corpustools/realign.py
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
def main():
    """Sentence align a given file anew."""
    convertermanager.LOGGER.setLevel(logging.DEBUG)
    args = parse_options()

    source_path, para_path = calculate_paths(Path(args.tmxhtml).resolve())

    print_filenames(source_path, para_path)

    if args.files:
        raise SystemExit("Only printing file names")

    try:
        convert_and_copy(source_path, para_path)
    except Exception as error:
        raise SystemExit(error)

    if args.convert:
        raise SystemExit("Only converting")

    parallelize.parallelise_file(
        source_path,
        para_path,
        dictionary=parallelize.get_dictionary(para_path, source_path),
    )
    tmx.tmx2html(source_path.tmx(para_path.lang))

parse_options()

Parse the commandline options.

Returns:

Type Description
argparse.Namespace

the parsed commandline arguments

Source code in /home/anders/projects/CorpusTools/corpustools/realign.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
def parse_options():
    """Parse the commandline options.

    Returns:
        (argparse.Namespace): the parsed commandline arguments
    """
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser],
        description="Sentence align a given file anew.\n"
        "Files are converted before being parallelised.\n"
        "This is mainly thought of as a debugging program "
        "when trying to solve issues in parallelised files.",
    )
    parser.add_argument(
        "--files",
        action="store_true",
        help="Only show the interesting filenames "
        "that are needed for improving sentence "
        "alignment.",
    )
    parser.add_argument(
        "--convert",
        action="store_true",
        help="Only convert the original files "
        "that are the source of the .tmx.html file. "
        "This is useful when improving the content of "
        "the converted files.",
    )
    parser.add_argument("tmxhtml", help="The tmx.html file to realign.")

    args = parser.parse_args()
    return args

print_filename(corpus_path)

Print interesting filenames for doing sentence alignment.

Parameters:

Name Type Description Default
corpus_path corpuspath.make_corpus_path

filenames

required
Source code in /home/anders/projects/CorpusTools/corpustools/realign.py
31
32
33
34
35
36
37
38
39
40
41
def print_filename(corpus_path):
    """Print interesting filenames for doing sentence alignment.

    Args:
        corpus_path (corpuspath.make_corpus_path): filenames
    """
    print(
        "\toriginal: {}\n\tmetatada: {}\n\tconverted: {}".format(
            corpus_path.orig, corpus_path.xsl, corpus_path.converted
        )
    )

print_filenames(corpus_path1, corpus_path2)

Print interesting filenames for doing sentence alignment.

Parameters:

Name Type Description Default
corpus_path1 corpuspath.make_corpus_path

filenames for the lang1 file.

required
corpus_path2 corpuspath.make_corpus_path

filenames for the lang2 file.

required
Source code in /home/anders/projects/CorpusTools/corpustools/realign.py
44
45
46
47
48
49
50
51
52
53
54
def print_filenames(corpus_path1, corpus_path2):
    """Print interesting filenames for doing sentence alignment.

    Args:
        corpus_path1 (corpuspath.make_corpus_path): filenames for the lang1 file.
        corpus_path2 (corpuspath.make_corpus_path): filenames for the lang2 file.
    """
    print("\nLanguage 1 filenames:")
    print_filename(corpus_path1)
    print("\nLanguage 2 filenames:")
    print_filename(corpus_path2)