Skip to content

corpuspath

This file contains classes to handle corpus filenames.

CorpusPath dataclass

Map filenames in a corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
@dataclass
class CorpusPath:
    """Map filenames in a corpus."""

    root: Path
    lang: str
    filepath: Path
    dirsuffix: str = ""

    def __post_init__(self):
        """Initialise the metadata attribute."""
        self.metadata = xslsetter.MetadataHandler(self.xsl, create=True)

    @property
    def orig_corpus_dir(self):
        return self.corpus_dir()

    @property
    def converted_corpus_dir(self):
        return self.corpus_dir(module="trigger_no_orig")

    @property
    def orig(self):
        """Return the path of the original file."""
        return self.orig_corpus_dir / self.filepath

    @property
    def xsl(self):
        """Return the path of the metadata file."""
        return self.orig.with_name(f"{self.orig.name}.xsl")

    @property
    def log(self):
        """Return the path of the log file."""
        return self.orig.with_name(f"{self.orig.name}.log")

    def corpus_dir(self, module=None, corpus_lang=None):
        this_lang = self.lang if corpus_lang is None else corpus_lang
        return (
            self.root / f"corpus-{this_lang}"
            f"{'-orig' if module is None else ''}"
            f"{'-' + self.dirsuffix if self.dirsuffix else ''}"
        )

    def name(
        self,
        module=None,
        corpus_lang=None,
        target_lang=None,
        filepath=None,
        suffix=None,
    ):
        """Returns a path based on the module and extension.

        Args:
            module (str): string containing some corpus module
            corpus_lang (str): corpus language, as a three letter language code
            target_lang (str): string containing the target language of a tmx
                file
            filepath (str): path to the file
            suffix (str): file suffix
        """
        this_module = '' if module is None else module
        this_target_lang = '' if target_lang is None else target_lang
        this_filepath = (
            f"{self.filepath if filepath is None else filepath}"
            f"{'' if suffix is None else suffix}"
        )
        return (
            self.corpus_dir(module=module, corpus_lang=corpus_lang)
            / this_module
            / this_target_lang
            / this_filepath
        )

    @property
    def converted(self):
        """Return the path to the converted file."""
        module = "converted"
        if self.metadata.get_variable("conversion_status") == "correct":
            module = "goldstandard/converted"
        if self.metadata.get_variable("conversion_status") == "correct-no-gs":
            module = "correct-no-gs/converted"

        return self.name(module=module, suffix=".xml")

    @property
    def analysed(self):
        """Return the path to analysed file."""
        return self.name(module="analysed", suffix=".xml")

    @property
    def korp_mono(self):
        """Return the path to analysed file."""
        return self.name(module="korp_mono", suffix=".xml")

    def korp_tmx(self, target_language):
        """Return the path to korp processed tmx file."""
        return self.name(
            module="korp_tmx",
            target_lang=target_language,
            suffix=".tmx",
        )

    def parallel(self, language):
        """Check if there is a parallel for language.

        Args:
            language (str): language of the parallel file.

        Returns:
            (pathlib.Path): path to the parallel file if it exist, else None
        """
        if self.metadata.get_parallel_texts().get(language) is not None:
            return self.name(
                corpus_lang=language,
                filepath=self.filepath.with_name(
                    self.metadata.get_parallel_texts().get(language)
                ),
            )

    def parallels(self):
        """Return paths to all parallel files.

        Yields:
            (str): path to the orig path of a parallel file.
        """
        return (
            self.parallel(language) for language in self.metadata.get_parallel_texts()
        )

    def tmx(self, target_language):
        """Name of the tmx file.

        Args:
            target_language (str): language of the parallel

        Returns:
            (str): path to the tmx file
        """
        return self.name(
            module="tmx",
            target_lang=target_language,
            suffix=".tmx",
        )

    @property
    def tca2_input(self):
        """Compute the name of the tca2 input file.

        Returns:
            (pathlib.Path): the name of the tca2 input file
        """
        # Ensure we have 20 bytes of leeway to let TCA2 append
        # lang_sent_new.txt without going over the 255 byte limit:
        origfilename = self.crop_to_bytes(self.filepath.name, (255 - 20))
        return Path("/tmp") / f"{origfilename}_{self.lang}.sent"

    @property
    def tca2_output(self):
        """Compute the name of the tca2 output file.

        Returns:
            (pathlib.Path): the name of the tca2 output file
        """
        return self.tca2_input.with_name(
            self.tca2_input.name.replace(".sent", "_new.txt")
        )

    @property
    def tmp_filename(self):
        return self.converted_corpus_dir / "tmp" / self.filepath.name

    @staticmethod
    def crop_to_bytes(name, max_bytes):
        """Ensure `name` is less than `max_bytes` bytes.

        Do not split name in the middle of a wide byte.
        """
        while len(name.encode("utf-8")) > max_bytes:
            name = name[:-1]
        return name

analysed property

Return the path to analysed file.

converted property

Return the path to the converted file.

korp_mono property

Return the path to analysed file.

log property

Return the path of the log file.

orig property

Return the path of the original file.

tca2_input property

Compute the name of the tca2 input file.

Returns:

Type Description
pathlib.Path

the name of the tca2 input file

tca2_output property

Compute the name of the tca2 output file.

Returns:

Type Description
pathlib.Path

the name of the tca2 output file

xsl property

Return the path of the metadata file.

__post_init__()

Initialise the metadata attribute.

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
93
94
95
def __post_init__(self):
    """Initialise the metadata attribute."""
    self.metadata = xslsetter.MetadataHandler(self.xsl, create=True)

crop_to_bytes(name, max_bytes) staticmethod

Ensure name is less than max_bytes bytes.

Do not split name in the middle of a wide byte.

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
257
258
259
260
261
262
263
264
265
@staticmethod
def crop_to_bytes(name, max_bytes):
    """Ensure `name` is less than `max_bytes` bytes.

    Do not split name in the middle of a wide byte.
    """
    while len(name.encode("utf-8")) > max_bytes:
        name = name[:-1]
    return name

korp_tmx(target_language)

Return the path to korp processed tmx file.

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
180
181
182
183
184
185
186
def korp_tmx(self, target_language):
    """Return the path to korp processed tmx file."""
    return self.name(
        module="korp_tmx",
        target_lang=target_language,
        suffix=".tmx",
    )

name(module=None, corpus_lang=None, target_lang=None, filepath=None, suffix=None)

Returns a path based on the module and extension.

Parameters:

Name Type Description Default
module str

string containing some corpus module

None
corpus_lang str

corpus language, as a three letter language code

None
target_lang str

string containing the target language of a tmx file

None
filepath str

path to the file

None
suffix str

file suffix

None
Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def name(
    self,
    module=None,
    corpus_lang=None,
    target_lang=None,
    filepath=None,
    suffix=None,
):
    """Returns a path based on the module and extension.

    Args:
        module (str): string containing some corpus module
        corpus_lang (str): corpus language, as a three letter language code
        target_lang (str): string containing the target language of a tmx
            file
        filepath (str): path to the file
        suffix (str): file suffix
    """
    this_module = '' if module is None else module
    this_target_lang = '' if target_lang is None else target_lang
    this_filepath = (
        f"{self.filepath if filepath is None else filepath}"
        f"{'' if suffix is None else suffix}"
    )
    return (
        self.corpus_dir(module=module, corpus_lang=corpus_lang)
        / this_module
        / this_target_lang
        / this_filepath
    )

parallel(language)

Check if there is a parallel for language.

Parameters:

Name Type Description Default
language str

language of the parallel file.

required

Returns:

Type Description
pathlib.Path

path to the parallel file if it exist, else None

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def parallel(self, language):
    """Check if there is a parallel for language.

    Args:
        language (str): language of the parallel file.

    Returns:
        (pathlib.Path): path to the parallel file if it exist, else None
    """
    if self.metadata.get_parallel_texts().get(language) is not None:
        return self.name(
            corpus_lang=language,
            filepath=self.filepath.with_name(
                self.metadata.get_parallel_texts().get(language)
            ),
        )

parallels()

Return paths to all parallel files.

Yields:

Type Description
str

path to the orig path of a parallel file.

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
205
206
207
208
209
210
211
212
213
def parallels(self):
    """Return paths to all parallel files.

    Yields:
        (str): path to the orig path of a parallel file.
    """
    return (
        self.parallel(language) for language in self.metadata.get_parallel_texts()
    )

tmx(target_language)

Name of the tmx file.

Parameters:

Name Type Description Default
target_language str

language of the parallel

required

Returns:

Type Description
str

path to the tmx file

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
def tmx(self, target_language):
    """Name of the tmx file.

    Args:
        target_language (str): language of the parallel

    Returns:
        (str): path to the tmx file
    """
    return self.name(
        module="tmx",
        target_lang=target_language,
        suffix=".tmx",
    )

collect_files(entities, suffix)

Collect files with the specified suffix.

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
268
269
270
271
272
273
274
275
276
def collect_files(entities, suffix):
    """Collect files with the specified suffix."""
    for entity in entities:
        entity_path = Path(entity).resolve()
        if entity_path.is_file() and entity_path.suffix == suffix:
            yield entity_path
        else:
            for file_ in entity_path.rglob(f"*{suffix}"):
                yield file_

make_corpus_path(path)

Returns a CorpusPath from a given path

Parameters:

Name Type Description Default
path str

a path to a corpus file

required

Raises:

Type Description
ValueError

the path is not part of a corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/corpuspath.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
def make_corpus_path(path):
    """Returns a CorpusPath from a given path

    Args:
        path (str): a path to a corpus file

    Raises:
        ValueError: the path is not part of a corpus.
    """

    def fix_filepath(filepath):
        for module in MODULES:
            if filepath.as_posix().startswith(module):
                mod_len = len(module.split("/"))
                if module.endswith("tmx"):
                    mod_len += 1
                return Path().joinpath(*filepath.parts[mod_len:]).with_suffix("")

        if any(filepath.suffix == suffix for suffix in [".xsl", ".log"]):
            return Path(filepath).with_suffix("")

        return Path(filepath)

    corpus_match = CORPUS_DIR_RE.search(Path(path).resolve().as_posix())

    if not corpus_match:
        raise ValueError(f"File is not part of a corpus: {path}")

    corpus_dict = corpus_match.groupdict()
    lang, *dirsuffixes = corpus_dict["corpusdir"].replace("-orig", "").split("-")

    return CorpusPath(
        root=Path(corpus_dict["parent"]),
        lang=lang,
        dirsuffix="-".join(dirsuffixes),
        filepath=fix_filepath(Path(corpus_dict["corpusfile"])),
    )