Skip to content

sentencedivider

Classes and functions to sentence align two files.

get_tokeniser(lang)

Check if resources needed by modes exists.

Parameters:

Name Type Description Default
lang str

the language that modes is asked to serve.

required

Returns:

Type Description
Path

A path to the zpipe file.

Raises:

Type Description
ArgumentError

if no resources are found.

Source code in corpustools/sentencedivider.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def get_tokeniser(lang: str) -> Path:
    """Check if resources needed by modes exists.

    Args:
        lang: the language that modes is asked to serve.

    Returns:
        A path to the zpipe file.

    Raises:
        utils.ArgumentError: if no resources are found.
    """
    for lang_dir in lang_resource_dirs(lang):
        full_path = lang_dir / "tokeniser-disamb-gt-desc.pmhfst"
        if full_path.exists():
            return full_path

    raise (ArgumentError(f"ERROR: no tokeniser for {lang}"))

make_sentences(tokenised_output)

Turn ccat output into cleaned up sentences.

Parameters:

Name Type Description Default
tokenised_output str

plain text output of ccat.

required

Yields:

Type Description
str

a cleaned up sentence

Source code in corpustools/sentencedivider.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def make_sentences(tokenised_output: str) -> Iterator[str]:
    """Turn ccat output into cleaned up sentences.

    Args:
        tokenised_output (str): plain text output of ccat.

    Yields:
        (str): a cleaned up sentence
    """

    token_buffer: list[str] = []
    for token in tokenised_output.split("\n"):
        if token != "ΒΆ":
            token_buffer.append(token)
        if token.strip() in STOPS:
            yield "".join(token_buffer).strip()
            token_buffer[:] = []
    if token_buffer:
        yield "".join(token_buffer).strip()

make_valid_sentences(corpus_path)

Turn ccat output into full sentences.

Parameters:

Name Type Description Default
corpus_path CorpusPath

The path to the corpus file.

required

Returns:

Type Description
list[str]

The ccat output has been turned into a list of full sentences.

Source code in corpustools/sentencedivider.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def make_valid_sentences(corpus_path: CorpusPath) -> list[str]:
    """Turn ccat output into full sentences.

    Args:
        corpus_path (CorpusPath): The path to the corpus file.

    Returns:
        The ccat output has been turned into a list of full sentences.
    """
    return [
        " ".join([word for word in sentence.split() if word.strip()])
        for sentence in make_sentences(
            tokenised_output=run_external_command(
                command="hfst-tokenise --print-all "
                f"{get_tokeniser(corpus_path.lang)}".split(),
                instring=ccat.ccatter(corpus_path),
            )
        )
        if sentence.strip()
    ]

tokenise(text, lang)

Turn a string into a list of tokens.

Parameters:

Name Type Description Default
text str

the text to be tokenised

required
lang str

the language of the text

required

Returns:

Type Description
str

The tokenised text, one token per line.

Source code in corpustools/sentencedivider.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def tokenise(text: str, lang: str) -> str:
    """Turn a string into a list of tokens.

    Args:
        text: the text to be tokenised
        lang: the language of the text

    Returns:
        The tokenised text, one token per line.
    """

    return run_external_command(
        command=f"hfst-tokenise --print-all {get_tokeniser(lang)}".split(),
        instring=text,
    )