Skip to content

sentencedivider

Classes and functions to sentence align two files.

SentenceDivider

A class to divide plain text output into sentences.

Uses hfst-tokenise as the motor for this purpose.

Attributes:

Name Type Description
stops list[str]

tokens that imply where a sentence ends.

lang str

three character language code

relative_path str

relative path to where files needed by modes.xml are found.

tokeniser modes.Pipeline

tokeniser pipeline

Source code in /home/anders/projects/CorpusTools/corpustools/sentencedivider.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class SentenceDivider:
    """A class to divide plain text output into sentences.

    Uses hfst-tokenise as the motor for this purpose.

    Attributes:
        stops (list[str]): tokens that imply where a sentence ends.
        lang (str): three character language code
        relative_path (str): relative path to where files needed by
            modes.xml are found.
        tokeniser (modes.Pipeline): tokeniser pipeline
    """

    stops = [";", "!", "?", ".", "..", "...", "¶", "…"]

    def __init__(self, lang, giella_prefix=None):
        """Set the files needed by the tokeniser.

        Args:
            lang (str): language the analyser can tokenise
        """
        self.tokeniser = modes.Pipeline("tokenise", lang, giella_prefix)

    def make_sentences(self, tokenised_output):
        """Turn ccat output into cleaned up sentences.

        Args:
            tokenised_output (str): plain text output of ccat.

        Yields:
            (str): a cleaned up sentence
        """

        token_buffer = []
        for token in tokenised_output.split("\n"):
            if token != "¶":
                token_buffer.append(token)
            if token.strip() in self.stops:
                yield "".join(token_buffer).strip()
                token_buffer[:] = []
        if token_buffer:
            yield "".join(token_buffer).strip()

    def make_valid_sentences(self, ccat_output):
        """Turn ccat output into full sentences.

        Args:
            ccat_output (str): the plain text output of ccat

        Returns:
            (list[str]): The ccat output has been turned into a list
                of full sentences.
        """
        return [
            sentence
            for sentence in self.make_sentences(
                self.tokeniser.run(ccat_output.encode("utf8"))
            )
            if sentence
        ]

__init__(lang, giella_prefix=None)

Set the files needed by the tokeniser.

Parameters:

Name Type Description Default
lang str

language the analyser can tokenise

required
Source code in /home/anders/projects/CorpusTools/corpustools/sentencedivider.py
61
62
63
64
65
66
67
def __init__(self, lang, giella_prefix=None):
    """Set the files needed by the tokeniser.

    Args:
        lang (str): language the analyser can tokenise
    """
    self.tokeniser = modes.Pipeline("tokenise", lang, giella_prefix)

make_sentences(tokenised_output)

Turn ccat output into cleaned up sentences.

Parameters:

Name Type Description Default
tokenised_output str

plain text output of ccat.

required

Yields:

Type Description
str

a cleaned up sentence

Source code in /home/anders/projects/CorpusTools/corpustools/sentencedivider.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def make_sentences(self, tokenised_output):
    """Turn ccat output into cleaned up sentences.

    Args:
        tokenised_output (str): plain text output of ccat.

    Yields:
        (str): a cleaned up sentence
    """

    token_buffer = []
    for token in tokenised_output.split("\n"):
        if token != "¶":
            token_buffer.append(token)
        if token.strip() in self.stops:
            yield "".join(token_buffer).strip()
            token_buffer[:] = []
    if token_buffer:
        yield "".join(token_buffer).strip()

make_valid_sentences(ccat_output)

Turn ccat output into full sentences.

Parameters:

Name Type Description Default
ccat_output str

the plain text output of ccat

required

Returns:

Type Description
list[str]

The ccat output has been turned into a list of full sentences.

Source code in /home/anders/projects/CorpusTools/corpustools/sentencedivider.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def make_valid_sentences(self, ccat_output):
    """Turn ccat output into full sentences.

    Args:
        ccat_output (str): the plain text output of ccat

    Returns:
        (list[str]): The ccat output has been turned into a list
            of full sentences.
    """
    return [
        sentence
        for sentence in self.make_sentences(
            self.tokeniser.run(ccat_output.encode("utf8"))
        )
        if sentence
    ]

to_plain_text(file_path)

Turn an xml formatted file into clean text.

Parameters:

Name Type Description Default
file_path CorpusPath

The path to the file

required

Raises:

Type Description
UserWarning

if there is no text, raise a UserWarning

Returns:

Type Description
str

the content of ccat output

Source code in /home/anders/projects/CorpusTools/corpustools/sentencedivider.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def to_plain_text(file_path):
    """Turn an xml formatted file into clean text.

    Args:
        file_path (CorpusPath): The path to the file

    Raises:
        UserWarning: if there is no text, raise a UserWarning

    Returns:
        (str): the content of ccat output
    """
    xml_printer = ccat.XMLPrinter(lang=file_path.lang, all_paragraphs=True)
    xml_printer.parse_file(file_path.converted)
    text = xml_printer.process_file().getvalue()
    if text:
        return text
    else:
        raise UserWarning(f"Empty file {file_path.converted}")