Skip to content

error_annotated_sentence

CorrectionSegment dataclass

Source code in corpustools/error_annotated_sentence.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
@dataclass
class CorrectionSegment:
    error_info: str | None
    suggestions: list[str]

    def to_xml(self) -> etree.Element:
        """Convert the correction segment to its XML representation.

        Returns:
            An lxml Element representing the correction segment in XML format.
        """
        correct_elem = etree.Element("correct")
        if self.error_info:
            correct_elem.set("errorinfo", self.error_info)
        correct_elem.text = "///".join(self.suggestions)
        return correct_elem

to_xml()

Convert the correction segment to its XML representation.

Returns:

Type Description
Element

An lxml Element representing the correction segment in XML format.

Source code in corpustools/error_annotated_sentence.py
16
17
18
19
20
21
22
23
24
25
26
def to_xml(self) -> etree.Element:
    """Convert the correction segment to its XML representation.

    Returns:
        An lxml Element representing the correction segment in XML format.
    """
    correct_elem = etree.Element("correct")
    if self.error_info:
        correct_elem.set("errorinfo", self.error_info)
    correct_elem.text = "///".join(self.suggestions)
    return correct_elem

ErrorAnnotatedSentence dataclass

Represents a sentence with zero or more error markups.

Source code in corpustools/error_annotated_sentence.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
@dataclass
class ErrorAnnotatedSentence:
    """Represents a sentence with zero or more error markups."""

    head: str
    errors: list[ErrorMarkupSegment]

    def uncorrected_text(self) -> str:
        """Get the uncorrected text of the sentence."""
        parts: list[str] = [self.head]

        for error in self.errors:
            parts.append(error.uncorrected_text())

        return "".join(parts)

    def to_xml(self, parent: etree.Element | None = None) -> etree.Element:
        """Convert the error annotated sentence to its XML representation.

        Args:
            parent: The parent XML element to which the sentence will be appended.

        Returns:
            An lxml Element representing the error annotated sentence in XML format.
        """
        if parent is None:
            parent = etree.Element("p")
        parent.text = self.head
        for error_segment in self.errors:
            parent.append(error_segment.to_xml())

        return parent

to_xml(parent=None)

Convert the error annotated sentence to its XML representation.

Parameters:

Name Type Description Default
parent Element | None

The parent XML element to which the sentence will be appended.

None

Returns:

Type Description
Element

An lxml Element representing the error annotated sentence in XML format.

Source code in corpustools/error_annotated_sentence.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def to_xml(self, parent: etree.Element | None = None) -> etree.Element:
    """Convert the error annotated sentence to its XML representation.

    Args:
        parent: The parent XML element to which the sentence will be appended.

    Returns:
        An lxml Element representing the error annotated sentence in XML format.
    """
    if parent is None:
        parent = etree.Element("p")
    parent.text = self.head
    for error_segment in self.errors:
        parent.append(error_segment.to_xml())

    return parent

uncorrected_text()

Get the uncorrected text of the sentence.

Source code in corpustools/error_annotated_sentence.py
116
117
118
119
120
121
122
123
def uncorrected_text(self) -> str:
    """Get the uncorrected text of the sentence."""
    parts: list[str] = [self.head]

    for error in self.errors:
        parts.append(error.uncorrected_text())

    return "".join(parts)

ErrorMarkup dataclass

Represents a marked up error in a sentence.

Attributes:

Name Type Description
error 'ErrorAnnotatedSentence'

"ErrorAnnotatedSentence" representing the erroneous segment

errortype ErrorType

Type of error

correction CorrectionSegment

CorrectionSegment

Source code in corpustools/error_annotated_sentence.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
@dataclass
class ErrorMarkup:
    """Represents a marked up error in a sentence.

    Attributes:
        error: "ErrorAnnotatedSentence" representing the erroneous segment
        errortype: Type of error
        correction: CorrectionSegment
    """

    error: "ErrorAnnotatedSentence"
    errortype: ErrorType
    correction: CorrectionSegment

    def uncorrected_text(self) -> str:
        """Get the uncorrected text of the error markup."""
        return self.error.uncorrected_text()

    def to_xml(self) -> etree.Element:
        """Convert the error markup to its XML representation.

        Returns:
            An lxml Element representing the error markup in XML format.
        """

        error_elem = etree.Element(self.errortype.name.lower())
        self.error.to_xml(parent=error_elem)
        error_elem.append(self.correction.to_xml())

        return error_elem

to_xml()

Convert the error markup to its XML representation.

Returns:

Type Description
Element

An lxml Element representing the error markup in XML format.

Source code in corpustools/error_annotated_sentence.py
75
76
77
78
79
80
81
82
83
84
85
86
def to_xml(self) -> etree.Element:
    """Convert the error markup to its XML representation.

    Returns:
        An lxml Element representing the error markup in XML format.
    """

    error_elem = etree.Element(self.errortype.name.lower())
    self.error.to_xml(parent=error_elem)
    error_elem.append(self.correction.to_xml())

    return error_elem

uncorrected_text()

Get the uncorrected text of the error markup.

Source code in corpustools/error_annotated_sentence.py
71
72
73
def uncorrected_text(self) -> str:
    """Get the uncorrected text of the error markup."""
    return self.error.uncorrected_text()

ErrorMarkupSegment dataclass

Source code in corpustools/error_annotated_sentence.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
@dataclass
class ErrorMarkupSegment:
    error_markup: ErrorMarkup
    tail: str

    def uncorrected_text(self) -> str:
        """Get the uncorrected text of the error markup segment."""
        return self.error_markup.uncorrected_text() + self.tail

    def to_xml(self) -> etree.Element:
        """Convert the error markup segment to its XML representation.
        Returns:
            An lxml Element representing the error markup segment in XML format.
        """
        error_elem = self.error_markup.to_xml()
        error_elem.tail = self.tail

        return error_elem

to_xml()

Convert the error markup segment to its XML representation. Returns: An lxml Element representing the error markup segment in XML format.

Source code in corpustools/error_annotated_sentence.py
 98
 99
100
101
102
103
104
105
106
def to_xml(self) -> etree.Element:
    """Convert the error markup segment to its XML representation.
    Returns:
        An lxml Element representing the error markup segment in XML format.
    """
    error_elem = self.error_markup.to_xml()
    error_elem.tail = self.tail

    return error_elem

uncorrected_text()

Get the uncorrected text of the error markup segment.

Source code in corpustools/error_annotated_sentence.py
94
95
96
def uncorrected_text(self) -> str:
    """Get the uncorrected text of the error markup segment."""
    return self.error_markup.uncorrected_text() + self.tail

parse_markup_to_correction_segment(markup)

Parse correction segment from markup iterator. Args: markup: An iterator over strings representing the markup. Returns: A CorrectionSegment representing the parsed content.

Source code in corpustools/error_annotated_sentence.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def parse_markup_to_correction_segment(markup: Iterator[str]) -> CorrectionSegment:
    """Parse correction segment from markup iterator.
    Args:
        markup: An iterator over strings representing the markup.
    Returns:
        A CorrectionSegment representing the parsed content.
    """
    next(markup)  # Skip initial '{'
    contents: list[str] = []
    for char in markup:
        if char == "}":
            break
        contents.append(char)

    correction_str = "".join(contents)
    if "|" in correction_str:
        error_info, suggestions_str = correction_str.split("|", 1)
        suggestions = suggestions_str.split("///")
    else:
        error_info = ""
        suggestions = correction_str.split("///")

    return CorrectionSegment(
        error_info=error_info if error_info else None,
        suggestions=suggestions,
    )

parse_markup_to_error_markup_segment(markup)

Parse ErrorMarkupSegment from markup iterator. Args: markup: An iterator over strings representing the markup. Returns: An ErrorMarkupSegment representing the parsed content.

Source code in corpustools/error_annotated_sentence.py
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
def parse_markup_to_error_markup_segment(
    markup: Iterator[str],
) -> tuple[ErrorMarkupSegment, str | None]:
    """Parse ErrorMarkupSegment from markup iterator.
    Args:
        markup: An iterator over strings representing the markup.
    Returns:
        An ErrorMarkupSegment representing the parsed content.
    """
    error = parse_markup_to_sentence(markup)
    symbol = next(markup)
    errortype = error_type_from_symbol(symbol)
    if errortype is None:
        raise ValueError(f"Unknown error symbol: «{symbol}»")
    correction = parse_markup_to_correction_segment(markup)
    tail, delimiter = parse_tail(markup)
    return ErrorMarkupSegment(
        error_markup=ErrorMarkup(
            error=error, errortype=errortype, correction=correction
        ),
        tail=tail,
    ), delimiter

parse_markup_to_sentence(markup)

Parse error annotated sentence from markup iterator.

Parameters:

Name Type Description Default
markup Iterator[str]

An iterator over strings representing the markup.

required

Returns: An ErrorAnnotatedSentence representing the parsed content.

Source code in corpustools/error_annotated_sentence.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
def parse_markup_to_sentence(markup: Iterator[str]) -> ErrorAnnotatedSentence:
    """Parse error annotated sentence from markup iterator.

    Args:
        markup: An iterator over strings representing the markup.
    Returns:
        An ErrorAnnotatedSentence representing the parsed content.
    """
    chars: list[str] = []
    errors: list[ErrorMarkupSegment] = []
    try:
        while char := next(markup):
            if char == "{":  # start of error markup
                break
            if char == "}":  # end of current error markup
                return ErrorAnnotatedSentence(
                    head="".join(chars),
                    errors=errors,
                )
            chars.append(char)
    except StopIteration:
        return ErrorAnnotatedSentence(
            head="".join(chars),
            errors=errors,
        )

    while True:
        try:
            error_segment, delimiter = parse_markup_to_error_markup_segment(markup)
            errors.append(error_segment)
            if delimiter == "}":
                # End of nested error content
                break
            elif delimiter == "{":
                # Start of another error markup segment, continue
                continue
            else:
                # End of markup (delimiter is None)
                break
        except StopIteration:
            break

    return ErrorAnnotatedSentence(
        head="".join(chars),
        errors=errors,
    )

parse_tail(markup)

Parse tail from markup iterator.

Parameters:

Name Type Description Default
markup Iterator[str]

An iterator over strings representing the markup.

required

Returns: A tuple of (tail string, next delimiter character or None). The delimiter is either '{', '}', or None if end of markup.

Source code in corpustools/error_annotated_sentence.py
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def parse_tail(markup: Iterator[str]) -> tuple[str, str | None]:
    """Parse tail from markup iterator.

    Args:
        markup: An iterator over strings representing the markup.
    Returns:
        A tuple of (tail string, next delimiter character or None).
        The delimiter is either '{', '}', or None if end of markup.
    """
    tail_chars: list[str] = []
    delimiter = None
    for char in markup:
        if char in "{}":
            # Start or end of next error markup segment
            delimiter = char
            break
        tail_chars.append(char)

    return "".join(tail_chars), delimiter