Skip to content

errormarkup

Classes and functions to convert errormarkup to xml.

ErrorMarkupError

Bases: Exception

This is raised for errors in this module.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
240
241
class ErrorMarkupError(Exception):
    """This is raised for errors in this module."""

add_error_markup(element)

Convert error markup to xml in this element and its children.

This is the starting point for doing markup.

Parameters:

Name Type Description Default
element etree._Element

The element where error markup should be converted to xml.

required
Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def add_error_markup(element):
    """Convert error markup to xml in this element and its children.

    This is the starting point for doing markup.

    Args:
        element (etree._Element): The element where error markup should
            be converted to xml.
    """
    errors = [message for message in validate_markup(element)]

    if errors:
        raise ErrorMarkupError("{}".format("\n".join(errors)))

    for child in element:
        convert_to_errormarkupxml(child)

convert_to_errormarkupxml(element)

Convert errormarkup found in the element to xml.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
198
199
200
201
202
203
204
205
206
207
def convert_to_errormarkupxml(element):
    """Convert errormarkup found in the element to xml."""
    if element.text:
        fix_text(element)

    if element.tail:
        fix_tail(element)

    for child in element:
        convert_to_errormarkupxml(child)

errormarkup_to_xml(text, last_correction)

Turn the errormarkup into error xml.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
168
169
170
171
172
173
174
175
176
177
178
179
180
def errormarkup_to_xml(text, last_correction):
    """Turn the errormarkup into error xml."""
    tail = text[last_correction.end() :]
    text, error = scan_for_error(text[: last_correction.start()])
    error_element = make_error_element(
        error,
        ERROR_TYPES[last_correction.group("correction")[0]],
        last_correction.group("correction")[2:-1],
    )
    error_element.tail = tail
    fix_text(error_element)

    return text, error_element

fix_tail(element)

Replace error markup with error xml.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
183
184
185
186
187
188
189
190
191
192
193
194
195
def fix_tail(element):
    """Replace error markup with error xml."""
    parent = element.getparent()
    position = parent.index(element)
    if element.tail:
        text = element.tail
        last_correction = LAST_CORRECTION_REGEX.search(text)
        while last_correction:
            position += 1
            text, error_element = errormarkup_to_xml(text, last_correction)
            element.tail = text
            parent.insert(position, error_element)
            last_correction = LAST_CORRECTION_REGEX.search(text)

fix_text(element)

Replace error markup with error xml.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
156
157
158
159
160
161
162
163
164
165
def fix_text(element):
    """Replace error markup with error xml."""
    if element.text:
        text = element.text
        last_correction = LAST_CORRECTION_REGEX.search(text)
        while last_correction:
            text, error_element = errormarkup_to_xml(text, last_correction)
            element.text = text
            element.insert(0, error_element)
            last_correction = LAST_CORRECTION_REGEX.search(text)

has_not_valid_pairs(text)

Check if the text has valid pairs.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def has_not_valid_pairs(text):
    """Check if the text has valid pairs."""
    old = text
    correction = CORRECTION_REGEX.search(old)
    while correction:
        no_simple = remove_simple_errors(old)
        correction = CORRECTION_REGEX.search(no_simple)

        if old == no_simple and correction:
            return correction.group("correction")

        old = no_simple

    return ""

invalid_corrections(text)

Check if all corrections are valid.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
45
46
47
48
49
50
51
52
def invalid_corrections(text):
    """Check if all corrections are valid."""
    return [
        correction
        for match in CORRECTION_REGEX.finditer(text)
        for correction in match.group("correction").split("///")
        if not correction.count("|") < 2
    ]

look_for_extended_attributes(correction)

Extract attributes and correction from a correctionstring.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
126
127
128
129
130
131
132
133
def look_for_extended_attributes(correction):
    """Extract attributes and correction from a correctionstring."""
    details = correction.split("|")

    if len(details) == 1:
        return (details[0], None)

    return (details[1], details[0])

make_correction_element(correction_content)

Make correction elements.

Parameters:

Name Type Description Default
correction_content str

string containing the correction(s)

required

Yields:

Type Description
lxml.etree.Element

A correction element for each correction

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def make_correction_element(correction_content):
    """Make correction elements.

    Args:
        correction_content (str): string containing the correction(s)

    Yields:
        (lxml.etree.Element): A correction element for each correction
    """
    for correction in correction_content.split("///"):
        correction_text, att_list = look_for_extended_attributes(correction)

        correct_element = etree.Element("correct")
        correct_element.text = correction_text

        if att_list is not None:
            correct_element.set("errorinfo", att_list)

        yield correct_element

make_error_element(error_text, error_name, correction)

Make an error xml element.

Parameters:

Name Type Description Default
error_text str

the text of the error element

required
error_name str

the tag of the error element

required
correction str

the correction(s) for the error

required

Returns:

Type Description
lxml.etree.Element

An etree._Element

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
def make_error_element(error_text, error_name, correction):
    """Make an error xml element.

    Args:
        error_text (str): the text of the error element
        error_name (str): the tag of the error element
        correction (str): the correction(s) for the error

    Returns:
        (lxml.etree.Element): An etree._Element
    """
    error_element = etree.Element(error_name)
    error_element.text = error_text

    for correction_element in make_correction_element(correction):
        error_element.append(correction_element)

    return error_element

remove_simple_errors(text)

Remove non nested errors from the text.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
55
56
57
58
59
60
61
62
63
64
65
66
def remove_simple_errors(text):
    """Remove non nested errors from the text."""
    result = []
    previous = 0
    for match in SIMPLE_ERROR_REGEX.finditer(text):
        result.append(text[previous : match.start()])
        previous = match.end()

    if previous < len(text):
        result.append(text[previous:])

    return "".join(result)

scan_for_error(text)

Scan for error markup in the given text.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
def scan_for_error(text):
    """Scan for error markup in the given text."""
    level = 0
    index = len(text) - 1

    while index > 0:
        if text[index] == "}":
            level += 1
        if text[index] == "{":
            level -= 1
        if level == 0:
            break
        index -= 1

    if index:
        return text[:index], text[index + 1 : -1]

    return "", text[index + 1 : -1]

validate_markup(element)

Check if the markup is valid.

Source code in /home/anders/projects/CorpusTools/corpustools/errormarkup.py
210
211
212
213
214
215
216
217
218
219
def validate_markup(element):
    """Check if the markup is valid."""
    for child in element:
        child_as_text = etree.tostring(child, encoding="unicode")
        for invalid_correction in invalid_corrections(child_as_text):
            yield f"Too many «|» in {invalid_correction}"
        invalid_pair = has_not_valid_pairs(child_as_text)
        if invalid_pair:
            yield f"In text starting with\n\t{child_as_text[len(child.tag)+2:50]}"
            yield f"\tError in front of\n\t\t{invalid_pair}"