Convert files containing error-annotated sentences to the GiellaLT xml format.
Source code in corpustools/error_annotated_converter.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52 | def convert2intermediate(filename: Path) -> etree.Element:
"""Convert files containing error-annotated sentences to the GiellaLT xml format."""
document = etree.Element("document")
etree.SubElement(document, "header")
body = etree.SubElement(document, "body")
errors: list[tuple[int, str, ValueError]] = []
for index, line in enumerate(filename.read_text(encoding="utf-8").splitlines()):
try:
error_annotated = parse_markup_to_sentence(iter(line))
body.append(error_annotated.to_xml())
except ValueError as error:
errors.append((index, line, error))
if errors:
log_file = filename.with_suffix(".log")
log_file.write_text(
"\n".join(
f"Error parsing line nr {index}:\n{line}\n{error}\n\n"
for index, line, error in errors
)
)
raise ConversionError(
f"Error markup parsing error, see log file {log_file}."
) from errors[0][2]
return document
|