Skip to content

error_annotated_converter

convert2intermediate(filename)

Convert files containing error-annotated sentences to the GiellaLT xml format.

Source code in corpustools/error_annotated_converter.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def convert2intermediate(filename: Path) -> etree.Element:
    """Convert files containing error-annotated sentences to the GiellaLT xml format."""
    document = etree.Element("document")
    etree.SubElement(document, "header")
    body = etree.SubElement(document, "body")

    errors: list[tuple[int, str, ValueError]] = []
    for index, line in enumerate(filename.read_text(encoding="utf-8").splitlines()):
        try:
            error_annotated = parse_markup_to_sentence(iter(line))
            body.append(error_annotated.to_xml())
        except ValueError as error:
            errors.append((index, line, error))

    if errors:
        log_file = filename.with_suffix(".log")
        log_file.write_text(
            "\n".join(
                f"Error parsing line nr {index}:\n{line}\n{error}\n\n"
                for index, line, error in errors
            )
        )
        raise ConversionError(
            f"Error markup parsing error, see log file {log_file}."
        ) from errors[0][2]

    return document