korp_mono

Turn analysed files into Korp files.

`extract_original_analysis(used_analysis, language)`

Filter all Err- and Sem-tags from the string.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py

def extract_original_analysis(used_analysis, language):
    """Filter all Err- and Sem-tags from the string."""
    # lang-nob produces:
    # Use/Circ Use/SpellNoSugg"CWD" N Prop Sem/Org ACR Dyn Err/Orth Msc Sg Indef
    # make a space in front of the first ""
    for strange_use in ["Circ", "SpellNoSugg"]:
        used_analysis = used_analysis.replace(
            f'Use/{strange_use}"', f'Use/{strange_use} "'
        )
    if language == "sme":
        used_analysis = group_sem(used_analysis)
    else:
        used_analysis = re.sub(r"Sem/[^\s]+\s", "", used_analysis)

    for regex in [
        r"Use/[^\s]+\s",
        r"Gram/[^\s]+\s",
        r"OLang/[^\s]+\s",
        r"Dial/[^\s]+\s",
        r"CmpN/[^\s]+\s",
        r"CmpNP/[^\s]+\s",
        r"G3+\s",
        r"v9+\s",
        r"Err/[^\s]+\s",
    ]:
        used_analysis = re.sub(regex, "", used_analysis)

    return used_analysis

`lemma_generation(original_analysis, pos, _current_lang)`

Generate lemma.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py

def lemma_generation(original_analysis, pos, _current_lang):
    """Generate lemma."""
    if "Ex/" in original_analysis or "_™_" in original_analysis:
        lemma_generation_string = get_generation_string(
            original_analysis, pos, _current_lang
        )

        if lemma_generation_string:
            return generate_lemma(lemma_generation_string, _current_lang)

    return ""

`make_morpho_syntactic_description(rest)`

Extract morpho_syntactic_description

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py

def make_morpho_syntactic_description(rest):
    """Extract morpho_syntactic_description"""
    ex_in_r = rest.find("_©_")
    tm_in_r = rest.find("_™_")

    # split derivation/composition string from the rest of MSD
    # and put it in and extra tuple at the end of the tuple list,
    # otherwise add a default tuple '___'
    # no derivation, no composition
    if ex_in_r == -1 and tm_in_r == -1:
        return rest
        ###logging.info('_msd_cds_1_|'+str(msd)+'|_|'+str(dcs)+'|_')
    # no derivation, but composition
    elif (ex_in_r == -1 and not tm_in_r == -1) or (
        not ex_in_r == -1 and not tm_in_r == -1 and tm_in_r < ex_in_r
    ):
        return re.compile("_™_").split(rest, 1)[0]
    # derivation, but no composition
    elif (not ex_in_r == -1 and tm_in_r == -1) or (
        not ex_in_r == -1 and not tm_in_r == -1 and ex_in_r < tm_in_r
    ):
        return re.compile("_©_").split(rest, 1)[0]
    # covered all relevant combinations?
    else:
        return ""

`make_sentences(sentences, current_lang)`

Make sentences from the current analysis.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py

def make_sentences(sentences, current_lang):
    """Make sentences from the current analysis."""
    return (
        make_sentence(current_sentence, current_lang) for current_sentence in sentences
    )

`make_vrt_xml(current_file, lang)`

Convert analysis of a file into a vrt file

Converting the analysis output into a suitable xml format for vrt transformation (vrt is the cwb input format)

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py

def make_vrt_xml(current_file, lang):
    """Convert analysis of a file into a vrt file

    Converting the analysis output into a suitable xml format for vrt
    transformation (vrt is the cwb input format)
    """
    p = etree.XMLParser(encoding="utf-8", huge_tree=True)
    xml_tree = etree.parse(current_file, parser=p)
    old_root = xml_tree.getroot()

    f_root = make_root_element(old_root)
    for s_id, sentence in enumerate(
        make_sentences(valid_sentences(old_root.find(".//body/dependency").text), lang)
    ):
        current_sentence = etree.SubElement(f_root, "sentence")
        current_sentence.set("id", str(s_id + 1))
        current_sentence.text = sentence

    pad_elements(f_root)

    return f_root

`pad_elements(elem)`

Make sure empty text or tail is padded with newline.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py

def pad_elements(elem):
    """Make sure empty text or tail is padded with newline."""
    padding = "\n"
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = padding
        for child in elem:
            pad_elements(child)
    if not elem.tail or not elem.tail.strip():
        elem.tail = padding

`process_file(file)`

Convert analysed file into vrt format file.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py

def process_file(file):
    """Convert analysed file into vrt format file."""
    analysed_file = corpuspath.make_corpus_path(file)
    path = analysed_file.korp_mono
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_bytes(
        etree.tostring(
            make_vrt_xml(file, analysed_file.lang),
            xml_declaration=False,
            encoding="utf-8",
        )
    )