Skip to content

korp_mono

Turn analysed files into Korp files.

extract_original_analysis(used_analysis, language)

Filter all Err- and Sem-tags from the string.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
def extract_original_analysis(used_analysis, language):
    """Filter all Err- and Sem-tags from the string."""
    # lang-nob produces:
    # Use/Circ Use/SpellNoSugg"CWD" N Prop Sem/Org ACR Dyn Err/Orth Msc Sg Indef
    # make a space in front of the first ""
    for strange_use in ["Circ", "SpellNoSugg"]:
        used_analysis = used_analysis.replace(
            f'Use/{strange_use}"', f'Use/{strange_use} "'
        )
    if language == "sme":
        used_analysis = group_sem(used_analysis)
    else:
        used_analysis = re.sub(r"Sem/[^\s]+\s", "", used_analysis)

    for regex in [
        r"Use/[^\s]+\s",
        r"Gram/[^\s]+\s",
        r"OLang/[^\s]+\s",
        r"Dial/[^\s]+\s",
        r"CmpN/[^\s]+\s",
        r"CmpNP/[^\s]+\s",
        r"G3+\s",
        r"v9+\s",
        r"Err/[^\s]+\s",
    ]:
        used_analysis = re.sub(regex, "", used_analysis)

    return used_analysis

lemma_generation(original_analysis, pos, _current_lang)

Generate lemma.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py
759
760
761
762
763
764
765
766
767
768
769
def lemma_generation(original_analysis, pos, _current_lang):
    """Generate lemma."""
    if "Ex/" in original_analysis or "_™_" in original_analysis:
        lemma_generation_string = get_generation_string(
            original_analysis, pos, _current_lang
        )

        if lemma_generation_string:
            return generate_lemma(lemma_generation_string, _current_lang)

    return ""

make_morpho_syntactic_description(rest)

Extract morpho_syntactic_description

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
def make_morpho_syntactic_description(rest):
    """Extract morpho_syntactic_description"""
    ex_in_r = rest.find("_©_")
    tm_in_r = rest.find("_™_")

    # split derivation/composition string from the rest of MSD
    # and put it in and extra tuple at the end of the tuple list,
    # otherwise add a default tuple '___'
    # no derivation, no composition
    if ex_in_r == -1 and tm_in_r == -1:
        return rest
        ###logging.info('_msd_cds_1_|'+str(msd)+'|_|'+str(dcs)+'|_')
    # no derivation, but composition
    elif (ex_in_r == -1 and not tm_in_r == -1) or (
        not ex_in_r == -1 and not tm_in_r == -1 and tm_in_r < ex_in_r
    ):
        return re.compile("_™_").split(rest, 1)[0]
    # derivation, but no composition
    elif (not ex_in_r == -1 and tm_in_r == -1) or (
        not ex_in_r == -1 and not tm_in_r == -1 and ex_in_r < tm_in_r
    ):
        return re.compile("_©_").split(rest, 1)[0]
    # covered all relevant combinations?
    else:
        return ""

make_sentences(sentences, current_lang)

Make sentences from the current analysis.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py
869
870
871
872
873
def make_sentences(sentences, current_lang):
    """Make sentences from the current analysis."""
    return (
        make_sentence(current_sentence, current_lang) for current_sentence in sentences
    )

make_vrt_xml(current_file, lang)

Convert analysis of a file into a vrt file

Converting the analysis output into a suitable xml format for vrt transformation (vrt is the cwb input format)

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
def make_vrt_xml(current_file, lang):
    """Convert analysis of a file into a vrt file

    Converting the analysis output into a suitable xml format for vrt
    transformation (vrt is the cwb input format)
    """
    p = etree.XMLParser(encoding="utf-8", huge_tree=True)
    xml_tree = etree.parse(current_file, parser=p)
    old_root = xml_tree.getroot()

    f_root = make_root_element(old_root)
    for s_id, sentence in enumerate(
        make_sentences(valid_sentences(old_root.find(".//body/dependency").text), lang)
    ):
        current_sentence = etree.SubElement(f_root, "sentence")
        current_sentence.set("id", str(s_id + 1))
        current_sentence.text = sentence

    pad_elements(f_root)

    return f_root

pad_elements(elem)

Make sure empty text or tail is padded with newline.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py
243
244
245
246
247
248
249
250
251
252
def pad_elements(elem):
    """Make sure empty text or tail is padded with newline."""
    padding = "\n"
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = padding
        for child in elem:
            pad_elements(child)
    if not elem.tail or not elem.tail.strip():
        elem.tail = padding

process_file(file)

Convert analysed file into vrt format file.

Source code in /home/anders/projects/CorpusTools/corpustools/korp_mono.py
504
505
506
507
508
509
510
511
512
513
514
515
def process_file(file):
    """Convert analysed file into vrt format file."""
    analysed_file = corpuspath.make_corpus_path(file)
    path = analysed_file.korp_mono
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_bytes(
        etree.tostring(
            make_vrt_xml(file, analysed_file.lang),
            xml_declaration=False,
            encoding="utf-8",
        )
    )