Skip to content

finder

Manage corpus files in various ways.

move_twenty_percent_to_goldcorpus()

Move twenty percent of the files to the goldcorpus

Source code in /home/anders/projects/CorpusTools/corpustools/finder.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
def move_twenty_percent_to_goldcorpus():
    """Move twenty percent of the files to the goldcorpus"""
    directories = [
        "orig/sme/admin/sd/cealkamusat_fi",
        "orig/sme/admin/sd/davviriikkalas_samekonvensuvdna_fi",
        "orig/sme/admin/sd/inaugurations_fi",
        "orig/sme/admin/sd/ohcan_lahkai_fi",
        "orig/sme/admin/sd/sami_parlamentarals_raddi_fi",
        "orig/sme/admin/sd/www.samediggi.fi",
        "orig/sme/facta/samediggi.fi/",
    ]

    fluff = collections.defaultdict(list)
    for dir in directories:
        for root, dirs, files in os.walk(os.path.join(os.getenv("GTFREE"), dir)):
            for f in files:
                if f.endswith(".xsl"):
                    name = os.path.join(root, f[:-4])
                    size = os.path.getsize(name)
                    fluff[size].append(name)

    i = 0
    for size in sorted(list(fluff.keys()), reverse=True):
        for f in fluff[size]:
            if i == 4:
                move_files.mover(f, f.replace("orig/", "goldstandard/orig/"))
                i = 0
            i += 1

remove_files_with_duplicate_content()

To replace: 123, , 339, 340

Source code in /home/anders/projects/CorpusTools/corpustools/finder.py
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def remove_files_with_duplicate_content():
    """To replace: 123, , 339, 340"""
    ufflangs = {
        "fin": "finnish",
        "eng": "english",
        "sme": "davvi",
        "smn": "anaras",
        "sms": "nuortta",
    }

    this_lang = "sms"

    foundcount = 0
    notfoundcount = 0
    fingetter = adder.AddToCorpus(
        str(os.getenv("GTFREE")), "fin", "admin/sd/www.samediggi.fi"
    )
    smsgetter = adder.AddToCorpus(
        str(os.getenv("GTFREE")), this_lang, "admin/sd/www.samediggi.fi"
    )
    for root, dirs, files in os.walk(
        os.path.join(
            os.getenv("GTFREE"), "orig", this_lang, "admin/sd/www.samediggi.fi"
        )
    ):
        print(root)
        for f in files:
            if f.endswith(".xsl") and "itemid=256" in f:
                path = os.path.join(root, f)
                mdh = xslsetter.MetadataHandler(path)
                filename = mdh.get_variable("filename")

                parallellfile = path.replace("/" + this_lang + "/", "/fin/")
                parallellfile = parallellfile.replace(".xsl", "")
                parallellfile = parallellfile.replace(
                    "lang=" + ufflangs[this_lang], "lang=finnish"
                )
                parallellfile = parallellfile.replace("itemid=256", "itemid=195")

                if not os.path.exists(parallellfile):
                    if this_lang != "fin":
                        fingetter.copy_url_to_corpus(
                            filename.replace("Itemid=256", "Itemid=195").replace(
                                "lang=" + ufflangs[this_lang], "lang=finnish"
                            )
                        )

                smsgetter.copy_url_to_corpus(
                    filename.replace("Itemid=256", "Itemid=195"),
                    parallelpath=parallellfile,
                )
                move_files.mover(path.replace(".xsl", ""), "")

    smsgetter.add_files_to_working_copy()
    fingetter.add_files_to_working_copy()