Skip to content

namechanger

Classes and functions change names of corpus files.

CorpusFileRemover

Remove an original file and all its derived files.

Source code in /home/anders/projects/CorpusTools/corpustools/namechanger.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class CorpusFileRemover:
    """Remove an original file and all its derived files."""

    def __init__(self, oldpath):
        """Class to remove corpus files.

        Args:
            oldpath (unicode): the old path
        """
        self.old_corpus_path = corpuspath.make_corpus_path(oldpath)
        p = Path(oldpath)
        if not p.exists():
            raise SystemExit(f"{oldpath} does not exist!")
        self.orig_vcs = versioncontrol.vcs(self.old_corpus_path.orig_corpus_dir)
        self.conv_vcs = versioncontrol.vcs(self.old_corpus_path.converted_corpus_dir)

    def remove_files(self):
        """Remove all the files that are under version control."""
        self.orig_vcs.remove(self.old_corpus_path.orig)
        self.orig_vcs.remove(self.old_corpus_path.xsl)
        self.conv_vcs.remove(self.old_corpus_path.converted)
        for lang in self.old_corpus_path.metadata.get_parallel_texts():
            if os.path.exists(self.old_corpus_path.tmx(lang)):
                self.conv_vcs.remove(self.old_corpus_path.tmx(lang))

__init__(oldpath)

Class to remove corpus files.

Parameters:

Name Type Description Default
oldpath unicode

the old path

required
Source code in /home/anders/projects/CorpusTools/corpustools/namechanger.py
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(self, oldpath):
    """Class to remove corpus files.

    Args:
        oldpath (unicode): the old path
    """
    self.old_corpus_path = corpuspath.make_corpus_path(oldpath)
    p = Path(oldpath)
    if not p.exists():
        raise SystemExit(f"{oldpath} does not exist!")
    self.orig_vcs = versioncontrol.vcs(self.old_corpus_path.orig_corpus_dir)
    self.conv_vcs = versioncontrol.vcs(self.old_corpus_path.converted_corpus_dir)

remove_files()

Remove all the files that are under version control.

Source code in /home/anders/projects/CorpusTools/corpustools/namechanger.py
57
58
59
60
61
62
63
64
def remove_files(self):
    """Remove all the files that are under version control."""
    self.orig_vcs.remove(self.old_corpus_path.orig)
    self.orig_vcs.remove(self.old_corpus_path.xsl)
    self.conv_vcs.remove(self.old_corpus_path.converted)
    for lang in self.old_corpus_path.metadata.get_parallel_texts():
        if os.path.exists(self.old_corpus_path.tmx(lang)):
            self.conv_vcs.remove(self.old_corpus_path.tmx(lang))

NamechangerError

Bases: Exception

This exception is raised when errors occurs in this module.

Source code in /home/anders/projects/CorpusTools/corpustools/namechanger.py
34
35
class NamechangerError(Exception):
    """This exception is raised when errors occurs in this module."""

are_duplicates(oldpath, newpath)

Check if oldpath and newpath are duplicates of each other.

Parameters:

Name Type Description Default
oldpath unicode

old path of the file

required
newpath unicode

the wanted, new path of the file

required

Returns:

Type Description
bool

a boolean indicating if the two files are duplicates

Source code in /home/anders/projects/CorpusTools/corpustools/namechanger.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def are_duplicates(oldpath, newpath):
    """Check if oldpath and newpath are duplicates of each other.

    Args:
        oldpath (unicode): old path of the file
        newpath (unicode): the wanted, new path of the file

    Returns:
        (bool): a boolean indicating if the two files are duplicates
    """
    if os.path.isfile(oldpath) and os.path.isfile(newpath):
        with open(oldpath, "rb") as oldcontent, open(newpath, "rb") as newcontent:
            return compute_hexdigest(oldcontent) == compute_hexdigest(newcontent)
    else:
        return False

compute_hexdigest(afile, blocksize=65536)

Compute the hexdigest of the file in path.

Parameters:

Name Type Description Default
afile file

a file like object

required

Returns:

Type Description
str

a hexdigest of the file

Source code in /home/anders/projects/CorpusTools/corpustools/namechanger.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def compute_hexdigest(afile, blocksize=65536):
    """Compute the hexdigest of the file in path.

    Args:
        afile (file): a file like object

    Returns:
        (str): a hexdigest of the file
    """
    hasher = hashlib.md5()
    buf = afile.read(blocksize)
    while buf:
        hasher.update(buf)
        buf = afile.read(blocksize)

    return hasher.hexdigest()

compute_new_basename(orig_path)

Compute the new path.

Parameters:

Name Type Description Default
orig_path Path

path to file, basename should possibly be normalised

required

Returns:

Type Description
pathlib.Path

lower cased, ascii path

Source code in /home/anders/projects/CorpusTools/corpustools/namechanger.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def compute_new_basename(orig_path):
    """Compute the new path.

    Args:
        orig_path (Path): path to file, basename should possibly be normalised

    Returns:
        (pathlib.Path): lower cased, ascii path
    """
    wanted_basename = normalise_filename(orig_path.name)
    new_path = orig_path.with_name(wanted_basename)
    index = 1
    while os.path.exists(new_path):
        if are_duplicates(orig_path, new_path):
            raise UserWarning(f"{orig_path} and {new_path} are duplicates. ")
        else:
            if "." in wanted_basename:
                dot = wanted_basename.rfind(".")
                extension = wanted_basename[dot:]
                pre_extension = wanted_basename[:dot]
                new_basename = pre_extension + "_" + str(index) + extension
            else:
                new_basename = wanted_basename + str(index)
            new_path = orig_path.with_name(new_basename)
            index += 1

    return new_path

normalise_filename(filename)

Normalise filename to ascii only.

Downcase filename, replace non-ascii characters with ascii ones and remove or replace unwanted characters.

Parameters:

Name Type Description Default
filename str

name of the file

required

Returns:

Type Description
str

a downcased string containing only ascii chars

Source code in /home/anders/projects/CorpusTools/corpustools/namechanger.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
def normalise_filename(filename):
    """Normalise filename to ascii only.

    Downcase filename, replace non-ascii characters with ascii ones and
    remove or replace unwanted characters.

    Args:
        filename (str): name of the file

    Returns:
        (str): a downcased string containing only ascii chars
    """
    if os.sep in filename:
        raise NamechangerError(
            "Invalid filename {}.\n"
            "Filename is not allowed to contain {}".format(filename, os.sep)
        )

    # unicode.decode wants a unicode string
    if not isinstance(filename, str):
        filename = filename.decode("utf8")

    # unidecode.unidecode makes ascii only
    # urllib.unquote replaces %xx escapes by their single-character equivalent.
    asciiname = unidecode.unidecode(unquote(filename))

    while asciiname.startswith(("-", "_")):
        asciiname = asciiname[1:]

    unwanted = re.compile("[+ ()'–?,!,<>\"&;&#\\|$]+")

    return unwanted.sub("_", asciiname).lower()