Skip to content

dupe_finder

Classes to find and handle duplicate files in the repository.

The classes work on converted files.

DupeFinder

Handle duplicates in the corpus.

Source code in /home/anders/projects/CorpusTools/corpustools/dupe_finder.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
class DupeFinder:
    """Handle duplicates in the corpus."""

    def __init__(self, directory):
        self.files = self._get_files(directory)
        self.dupe_files = set()

    @staticmethod
    def _get_files(directory):
        """Get the xml documents from the directory.

        Args:
            directory (str): the directory to collect xml files from.
        """
        files = {}
        xmlprinter = ccat.XMLPrinter(all_paragraphs=True)
        for f in os.listdir(directory):
            if f.endswith(".xml"):
                filename = os.path.join(directory, f)
                xmlprinter.parse_file(filename)
                files[filename] = xmlprinter.process_file().getvalue()

        return files

    @staticmethod
    def get_parallel_texts(filename1):
        """Get the names of the parallel files.

        filename (str): name of the file that should be searched.
        """
        return etree.parse(filename1).xpath(".//parallel_text")

    def remove_dupe_file(self, filename1, filename2):
        """Remove duplicate files.

        filename1 (str): name of the first file to be compared.
        filename2 (str): name of the second file to be compared.
        """
        result = list(
            difflib.unified_diff(
                self.files[filename1].splitlines(1), self.files[filename2].splitlines(1)
            )
        )
        if not result:
            print("Parallels:", filename1, filename2)
            to_remove = filename1
            if self.get_parallel_texts(filename1) > self.get_parallel_texts(filename2):
                to_remove = filename2

            self.dupe_files.add(to_remove)
            origname = corpuspath.make_corpus_path(to_remove)
            if os.path.exists(origname.orig):
                move_files.mover(origname.orig, "")
            print()

    @staticmethod
    def get_wc(filename):
        """Get the wordcount of a file.

        Args:
            filename (str): name of the file to retrieve the word count from.

        Returns:
            (float): the word count
        """
        tree = etree.parse(filename)
        w = tree.find(".//wordcount").text

        return float(w)

    def good_word_ratio(self, filename1, filename2):
        """Check if the word ratio of two files are nearly equal.

        Args:
            filename1 (str): name of the first file.
            filename2 (str): name of the second file.

        Returns:
            (bool): True if the ratio is larger than 0.9, False if it is less.
        """
        w1 = self.get_wc(filename1)
        w2 = self.get_wc(filename2)

        ratio = min(w1, w2) / max(w1, w2)

        return ratio > 0.9

    def compare_files(self, filename1, filename2):
        """Compare two files.

        Args:
            filename1 (str): name of the first file.
            filename2 (str): name of the second file.
        """
        sm = difflib.SequenceMatcher(a=self.files[filename1], b=self.files[filename2])
        ratio = sm.ratio()
        if ratio > 0.90:
            self.dupe_files.add((filename1, filename2))
            print()
            print(round(ratio, 2), filename1, filename2)

            result = difflib.unified_diff(
                self.files[filename1].splitlines(1),
                self.files[filename2].splitlines(1),
                fromfile=os.path.basename(filename1),
                tofile=os.path.basename(filename2),
            )
            sys.stdout.writelines(result)

    def iterate_all_files(self, remove=False):
        """Compare all files to each other.

        Args:
            remove (bool): Defaults to False. If True, remove files,
                otherwise keep files.
        """
        wrong_ratio = 0
        good_ratio = 0
        checked_files = collections.defaultdict(set)
        for filename1 in self.files.keys():
            for filename2 in self.files.keys():
                if filename1 != filename2 and filename1 not in checked_files[filename2]:
                    if self.good_word_ratio(filename1, filename2):
                        good_ratio += 1
                        if remove:
                            self.remove_dupe_file(filename1, filename2)
                        else:
                            self.compare_files(filename1, filename2)

                    checked_files[filename1].add(filename2)
                    checked_files[filename2].add(filename1)
                else:
                    wrong_ratio += 1

        util.print_frame(debug=good_ratio)
        util.print_frame(debug=wrong_ratio)
        print("Almost dupes", len(self.dupe_files))

compare_files(filename1, filename2)

Compare two files.

Parameters:

Name Type Description Default
filename1 str

name of the first file.

required
filename2 str

name of the second file.

required
Source code in /home/anders/projects/CorpusTools/corpustools/dupe_finder.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
def compare_files(self, filename1, filename2):
    """Compare two files.

    Args:
        filename1 (str): name of the first file.
        filename2 (str): name of the second file.
    """
    sm = difflib.SequenceMatcher(a=self.files[filename1], b=self.files[filename2])
    ratio = sm.ratio()
    if ratio > 0.90:
        self.dupe_files.add((filename1, filename2))
        print()
        print(round(ratio, 2), filename1, filename2)

        result = difflib.unified_diff(
            self.files[filename1].splitlines(1),
            self.files[filename2].splitlines(1),
            fromfile=os.path.basename(filename1),
            tofile=os.path.basename(filename2),
        )
        sys.stdout.writelines(result)

get_parallel_texts(filename1) staticmethod

Get the names of the parallel files.

filename (str): name of the file that should be searched.

Source code in /home/anders/projects/CorpusTools/corpustools/dupe_finder.py
59
60
61
62
63
64
65
@staticmethod
def get_parallel_texts(filename1):
    """Get the names of the parallel files.

    filename (str): name of the file that should be searched.
    """
    return etree.parse(filename1).xpath(".//parallel_text")

get_wc(filename) staticmethod

Get the wordcount of a file.

Parameters:

Name Type Description Default
filename str

name of the file to retrieve the word count from.

required

Returns:

Type Description
float

the word count

Source code in /home/anders/projects/CorpusTools/corpustools/dupe_finder.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
@staticmethod
def get_wc(filename):
    """Get the wordcount of a file.

    Args:
        filename (str): name of the file to retrieve the word count from.

    Returns:
        (float): the word count
    """
    tree = etree.parse(filename)
    w = tree.find(".//wordcount").text

    return float(w)

good_word_ratio(filename1, filename2)

Check if the word ratio of two files are nearly equal.

Parameters:

Name Type Description Default
filename1 str

name of the first file.

required
filename2 str

name of the second file.

required

Returns:

Type Description
bool

True if the ratio is larger than 0.9, False if it is less.

Source code in /home/anders/projects/CorpusTools/corpustools/dupe_finder.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def good_word_ratio(self, filename1, filename2):
    """Check if the word ratio of two files are nearly equal.

    Args:
        filename1 (str): name of the first file.
        filename2 (str): name of the second file.

    Returns:
        (bool): True if the ratio is larger than 0.9, False if it is less.
    """
    w1 = self.get_wc(filename1)
    w2 = self.get_wc(filename2)

    ratio = min(w1, w2) / max(w1, w2)

    return ratio > 0.9

iterate_all_files(remove=False)

Compare all files to each other.

Parameters:

Name Type Description Default
remove bool

Defaults to False. If True, remove files, otherwise keep files.

False
Source code in /home/anders/projects/CorpusTools/corpustools/dupe_finder.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
def iterate_all_files(self, remove=False):
    """Compare all files to each other.

    Args:
        remove (bool): Defaults to False. If True, remove files,
            otherwise keep files.
    """
    wrong_ratio = 0
    good_ratio = 0
    checked_files = collections.defaultdict(set)
    for filename1 in self.files.keys():
        for filename2 in self.files.keys():
            if filename1 != filename2 and filename1 not in checked_files[filename2]:
                if self.good_word_ratio(filename1, filename2):
                    good_ratio += 1
                    if remove:
                        self.remove_dupe_file(filename1, filename2)
                    else:
                        self.compare_files(filename1, filename2)

                checked_files[filename1].add(filename2)
                checked_files[filename2].add(filename1)
            else:
                wrong_ratio += 1

    util.print_frame(debug=good_ratio)
    util.print_frame(debug=wrong_ratio)
    print("Almost dupes", len(self.dupe_files))

remove_dupe_file(filename1, filename2)

Remove duplicate files.

filename1 (str): name of the first file to be compared. filename2 (str): name of the second file to be compared.

Source code in /home/anders/projects/CorpusTools/corpustools/dupe_finder.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def remove_dupe_file(self, filename1, filename2):
    """Remove duplicate files.

    filename1 (str): name of the first file to be compared.
    filename2 (str): name of the second file to be compared.
    """
    result = list(
        difflib.unified_diff(
            self.files[filename1].splitlines(1), self.files[filename2].splitlines(1)
        )
    )
    if not result:
        print("Parallels:", filename1, filename2)
        to_remove = filename1
        if self.get_parallel_texts(filename1) > self.get_parallel_texts(filename2):
            to_remove = filename2

        self.dupe_files.add(to_remove)
        origname = corpuspath.make_corpus_path(to_remove)
        if os.path.exists(origname.orig):
            move_files.mover(origname.orig, "")
        print()