Skip to content

generate_anchor_list

Generate an anchor file needed by the java aligner.

GenerateAnchorList

Generate anchor list used by tca2.

Source code in /home/anders/projects/CorpusTools/corpustools/generate_anchor_list.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
class GenerateAnchorList:
    """Generate anchor list used by tca2."""

    def __init__(self, lang1, lang2, columns, input_file):
        """Initialise the GenerateAnchorList class.

        Args:
            lang1 (str): the main lang
            lang2 (str): the translated lang
            columns (list of str): contains all the possible langs
                found in the main anchor file.
            input_file (str): path of the existing anchor file.
        """
        self.lang1 = lang1
        self.lang2 = lang2
        self.lang1_index = columns.index(lang1)
        self.lang2_index = columns.index(lang2)
        self.columns = columns
        self.input_file = input_file

    def words_of_line(self, lineno, line):
        """Either a word-pair or None, if no word-pair on that line."""
        line = line.strip()
        if not line.startswith("#") or not line.startswith("&"):
            words = line.split("/")
            if len(words) == len(self.columns):
                word1 = words[self.lang1_index].strip()
                word2 = words[self.lang2_index].strip()
                if word1 and word2:
                    return word1, word2
            else:
                print(
                    f"Invalid line at {lineno} in {self.input_file}",
                    file=sys.stderr,
                )

    def read_anchors(self, quiet=False):
        """List of word-pairs in infiles, empty/bad lines skipped."""
        with codecs.open(self.input_file, encoding="utf8") as f:
            out = [self.words_of_line(i, l) for i, l in enumerate(f.readlines())]
            out = [_f for _f in out if _f]
            if not quiet:
                util.note(f"Read {len(out)} anchors from {self.input_file}")
            return out

    def generate_file(self, outpath, quiet=False):
        """infiles is a list of file paths."""
        anchors = self.read_anchors(quiet)

        with codecs.open(outpath, "w", encoding="utf8") as outfile:
            if not quiet:
                util.note(f"Generating anchor word list to {outpath}")
            out = "\n".join(f"{w1} / {w2}" for w1, w2 in anchors)
            outfile.write(out)
            outfile.write("\n")

__init__(lang1, lang2, columns, input_file)

Initialise the GenerateAnchorList class.

Parameters:

Name Type Description Default
lang1 str

the main lang

required
lang2 str

the translated lang

required
columns list of str

contains all the possible langs found in the main anchor file.

required
input_file str

path of the existing anchor file.

required
Source code in /home/anders/projects/CorpusTools/corpustools/generate_anchor_list.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def __init__(self, lang1, lang2, columns, input_file):
    """Initialise the GenerateAnchorList class.

    Args:
        lang1 (str): the main lang
        lang2 (str): the translated lang
        columns (list of str): contains all the possible langs
            found in the main anchor file.
        input_file (str): path of the existing anchor file.
    """
    self.lang1 = lang1
    self.lang2 = lang2
    self.lang1_index = columns.index(lang1)
    self.lang2_index = columns.index(lang2)
    self.columns = columns
    self.input_file = input_file

generate_file(outpath, quiet=False)

infiles is a list of file paths.

Source code in /home/anders/projects/CorpusTools/corpustools/generate_anchor_list.py
76
77
78
79
80
81
82
83
84
85
def generate_file(self, outpath, quiet=False):
    """infiles is a list of file paths."""
    anchors = self.read_anchors(quiet)

    with codecs.open(outpath, "w", encoding="utf8") as outfile:
        if not quiet:
            util.note(f"Generating anchor word list to {outpath}")
        out = "\n".join(f"{w1} / {w2}" for w1, w2 in anchors)
        outfile.write(out)
        outfile.write("\n")

read_anchors(quiet=False)

List of word-pairs in infiles, empty/bad lines skipped.

Source code in /home/anders/projects/CorpusTools/corpustools/generate_anchor_list.py
67
68
69
70
71
72
73
74
def read_anchors(self, quiet=False):
    """List of word-pairs in infiles, empty/bad lines skipped."""
    with codecs.open(self.input_file, encoding="utf8") as f:
        out = [self.words_of_line(i, l) for i, l in enumerate(f.readlines())]
        out = [_f for _f in out if _f]
        if not quiet:
            util.note(f"Read {len(out)} anchors from {self.input_file}")
        return out

words_of_line(lineno, line)

Either a word-pair or None, if no word-pair on that line.

Source code in /home/anders/projects/CorpusTools/corpustools/generate_anchor_list.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def words_of_line(self, lineno, line):
    """Either a word-pair or None, if no word-pair on that line."""
    line = line.strip()
    if not line.startswith("#") or not line.startswith("&"):
        words = line.split("/")
        if len(words) == len(self.columns):
            word1 = words[self.lang1_index].strip()
            word2 = words[self.lang2_index].strip()
            if word1 and word2:
                return word1, word2
        else:
            print(
                f"Invalid line at {lineno} in {self.input_file}",
                file=sys.stderr,
            )