Skip to content

languagedetector

This file contains classes fix converted documents.

LanguageDetector

Detect and set the languages of a document.

Source code in /home/anders/projects/CorpusTools/corpustools/languagedetector.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class LanguageDetector:
    """Detect and set the languages of a document."""

    def __init__(self, document, language_guesser):
        """Initialise the LanguageDetector class.

        Args:
            document (etree.Element): an etree element.
            language_guesser (text_cat.Classifier): a text_cat.Classifier.
        """
        self.document = document
        self.language_guesser = language_guesser

    @property
    def inlangs(self):
        """Return the predifined possible languages of the document."""
        inlangs = [
            language.get("{http://www.w3.org/XML/1998/namespace}" "lang")
            for language in self.document.findall("header/multilingual/language")
        ]
        if inlangs:
            inlangs.append(self.mainlang)

        return inlangs

    @property
    def mainlang(self):
        """Get the mainlang of the file."""
        return self.document.attrib["{http://www.w3.org/XML/1998/namespace}lang"]

    def set_paragraph_language(self, paragraph):
        """Set xml:lang of paragraph.

        Extract the text outside the quotes, use this text to set
        language of the paragraph.
        Set the language of the quotes in the paragraph.
        """
        if paragraph.get("{http://www.w3.org/XML/1998/namespace}lang") is None:
            paragraph_text = self.remove_quote(paragraph)
            if self.language_guesser is not None and self.language_guesser.get_langs(
                self.inlangs
            ):
                lang = self.language_guesser.classify(
                    paragraph_text, langs=self.inlangs
                )
                if lang != self.mainlang:
                    paragraph.set("{http://www.w3.org/XML/1998/namespace}lang", lang)

                self.set_span_language(paragraph)

        return paragraph

    def set_span_language(self, paragraph):
        """Set xml:lang of span element."""
        for element in paragraph.iter("span"):
            if element.get("type") == "quote":
                if element.text is not None:
                    lang = self.language_guesser.classify(
                        element.text, langs=self.inlangs
                    )
                    if lang != self.mainlang:
                        element.set("{http://www.w3.org/XML/1998/namespace}lang", lang)

    @staticmethod
    def remove_quote(paragraph):
        """Extract all text except the one inside <span type='quote'>."""
        text = ""
        for element in paragraph.iter():
            if (
                element.tag == "span"
                and element.get("type") == "quote"
                and element.tail is not None
            ):
                text = text + element.tail
            else:
                if element.text is not None:
                    text = text + element.text
                if element.tail is not None:
                    text = text + element.tail

        return text

    def detect_language(self):
        """Detect language in all the paragraphs in self.document."""
        if self.document.find("header/multilingual") is not None:
            for paragraph in self.document.iter("p"):
                self.set_paragraph_language(paragraph)

inlangs property

Return the predifined possible languages of the document.

mainlang property

Get the mainlang of the file.

__init__(document, language_guesser)

Initialise the LanguageDetector class.

Parameters:

Name Type Description Default
document etree.Element

an etree element.

required
language_guesser text_cat.Classifier

a text_cat.Classifier.

required
Source code in /home/anders/projects/CorpusTools/corpustools/languagedetector.py
28
29
30
31
32
33
34
35
36
def __init__(self, document, language_guesser):
    """Initialise the LanguageDetector class.

    Args:
        document (etree.Element): an etree element.
        language_guesser (text_cat.Classifier): a text_cat.Classifier.
    """
    self.document = document
    self.language_guesser = language_guesser

detect_language()

Detect language in all the paragraphs in self.document.

Source code in /home/anders/projects/CorpusTools/corpustools/languagedetector.py
107
108
109
110
111
def detect_language(self):
    """Detect language in all the paragraphs in self.document."""
    if self.document.find("header/multilingual") is not None:
        for paragraph in self.document.iter("p"):
            self.set_paragraph_language(paragraph)

remove_quote(paragraph) staticmethod

Extract all text except the one inside .

Source code in /home/anders/projects/CorpusTools/corpustools/languagedetector.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
@staticmethod
def remove_quote(paragraph):
    """Extract all text except the one inside <span type='quote'>."""
    text = ""
    for element in paragraph.iter():
        if (
            element.tag == "span"
            and element.get("type") == "quote"
            and element.tail is not None
        ):
            text = text + element.tail
        else:
            if element.text is not None:
                text = text + element.text
            if element.tail is not None:
                text = text + element.tail

    return text

set_paragraph_language(paragraph)

Set xml:lang of paragraph.

Extract the text outside the quotes, use this text to set language of the paragraph. Set the language of the quotes in the paragraph.

Source code in /home/anders/projects/CorpusTools/corpustools/languagedetector.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def set_paragraph_language(self, paragraph):
    """Set xml:lang of paragraph.

    Extract the text outside the quotes, use this text to set
    language of the paragraph.
    Set the language of the quotes in the paragraph.
    """
    if paragraph.get("{http://www.w3.org/XML/1998/namespace}lang") is None:
        paragraph_text = self.remove_quote(paragraph)
        if self.language_guesser is not None and self.language_guesser.get_langs(
            self.inlangs
        ):
            lang = self.language_guesser.classify(
                paragraph_text, langs=self.inlangs
            )
            if lang != self.mainlang:
                paragraph.set("{http://www.w3.org/XML/1998/namespace}lang", lang)

            self.set_span_language(paragraph)

    return paragraph

set_span_language(paragraph)

Set xml:lang of span element.

Source code in /home/anders/projects/CorpusTools/corpustools/languagedetector.py
77
78
79
80
81
82
83
84
85
86
def set_span_language(self, paragraph):
    """Set xml:lang of span element."""
    for element in paragraph.iter("span"):
        if element.get("type") == "quote":
            if element.text is not None:
                lang = self.language_guesser.classify(
                    element.text, langs=self.inlangs
                )
                if lang != self.mainlang:
                    element.set("{http://www.w3.org/XML/1998/namespace}lang", lang)