Skip to content

languagedetector

This file contains classes fix converted documents.

LanguageDetector

Detect and set the languages of a document.

Source code in corpustools/languagedetector.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class LanguageDetector:
    """Detect and set the languages of a document."""

    def __init__(self, document, language_guesser):
        """Initialise the LanguageDetector class.

        Args:
            document (etree.Element): an etree element.
            language_guesser (text_cat.Classifier): a text_cat.Classifier.
        """
        self.document = document
        self.language_guesser = language_guesser

    @property
    def inlangs(self):
        """Return the predifined possible languages of the document."""
        inlangs = [
            language.get("{http://www.w3.org/XML/1998/namespace}" "lang")
            for language in self.document.findall("header/multilingual/language")
        ]
        if inlangs:
            inlangs.append(self.mainlang)

        return inlangs

    @property
    def mainlang(self):
        """Get the mainlang of the file."""
        return self.document.attrib["{http://www.w3.org/XML/1998/namespace}lang"]

    def set_paragraph_language(self, paragraph):
        """Set xml:lang of paragraph.

        Extract the text outside the quotes, use this text to set
        language of the paragraph.
        Set the language of the quotes in the paragraph.
        """
        if paragraph.get("{http://www.w3.org/XML/1998/namespace}lang") is None:
            paragraph_text = self.remove_quote(paragraph)
            if self.language_guesser is not None and self.language_guesser.get_langs(
                self.inlangs
            ):
                lang = self.language_guesser.classify(
                    paragraph_text, langs=self.inlangs
                )
                if lang != self.mainlang:
                    paragraph.set("{http://www.w3.org/XML/1998/namespace}lang", lang)

                self.set_span_language(paragraph)

        return paragraph

    def set_span_language(self, paragraph):
        """Set xml:lang of span element."""
        for element in paragraph.iter("span"):
            if element.get("type") == "quote":
                if element.text is not None:
                    lang = self.language_guesser.classify(
                        element.text, langs=self.inlangs
                    )
                    if lang != self.mainlang:
                        element.set("{http://www.w3.org/XML/1998/namespace}lang", lang)

    @staticmethod
    def remove_quote(paragraph):
        """Extract all text except the one inside <span type='quote'>."""
        text = ""
        for element in paragraph.iter():
            if (
                element.tag == "span"
                and element.get("type") == "quote"
                and element.tail is not None
            ):
                text = text + element.tail
            else:
                if element.text is not None:
                    text = text + element.text
                if element.tail is not None:
                    text = text + element.tail

        return text

    def detect_language(self):
        """Detect language in all the paragraphs in self.document."""
        if self.document.find("header/multilingual") is not None:
            for paragraph in self.document.iter("p"):
                self.set_paragraph_language(paragraph)

inlangs property

Return the predifined possible languages of the document.

mainlang property

Get the mainlang of the file.

__init__(document, language_guesser)

Initialise the LanguageDetector class.

Parameters:

Name Type Description Default
document Element

an etree element.

required
language_guesser Classifier

a text_cat.Classifier.

required
Source code in corpustools/languagedetector.py
28
29
30
31
32
33
34
35
36
def __init__(self, document, language_guesser):
    """Initialise the LanguageDetector class.

    Args:
        document (etree.Element): an etree element.
        language_guesser (text_cat.Classifier): a text_cat.Classifier.
    """
    self.document = document
    self.language_guesser = language_guesser

detect_language()

Detect language in all the paragraphs in self.document.

Source code in corpustools/languagedetector.py
107
108
109
110
111
def detect_language(self):
    """Detect language in all the paragraphs in self.document."""
    if self.document.find("header/multilingual") is not None:
        for paragraph in self.document.iter("p"):
            self.set_paragraph_language(paragraph)

remove_quote(paragraph) staticmethod

Extract all text except the one inside .

Source code in corpustools/languagedetector.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
@staticmethod
def remove_quote(paragraph):
    """Extract all text except the one inside <span type='quote'>."""
    text = ""
    for element in paragraph.iter():
        if (
            element.tag == "span"
            and element.get("type") == "quote"
            and element.tail is not None
        ):
            text = text + element.tail
        else:
            if element.text is not None:
                text = text + element.text
            if element.tail is not None:
                text = text + element.tail

    return text

set_paragraph_language(paragraph)

Set xml:lang of paragraph.

Extract the text outside the quotes, use this text to set language of the paragraph. Set the language of the quotes in the paragraph.

Source code in corpustools/languagedetector.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def set_paragraph_language(self, paragraph):
    """Set xml:lang of paragraph.

    Extract the text outside the quotes, use this text to set
    language of the paragraph.
    Set the language of the quotes in the paragraph.
    """
    if paragraph.get("{http://www.w3.org/XML/1998/namespace}lang") is None:
        paragraph_text = self.remove_quote(paragraph)
        if self.language_guesser is not None and self.language_guesser.get_langs(
            self.inlangs
        ):
            lang = self.language_guesser.classify(
                paragraph_text, langs=self.inlangs
            )
            if lang != self.mainlang:
                paragraph.set("{http://www.w3.org/XML/1998/namespace}lang", lang)

            self.set_span_language(paragraph)

    return paragraph

set_span_language(paragraph)

Set xml:lang of span element.

Source code in corpustools/languagedetector.py
77
78
79
80
81
82
83
84
85
86
def set_span_language(self, paragraph):
    """Set xml:lang of span element."""
    for element in paragraph.iter("span"):
        if element.get("type") == "quote":
            if element.text is not None:
                lang = self.language_guesser.classify(
                    element.text, langs=self.inlangs
                )
                if lang != self.mainlang:
                    element.set("{http://www.w3.org/XML/1998/namespace}lang", lang)