Skip to content

htmlconverter

Convert html files to the Giella xml format.

HTMLError

Bases: Exception

Raise this error in this module.

Source code in /home/anders/projects/CorpusTools/corpustools/htmlconverter.py
26
27
class HTMLError(Exception):
    """Raise this error in this module."""

remove_declared_encoding(content)

Remove declared decoding.

lxml explodes if we send a decoded Unicode string with an xml-declared encoding http://lxml.de/parsing.html#python-unicode-strings

Parameters:

Name Type Description Default
content str

the contents of a html document

required

Returns:

Type Description
str

content sans the declared decoding

Source code in /home/anders/projects/CorpusTools/corpustools/htmlconverter.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def remove_declared_encoding(content):
    """Remove declared decoding.

    lxml explodes if we send a decoded Unicode string with an
    xml-declared encoding
    http://lxml.de/parsing.html#python-unicode-strings

    Args:
        content (str): the contents of a html document

    Returns:
        (str): content sans the declared decoding
    """
    xml_encoding_declaration_re = re.compile(
        r"^<\?xml [^>]*encoding=[\"']([^\"']+)[^>]*\?>[ \r\n]*", re.IGNORECASE
    )

    return re.sub(xml_encoding_declaration_re, "", content)

to_html_elt(filename)

Return the content of the html doc as a string.

Parameters:

Name Type Description Default
filename str

path to the webpage

required

Returns:

Type Description
lxml.etree.Element

the content of the webpage sent through the lxml.html5parser.

Source code in /home/anders/projects/CorpusTools/corpustools/htmlconverter.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def to_html_elt(filename):
    """Return the content of the html doc as a string.

    Args:
        filename (str): path to the webpage

    Returns:
        (lxml.etree.Element): the content of the webpage sent through the
            lxml.html5parser.
    """
    for encoding in ["utf-8", "windows-1252", "latin1"]:
        try:
            with codecs.open(filename, encoding=encoding) as file_:
                parser = etree.HTMLParser(remove_comments=True)
                return html.document_fromstring(
                    remove_declared_encoding(file_.read()), parser=parser
                )
        except UnicodeDecodeError:
            pass

    raise HTMLError(f"{filename}: encoding trouble")