htmlconverter

Convert html files to the Giella xml format.

`HTMLError`

Bases: Exception

Raise this error in this module.

Source code in corpustools/htmlconverter.py

class HTMLError(Exception):
    """Raise this error in this module."""

`remove_declared_encoding(content)`

Remove declared decoding.

lxml explodes if we send a decoded Unicode string with an xml-declared encoding http://lxml.de/parsing.html#python-unicode-strings

Parameters:

Name	Type	Description	Default
`content`	`str`	the contents of a html document	required

Returns:

Type	Description
`str`	content sans the declared decoding

Source code in corpustools/htmlconverter.py

def remove_declared_encoding(content):
    """Remove declared decoding.

    lxml explodes if we send a decoded Unicode string with an
    xml-declared encoding
    http://lxml.de/parsing.html#python-unicode-strings

    Args:
        content (str): the contents of a html document

    Returns:
        (str): content sans the declared decoding
    """
    xml_encoding_declaration_re = re.compile(
        r"^<\?xml [^>]*encoding=[\"']([^\"']+)[^>]*\?>[ \r\n]*", re.IGNORECASE
    )

    return re.sub(xml_encoding_declaration_re, "", content)

`to_html_elt(filename)`

Return the content of the html doc as a string.

Parameters:

Name	Type	Description	Default
`filename`	`Path`	path to the webpage	required

Returns:

Type	Description
`HtmlElement`	The content of the webpage sent through the lxml.html5parser.

Source code in corpustools/htmlconverter.py

def to_html_elt(filename: Path) -> html.HtmlElement:
    """Return the content of the html doc as a string.

    Args:
        filename: path to the webpage

    Returns:
        The content of the webpage sent through the lxml.html5parser.
    """
    for encoding in ["utf-8", "windows-1252", "latin1"]:
        try:
            with open(filename, encoding=encoding) as file_:
                parser = etree.HTMLParser(remove_comments=True)
                return html.document_fromstring(
                    remove_declared_encoding(file_.read()), parser=parser
                )
        except UnicodeDecodeError:
            pass

    raise HTMLError(f"{filename}: encoding trouble")