Skip to content

convert_using_soffice

Convert doc that LibreOffice knows to html.

to_html_elt(filename)

Convert the content of a writenow file to an ElementTree.

Parameters:

Name Type Description Default
filename Path

path to the document

required

Returns:

Type Description
ElementTree

An element containing the HTML version of the given file.

Source code in corpustools/convert_using_soffice.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def to_html_elt(filename: Path) -> ElementTree:
    """Convert the content of a writenow file to an ElementTree.

    Args:
        filename: path to the document

    Returns:
        An element containing the HTML version of the given file.
    """
    outdir = filename.parent
    subprocess.run(
        [
            "/Applications/LibreOffice.app/Contents/MacOS/soffice"
            if sys.platform == "darwin"
            else "soffice",
            "--convert-to",
            "html",
            "--outdir",
            outdir.as_posix(),
            filename.as_posix(),
        ],
        encoding="utf-8",
        check=False,
    )

    outname = f"{filename.stem}.html"
    parsed_html = html.parse(outdir / outname)
    (outdir / outname).unlink()

    return parsed_html