epubconverter

Convert epub documents to the Giella xml format.

Epub files are zip files that contain text in xhtml files. This class reads all xhtml files found in this archive. The body element of these files are converted to div elements, and appended inside a new body element.

It is possible to filter away ranges of elements from this new xhtml file. These ranges consist pairs of xpath paths, specified inside the metadata file that belongs to this epub file.

`chapters(book, metadata)`

Get the all linear chapters of the epub book.

Parameters:

Name	Type	Description	Default
`book`	`Book`	The epub book element	required

Yields:

Type	Description
`_Element`	The body of an xhtml file found in the epub file.

Source code in corpustools/epubconverter.py

def chapters(book: Book, metadata: MetadataHandler) -> Iterator[etree._Element]:
    """Get the all linear chapters of the epub book.

    Args:
        book: The epub book element

    Yields:
        The body of an xhtml file found in the epub file.
    """
    excluded = metadata.epub_excluded_chapters
    for index, chapter in enumerate(book.chapters):
        if index not in excluded:
            chapterbody = read_chapter(chapter).find(
                "{http://www.w3.org/1999/xhtml}body"
            )
            if chapterbody is not None:
                chapterbody.tag = "{http://www.w3.org/1999/xhtml}div"
                yield chapterbody

`extract_content(filename, metadata)`

Extract content from the epub file.

Parameters:

Name	Type	Description	Default
`filename`	`Path`	path to the document	required

Returns:

Type	Description
`_Element`	The content of the epub file wrapped in html element

Source code in corpustools/epubconverter.py

def extract_content(filename: Path, metadata: MetadataHandler) -> etree._Element:
    """Extract content from the epub file.

    Args:
        filename: path to the document

    Returns:
        The content of the epub file wrapped in html element
    """
    mainbody = etree.Element("{http://www.w3.org/1999/xhtml}body")
    html = etree.Element("{http://www.w3.org/1999/xhtml}html")
    html.append(etree.Element("{http://www.w3.org/1999/xhtml}head"))
    html.append(mainbody)

    book = Book(open_epub(filename))

    for chapterbody in chapters(book, metadata):
        mainbody.append(chapterbody)

    return html

`read_chapter(chapter)`

Read the contents of a epub_file chapter.

Parameters:

Name	Type	Description	Default
`chapter`	`BookChapter`	the chapter of an epub file	required

Returns:

Type	Description
`_Element`	The contents of a chapter

Raises:

Type	Description
`ConversionException`	on conversion error

Source code in corpustools/epubconverter.py

def read_chapter(chapter: BookChapter) -> etree._Element:
    """Read the contents of a epub_file chapter.

    Args:
        chapter: the chapter of an epub file

    Returns:
        The contents of a chapter

    Raises:
        util.ConversionException: on conversion error
    """
    try:
        return etree.fromstring(chapter.read())
    except KeyError as error:
        raise ConversionError(error) from error

`remove_first_element(path1, content)`

Remove the first element in the range.

Parameters:

Name	Type	Description	Default
`path1`	`str`	path to the first element to remove.	required
`content`	`_Element`	the xhtml document that should be altered.	required

Source code in corpustools/epubconverter.py

def remove_first_element(path1: str, content: etree._Element) -> None:
    """Remove the first element in the range.

    Args:
        path1: path to the first element to remove.
        content: the xhtml document that should be altered.
    """
    first_start = content.find(
        path1, namespaces={"html": "http://www.w3.org/1999/xhtml"}
    )
    if first_start is not None:
        parent = first_start.getparent()
        if parent is not None:
            parent.remove(first_start)

`remove_range(path1, path2, content)`

Remove a range of elements from an xhtml document.

Parameters:

Name	Type	Description	Default
`path1`	`str`	path to first element	required
`path2`	`str`	path to second element	required
`content`	`_Element`	xhtml document	required

Source code in corpustools/epubconverter.py

def remove_range(path1: str, path2: str, content: etree._Element) -> None:
    """Remove a range of elements from an xhtml document.

    Args:
        path1: path to first element
        path2: path to second element
        content: xhtml document
    """
    if path2:
        starts, ends = remove_trees_1(path1, path2, content)
        remove_trees_2(starts, ends, content)
        remove_first_element(path1, content)
    else:
        found = content.find(path1, namespaces={"html": "http://www.w3.org/1999/xhtml"})
        if found is not None:
            parent = found.getparent()
            if parent is not None:
                parent.remove(found)

`remove_ranges(metadata, html)`

Remove ranges of html elements.

Source code in corpustools/epubconverter.py

def remove_ranges(metadata: MetadataHandler, html: etree._Element) -> None:
    """Remove ranges of html elements."""
    if metadata.skip_elements:
        for pairs in metadata.skip_elements:
            remove_range(pairs[0], pairs[1], html)

`remove_siblings_shorten_path(parts, content, preceding=False)`

Remove all siblings before or after an element.

Parameters:

Name	Type	Description	Default
`parts`	`list[str]`	a xpath path split on /	required
`content`	`_Element`	an xhtml document	required
`preceding`	`bool`	When True, iterate through the preceding siblings of the found element, otherwise iterate throughe the following siblings.	`False`

Returns:

Type	Description
`list[str]`	The path to the parent of parts.

Source code in corpustools/epubconverter.py

def remove_siblings_shorten_path(
    parts: list[str], content: etree._Element, preceding: bool = False
) -> list[str]:
    """Remove all siblings before or after an element.

    Args:
        parts: a xpath path split on /
        content: an xhtml document
        preceding: When True, iterate through the preceding siblings
            of the found element, otherwise iterate throughe the following
            siblings.

    Returns:
        The path to the parent of parts.
    """
    path = "/".join(parts)
    found = content.find(path, namespaces={"html": "http://www.w3.org/1999/xhtml"})
    if found is not None:
        parent = found.getparent()
        if parent is not None:
            for sibling in found.itersiblings(preceding=preceding):
                parent.remove(sibling)

    return parts[:-1]

`remove_trees_1(path1, path2, content)`

Remove tree nodes that do not have the same parents.

While the parents in starts and ends are unequal (that means that starts and ends belong in different trees), remove elements following starts and preceding ends. Shorten the path to the parents of starts and ends and remove more elements if needed. starts and ends are of equal length.

Parameters:

Name	Type	Description	Default
`path1`	`str`	path to first element	required
`path2`	`str`	path to second element	required
`content`	`_Element`	xhtml document, where elements are removed.	required

Returns:

Type	Description
`tuple[list[str], list[str]]`	Paths to the new start and end element.

Source code in corpustools/epubconverter.py

def remove_trees_1(
    path1: str, path2: str, content: etree._Element
) -> tuple[list[str], list[str]]:
    """Remove tree nodes that do not have the same parents.

    While the parents in starts and ends are unequal (that means that
    starts and ends belong in different trees), remove elements
    following starts and preceding ends. Shorten the path to the parents
    of starts and ends and remove more elements if needed. starts and
    ends are of equal length.

    Args:
        path1: path to first element
        path2: path to second element
        content: xhtml document, where elements are removed.

    Returns:
        Paths to the new start and end element.
    """
    starts, ends = shorten_longest_path(path1, path2, content)

    while starts[:-1] != ends[:-1]:
        starts = remove_siblings_shorten_path(starts, content)
        ends = remove_siblings_shorten_path(ends, content, preceding=True)

    return starts, ends

`remove_trees_2(starts, ends, content)`

Remove tree nodes that have the same parents.

Now that the parents of starts and ends are equal, remove the last trees of nodes between starts and ends (if necessary).

Parameters:

Name	Type	Description	Default
`starts`	`list[str]`	path to first element	required
`ends`	`list[str]`	path to second element	required
`content`	`_Element`	xhtml document, where elements are removed.	required

Source code in corpustools/epubconverter.py

def remove_trees_2(starts: list[str], ends: list[str], content: etree._Element) -> None:
    """Remove tree nodes that have the same parents.

    Now that the parents of starts and ends are equal, remove the last
    trees of nodes between starts and ends (if necessary).

    Args:
        starts: path to first element
        ends: path to second element
        content: xhtml document, where elements are removed.
    """
    deepest_start = content.find(
        "/".join(starts), namespaces={"html": "http://www.w3.org/1999/xhtml"}
    )
    deepest_end = content.find(
        "/".join(ends), namespaces={"html": "http://www.w3.org/1999/xhtml"}
    )
    if deepest_start is not None:
        parent = deepest_start.getparent()
        for sibling in deepest_start.itersiblings():
            if sibling == deepest_end:
                break
            elif parent is not None:
                parent.remove(sibling)

`shorten_longest_path(path1, path2, content)`

Remove elements from the longest path.

If starts is longer than ends, remove the siblings following starts, shorten starts with one step (going to the parent). If starts still is longer than ends, remove the siblings following the parent. This is done untill starts and ends are of equal length.

If on the other hand ends is longer than starts, remove the siblings preceding ends, then shorten ends (going to its parent).

Parameters:

Name	Type	Description	Default
`path1`	`str`	path to first element	required
`path2`	`str`	path to second element	required
`content`	`_Element`	xhtml document, where elements are removed.	required

Returns:

Type	Description
`tuple[list[str], list[str]]`	Paths to the new start and end element, now with the same length.

Source code in corpustools/epubconverter.py

def shorten_longest_path(
    path1: str, path2: str, content: etree._Element
) -> tuple[list[str], list[str]]:
    """Remove elements from the longest path.

    If starts is longer than ends, remove the siblings following starts,
    shorten starts with one step (going to the parent). If starts still is
    longer than ends, remove the siblings following the parent. This is
    done untill starts and ends are of equal length.

    If on the other hand ends is longer than starts, remove the siblings
    preceding ends, then shorten ends (going to its parent).

    Args:
        path1: path to first element
        path2: path to second element
        content: xhtml document, where elements are removed.

    Returns:
        Paths to the new start and end element, now with the same length.
    """
    starts, ends = path1.split("/"), path2.split("/")

    while len(starts) > len(ends):
        starts = remove_siblings_shorten_path(starts, content)

    while len(ends) > len(starts):
        ends = remove_siblings_shorten_path(ends, content, preceding=True)

    return starts, ends

`to_html_elt(filename)`

Append all chapter bodies as divs to an html file.

Returns:

Type	Description
`_Element`	An etree.Element containing the content of all xhtml files found in
`_Element`	the epub file as one xhtml document.

Source code in corpustools/epubconverter.py

def to_html_elt(filename: Path) -> etree._Element:
    """Append all chapter bodies as divs to an html file.

    Returns:
        An etree.Element containing the content of all xhtml files found in
        the epub file as one xhtml document.
    """
    filename = filename.with_suffix(filename.suffix + ".xsl")
    metadata = MetadataHandler(filename, create=True)
    html = extract_content(filename, metadata)
    try:
        remove_ranges(metadata, html)
    except AttributeError as error:
        raise ConversionError(
            "Check that skip_elements in the metadata file has the correct format"
        ) from error

    return html