Skip to content

CorpusTools documentation

convertermanager

giellalt/CorpusTools

convertermanager

This file manages conversion of files to the Giella xml format.

`ConverterManager`

Manage the conversion of original files to corpus xml.

Class/static variables: _languageguesser (text_cat.Classifier): Language guesser to indicate languages in the converted document.

Attributes:

Name	Type	Description
`write_intermediate`	`bool`	indicate whether intermediate versions of the converted document should be written to disk.
`goldstandard`	`bool`	indicating whether goldstandard documents should be converted.
`files`	`list of str`	list of paths to original files that should be converted from original format to xml.

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

class ConverterManager:
    """Manage the conversion of original files to corpus xml.

    Class/static variables:
        _languageguesser (text_cat.Classifier): Language guesser to indicate
            languages in the converted document.
    Attributes:
        write_intermediate (bool): indicate whether intermediate versions
            of the converted document should be written to disk.
        goldstandard (bool): indicating whether goldstandard documents
            should be converted.
        files (list of str): list of paths to original files that should
            be converted from original format to xml.
    """

    _languageguesser = None

    def languageguesser(self):
        """Return our language guesser.
        This is a class variable, but since it takes a while to initialise,
        we don't do it until it's needed."""
        if self._languageguesser is None:
            ConverterManager._languageguesser = text_cat.Classifier(None)
        return self._languageguesser

    def __init__(
        self, lazy_conversion=False, write_intermediate=False, goldstandard=False
    ):
        """Initialise the ConverterManager class.

        Args:
            lazy_conversion (bool): indicate whether conversion depends on the
                fact that metadata have changed since last conversion.
            write_intermediate (bool): indicating whether intermediate versions
                of the converted document should be written to disk.
            goldstandard (bool): indicating whether goldstandard documents
                should be converted.
        """
        self.lazy_conversion = lazy_conversion
        self.write_intermediate = write_intermediate
        self.goldstandard = goldstandard
        self.files = []

    def convert(self, orig_file):
        """Convert file to corpus xml format.

        Args:
            orig_file (str): the path to the original file.
        """
        try:
            conv = converter.Converter(orig_file, lazy_conversion=self.lazy_conversion)
            conv.write_complete(self.languageguesser())
        except (
            util.ConversionError,
            ValueError,
            IndexError,
        ) as error:
            LOGGER.warn("Could not convert %s\n%s", orig_file, error)
            raise

    def convert_in_parallel(self, pool_size):
        """Convert files using the multiprocessing module."""
        # util.run_in_parallel(self.convert, pool_size, self.files)
        # LOGGER.info("Starting the conversion of %d files (using %d cpus)",
        #             len(self.files), pool_size)

        nfiles = len(self.files)
        futures = {}  # Future -> filename
        print(f"Starting parallel conversion with {pool_size} workers")
        failed = []
        with ProcessPoolExecutor(max_workers=pool_size) as pool:
            for file in self.files:
                fut = pool.submit(partial(self.convert, file))
                futures[fut] = file

            for i, future in enumerate(as_completed(futures), start=1):
                exc = future.exception()
                filename = futures[future]
                if exc is not None:
                    failed.append(filename)
                    print(f"[{i}/{nfiles} FAILED: {filename}")
                    print(exc)
                else:
                    print(f"[{i}/{nfiles}] done: {filename}")

        n_ok = nfiles - len(failed)
        print(f"all done converting. {n_ok} files converted ok, {len(failed)} failed")
        if failed:
            print("the files that failed to convert are:")
            for filename in failed:
                print(filename)

        # pool = multiprocessing.Pool(processes=pool_size)
        # pool.map(unwrap_self_convert, list(zip([self] * len(self.files), self.files)))
        # pool.close()
        # pool.join()

    def convert_serially(self):
        """Convert the files in one process."""
        LOGGER.info("Starting the conversion of %d files", len(self.files))

        for orig_file in self.files:
            LOGGER.debug("converting %s", orig_file)
            self.convert(orig_file)

    def add_file(self, xsl_file):
        """Add file for conversion.

        Args:
            xsl_file (str): path to a metadata file
        """
        if os.path.isfile(xsl_file) and os.path.isfile(xsl_file[:-4]):
            metadata = xslsetter.MetadataHandler(xsl_file)
            if (
                metadata.get_variable("conversion_status") == "standard"
                and not self.goldstandard
            ) or (
                metadata.get_variable("conversion_status").startswith("correct")
                and self.goldstandard
            ):
                self.files.append(xsl_file[:-4])
        else:
            LOGGER.warn("%s does not exist", xsl_file[:-4])

    @staticmethod
    def make_xsl_file(source):
        """Write an xsl file if it does not exist."""
        xsl_file = source if source.endswith(".xsl") else source + ".xsl"
        if not os.path.isfile(xsl_file):
            metadata = xslsetter.MetadataHandler(xsl_file, create=True)
            metadata.set_lang_genre_xsl()
            metadata.write_file()

        return xsl_file

    def add_directory(self, directory):
        """Add all files in a directory for conversion."""
        for root, _, files in os.walk(directory):
            for file_ in files:
                if file_.endswith(".xsl"):
                    self.add_file(os.path.join(root, file_))

    def collect_files(self, sources):
        """Find all convertible files in sources.

        Args:
            sources (list of str): a list of files or directories where
                convertable files are found.
        """
        LOGGER.info("Collecting files to convert")

        for source in sources:
            if os.path.isfile(source):
                xsl_file = self.make_xsl_file(source)
                self.add_file(xsl_file)
            elif os.path.isdir(source):
                self.add_directory(source)
            else:
                LOGGER.error(
                    "Can not process %s\n" "This is neither a file nor a directory.",
                    source,
                )

`init(lazy_conversion=False, write_intermediate=False, goldstandard=False)`

Initialise the ConverterManager class.

Parameters:

Name	Type	Description	Default
`lazy_conversion`	`bool`	indicate whether conversion depends on the fact that metadata have changed since last conversion.	`False`
`write_intermediate`	`bool`	indicating whether intermediate versions of the converted document should be written to disk.	`False`
`goldstandard`	`bool`	indicating whether goldstandard documents should be converted.	`False`

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def __init__(
    self, lazy_conversion=False, write_intermediate=False, goldstandard=False
):
    """Initialise the ConverterManager class.

    Args:
        lazy_conversion (bool): indicate whether conversion depends on the
            fact that metadata have changed since last conversion.
        write_intermediate (bool): indicating whether intermediate versions
            of the converted document should be written to disk.
        goldstandard (bool): indicating whether goldstandard documents
            should be converted.
    """
    self.lazy_conversion = lazy_conversion
    self.write_intermediate = write_intermediate
    self.goldstandard = goldstandard
    self.files = []

`add_directory(directory)`

Add all files in a directory for conversion.

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def add_directory(self, directory):
    """Add all files in a directory for conversion."""
    for root, _, files in os.walk(directory):
        for file_ in files:
            if file_.endswith(".xsl"):
                self.add_file(os.path.join(root, file_))

`add_file(xsl_file)`

Add file for conversion.

Parameters:

Name	Type	Description	Default
`xsl_file`	`str`	path to a metadata file	required

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def add_file(self, xsl_file):
    """Add file for conversion.

    Args:
        xsl_file (str): path to a metadata file
    """
    if os.path.isfile(xsl_file) and os.path.isfile(xsl_file[:-4]):
        metadata = xslsetter.MetadataHandler(xsl_file)
        if (
            metadata.get_variable("conversion_status") == "standard"
            and not self.goldstandard
        ) or (
            metadata.get_variable("conversion_status").startswith("correct")
            and self.goldstandard
        ):
            self.files.append(xsl_file[:-4])
    else:
        LOGGER.warn("%s does not exist", xsl_file[:-4])

`collect_files(sources)`

Find all convertible files in sources.

Parameters:

Name	Type	Description	Default
`sources`	`list of str`	a list of files or directories where convertable files are found.	required

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def collect_files(self, sources):
    """Find all convertible files in sources.

    Args:
        sources (list of str): a list of files or directories where
            convertable files are found.
    """
    LOGGER.info("Collecting files to convert")

    for source in sources:
        if os.path.isfile(source):
            xsl_file = self.make_xsl_file(source)
            self.add_file(xsl_file)
        elif os.path.isdir(source):
            self.add_directory(source)
        else:
            LOGGER.error(
                "Can not process %s\n" "This is neither a file nor a directory.",
                source,
            )

`convert(orig_file)`

Convert file to corpus xml format.

Parameters:

Name	Type	Description	Default
`orig_file`	`str`	the path to the original file.	required

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def convert(self, orig_file):
    """Convert file to corpus xml format.

    Args:
        orig_file (str): the path to the original file.
    """
    try:
        conv = converter.Converter(orig_file, lazy_conversion=self.lazy_conversion)
        conv.write_complete(self.languageguesser())
    except (
        util.ConversionError,
        ValueError,
        IndexError,
    ) as error:
        LOGGER.warn("Could not convert %s\n%s", orig_file, error)
        raise

`convert_in_parallel(pool_size)`

Convert files using the multiprocessing module.

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def convert_in_parallel(self, pool_size):
    """Convert files using the multiprocessing module."""
    # util.run_in_parallel(self.convert, pool_size, self.files)
    # LOGGER.info("Starting the conversion of %d files (using %d cpus)",
    #             len(self.files), pool_size)

    nfiles = len(self.files)
    futures = {}  # Future -> filename
    print(f"Starting parallel conversion with {pool_size} workers")
    failed = []
    with ProcessPoolExecutor(max_workers=pool_size) as pool:
        for file in self.files:
            fut = pool.submit(partial(self.convert, file))
            futures[fut] = file

        for i, future in enumerate(as_completed(futures), start=1):
            exc = future.exception()
            filename = futures[future]
            if exc is not None:
                failed.append(filename)
                print(f"[{i}/{nfiles} FAILED: {filename}")
                print(exc)
            else:
                print(f"[{i}/{nfiles}] done: {filename}")

    n_ok = nfiles - len(failed)
    print(f"all done converting. {n_ok} files converted ok, {len(failed)} failed")
    if failed:
        print("the files that failed to convert are:")
        for filename in failed:
            print(filename)

`convert_serially()`

Convert the files in one process.

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def convert_serially(self):
    """Convert the files in one process."""
    LOGGER.info("Starting the conversion of %d files", len(self.files))

    for orig_file in self.files:
        LOGGER.debug("converting %s", orig_file)
        self.convert(orig_file)

`languageguesser()`

Return our language guesser. This is a class variable, but since it takes a while to initialise, we don't do it until it's needed.

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def languageguesser(self):
    """Return our language guesser.
    This is a class variable, but since it takes a while to initialise,
    we don't do it until it's needed."""
    if self._languageguesser is None:
        ConverterManager._languageguesser = text_cat.Classifier(None)
    return self._languageguesser

`make_xsl_file(source)` `staticmethod`

Write an xsl file if it does not exist.

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

@staticmethod
def make_xsl_file(source):
    """Write an xsl file if it does not exist."""
    xsl_file = source if source.endswith(".xsl") else source + ".xsl"
    if not os.path.isfile(xsl_file):
        metadata = xslsetter.MetadataHandler(xsl_file, create=True)
        metadata.set_lang_genre_xsl()
        metadata.write_file()

    return xsl_file

`main()`

Convert documents to giellatekno xml format.

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def main():
    """Convert documents to giellatekno xml format."""
    LOGGER.setLevel(logging.WARNING)
    try:
        sanity_check()
    except (util.SetupError, util.ExecutableMissingError) as error:
        raise SystemExit(str(error))

    args = parse_options()

    manager = ConverterManager(
        args.lazy_conversion, args.write_intermediate, args.goldstandard
    )
    manager.collect_files(args.sources)

    try:
        if args.serial:
            LOGGER.setLevel(logging.DEBUG)
            manager.convert_serially()
        else:
            manager.convert_in_parallel(args.ncpus)
    except util.ExecutableMissingError as error:
        raise SystemExit(str(error))

`parse_options()`

Parse the commandline options.

Returns:

Type	Description
`argparse.Namespace`	the parsed commandline arguments

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def parse_options():
    """Parse the commandline options.

    Returns:
        (argparse.Namespace): the parsed commandline arguments
    """
    parser = argparse.ArgumentParser(
        parents=[argparse_version.parser],
        description="Convert original files to giellatekno xml.",
    )

    parser.add_argument("--ncpus", action=NCpus, default=multiprocessing.cpu_count())
    parser.add_argument(
        "--skip-existing",
        action="store_true",
        help="skip converting files that are already converted (that already "
             "exist in the converted/ folder"
    )
    parser.add_argument(
        "--serial",
        action="store_true",
        help="use this for debugging the conversion \
                        process. When this argument is used files will \
                        be converted one by one.",
    )
    parser.add_argument(
        "--lazy-conversion",
        action="store_true",
        help="Reconvert only if metadata have changed.",
    )
    parser.add_argument(
        "--write-intermediate",
        action="store_true",
        help="Write the intermediate XML representation \
                        to ORIGFILE.im.xml, for debugging the XSLT.\
                        (Has no effect if the converted file already exists.)",
    )
    parser.add_argument(
        "--goldstandard",
        action="store_true",
        help="Convert goldstandard and .correct files",
    )
    parser.add_argument(
        "sources",
        nargs="+",
        help="The original file(s) or \
                        directory/ies where the original files exist",
    )

    args = parser.parse_args()

    return args

`sanity_check()`

Check that needed programs and environment variables are set.

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def sanity_check():
    """Check that needed programs and environment variables are set."""
    # util.sanity_check(['wvHtml', 'pdftotext', 'latex2html'])
    util.sanity_check(["pdftotext", "latex2html"])
    if not os.path.isfile(converter.Converter.get_dtd_location()):
        raise util.SetupError(
            "Couldn't find {}\n"
            "Check that GTHOME points at the right directory "
            "(currently: {}).".format(
                converter.Converter.get_dtd_location(), os.environ["GTHOME"]
            )
        )

`unwrap_self_convert(arg, **kwarg)`

Unpack self from the arguments and call convert again.

This is due to how multiprocess works: http://www.rueckstiess.net/research/snippets/show/ca1d7d90

Source code in /home/anders/projects/CorpusTools/corpustools/convertermanager.py

def unwrap_self_convert(arg, **kwarg):
    """Unpack self from the arguments and call convert again.

    This is due to how multiprocess works:
    http://www.rueckstiess.net/research/snippets/show/ca1d7d90
    """
    return ConverterManager.convert(*arg, **kwarg)