CERN Accelerating science

Conversion tools

The WebSubmit Conversion Tools library (websubmit_file_converter.py) let you convert from a fulltext format into an other and to perform OCR.

Python API

def get_best_format_to_extract_text_from(filelist, best_formats=CFG_WEBSUBMIT_BEST_FORMATS_TO_EXTRACT_TEXT_FROM):
    """
    Return among the filelist the best file whose format is best suited for
    extracting text.
    """

def get_missing_formats(filelist, desired_conversion=CFG_WEBSUBMIT_DESIRED_CONVERSIONS):
    """Given a list of files it will return a dictionary of the form:
    file1 : missing formats to generate from it...
    """

def can_convert(input_format, output_format, max_intermediate_conversions=2):
    """Return the chain of conversion to transform input_format into output_format, if any."""

def can_pdfopt():
    """Return True if it's possible to optimize PDFs."""

def can_pdfa():
    """Return True if it's possible to generate PDF/As."""

def can_perform_ocr():
    """Return True if it's possible to perform OCR."""

def can_spell_check(ln='en'):
    """Return True if it's possible to perform spell checking."""

def guess_is_OCR_needed(input_file, ln='en'):
    """
    Tries to see if enough text is retrievable from input_file.
    Return True if OCR is needed, False if it's already
    possible to retrieve information from the document.
    """
    output_file = convert_file(input_file, format='.txt', perform_ocr=False)

def convert_file(input_file, output_file=None, output_format=None, **params):
    """
    Convert files from one format to another.
    @param input_file [string] the path to an existing file
    @param output_file [string] the path to the desired ouput. (if None a
        temporary file is generated)
    @param output_format [string] the desired format (if None it is taken from
        output_file)
    @param params other paramaters to pass to the particular converter
    @return [string] the final output_file
    """

def pdf2hocr2pdf(input_file, output_file=None, font="Courier", author=None, keywords=None, subject=None, title=None, draft=False, ln='en', pdfopt=True, **args):
    """
    Transform a scanned PDF into a PDF with OCRed text.
    @param font the default font (e.g. Courier, Times-Roman).
    @param author the author name.
    @param subject the subject of the document.
    @param title the title of the document.
    @param draft whether to enable debug information in the output.
    @param ln is a two letter language code to give the OCR tool a hint.
    """
    input_file, output_hocr_file, dummy = prepare_io(input_file, output_ext='.hocr', need_working_dir=False)
    output_hocr_file, working_dir = pdf2hocr(input_file, output_file=output_hocr_file, ln=ln, return_working_dir=True)
    output_file = hocr2pdf(output_hocr_file, output_file, working_dir, font=font, author=author, keywords=keywords, subject=subject, title=title, draft=draft)
    clean_working_dir(working_dir)
    return output_file

See websubmit_file_converter API for a complete API description.