| Hlavní stránka > Hacking Invenio > WebSubmit Internals > Conversion tools |
The WebSubmit Conversion Tools library (websubmit_file_converter.py) let you convert from a fulltext format into an other and to perform OCR.
def get_best_format_to_extract_text_from(filelist, best_formats=CFG_WEBSUBMIT_BEST_FORMATS_TO_EXTRACT_TEXT_FROM):
"""
Return among the filelist the best file whose format is best suited for
extracting text.
"""
def get_missing_formats(filelist, desired_conversion=CFG_WEBSUBMIT_DESIRED_CONVERSIONS):
"""Given a list of files it will return a dictionary of the form:
file1 : missing formats to generate from it...
"""
def can_convert(input_format, output_format, max_intermediate_conversions=2):
"""Return the chain of conversion to transform input_format into output_format, if any."""
def can_pdfopt():
"""Return True if it's possible to optimize PDFs."""
def can_pdfa():
"""Return True if it's possible to generate PDF/As."""
def can_perform_ocr():
"""Return True if it's possible to perform OCR."""
def can_spell_check(ln='en'):
"""Return True if it's possible to perform spell checking."""
def guess_is_OCR_needed(input_file, ln='en'):
"""
Tries to see if enough text is retrievable from input_file.
Return True if OCR is needed, False if it's already
possible to retrieve information from the document.
"""
output_file = convert_file(input_file, format='.txt', perform_ocr=False)
def convert_file(input_file, output_file=None, output_format=None, **params):
"""
Convert files from one format to another.
@param input_file [string] the path to an existing file
@param output_file [string] the path to the desired ouput. (if None a
temporary file is generated)
@param output_format [string] the desired format (if None it is taken from
output_file)
@param params other paramaters to pass to the particular converter
@return [string] the final output_file
"""
def pdf2hocr2pdf(input_file, output_file=None, font="Courier", author=None, keywords=None, subject=None, title=None, draft=False, ln='en', pdfopt=True, **args):
"""
Transform a scanned PDF into a PDF with OCRed text.
@param font the default font (e.g. Courier, Times-Roman).
@param author the author name.
@param subject the subject of the document.
@param title the title of the document.
@param draft whether to enable debug information in the output.
@param ln is a two letter language code to give the OCR tool a hint.
"""
input_file, output_hocr_file, dummy = prepare_io(input_file, output_ext='.hocr', need_working_dir=False)
output_hocr_file, working_dir = pdf2hocr(input_file, output_file=output_hocr_file, ln=ln, return_working_dir=True)
output_file = hocr2pdf(output_hocr_file, output_file, working_dir, font=font, author=author, keywords=keywords, subject=subject, title=title, draft=draft)
clean_working_dir(working_dir)
return output_file
See websubmit_file_converter API for a complete API description.