Home > Hacking Invenio > WebSubmit Internals > Conversion tools |
The WebSubmit Conversion Tools library (websubmit_file_converter.py) let you convert from a fulltext format into an other and to perform OCR.
def get_best_format_to_extract_text_from(filelist, best_formats=CFG_WEBSUBMIT_BEST_FORMATS_TO_EXTRACT_TEXT_FROM): """ Return among the filelist the best file whose format is best suited for extracting text. """ def get_missing_formats(filelist, desired_conversion=CFG_WEBSUBMIT_DESIRED_CONVERSIONS): """Given a list of files it will return a dictionary of the form: file1 : missing formats to generate from it... """ def can_convert(input_format, output_format, max_intermediate_conversions=2): """Return the chain of conversion to transform input_format into output_format, if any.""" def can_pdfopt(): """Return True if it's possible to optimize PDFs.""" def can_pdfa(): """Return True if it's possible to generate PDF/As.""" def can_perform_ocr(): """Return True if it's possible to perform OCR.""" def can_spell_check(ln='en'): """Return True if it's possible to perform spell checking.""" def guess_is_OCR_needed(input_file, ln='en'): """ Tries to see if enough text is retrievable from input_file. Return True if OCR is needed, False if it's already possible to retrieve information from the document. """ output_file = convert_file(input_file, format='.txt', perform_ocr=False) def convert_file(input_file, output_file=None, output_format=None, **params): """ Convert files from one format to another. @param input_file [string] the path to an existing file @param output_file [string] the path to the desired ouput. (if None a temporary file is generated) @param output_format [string] the desired format (if None it is taken from output_file) @param params other paramaters to pass to the particular converter @return [string] the final output_file """ def pdf2hocr2pdf(input_file, output_file=None, font="Courier", author=None, keywords=None, subject=None, title=None, draft=False, ln='en', pdfopt=True, **args): """ Transform a scanned PDF into a PDF with OCRed text. @param font the default font (e.g. Courier, Times-Roman). @param author the author name. @param subject the subject of the document. @param title the title of the document. @param draft whether to enable debug information in the output. @param ln is a two letter language code to give the OCR tool a hint. """ input_file, output_hocr_file, dummy = prepare_io(input_file, output_ext='.hocr', need_working_dir=False) output_hocr_file, working_dir = pdf2hocr(input_file, output_file=output_hocr_file, ln=ln, return_working_dir=True) output_file = hocr2pdf(output_hocr_file, output_file, working_dir, font=font, author=author, keywords=keywords, subject=subject, title=title, draft=draft) clean_working_dir(working_dir) return output_file
See websubmit_file_converter API for a complete API description.