diff options
Diffstat (limited to 'graphics/ocropus/files/patch-ocroscript__scripts__rec-tess-complete.lua')
| -rw-r--r-- | graphics/ocropus/files/patch-ocroscript__scripts__rec-tess-complete.lua | 96 |
1 files changed, 96 insertions, 0 deletions
diff --git a/graphics/ocropus/files/patch-ocroscript__scripts__rec-tess-complete.lua b/graphics/ocropus/files/patch-ocroscript__scripts__rec-tess-complete.lua new file mode 100644 index 000000000000..d428d7d718cb --- /dev/null +++ b/graphics/ocropus/files/patch-ocroscript__scripts__rec-tess-complete.lua @@ -0,0 +1,96 @@ +--- ./ocroscript/scripts/rec-tess-complete.lua.orig 2008-10-16 05:40:35.000000000 +0900 ++++ ./ocroscript/scripts/rec-tess-complete.lua 2009-05-26 21:22:11.000000000 +0900 +@@ -20,11 +20,20 @@ + -- Reviewer: + -- Primary Repository: + -- Web Sites: www.iupr.org, www.dfki.de, www.ocropus.org ++-- ++-- Patch applied: ++-- http://code.google.com/p/ocropus/issues/detail?id=137 + + + require 'lib.util' + require 'lib.headings' + require 'lib.paragraphs' ++require 'lib.path' ++require 'lib.hocr' ++import_all(ocr) ++import_all(graphics) ++import_all(iulib) ++import_all(nustring) + + remove_hyphens = true + +@@ -74,7 +83,7 @@ + -- RecognizedPage is a transport object of tesseract_recognize_blockwise(). + -- This function will convert it to a DOM. + function convert_RecognizedPage_to_DOM(p, image_path, keep_char_boxes) +- page_DOM = get_page_DOM(p, image_path) ++ page_DOM = hocr.get_page_DOM(p, image_path) + for i = 0, p:linesCount() - 1 do + local bbox = p:bbox(i) + local text = nustring() +@@ -85,13 +94,12 @@ + bboxes = narray_to_table(r) + end + p:text(text, i) +- line_DOM = get_line_DOM(bbox, text, bboxes, p) ++ line_DOM = hocr.get_line_DOM(bbox, text, bboxes, p) + table.insert(page_DOM, line_DOM) + end + return page_DOM + end + +- + function get_images_DOM(tiseg_image, html_path, images_dir, page_image) + os.execute('mkdir -p "'..images_dir..'"') + local rects = rectarray() +@@ -102,12 +110,11 @@ + local dom = {{tag = 'hr', size = '0'}} + for i = 0, rects:length() - 1 do + local src = images_dir .. ('/%04d.png'):format(i + 1) +- local img_path = util.combine_paths(html_path, src) + img = bytearray() + r = rects:at(i) + extract_subimage(img, page_image, r.x0, r.y0, r.x1, r.y1) +- write_image_gray(img_path, img) +- local props = {bbox = bbox_to_string(page_image, r)} ++ iulib.write_image_gray(src, img) ++ local props = {bbox = hocr.bbox_to_string(page_image, r)} + local link = {tag = 'a', href=src} + local width = r.x1 - r.x0 + local height = r.y1 -r.y0 +@@ -119,7 +126,7 @@ + height = "200px" + end + local tag = {tag = 'img', src = src, width=width, height=height, +- class = 'ocr_image', title = hocr_properties_attribute(props)} ++ class = 'ocr_image', title = hocr.properties_attribute(props)} + table.insert(link, tag) + table.insert(dom, link) + table.insert(dom, '\n') +@@ -146,8 +153,8 @@ + get_nontext_mask(nontext_mask,tiseg_image) + remove_masked_region(text_image,nontext_mask,clean_image) + segmenter:segment(page_segmentation,text_image) +- local p = RecognizedPage() +- tesseract_recognize_blockwise(p, page_image, page_segmentation) ++ local p = tesseract.RecognizedPage() ++ tesseract.recognize_blockwise(p, page_image, page_segmentation) + page_DOM = convert_RecognizedPage_to_DOM(p, pages:getFileName(), + option("charboxes")) + page_DOM = detect_headings(page_DOM, page_image) +@@ -157,10 +164,10 @@ + table.insert(body_DOM, page_DOM) + end + --end +-doc_DOM = get_html_tag() +-table.insert(doc_DOM, get_head_tag()) ++doc_DOM = hocr.get_html_tag() ++table.insert(doc_DOM, hocr.get_head_tag()) + table.insert(doc_DOM, '\n') + table.insert(doc_DOM, body_DOM) + file = io.open(output_file, 'w') +-dump_DOM(file, doc_DOM, html_preamble) ++hocr.dump(file, doc_DOM, html_preamble) + file:close() |
