From bd61cc63b26b7c5ef84fce9a3d095bb6cc374757 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 28 Jan 2021 12:30:23 +0100 Subject: [PATCH 001/194] recognize: try to self.resolve_resource model --- ocrd_cis/ocropy/recognize.py | 28 +++++++++++++++++----------- setup.py | 2 +- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 8d24b9d0..8ebddca3 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,5 +1,6 @@ from __future__ import absolute_import +import sys import os.path import numpy as np from PIL import Image @@ -102,19 +103,24 @@ def setup(self): x.allocate(5000) def get_model(self): - """Search for the model file. First checks if - parameter['model'] is a valid readeable file and returns it. - If not, it checks if the model can be found in the + """Search for the model file. First checks if parameter['model'] can + be resolved with OcrdResourceManager to a valid readeable file and + returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" canread = lambda p: os.path.isfile(p) and os.access(p, os.R_OK) - model = self.parameter['model'] - if canread(model): - return model - ocropydir = os.path.dirname(os.path.abspath(__file__)) - path = os.path.join(ocropydir, 'models', model) - if canread(path): - return path - return model + try: + model = self.resolve_resource(self.parameter['model']) + if canread(model): + return model + except SystemExit: + ocropydir = os.path.dirname(os.path.abspath(__file__)) + path = os.path.join(ocropydir, 'models', self.parameter['model']) + self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path) + if canread(path): + return path + self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s", + self.parameter['model'], self.parameter['model']) + sys.exit(1) def process(self): """Recognize lines / words / glyphs of the workspace. diff --git a/setup.py b/setup.py index 11bbf0a6..b60d3f2f 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=2.13', + 'ocrd>=2.22.3', 'click', 'scipy', 'numpy>=1.17.0', From db584d8342beed51f785cbf857f1e3c6881b6116 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 11 Mar 2022 19:39:37 +0100 Subject: [PATCH 002/194] resegment: fix method=baseline --- ocrd_cis/ocropy/resegment.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index d9d661b2..ee7f55b2 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -266,9 +266,10 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l continue line_polygon = baseline_of_segment(line, parent_coords) line_ltr = line_polygon[0,0] < line_polygon[-1,0] - line_polygon = make_valid(join_polygons(LineString(line_polygon).buffer( + line_polygon = make_valid(join_polygons([LineString(line_polygon).buffer( # left-hand side if left-to-right, and vice versa - scale * (-1) ** line_ltr, single_sided=True), loc=line.id)) + scale * (-1) ** line_ltr, single_sided=True)], + loc=line.id, scale=scale)) line_polygon = np.array(line_polygon.exterior, np.int)[:-1] line_y, line_x = draw.polygon(line_polygon[:, 1], line_polygon[:, 0], From 56affe216a0cbdb559e642c05aec4bdee4ecc617 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 11 Mar 2022 19:41:10 +0100 Subject: [PATCH 003/194] resegment: join_polygons: allow non-contiguous input, too --- ocrd_cis/ocropy/resegment.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index ee7f55b2..4456a8e9 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import os.path +from itertools import chain import numpy as np from skimage import draw from shapely.geometry import Polygon, asPolygon, LineString @@ -482,6 +483,10 @@ def join_polygons(polygons, loc=''): # compoundp = unary_union(polygons) # jointp = compoundp.convex_hull LOG = getLogger('processor.OcropyResegment') + polygons = list(chain.from_iterable([ + poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection'] + else [poly] + for poly in polygons])) if len(polygons) == 1: return polygons[0] # get equidistant list of points along hull From b856f5b75ad0b3c61e0e6acf06599da7460022ac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 11 Mar 2022 19:43:37 +0100 Subject: [PATCH 004/194] resegment: join_polygons: make equidistant points relative to estimated scale --- ocrd_cis/ocropy/resegment.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 4456a8e9..4bcc203a 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -385,7 +385,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # combine all assigned new lines to single outline polygon if len(new_lines) > 1: LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) - new_polygon = join_polygons([intersections[(i, j)] for i in new_lines], loc=line.id) + new_polygon = join_polygons([intersections[(i, j)] for i in new_lines], + loc=line.id, scale=scale) line_polygons[j] = new_polygon # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], @@ -460,7 +461,8 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, else: # get alpha shape poly = join_polygons([make_valid(Polygon(contour)) - for contour in contours], loc=line.id) + for contour in contours], + loc=line.id, scale=scale) poly = poly.exterior.coords[:-1] polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) @@ -478,7 +480,7 @@ def diff_polygons(poly1, poly2): poly = make_valid(poly) return poly -def join_polygons(polygons, loc=''): +def join_polygons(polygons, loc='', scale=20): """construct concave hull (alpha shape) from input polygons""" # compoundp = unary_union(polygons) # jointp = compoundp.convex_hull @@ -493,14 +495,14 @@ def join_polygons(polygons, loc=''): # (otherwise alphashape will jump across the interior) points = [poly.exterior.interpolate(dist).coords[0] # .xy for poly in polygons - for dist in np.arange(0, poly.length, 5.0)] + for dist in np.arange(0, poly.length, scale / 2)] #alpha = alphashape.optimizealpha(points) # too slow - alpha = 0.05 + alpha = 0.03 jointp = alphashape.alphashape(points, alpha) tries = 0 # from descartes import PolygonPatch # import matplotlib.pyplot as plt - while jointp.type in ['MultiPolygon', 'GeometryCollection']: + while jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors): # plt.figure() # plt.gca().scatter(*zip(*points)) # for geom in jointp.geoms: From d75e58da30cd681c0be37424f43c2859f64da220 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 25 Mar 2022 15:27:46 +0100 Subject: [PATCH 005/194] update to shapely 1.8 --- ocrd_cis/ocropy/common.py | 2 +- ocrd_cis/ocropy/resegment.py | 16 ++++++++-------- ocrd_cis/ocropy/segment.py | 11 ++++++----- setup.py | 2 +- 4 files changed, 16 insertions(+), 15 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index d84e42b3..dc8ed20c 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -1141,7 +1141,7 @@ def compute_segmentation(binary, LOG.debug('sorting labels by reading order') llabels = morph.reading_order(llabels,rl,bt)[llabels] DSAVE('llabels_ordered', llabels) - + #segmentation = llabels*binary #return segmentation return llabels, hlines, vlines, images, colseps, scale diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 4bcc203a..85da6c32 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -4,7 +4,7 @@ from itertools import chain import numpy as np from skimage import draw -from shapely.geometry import Polygon, asPolygon, LineString +from shapely.geometry import Polygon, LineString from shapely.prepared import prep from shapely.ops import unary_union import alphashape @@ -209,7 +209,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) line_polygons.append(prep(segment_polygon)) - segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1] + segment_polygon = np.array(segment_polygon.exterior.coords, np.int)[:-1] # draw.polygon: If any segment_polygon lies outside of parent # (causing negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does not need @@ -224,7 +224,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l segment.id, page_id if fullpage else parent.id) segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) - segment_polygon = np.array(segment_polygon.exterior, np.int)[:-1] + segment_polygon = np.array(segment_polygon.exterior.coords, np.int)[:-1] ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = False @@ -271,7 +271,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # left-hand side if left-to-right, and vice versa scale * (-1) ** line_ltr, single_sided=True)], loc=line.id, scale=scale)) - line_polygon = np.array(line_polygon.exterior, np.int)[:-1] + line_polygon = np.array(line_polygon.exterior.coords, np.int)[:-1] line_y, line_x = draw.polygon(line_polygon[:, 1], line_polygon[:, 0], parent_bin.shape) @@ -284,8 +284,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: - LOG.warning('Cannot line-segment %s "%s": %s', - tag, page_id if fullpage else parent.id, err) + LOG.error('Cannot line-segment %s "%s": %s', + tag, page_id if fullpage else parent.id, err) return LOG.info("Found %d new line labels for %d existing lines on %s '%s'", new_line_labels.max(), len(lines), tag, parent.id) @@ -476,7 +476,7 @@ def diff_polygons(poly1, poly2): if poly.type == 'MultiPolygon': poly = poly.convex_hull if poly.minimum_clearance < 1.0: - poly = asPolygon(np.round(poly.exterior.coords)) + poly = Polygon(np.round(poly.exterior.coords)) poly = make_valid(poly) return poly @@ -517,7 +517,7 @@ def join_polygons(polygons, loc='', scale=20): if jointp.minimum_clearance < 1.0: # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity - jointp = asPolygon(np.round(jointp.exterior.coords)) + jointp = Polygon(np.round(jointp.exterior.coords)) jointp = make_valid(jointp) return jointp diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index b782fdde..eeaccf2d 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -5,7 +5,7 @@ from skimage import draw from skimage.morphology import convex_hull_image import cv2 -from shapely.geometry import Polygon, asPolygon +from shapely.geometry import Polygon, LineString from shapely.prepared import prep from shapely.ops import unary_union @@ -125,15 +125,16 @@ def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None): polygon = polygon.simplify(tolerance) if polygon.is_valid: break - polygon = polygon.exterior.coords[:-1] # keep open - if len(polygon) < 4: + poly = polygon.exterior.coords[:-1] # keep open + if len(poly) < 4: LOG.warning('Label %d contour %d has less than 4 points for %s', label, i, name) continue - results.append((label, polygon)) + results.append((label, poly)) result_labels[contour_labels == i+1] = len(results) return results, result_labels + class OcropySegment(Processor): def __init__(self, *args, **kwargs): @@ -761,7 +762,7 @@ def make_intersection(poly1, poly2): if interp.minimum_clearance < 1.0: # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity - interp = asPolygon(np.round(interp.exterior.coords)) + interp = Polygon(np.round(interp.exterior.coords)) interp = make_valid(interp) return interp diff --git a/setup.py b/setup.py index 72e11280..a0c371ed 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ 'scipy', 'numpy>=1.17.0', 'pillow>=7.1.2', - 'shapely>=1.7.1,<1.8', + 'shapely>=1.7.1', 'scikit-image', 'alphashape', 'opencv-python-headless', From 6a06f36238589f8b35d43f7c9be5d707e97fb97a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 19 Mar 2022 13:08:55 +0100 Subject: [PATCH 006/194] fix Workspace.save_image_file args --- ocrd_cis/ocropy/binarize.py | 18 ++++++------------ ocrd_cis/ocropy/clip.py | 6 ++---- ocrd_cis/ocropy/denoise.py | 6 ++---- ocrd_cis/ocropy/deskew.py | 6 ++---- ocrd_cis/ocropy/dewarp.py | 4 ++-- ocrd_cis/ocropy/segment.py | 10 ++++------ 6 files changed, 18 insertions(+), 32 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 6092d3d5..872185c3 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -213,10 +213,8 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): file_id += '.IMG-BIN' features += ',binarized' file_path = self.workspace.save_image_file( - bin_image, - file_id, - page_id=page_id, - file_grp=self.output_file_grp) + bin_image, file_id, self.output_file_grp, + page_id=page_id) # update PAGE (reference the image file): page.add_AlternativeImage(AlternativeImageType( filename=file_path, @@ -263,10 +261,8 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ file_id += '.IMG-BIN' features += ',binarized' file_path = self.workspace.save_image_file( - bin_image, - file_id, - page_id=page_id, - file_grp=self.output_file_grp) + bin_image, file_id, self.output_file_grp, + page_id=page_id) # update PAGE (reference the image file): region.add_AlternativeImage(AlternativeImageType( filename=file_path, @@ -306,10 +302,8 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi file_id += '.IMG-BIN' features += ',binarized' file_path = self.workspace.save_image_file( - bin_image, - file_id, - page_id=page_id, - file_grp=self.output_file_grp) + bin_image, file_id, self.output_file_grp, + page_id=page_id) # update PAGE (reference the image file): line.add_AlternativeImage(AlternativeImageType( filename=file_path, diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 8f84efe6..a305f09e 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -257,10 +257,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, segment_image = crop_image(segment_image,box=segment_bbox) # update METS (add the image file): file_path = self.workspace.save_image_file( - segment_image, - file_id=file_id + '.IMG-CLIP', - page_id=page_id, - file_grp=self.output_file_grp) + segment_image, file_id + '.IMG-CLIP', self.output_file_grp, + page_id=page_id) # update PAGE (reference the image file): segment.add_AlternativeImage(AlternativeImageType( filename=file_path, diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 61a77141..cbbdf8cf 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -127,10 +127,8 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, f maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt # update METS (add the image file): file_path = self.workspace.save_image_file( - bin_image, - file_id + '.IMG-DESPECK', - page_id=page_id, - file_grp=self.output_file_grp) + bin_image, file_id + '.IMG-DESPECK', self.output_file_grp, + page_id=page_id) # update PAGE (reference the image file): segment.add_AlternativeImage(AlternativeImageType( filename=file_path, diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index aabbce3e..bb9904e0 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -142,10 +142,8 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p segment_coords['features'] += ',deskewed' # update METS (add the image file): file_path = self.workspace.save_image_file( - segment_image, - file_id + '.IMG-DESKEW', - page_id=page_id, - file_grp=self.output_file_grp) + segment_image, file_id + '.IMG-DESKEW', self.output_file_grp, + page_id=page_id) # update PAGE (reference the image file): segment.add_AlternativeImage(AlternativeImageType( filename=file_path, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index dc083eaf..7d3251bf 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -168,8 +168,8 @@ def process(self): file_path = self.workspace.save_image_file( dew_image, file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP', - page_id=input_file.pageId, - file_grp=self.output_file_grp) + self.output_file_grp, + page_id=input_file.pageId) # update PAGE (reference the image file): alternative_image = line.get_AlternativeImage() line.add_AlternativeImage(AlternativeImageType( diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index eeaccf2d..7e94f495 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -669,9 +669,8 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', - page_id=page_id, - file_grp=self.output_file_grp) + image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, + page_id=page_id) element.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=coords['features'] + ',clipped')) else: @@ -708,9 +707,8 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', - page_id=page_id, - file_grp=self.output_file_grp) + image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, + page_id=page_id) # update PAGE (reference the image file): element.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=coords['features'] + ',clipped')) From 2cdfa7949dbf2e965aa8f44e91cb2353994d9464 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 24 Mar 2022 12:37:59 +0100 Subject: [PATCH 007/194] revert e673544 (crashes OpenCV) --- ocrd_cis/ocropy/ocrolib/morph.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index a0170c43..75d86b69 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -21,7 +21,8 @@ def label(image,**kw): """ # default connectivity in OpenCV: 8 (which is equivalent to...) # default connectivity in scikit-image: 2 - n, labels = cv2.connectedComponents(image.astype(uint8), connectivity=4) + # connectivity=4 crashes (segfaults) OpenCV#21366 + n, labels = cv2.connectedComponents(image.astype(uint8)) #n, labels = cv2.connectedComponentsWithAlgorithm(image.astype(uint8), connectivity=4, ltype=2, ccltype=cv2.CCL_DEFAULT) return labels, n-1 # try: return measurements.label(image,**kw) From 8f6cfc54ada9bfb6be3a10ff1bb98f996b14e9f0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 18 Mar 2022 15:25:58 +0100 Subject: [PATCH 008/194] segment: annotate baselines, too --- ocrd_cis/ocropy/common.py | 81 +++++++++++++++++++++++- ocrd_cis/ocropy/resegment.py | 2 +- ocrd_cis/ocropy/segment.py | 118 ++++++++++++++++++++++++++--------- 3 files changed, 167 insertions(+), 34 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index dc8ed20c..7afb03af 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -996,7 +996,9 @@ def h_compatible(obj1, obj2, center1, center2): # (which must be split anyway) # - with tighter polygonal spread around foreground # - with spread of line labels against separator labels +# - with baseline extraction # - return bg line and sep labels intead of just fg line labels +# - return baseline coords, too @checks(ABINARY2) def compute_segmentation(binary, zoom=1.0, @@ -1046,6 +1048,7 @@ def compute_segmentation(binary, foreground may remain unlabelled for separators and other non-text like small noise, or large drop-capitals / images), + - list of Numpy arrays of baseline coordinates [y, x points in lr order] - Numpy array of horizontal foreground lines mask, - Numpy array of vertical foreground lines mask, - Numpy array of large/non-text foreground component mask, @@ -1144,7 +1147,81 @@ def compute_segmentation(binary, #segmentation = llabels*binary #return segmentation - return llabels, hlines, vlines, images, colseps, scale + blines = compute_baselines(bottom, top, llabels, scale) + return llabels, blines, hlines, vlines, images, colseps, scale + +@checks(AFLOAT2,AFLOAT2,SEGMENTATION,NUMBER) +def compute_baselines(bottom, top, linelabels, scale, method='bottom'): + """Get the coordinates of baselines running along each bottom gradient peak.""" + seeds = linelabels > 0 + # smooth bottom+top maps horizontally for centerline estimation + bot = filters.gaussian_filter(bottom, (scale*0.25,scale), mode='constant') + top = filters.gaussian_filter(top, (scale*0.25,scale), mode='constant') + # idea: center is where bottom and top gradient meet in the middle + # (but between top and bottom, not between bottom and top) + # - calculation via numpy == or isclose is too fragile numerically: + #clines = np.isclose(top, bottom, rtol=0.5) & (np.diff(top - bottom, axis=0, append=0) < 0) + # - calculation via zero crossing of bop-bottom is more robust, + # but needs post-processing for lines with much larger height than scale + if method == 'center': + blines = (np.diff(np.sign(top - bottom), axis=0, append=0) < 0) & seeds + #DSAVE('centerlines', blines) + # - calculation via peak gradient + elif method == 'bottom': + bot1d = np.diff(bot, axis=0, append=0) + bot1d = np.diff(np.sign(bot1d), axis=0, append=0) < 0 + bot1d &= bot > 0 + #DSAVE('bot1d', bot1d) + blines = bot1d + baselabels, nbaselabels = morph.label(blines) + baseslices = [(slice(0,0),slice(0,0))] + morph.find_objects(baselabels) + # if multiple labels per seed, ignore the ones above others + # (can happen due to mis-estimation of scale) + corrs = morph.correspondences(linelabels, baselabels).T + labelmap = {} + #DSAVE('baselines', baselabels) + def partitions(adj, starti, startpart=None): + for i in range(starti, len(adj)): + if startpart is None: + yield from partitions(adj, i + 1, [i]) + elif all(adj[i][j] for j in startpart): + yield from partitions(adj, i + 1, [i] + startpart) + if startpart is not None: + yield startpart + for line in np.unique(linelabels): + if not line: continue # ignore bg line + corrinds = corrs[:, 0] == line + corrinds[corrs[:, 1] == 0] = False # ignore bg baseline + if not np.any(corrinds): continue + corrinds = corrinds.nonzero()[0] + if len(corrinds) == 1: + labelmap.setdefault(line, list()).append(corrs[corrinds[0], 1]) + continue + nonoverlapping = ~np.eye(len(corrinds), dtype=np.bool) + for i, indi in enumerate(corrinds[:-1]): + baselabeli = corrs[indi, 1] + baseslicei = baseslices[baselabeli] + for j, indj in enumerate(corrinds[i + 1:], i + 1): + baselabelj = corrs[indj, 1] + baseslicej = baseslices[baselabelj] + if sl.xoverlaps(baseslicei, baseslicej): + nonoverlapping[i, j] = False + nonoverlapping[j, i] = False + def pathlen(path): + return sum(corrs[corrinds[pos], 2] for pos in path) + corrgroups = sorted(partitions(nonoverlapping, 0), key=pathlen) + # select longest path + corrinds = corrinds[corrgroups[-1]] + labelmap.setdefault(line, list()).extend(corrs[corrinds, 1]) + basepoints = [] + for line in np.unique(linelabels): + if line not in labelmap: continue + linemask = linelabels == line + points = [] + for label in labelmap[line]: + points.extend(list(zip(*np.where((baselabels == label) & linemask)))) + basepoints.append(points) + return basepoints # from ocropus-gpageseg, but # - on both foreground and background, @@ -1741,7 +1818,7 @@ def find_topological(): npartitions > len(gaps)+1 or # partitions without the cut still score better than after sum(map(sl.height if prefer_vertical else sl.width, - (morph.find_objects(partitions)))) > np.max( + morph.find_objects(partitions))) > np.max( partitionscores, initial=0))): # continue on each partition by suppressing the others, respectively order = morph.reading_order(partitions,rl,bt) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 85da6c32..81166432 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -280,7 +280,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l scale=scale, loc=parent.id, threshold=threshold) return try: - new_line_labels, _, _, _, _, scale = compute_segmentation( + new_line_labels, _, _, _, _, _, scale = compute_segmentation( parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 7e94f495..4f0e87e4 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -19,6 +19,7 @@ AlternativeImageType ) from ocrd_models.ocrd_page_generateds import ( + BaselineType, TableRegionType, ImageRegionType, RegionRefType, @@ -55,30 +56,38 @@ TOOL = 'ocrd-cis-ocropy-segment' -def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None): +def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, + (optionally) a Numpy array of a scalar field ``baselines``, and a Numpy array of the foreground ``fg_bin``, iterate through all labels (except zero and those labels which do not correspond to any foreground at all) to find - their outer contours. Each contour part which is not too - small and gives a (simplified) polygon of at least 4 points - becomes a polygon. (Thus, labels can be split into multiple - polygons.) + their outer contours and inner baselines. + Each contour part which is not too small and gives a + (simplified) polygon of at least 4 points becomes a polygon. + (Thus, labels can be split into multiple polygons.) Return a tuple: - - these polygons as a list of label, polygon tuples, and + - these polygons as a list of label, polygon, baseline tuples, and - a Numpy array of new background labels for that list. """ LOG = getLogger('processor.OcropySegment') + # find sharp baseline + if baselines is not None: + def getx(xy): + return xy[0] + baselines = [LineString(sorted([p[::-1] for p in line], key=getx)).simplify(5) + for line in baselines + if len(line) >= 2] results = list() result_labels = np.zeros_like(bg_labels, dtype=bg_labels.dtype) for label in np.unique(bg_labels): if not label: # ignore if background continue - bg_mask = np.array(bg_labels == label, np.uint8) + bg_mask = np.array(bg_labels == label, np.bool) if not np.count_nonzero(bg_mask * fg_bin): # ignore if missing foreground LOG.debug('skipping label %d in %s due to empty fg', @@ -86,16 +95,16 @@ def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None): continue # simplify to convex hull if simplify is not None: - hull = convex_hull_image(bg_mask).astype(np.uint8) - conflicts = np.setdiff1d((hull>0) * simplify, - (bg_mask>0) * simplify) + hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(np.bool) + conflicts = np.setdiff1d(hull * simplify, + bg_mask * simplify) if conflicts.any(): LOG.debug('Cannot simplify %d: convex hull would create additional intersections %s', label, str(conflicts)) else: bg_mask = hull # find outer contour (parts): - contours, _ = cv2.findContours(bg_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + contours, _ = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # determine areas of parts: areas = [cv2.contourArea(contour) for contour in contours] total_area = sum(areas) @@ -130,7 +139,49 @@ def masks2polygons(bg_labels, fg_bin, name, min_area=None, simplify=None): LOG.warning('Label %d contour %d has less than 4 points for %s', label, i, name) continue - results.append((label, poly)) + # get baseline segments intersecting with this line mask + # and concatenate them from left to right + if baselines is not None: + base = [] + for baseline in baselines: + baseline = baseline.intersection(polygon) + # post-process + if (baseline.is_empty or + baseline.type in ['Point', 'MultiPoint']): + continue + base_x = [pt[0] for pt in base] + base_left = min(base_x, default=0) + base_right = max(base_x, default=0) + left = baseline.bounds[0] + right = baseline.bounds[2] + if (baseline.type == 'GeometryCollection' or + baseline.type.startswith('Multi')): + # heterogeneous result: filter point + for geom in baseline.geoms: + if geom.type == 'Point': + continue + left = geom.bounds[0] + right = geom.bounds[2] + if left > base_right: + base.extend(geom.coords) + base_right = right + elif right < base_left: + base = list(geom.coords) + base + base_left = left + else: + LOG.warning("baseline part component crosses existing x") + continue + elif left > base_right: + base.extend(baseline.coords) + elif right < base_left: + base = list(baseline.coords) + base + else: + LOG.warning("baseline part crosses existing x") + continue + assert all(p1[0] < p2[0] for p1, p2 in zip(base[:-1],base[1:])), base + else: + base = None + results.append((label, poly, base)) result_labels[contour_labels == i+1] = len(results) return results, result_labels @@ -472,7 +523,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, try: if report: raise Exception(report) - line_labels, hlines, vlines, images, colseps, scale = compute_segmentation( + line_labels, baselines, hlines, vlines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin+ignore_labels)>0, @@ -568,17 +619,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(region_mask * region_label, element_bin, + regions, _ = masks2polygons(region_mask * region_label, None, element_bin, '%s "%s"' % (element_name, element_id), min_area=6000/zoom/zoom, simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(region_line_labels, element_bin, + lines, _ = masks2polygons(region_line_labels, baselines, element_bin, 'region "%s"' % element_id, min_area=640/zoom/zoom) # create new lines in new regions (allocating by intersection) - line_polys = [Polygon(polygon) for _, polygon in lines] - for _, region_polygon in regions: + line_polys = [Polygon(polygon) for _, polygon, _ in lines] + for _, region_polygon, _ in regions: region_poly = prep(Polygon(region_polygon)) # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(region_polygon, image, coords) @@ -598,7 +649,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, for i, line_poly in enumerate(line_polys): if not region_poly.intersects(line_poly): # .contains continue - line_label, line_polygon = lines[i] + line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) @@ -610,9 +661,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, line_no += 1 line_id = region_id + "_line%04d" % line_no LOG.debug('Line label %d becomes ID "%s"', line_label, line_id) - line = TextLineType( - id=line_id, Coords=CoordsType( - points=points_from_polygon(line_polygon))) + line = TextLineType(id=line_id, + Coords=CoordsType(points=points_from_polygon(line_polygon))) + if line_baseline: + line_baseline = coordinates_for_segment(line_baseline, image, coords) + line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) region.add_TextLine(line) # if the region has received text lines, keep it if region.get_TextLine(): @@ -627,9 +680,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, LOG.info('Found %d large non-text/image regions for %s "%s"', num_images, element_name, element_id) # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(image_labels, element_bin, + image_polygons, _ = masks2polygons(image_labels, None, element_bin, '%s "%s"' % (element_name, element_id)) - for image_label, polygon in image_polygons: + for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) @@ -648,11 +701,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, LOG.info('Found %d/%d h/v-lines for %s "%s"', num_hlines, num_vlines, element_name, element_id) # find contours around region labels (can be non-contiguous): - hline_polygons, _ = masks2polygons(hline_labels, element_bin, + hline_polygons, _ = masks2polygons(hline_labels, None, element_bin, '%s "%s"' % (element_name, element_id)) - vline_polygons, _ = masks2polygons(vline_labels, element_bin, + vline_polygons, _ = masks2polygons(vline_labels, None, element_bin, '%s "%s"' % (element_name, element_id)) - for _, polygon in hline_polygons + vline_polygons: + for _, polygon, _ in hline_polygons + vline_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) @@ -683,11 +736,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(line_labels, element_bin, + line_polygons, _ = masks2polygons(line_labels, baselines, element_bin, 'region "%s"' % element_id, min_area=640/zoom/zoom) line_no = 0 - for line_label, polygon in line_polygons: + for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) @@ -698,9 +751,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # annotate result: line_no += 1 line_id = element_id + "_line%04d" % line_no - element.add_TextLine(TextLineType( - id=line_id, Coords=CoordsType( - points=points_from_polygon(line_polygon)))) + line = TextLineType(id=line_id, + Coords=CoordsType(points=points_from_polygon(line_polygon))) + if baseline: + line_baseline = coordinates_for_segment(baseline, image, coords) + line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) + element.add_TextLine(line) if not sep_bin.any(): return # no derived image # annotate a text/image-separated image From b50c51b3bd575e262d17289aca881300390792a5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 6 Apr 2022 02:24:30 +0200 Subject: [PATCH 009/194] segment: fix lines2regions non-continguous partitions --- ocrd_cis/ocropy/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 7afb03af..b0d97594 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -1818,7 +1818,7 @@ def find_topological(): npartitions > len(gaps)+1 or # partitions without the cut still score better than after sum(map(sl.height if prefer_vertical else sl.width, - morph.find_objects(partitions))) > np.max( + filter(None, morph.find_objects(partitions)))) > np.max( partitionscores, initial=0))): # continue on each partition by suppressing the others, respectively order = morph.reading_order(partitions,rl,bt) From c4eaf3d44a649ed5e1ee93ad7c5a19020967f0c0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 6 Apr 2022 22:38:05 +0200 Subject: [PATCH 010/194] =?UTF-8?q?re/segment:=20alpha=20shape:=20smaller?= =?UTF-8?q?=20=CE=B1=20to=20avoid=20holes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_cis/ocropy/resegment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 81166432..b0bd1d4e 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -259,6 +259,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # use depth to flatten overlapping lines as seed labels new_labels = np.argmax(distances, axis=0) else: + # 'baseline' new_labels = np.zeros_like(parent_bin, np.uint8) for i, line in enumerate(lines): if line.Baseline is None: @@ -497,7 +498,7 @@ def join_polygons(polygons, loc='', scale=20): for poly in polygons for dist in np.arange(0, poly.length, scale / 2)] #alpha = alphashape.optimizealpha(points) # too slow - alpha = 0.03 + alpha = 0.01 jointp = alphashape.alphashape(points, alpha) tries = 0 # from descartes import PolygonPatch From 8b0e7b87326463aff326e8be3162c0d925cce521 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 6 Apr 2022 22:39:04 +0200 Subject: [PATCH 011/194] resegment (ccomps/baseline): propagate/spread twice to catch diacritics/punctuation, too --- ocrd_cis/ocropy/resegment.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index b0bd1d4e..56840258 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -31,7 +31,7 @@ from .common import ( pil2array, odd, - # DSAVE, + DSAVE, # binarize, check_page, check_region, @@ -294,8 +294,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_line_polygons, new_line_labels = masks2polygons( new_line_labels, parent_bin, '%s "%s"' % (tag, parent.id), min_area=640/zoom/zoom) - # DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin]) - # DSAVE('new_line_labels', [new_line_labels, parent_bin], disabled=False) + DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin]) + DSAVE('new_line_labels', [new_line_labels, parent_bin]) new_line_polygons = [make_valid(Polygon(line_poly)) for line_label, line_poly in new_line_polygons] # polygons for intersecting pairs @@ -421,12 +421,20 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, scale=43, loc='', threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" LOG = getLogger('processor.OcropyResegment') + DSAVE('baseline-seeds', [new_labels, (components>0)]) # allocate to connected components consistently (by majority, # ignoring smallest components like punctuation) #new_labels = morph.propagate_labels_majority(binarized, new_labels) new_labels = morph.propagate_labels_majority(components > 0, new_labels) + DSAVE('majority-propagated', [new_labels, (components>0) & (new_labels==0)]) # dilate/grow labels from connected components against each other and bg + new_labels = morph.spread_labels(new_labels, maxdist=scale*2) + DSAVE('scale-spread', [new_labels, (components>0)]) + # now propagate again to catch smallest components like punctuation + new_labels = morph.propagate_labels_majority(components > 0, new_labels) + DSAVE('propagated-again', [new_labels, (components>0) & (new_labels==0)]) new_labels = morph.spread_labels(new_labels, maxdist=scale/2) + DSAVE('spread-again', [new_labels, (components>0)]) # find polygon hull and modify line coords for i, line in enumerate(lines): new_label = new_labels == i + 1 From 0f359d09fee3a3873ed10c44d7a7e243a6a566a7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 7 Apr 2022 00:36:58 +0200 Subject: [PATCH 012/194] resegment: if method=lineest, then annotate baselines, too --- ocrd_cis/ocropy/resegment.py | 78 +++++--------------- ocrd_cis/ocropy/segment.py | 134 +++++++++++++++++++++++++---------- 2 files changed, 115 insertions(+), 97 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 56840258..997c68f0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,17 +1,15 @@ from __future__ import absolute_import import os.path -from itertools import chain import numpy as np from skimage import draw from shapely.geometry import Polygon, LineString from shapely.prepared import prep from shapely.ops import unary_union -import alphashape from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - to_xml, PageType + to_xml, PageType, BaselineType ) from ocrd import Processor from ocrd_utils import ( @@ -42,7 +40,10 @@ masks2polygons, polygon_for_parent, make_valid, - make_intersection + make_intersection, + join_baselines, + join_polygons, + diff_polygons ) TOOL = 'ocrd-cis-ocropy-resegment' @@ -281,7 +282,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l scale=scale, loc=parent.id, threshold=threshold) return try: - new_line_labels, _, _, _, _, _, scale = compute_segmentation( + new_line_labels, new_baselines, _, _, _, _, scale = compute_segmentation( parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: @@ -292,12 +293,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_line_labels.max(), len(lines), tag, parent.id) # polygonalize and prepare comparison new_line_polygons, new_line_labels = masks2polygons( - new_line_labels, parent_bin, '%s "%s"' % (tag, parent.id), + new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), min_area=640/zoom/zoom) DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) - new_line_polygons = [make_valid(Polygon(line_poly)) - for line_label, line_poly in new_line_polygons] + new_line_polygons, new_baselines = zip(*[(make_valid(Polygon(line_poly)), LineString(baseline)) + for _, line_poly, baseline in new_line_polygons]) # polygons for intersecting pairs intersections = dict() # ratio of overlap between intersection and new line @@ -386,9 +387,11 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # combine all assigned new lines to single outline polygon if len(new_lines) > 1: LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) - new_polygon = join_polygons([intersections[(i, j)] for i in new_lines], - loc=line.id, scale=scale) + new_polygon = join_polygons([intersections[(i, j)] + for i in new_lines], loc=line.id, scale=scale) line_polygons[j] = new_polygon + new_baseline = join_baselines([new_polygon.intersection(new_baselines[i]) + for i in new_lines], loc=line.id) # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) @@ -398,6 +401,10 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l return # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) + if new_baseline is not None: + new_baseline = coordinates_for_segment(new_baseline.coords, + parent_image, parent_coords) + line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline))) # now also ensure the assigned lines do not overlap other existing lines for i in new_lines: for otherj in np.nonzero(fits_fg[i] > 0.1)[0]: @@ -480,58 +487,9 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, continue line.get_Coords().set_points(points_from_polygon(polygon)) -def diff_polygons(poly1, poly2): - poly = poly1.difference(poly2) - if poly.type == 'MultiPolygon': - poly = poly.convex_hull - if poly.minimum_clearance < 1.0: - poly = Polygon(np.round(poly.exterior.coords)) - poly = make_valid(poly) - return poly - -def join_polygons(polygons, loc='', scale=20): - """construct concave hull (alpha shape) from input polygons""" - # compoundp = unary_union(polygons) - # jointp = compoundp.convex_hull - LOG = getLogger('processor.OcropyResegment') - polygons = list(chain.from_iterable([ - poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection'] - else [poly] - for poly in polygons])) - if len(polygons) == 1: - return polygons[0] - # get equidistant list of points along hull - # (otherwise alphashape will jump across the interior) - points = [poly.exterior.interpolate(dist).coords[0] # .xy - for poly in polygons - for dist in np.arange(0, poly.length, scale / 2)] - #alpha = alphashape.optimizealpha(points) # too slow - alpha = 0.01 - jointp = alphashape.alphashape(points, alpha) - tries = 0 - # from descartes import PolygonPatch - # import matplotlib.pyplot as plt - while jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors): - # plt.figure() - # plt.gca().scatter(*zip(*points)) - # for geom in jointp.geoms: - # plt.gca().add_patch(PolygonPatch(geom, alpha=0.2)) - # plt.show() - alpha *= 0.7 - tries += 1 - if tries > 10: - LOG.warning("cannot find alpha for concave hull on '%s'", loc) - alpha = 0 - jointp = alphashape.alphashape(points, alpha) - if jointp.minimum_clearance < 1.0: - # follow-up calculations will necessarily be integer; - # so anticipate rounding here and then ensure validity - jointp = Polygon(np.round(jointp.exterior.coords)) - jointp = make_valid(jointp) - return jointp - # zzz should go into core ocrd_utils def baseline_of_segment(segment, coords): line = np.array(polygon_from_points(segment.get_Baseline().points)) line = transform_coordinates(line, coords['transform']) return np.round(line).astype(np.int32) + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 4f0e87e4..11a018a5 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import os.path +from itertools import chain import numpy as np from skimage import draw from skimage.morphology import convex_hull_image @@ -8,6 +9,7 @@ from shapely.geometry import Polygon, LineString from shapely.prepared import prep from shapely.ops import unary_union +import alphashape from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( @@ -142,43 +144,10 @@ def getx(xy): # get baseline segments intersecting with this line mask # and concatenate them from left to right if baselines is not None: - base = [] - for baseline in baselines: - baseline = baseline.intersection(polygon) - # post-process - if (baseline.is_empty or - baseline.type in ['Point', 'MultiPoint']): - continue - base_x = [pt[0] for pt in base] - base_left = min(base_x, default=0) - base_right = max(base_x, default=0) - left = baseline.bounds[0] - right = baseline.bounds[2] - if (baseline.type == 'GeometryCollection' or - baseline.type.startswith('Multi')): - # heterogeneous result: filter point - for geom in baseline.geoms: - if geom.type == 'Point': - continue - left = geom.bounds[0] - right = geom.bounds[2] - if left > base_right: - base.extend(geom.coords) - base_right = right - elif right < base_left: - base = list(geom.coords) + base - base_left = left - else: - LOG.warning("baseline part component crosses existing x") - continue - elif left > base_right: - base.extend(baseline.coords) - elif right < base_left: - base = list(baseline.coords) + base - else: - LOG.warning("baseline part crosses existing x") - continue - assert all(p1[0] < p2[0] for p1, p2 in zip(base[:-1],base[1:])), base + base = join_baselines([baseline.intersection(polygon) + for baseline in baselines], name) + if base is not None: + base = base.coords else: base = None results.append((label, poly, base)) @@ -834,6 +803,97 @@ def make_valid(polygon): polygon = polygon.simplify(tolerance) return polygon +def diff_polygons(poly1, poly2): + poly = poly1.difference(poly2) + if poly.type == 'MultiPolygon': + poly = poly.convex_hull + if poly.minimum_clearance < 1.0: + poly = Polygon(np.round(poly.exterior.coords)) + poly = make_valid(poly) + return poly + +def join_polygons(polygons, loc='', scale=20): + """construct concave hull (alpha shape) from input polygons""" + # compoundp = unary_union(polygons) + # jointp = compoundp.convex_hull + LOG = getLogger('processor.OcropyResegment') + polygons = list(chain.from_iterable([ + poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection'] + else [poly] + for poly in polygons])) + if len(polygons) == 1: + return polygons[0] + # get equidistant list of points along hull + # (otherwise alphashape will jump across the interior) + points = [poly.exterior.interpolate(dist).coords[0] # .xy + for poly in polygons + for dist in np.arange(0, poly.length, scale / 2)] + #alpha = alphashape.optimizealpha(points) # too slow + alpha = 0.01 + jointp = alphashape.alphashape(points, alpha) + tries = 0 + # from descartes import PolygonPatch + # import matplotlib.pyplot as plt + while jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors): + # plt.figure() + # plt.gca().scatter(*zip(*points)) + # for geom in jointp.geoms: + # plt.gca().add_patch(PolygonPatch(geom, alpha=0.2)) + # plt.show() + alpha *= 0.7 + tries += 1 + if tries > 10: + LOG.warning("cannot find alpha for concave hull on '%s'", loc) + alpha = 0 + jointp = alphashape.alphashape(points, alpha) + if jointp.minimum_clearance < 1.0: + # follow-up calculations will necessarily be integer; + # so anticipate rounding here and then ensure validity + jointp = Polygon(np.round(jointp.exterior.coords)) + jointp = make_valid(jointp) + return jointp + +def join_baselines(baselines, loc=''): + LOG = getLogger('processor.OcropyResegment') + result = [] + for baseline in baselines: + if (baseline.is_empty or + baseline.type in ['Point', 'MultiPoint']): + continue + base_x = [pt[0] for pt in result] + base_left = min(base_x, default=0) + base_right = max(base_x, default=0) + left = baseline.bounds[0] + right = baseline.bounds[2] + if (baseline.type == 'GeometryCollection' or + baseline.type.startswith('Multi')): + # heterogeneous result: filter point + for geom in baseline.geoms: + if geom.type == 'Point': + continue + left = geom.bounds[0] + right = geom.bounds[2] + if left > base_right: + result.extend(geom.coords) + base_right = right + elif right < base_left: + result = list(geom.coords) + result + base_left = left + else: + LOG.warning("baseline part component crosses existing x in %s", loc) + continue + elif left > base_right: + result.extend(baseline.coords) + elif right < base_left: + result = list(baseline.coords) + result + else: + LOG.warning("baseline part crosses existing x in %s", loc) + continue + assert all(p1[0] < p2[0] for p1, p2 in zip(result[:-1], result[1:])), result + if not len(result): + return None + return LineString(result) + def page_get_reading_order(ro, rogroup): """Add all elements from the given reading order group to the given dictionary. From b3018eb0b176b24eb09a86d73537d55169de2f43 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Thu, 7 Apr 2022 14:28:50 +0200 Subject: [PATCH 013/194] ocrd-tool.json: typo cr{,e}ate --- ocrd_cis/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index 74c0d0c9..91a8722b 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -439,7 +439,7 @@ }, "model": { "type": "string", - "description": "load model or crate new one (e.g. fraktur.pyrnn)" + "description": "load model or create new one (e.g. fraktur.pyrnn)" }, "ntrain": { "type": "number", From 97af16c769483f91461c41815b17cc371ff061f9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 Apr 2022 09:34:36 +0200 Subject: [PATCH 014/194] =?UTF-8?q?segment:=20rewrite=20separator=20detect?= =?UTF-8?q?ion=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit instead of detecting hlines and vlines independently, and via costly horizontal/vertical morphology operations, analyse image by medial axis transform (skeleton and distance transform of all connected components); then filter components that are too compact (inner vs outer size), also filter by statistics of distance along the skeleton: filter if too wide on average or too variant; then apply morphological closing to reconnect broken segments, linking only those components that roughly extend each other in the same direction; finally, sort by size and filter components that are too small in inner (skeleton length) or outer size (bbox diagonal), selecting only the topmost candidates; propagate from skeleton to full component and then spread a little into the background --- ocrd_cis/ocropy/common.py | 187 ++++++++++++++++++++++++++++------- ocrd_cis/ocropy/resegment.py | 2 +- ocrd_cis/ocropy/segment.py | 33 +++---- 3 files changed, 163 insertions(+), 59 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index b0d97594..3b8d0f60 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -7,6 +7,7 @@ from scipy.ndimage import measurements, filters, interpolation, morphology from scipy import stats, signal #from skimage.morphology import convex_hull_image +from skimage.morphology import medial_axis from PIL import Image from . import ocrolib @@ -450,18 +451,18 @@ def on_press(event): @checks(ABINARY2,NUMBER) def compute_images(binary, scale, maximages=5): - """Finds (and removes) large connected foreground components. + """Detects large connected foreground components that could be images. Parameters: - ``binary``, a bool or int array of the page image, with 1=black - ``scale``, square root of average bbox area of characters - - ``maximages``, maximum number of large components to keep + - ``maximages``, maximum number of images to find (This could be drop-capitals, line drawings or photos.) - Returns a same-size bool array as a mask image. + Returns a same-size image label array. """ if maximages == 0: - return binary == -1 + return np.zeros_like(binary, np.int) images = binary # d0 = odd(max(2,scale/5)) # d1 = odd(max(2,scale/8)) @@ -473,7 +474,7 @@ def compute_images(binary, scale, maximages=5): images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages) DSAVE('images1_large', images+0.6*binary) if not images.any(): - return images > 0 + return np.zeros_like(binary, np.int) # 2- open horizontally and vertically to suppress # v/h-lines; these will be detected separately, # and it is dangerous to combine them into one @@ -498,14 +499,130 @@ def compute_images(binary, scale, maximages=5): images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages) DSAVE('images5_selected', images+0.6*binary) if not images.any(): - return images > 0 + return np.zeros_like(binary, np.int) # 6- dilate a little to get a smooth contour without gaps dilated = morph.r_dilation(images, (odd(scale),odd(scale))) images = morph.propagate_labels_majority(binary, dilated+1) images = morph.spread_labels(images, maxdist=scale)==2 + images, _ = morph.label(images) DSAVE('images6_dilated', images+0.6*binary) # we could repeat reconstruct-dilate here... - return images > 0 + return images + +@checks(ABINARY2,NUMBER) +def compute_seplines(binary, scale, maxseps=0): + """Detects thin connected foreground components that could be separators. + + Parameters: + - ``binary``, a bool or int array of the page image, with 1=black + - ``scale``, square root of average bbox area of characters + - ``maxseps``, maximum number of separators to find + (This could be horizontal, vertical or oblique, even slightly warped and discontinuous lines.) + + Returns a same-size separator label array. + """ + if maxseps == 0: + return np.zeros_like(binary, np.int) + skel, dist = medial_axis(binary, return_distance=True) + DSAVE("medial-axis", [dist, skel]) + labels, nlabels = morph.label(skel) + slices = [None] + morph.find_objects(labels) + DSAVE("skel-labels", labels) + # determine those components which could be separators + # (filter by compactness, and by mean+variance of distances) + sepmap = np.zeros(nlabels + 1, np.int) + numsep = 0 + sepsizes = [0] + sepslices = [None] + for label in range(1, nlabels + 1): + labelslice = slices[label] + labelmask = labels == label + labelsize = np.count_nonzero(labelmask) # sum of skel pixels, i.e. "inner length" + labellength = np.hypot(*sl.dims(labelslice)) # length of bbox diagonal, i.e. "outer length" + #LOG.debug("skel label %d has inner size %d and outer size %d", label, labelsize, labellength) + if labelsize > 1.4 * labellength: + # not long / stretched out / too compact + # todo: maybe just check aspect ratio for orthogonal lines? + continue + distances = dist[labelmask] + mean_dist = np.mean(distances) + var_dist = np.var(distances) + #LOG.debug("skel label %d has dist %.1f±%.2f", label, mean_dist, np.sqrt(var_dist)) + # todo: empirical analysis of ideal thresholds + if mean_dist < scale / 4 and var_dist < 0.3: + numsep += 1 + sepmap[label] = numsep + sepsizes.append(labelsize) + sepslices.append(labelslice) + # todo: we could also use the mean+var as a criterion to split components + # where the distance exceeds the threshold; e.g. vlines that touch + # letters or images + sepsizes = np.array(sepsizes) + sepslices = np.array(sepslices) + LOG.debug("detected %d separator candidates", numsep) + DSAVE("seps-raw", sepmap[labels]) + d0 = odd(max(1,scale/2)) + d1 = odd(max(1,scale/4)) + closed = morph.rb_closing(sepmap[labels] > 0, (d0,d1)) + DSAVE("seps-closed", [dist, closed]) + labels2, nlabels2 = morph.label(closed) + corrs = morph.correspondences(sepmap[labels], labels2, return_counts=False).T + corrmap = np.arange(numsep + 1) + for sep2 in range(1, nlabels2 + 1): + corrinds = corrs[:, 1] == sep2 + corrinds[corrs[:, 0] == 0] = False # ignore bg + corrinds = corrinds.nonzero()[0] + if len(corrinds) == 1: + continue + for i, indi in enumerate(corrinds[:-1]): + sepi = corrs[indi, 0] + labeli = np.flatnonzero(sepmap == sepi)[0] + slicei = slices[labeli] + lengthi = np.hypot(*sl.dims(slicei)) + for j, indj in enumerate(corrinds[i + 1:], i + 1): + sepj = corrs[indj, 0] + labelj = np.flatnonzero(sepmap == sepj)[0] + slicej = slices[labelj] + lengthj = np.hypot(*sl.dims(slicej)) + #inter = sl.intersect(slicei, slicej) + union = sl.union(slicei, slicej) + length = np.hypot(*sl.dims(union)) + if length > 0.9 * (lengthi + lengthj): + #if sl.empty(inter) or sl.area(inter) / sl.area(union) < 0.2: + corrmap[sepj] = corrmap[sepi] + _, corrmap = np.unique(corrmap, return_inverse=True) # make contiguous + numsep = corrmap.max() + LOG.debug("linked to %d separator candidates", numsep) + def union(slices): + if len(slices) > 1: + return sl.union(slices[0], union(slices[1:])) + return slices[0] + for sep in range(1, numsep + 1): + sepsizes[sep] = sum(sepsizes[corrmap == sep]) + sepslices[sep] = union(sepslices[corrmap == sep]) + sepsizes = sepsizes[:numsep + 1] + sepslices = sepslices[:numsep + 1] + seplengths = np.array([np.hypot(*sl.dims(sepslice)) if sepslice else 0 + for sepslice in sepslices]) + sepmap = corrmap[sepmap] + DSAVE("seps-raw-linked", sepmap[labels]) + # order by size, filter minsize and filter top maxseps + order = np.argsort(sepsizes)[::-1] + # no more than maxseps and no smaller than scale + minsize = np.flatnonzero((sepsizes[order] < scale) | (seplengths[order] < 3 * scale)) + if np.any(minsize): + maxseps = min(maxseps, minsize[0]) + maxseps = min(maxseps, numsep) + ordermap = np.zeros(numsep + 1, np.int) + ordermap[order[:maxseps]] = np.arange(1, maxseps + 1) + sepmap = ordermap[sepmap] + DSAVE("sep-top", sepmap[labels]) + sepseeds = morph.propagate_labels_simple(binary, sepmap[labels]) + DSAVE("seps-top-propagated", sepseeds) + # FIXME: perhaps hclose / vclose first? + seplabels = morph.spread_labels(sepseeds, maxdist=scale / 2) + DSAVE("seps-top-spread", seplabels) + return seplabels # from ocropus-gpageseg, but with horizontal opening @deprecated @@ -974,20 +1091,20 @@ def h_compatible(obj1, obj2, center1, center2): relabel[relabel == label2] = new_label # apply re-assignments: seeds = relabel[seeds] - DSAVE("hmerge5_connected", seeds) + # DSAVE("hmerge5_connected", seeds) return seeds # from ocropus-gpageseg, but: # - with fullpage switch -# (opt-in for h/v-line and column detection), +# (opt-in for separator line and column detection), # - with external separator mask -# (opt-in for h/v-line pass-through) +# (opt-in for separator line pass-through) # - with zoom parameter # (make fixed dimension params relative to pixel density, # instead of blind 300 DPI assumption) -# - with improved h/v-line and column detection -# - with v-line detection _before_ column detection -# - with h/v-line suppression _after_ large component filtering +# - with improved separator line and column detection +# - with separator detection _before_ column detection +# - with separator suppression _after_ large component filtering # - with more robust line seed estimation, # - with horizontal merge instead of blur, # - with component majority for foreground @@ -1005,10 +1122,9 @@ def compute_segmentation(binary, fullpage=False, seps=None, maxcolseps=2, + csminheight=4, maxseps=0, maximages=0, - csminheight=4, - hlminwidth=10, spread_dist=None, rl=False, bt=False): @@ -1026,13 +1142,10 @@ def compute_segmentation(binary, - for up to ``maxcolseps`` multi-line vertical whitespaces (as column separators, counted piece-wise) of at least ``csminheight`` multiples of ``scale``, - - for up to ``maxseps`` vertical black lines - (as column separators, counted piece-wise) of at least - ``csminheight`` multiples of ``scale``, and - - for any number of horizontal lines of at least - ``hlminwidth`` multiples of ``scale``, + - for up to ``maxseps`` black separator lines (horizontal, vertical + or oblique; counted piece-wise), - for anything in ``seps`` if given, - then suppress these separator components and return them separately. + then suppress these non-text components and return them separately. Labels will be projected ("spread") from the foreground to the surrounding background within ``spread_dist`` distance (or half @@ -1049,8 +1162,7 @@ def compute_segmentation(binary, separators and other non-text like small noise, or large drop-capitals / images), - list of Numpy arrays of baseline coordinates [y, x points in lr order] - - Numpy array of horizontal foreground lines mask, - - Numpy array of vertical foreground lines mask, + - Numpy array of foreground separator lines mask, - Numpy array of large/non-text foreground component mask, - Numpy array of vertical background separators mask, - the estimated scale (i.e. median sqrt bbox area of glyph components). @@ -1062,18 +1174,17 @@ def compute_segmentation(binary, LOG.debug('height: %d, zoom: %.2f, scale: %d', binary.shape[0], zoom, scale) if fullpage: - LOG.debug('computing images') + LOG.debug('detecting images') images = compute_images(binary, scale, maximages=maximages) - LOG.debug('computing horizontal/vertical line separators') - hlines = compute_hlines(binary, scale, hlminwidth=hlminwidth, images=images) - vlines = compute_separators_morph(binary, scale, csminheight=csminheight, maxseps=maxseps, images=images) - binary = np.minimum(binary,1-hlines) - binary = np.minimum(binary,1-vlines) - binary = np.minimum(binary,1-images) + LOG.debug('detecting separators') + #hlines = compute_hlines(binary, scale, hlminwidth=hlminwidth, images=images) + #vlines = compute_separators_morph(binary, scale, csminheight=csminheight, maxseps=maxseps, images=images) + slines = compute_seplines(binary, scale, maxseps=maxseps) + binary = np.minimum(binary, 1 - (slines > 0)) + binary = np.minimum(binary, 1 - (images > 0)) else: - hlines = np.zeros_like(binary, np.bool) - vlines = np.zeros_like(binary, np.bool) - images = np.zeros_like(binary, np.bool) + slines = np.zeros_like(binary, np.uint8) + images = np.zeros_like(binary, np.uint8) if seps is not None and not seps.all(): # suppress separators/images for line estimation # (unless it encompasses the full image for some reason) @@ -1092,8 +1203,7 @@ def compute_segmentation(binary, # get a larger (closed) mask of all separators # (both bg boundary and fg line seps, detected # and passed in) to separate line/column labels - sepmask = np.maximum(hlines, vlines) - sepmask = np.maximum(sepmask, images) + sepmask = np.maximum(slines > 0, images > 0) sepmask = np.maximum(sepmask, colseps) if seps is not None: sepmask = np.maximum(sepmask, seps) @@ -1148,7 +1258,7 @@ def compute_segmentation(binary, #segmentation = llabels*binary #return segmentation blines = compute_baselines(bottom, top, llabels, scale) - return llabels, blines, hlines, vlines, images, colseps, scale + return llabels, blines, slines, images, colseps, scale @checks(AFLOAT2,AFLOAT2,SEGMENTATION,NUMBER) def compute_baselines(bottom, top, linelabels, scale, method='bottom'): @@ -1180,6 +1290,9 @@ def compute_baselines(bottom, top, linelabels, scale, method='bottom'): corrs = morph.correspondences(linelabels, baselabels).T labelmap = {} #DSAVE('baselines', baselabels) + # FIXME: this is slow and should be replace by some graph clustering algorithm + # (we want a permutation matrix which maximizes triangles in the adjacency matrix, + # then pick the triangle-subgraph with the largest sum of pixels at its nodes) def partitions(adj, starti, startpart=None): for i in range(starti, len(adj)): if startpart is None: @@ -1530,7 +1643,7 @@ def find_topological(): seplabs, counts = np.unique(seplab * bin, return_counts=True) kept = np.in1d(seplab.ravel(), seplabs[counts > scale * min_line]) seplab = seplab * kept.reshape(*seplab.shape) - DSAVE('seplab', seplab) + #DSAVE('seplab', seplab) sepobj = morph.find_objects(seplab) if not len(sepobj): return diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 997c68f0..f96f2750 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -282,7 +282,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l scale=scale, loc=parent.id, threshold=threshold) return try: - new_line_labels, new_baselines, _, _, _, _, scale = compute_segmentation( + new_line_labels, new_baselines, _, _, _, scale = compute_segmentation( parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 11a018a5..b70997e5 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -191,7 +191,7 @@ def process(self): When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting - up to ``maximages`` large foreground images, - - up to ``maxseps`` foreground h/v-line separators and + - up to ``maxseps`` foreground line separators and - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. @@ -492,7 +492,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, try: if report: raise Exception(report) - line_labels, baselines, hlines, vlines, images, colseps, scale = compute_segmentation( + line_labels, baselines, seplines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin+ignore_labels)>0, @@ -502,8 +502,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], maximages=self.parameter['maximages'] if element_name != 'table' else 0, - csminheight=self.parameter['csminheight'], - hlminwidth=self.parameter['hlminwidth']) + csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): LOG.error('Cannot line-segment region "%s": %s', element_id, err) @@ -526,8 +525,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (these cannot be split or grouped together with other regions) line_labels = np.where(line_labels, line_labels+len(ignore), ignore_labels) # suppress separators/images in fg and try to use for partitioning slices - sepmask = np.maximum(np.maximum(hlines, vlines), - np.maximum(sep_bin, images)) + sepmask = np.maximum(sep_bin, np.maximum(seplines > 0, images > 0)) region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, @@ -645,11 +643,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - image_labels, num_images = morph.label(images) - LOG.info('Found %d large non-text/image regions for %s "%s"', - num_images, element_name, element_id) + LOG.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(image_labels, None, element_bin, + image_polygons, _ = masks2polygons(images, None, element_bin, '%s "%s"' % (element_name, element_id)) for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: @@ -664,22 +660,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element.add_ImageRegion(ImageRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) - # split rulers into separator regions: - hline_labels, num_hlines = morph.label(hlines) - vline_labels, num_vlines = morph.label(vlines) - LOG.info('Found %d/%d h/v-lines for %s "%s"', - num_hlines, num_vlines, element_name, element_id) + # split detected separator labels into separator regions: + LOG.info('Found %d separator lines for %s "%s"', seplines.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): - hline_polygons, _ = masks2polygons(hline_labels, None, element_bin, - '%s "%s"' % (element_name, element_id)) - vline_polygons, _ = masks2polygons(vline_labels, None, element_bin, - '%s "%s"' % (element_name, element_id)) - for _, polygon, _ in hline_polygons + vline_polygons: + sep_polygons, _ = masks2polygons(seplines, None, element_bin, + '%s "%s"' % (element_name, element_id)) + for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for separator') + LOG.warning('Ignoring extant region contour for separator %d', sep_label) continue # annotate result: region_no += 1 From c10b692fcbff544e8a2e485fd00c51cfc7eeed42 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 12 Apr 2022 09:35:39 +0200 Subject: [PATCH 015/194] segment: for more robust bg separator detection, combine criteria of gradient maximum and percentile --- ocrd_cis/ocropy/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 3b8d0f60..062d792e 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -802,7 +802,7 @@ def compute_colseps_conv(binary, scale=1.0, csminheight=10, maxcolseps=2): grad = filters.gaussian_filter(1.0*binary,(scale,scale*0.5),order=(0,1)) grad = filters.uniform_filter(grad,(10.0*scale,1)) # csminheight DSAVE("colwsseps2_grad-raw",grad) - grad = (grad>0.5*np.amax(grad)) + grad = grad > np.minimum(0.5 * np.amax(grad), np.percentile(grad, 99.5)) DSAVE("colwsseps2_grad",grad) # combine dilated edges and whitespace seps = np.minimum(thresh,filters.maximum_filter(grad,(odd(10*scale),odd(5*scale)))) From 789361500fd7926476033bcf1ffc882eb0e6fb71 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 28 Apr 2022 22:28:53 +0200 Subject: [PATCH 016/194] remove Calamari dependency (not used, only CLI callout) --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index a0c371ed..fc98d00d 100644 --- a/setup.py +++ b/setup.py @@ -47,8 +47,7 @@ 'scikit-image', 'alphashape', 'opencv-python-headless', - 'python-Levenshtein', - 'calamari_ocr == 0.3.5' + 'python-Levenshtein' ], extras_require={ 'debug': ['matplotlib>3.0.0'], From ca15800f86e6ece0e390bd41a6b2d66295f5bc74 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 28 Apr 2022 22:29:27 +0200 Subject: [PATCH 017/194] join_polygons (alpha shape): make more robust --- ocrd_cis/ocropy/segment.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index b70997e5..8320deb5 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -818,14 +818,14 @@ def join_polygons(polygons, loc='', scale=20): # (otherwise alphashape will jump across the interior) points = [poly.exterior.interpolate(dist).coords[0] # .xy for poly in polygons - for dist in np.arange(0, poly.length, scale / 2)] + for dist in np.arange(0, poly.length, min(scale / 2, poly.length / 4))] #alpha = alphashape.optimizealpha(points) # too slow alpha = 0.01 jointp = alphashape.alphashape(points, alpha) tries = 0 # from descartes import PolygonPatch # import matplotlib.pyplot as plt - while jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors): + while jointp.is_empty or jointp.area == 0.0 or jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors): # plt.figure() # plt.gca().scatter(*zip(*points)) # for geom in jointp.geoms: From b490d3f8144cd78ce23c19722bef5f72d10c9872 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Jun 2022 23:49:20 +0200 Subject: [PATCH 018/194] re/segment: join_polygons directly instead of alphashape --- ocrd_cis/ocropy/segment.py | 50 +++++++++++++++++--------------------- setup.py | 1 - 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 8320deb5..8ed2042c 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -3,13 +3,14 @@ import os.path from itertools import chain import numpy as np +from scipy.sparse.csgraph import minimum_spanning_tree from skimage import draw from skimage.morphology import convex_hull_image import cv2 from shapely.geometry import Polygon, LineString from shapely.prepared import prep -from shapely.ops import unary_union -import alphashape +from shapely.ops import unary_union, nearest_points +from shapely.validation import explain_validity from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( @@ -803,40 +804,33 @@ def diff_polygons(poly1, poly2): poly = make_valid(poly) return poly -def join_polygons(polygons, loc='', scale=20): +def join_polygons(polygons, scale=20): """construct concave hull (alpha shape) from input polygons""" # compoundp = unary_union(polygons) # jointp = compoundp.convex_hull - LOG = getLogger('processor.OcropyResegment') polygons = list(chain.from_iterable([ poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection'] else [poly] for poly in polygons])) - if len(polygons) == 1: + npoly = len(polygons) + if npoly == 1: return polygons[0] - # get equidistant list of points along hull - # (otherwise alphashape will jump across the interior) - points = [poly.exterior.interpolate(dist).coords[0] # .xy - for poly in polygons - for dist in np.arange(0, poly.length, min(scale / 2, poly.length / 4))] - #alpha = alphashape.optimizealpha(points) # too slow - alpha = 0.01 - jointp = alphashape.alphashape(points, alpha) - tries = 0 - # from descartes import PolygonPatch - # import matplotlib.pyplot as plt - while jointp.is_empty or jointp.area == 0.0 or jointp.type in ['MultiPolygon', 'GeometryCollection'] or len(jointp.interiors): - # plt.figure() - # plt.gca().scatter(*zip(*points)) - # for geom in jointp.geoms: - # plt.gca().add_patch(PolygonPatch(geom, alpha=0.2)) - # plt.show() - alpha *= 0.7 - tries += 1 - if tries > 10: - LOG.warning("cannot find alpha for concave hull on '%s'", loc) - alpha = 0 - jointp = alphashape.alphashape(points, alpha) + # find min-dist path through all polygons (travelling salesman) + pairs = itertools.combinations(range(npoly), 2) + dists = np.eye(npoly, dtype=float) + for i, j in pairs: + dists[i, j] = polygons[i].distance(polygons[j]) + dists[j, i] = dists[i, j] + dists = minimum_spanning_tree(dists, overwrite=True) + # add bridge polygons (where necessary) + for prevp, nextp in zip(*dists.nonzero()): + prevp = polygons[prevp] + nextp = polygons[nextp] + nearest = nearest_points(prevp, nextp) + bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) + polygons.append(bridgep) + jointp = unary_union(polygons) + assert jointp.type == 'Polygon', jointp.wkt if jointp.minimum_clearance < 1.0: # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity diff --git a/setup.py b/setup.py index fc98d00d..4a37603e 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,6 @@ 'pillow>=7.1.2', 'shapely>=1.7.1', 'scikit-image', - 'alphashape', 'opencv-python-headless', 'python-Levenshtein' ], From 2bc033cce570ef32b172df9fb7f4e309970669b1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 8 Jun 2022 23:51:58 +0200 Subject: [PATCH 019/194] re/segment: sort text lines in reading order --- ocrd_cis/ocropy/segment.py | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 8ed2042c..3fc150ff 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -59,7 +59,7 @@ TOOL = 'ocrd-cis-ocropy-segment' -def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None): +def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, reorder=True): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -114,11 +114,15 @@ def getx(xy): if not total_area: # ignore if too small continue - # sort contours in reading order + # redraw label array contour_labels = np.zeros_like(bg_mask, np.uint8) for i, contour in enumerate(contours): - cv2.drawContours(contour_labels, contours[i:i+1], -1, i+1, cv2.FILLED) - order = np.argsort(morph.reading_order(contour_labels)[1:]) + cv2.drawContours(contour_labels, contours, i, i+1, cv2.FILLED) + if reorder: + # sort contours in reading order + order = np.argsort(morph.reading_order(contour_labels)[1:]) + else: + order = range(len(contours)) # convert to polygons for i in order: contour = contours[i] @@ -133,14 +137,13 @@ def getx(xy): polygon = contour[:, 0, ::] # already ordered x,y # simplify and validate: polygon = Polygon(polygon) - for tolerance in range(2, int(area)): - polygon = polygon.simplify(tolerance) - if polygon.is_valid: - break + if not polygon.is_valid: + #LOG.debug(polygon.wkt) + LOG.debug(explain_validity(polygon)) + polygon = make_valid(polygon) poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: - LOG.warning('Label %d contour %d has less than 4 points for %s', - label, i, name) + LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name) continue # get baseline segments intersecting with this line mask # and concatenate them from left to right @@ -572,7 +575,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, "region label %d has both existing regions and new lines (%s)" % ( region_label, str(region_line_labels0)) region = ignore[region_line_labels0[0] - 1] - if rogroup and region.parent_object_ == element and not isinstance(region, SeparatorRegionType): + if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) LOG.debug('Region label %d is for ignored region "%s"', region_label, region.id) @@ -662,10 +665,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - LOG.info('Found %d separator lines for %s "%s"', seplines.max(), element_name, element_id) + LOG.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): sep_polygons, _ = masks2polygons(seplines, None, element_bin, - '%s "%s"' % (element_name, element_id)) + '%s "%s"' % (element_name, element_id), + reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -772,8 +776,7 @@ def make_intersection(poly1, poly2): interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) if interp.type == 'MultiPolygon': # homogeneous result: construct convex hull to connect - # FIXME: construct concave hull / alpha shape - interp = interp.convex_hull + interp = join_polygons(interp.geoms) if interp.minimum_clearance < 1.0: # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity From 956f4a399d1ac3a135544261ce2c5dfb86fcbbe3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Jun 2022 00:25:14 +0200 Subject: [PATCH 020/194] baseline extraction: partition by finding largest cliques --- ocrd_cis/ocropy/common.py | 24 +++++++----------------- setup.py | 1 + 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 062d792e..3a51d6e5 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -8,6 +8,7 @@ from scipy import stats, signal #from skimage.morphology import convex_hull_image from skimage.morphology import medial_axis +import networkx as nx from PIL import Image from . import ocrolib @@ -1281,7 +1282,7 @@ def compute_baselines(bottom, top, linelabels, scale, method='bottom'): bot1d = np.diff(bot, axis=0, append=0) bot1d = np.diff(np.sign(bot1d), axis=0, append=0) < 0 bot1d &= bot > 0 - #DSAVE('bot1d', bot1d) + DSAVE('bot1d', bot1d) blines = bot1d baselabels, nbaselabels = morph.label(blines) baseslices = [(slice(0,0),slice(0,0))] + morph.find_objects(baselabels) @@ -1289,18 +1290,7 @@ def compute_baselines(bottom, top, linelabels, scale, method='bottom'): # (can happen due to mis-estimation of scale) corrs = morph.correspondences(linelabels, baselabels).T labelmap = {} - #DSAVE('baselines', baselabels) - # FIXME: this is slow and should be replace by some graph clustering algorithm - # (we want a permutation matrix which maximizes triangles in the adjacency matrix, - # then pick the triangle-subgraph with the largest sum of pixels at its nodes) - def partitions(adj, starti, startpart=None): - for i in range(starti, len(adj)): - if startpart is None: - yield from partitions(adj, i + 1, [i]) - elif all(adj[i][j] for j in startpart): - yield from partitions(adj, i + 1, [i] + startpart) - if startpart is not None: - yield startpart + DSAVE('baselines-raw', baselabels) for line in np.unique(linelabels): if not line: continue # ignore bg line corrinds = corrs[:, 0] == line @@ -1320,11 +1310,11 @@ def partitions(adj, starti, startpart=None): if sl.xoverlaps(baseslicei, baseslicej): nonoverlapping[i, j] = False nonoverlapping[j, i] = False + # find all maximal cliques in the graph (i.e. all fully connected subgraphs) + # and then pick the partition with the largest sum of pixels at its nodes def pathlen(path): - return sum(corrs[corrinds[pos], 2] for pos in path) - corrgroups = sorted(partitions(nonoverlapping, 0), key=pathlen) - # select longest path - corrinds = corrinds[corrgroups[-1]] + return sum(corrs[corrinds[path], 2]) + corrinds = corrinds[max(nx.find_cliques(nx.Graph(nonoverlapping)), key=pathlen)] labelmap.setdefault(line, list()).extend(corrs[corrinds, 1]) basepoints = [] for line in np.unique(linelabels): diff --git a/setup.py b/setup.py index 4a37603e..a5e19979 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ 'pillow>=7.1.2', 'shapely>=1.7.1', 'scikit-image', + 'networkx', 'opencv-python-headless', 'python-Levenshtein' ], From 62d0729779d771a57fdd67d8057485ae6c4f6176 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Jun 2022 00:30:14 +0200 Subject: [PATCH 021/194] sepline detection linking: partition by finding largest cliques --- ocrd_cis/ocropy/common.py | 131 ++++++++++++++++++++++++++++++++------ 1 file changed, 112 insertions(+), 19 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 3a51d6e5..a195a4c1 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -535,33 +535,112 @@ def compute_seplines(binary, scale, maxseps=0): numsep = 0 sepsizes = [0] sepslices = [None] + sepdists = [0] for label in range(1, nlabels + 1): labelslice = slices[label] labelmask = labels == label labelsize = np.count_nonzero(labelmask) # sum of skel pixels, i.e. "inner length" + labelarea = sl.area(labelslice) + labelaspect = sl.aspect(labelslice) + if labelaspect > 1: + labelaspect = 1 / labelaspect labellength = np.hypot(*sl.dims(labelslice)) # length of bbox diagonal, i.e. "outer length" #LOG.debug("skel label %d has inner size %d and outer size %d", label, labelsize, labellength) - if labelsize > 1.4 * labellength: - # not long / stretched out / too compact - # todo: maybe just check aspect ratio for orthogonal lines? + if labelsize > 1.5 * labellength and labelaspect >= 0.1 and labelsize < 15 * scale: #and labelsize > 0.1 * labelarea + # not long / straight, but very compact continue distances = dist[labelmask] - mean_dist = np.mean(distances) - var_dist = np.var(distances) - #LOG.debug("skel label %d has dist %.1f±%.2f", label, mean_dist, np.sqrt(var_dist)) + avg_dist = np.median(distances) #np.mean(distances) + std_dist = np.std(distances) # todo: empirical analysis of ideal thresholds - if mean_dist < scale / 4 and var_dist < 0.3: - numsep += 1 - sepmap[label] = numsep - sepsizes.append(labelsize) - sepslices.append(labelslice) - # todo: we could also use the mean+var as a criterion to split components - # where the distance exceeds the threshold; e.g. vlines that touch - # letters or images + if avg_dist > scale / 4 or std_dist/avg_dist > 0.7: + continue + #LOG.debug("skel label %d has dist %.1f±%.2f", label, avg_dist, std_dist) + numsep += 1 + sepmap[label] = numsep + sepsizes.append(labelsize) + sepslices.append(labelslice) + sepdists.append(avg_dist) + if labelsize > 10 * scale and avg_dist > 0 and std_dist / avg_dist > 0.2: + # try to split this large label up along neighbouring spans of similar distances: + # (e.g. vlines that touch letters or images) + # 1. get optimal (by variability) spans as bin intervals, then merge largest spans + disthist, distedges = np.histogram(distances, bins='scott', density=True) # stone + disthist *= np.diff(distedges) # get probability masses + disthistlarge = disthist > 0.1 + if np.count_nonzero(disthistlarge) < 2: + continue # only 1 large bin + disthistlarge[-1] = True # ensure full interval + distedges = distedges[1:][disthistlarge] + disthist = np.cumsum(disthist)[disthistlarge] + disthist = np.diff(disthist, prepend=0) + distbin = np.digitize(distances, distedges, right=True) + # 2. now find connected components within bins, but map all tiny components + # to a single label so they can be replaced by their neighbours later-on + sublabels = np.zeros_like(labels) + sublabels[labelmask] = distbin + 1 + DSAVE("sublabels", sublabels) + sublabels2 = np.zeros_like(labels) + sublabel = 1 + sublabelmap = [0, 1] + for bin in range(len(distedges)): + binmask = sublabels == bin + 1 + binlabels, nbinlabels = morph.label(binmask) + _, binlabelcounts = np.unique(binlabels, return_counts=True) + largemask = (binlabelcounts > 2 * scale)[binlabels] + smallmask = (binlabelcounts <= 2 * scale)[binlabels] + sublabels2[binmask & smallmask] = 1 + if not np.any(binmask & largemask): + continue + sublabels2[binmask & largemask] = binlabels[binmask & largemask] + sublabel + sublabel += nbinlabels + sublabelmap.extend(nbinlabels*[bin + 1]) + if sublabel == 1: + continue # only tiny sublabels here + DSAVE("sublabels_connected", sublabels2) + sublabelmap = np.array(sublabelmap) + # 3. finally, replace tiny components by nearest components, + # and recombine survivors to bin labels + smallmask = sublabels2 == 1 + sublabels2[smallmask] = 0 + sublabels2[smallmask] = morph.spread_labels(sublabels2)[smallmask] + sublabels = sublabelmap[sublabels2] + DSAVE("sublabels_final", sublabels) + # now apply as multiple separators + numsep -= 1 + sepmap[label] = 0 + slices[label] = None + sepsizes = sepsizes[:-1] + sepslices = sepslices[:-1] + sepdists = sepdists[:-1] + for sublabel in np.unique(sublabels[labelmask]): + sublabelmask = sublabels == sublabel + sublabelsize = np.count_nonzero(sublabelmask) + sublabelslice = sublabelmask.nonzero() + sublabelslice = sl.box(sublabelslice[0].min(), + sublabelslice[0].max(), + sublabelslice[1].min(), + sublabelslice[1].max()) + subdistances = dist[sublabelmask] + nlabels += 1 + numsep += 1 + sepmap = np.append(sepmap, numsep) + labels[sublabelmask] = nlabels + slices.append(sublabelslice) + sepsizes.append(sublabelsize) + sepslices.append(sublabelslice) + sepdists.append(np.median(subdistances)) + #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice)) sepsizes = np.array(sepsizes) sepslices = np.array(sepslices) LOG.debug("detected %d separator candidates", numsep) DSAVE("seps-raw", sepmap[labels]) + # now dilate+erode to link neighbouring candidates, + # but allow only such links which + # - stay consistent regarding avg/std width + # - do not enclose large areas in between + # - do not "change direction" (roughly adds up their diagonals) + # then combine mutual neighbourships to largest allowed partitions d0 = odd(max(1,scale/2)) d1 = odd(max(1,scale/4)) closed = morph.rb_closing(sepmap[labels] > 0, (d0,d1)) @@ -574,23 +653,37 @@ def compute_seplines(binary, scale, maxseps=0): corrinds[corrs[:, 0] == 0] = False # ignore bg corrinds = corrinds.nonzero()[0] if len(corrinds) == 1: - continue + continue # nothing to link + nonoverlapping = np.zeros((len(corrinds), len(corrinds)), dtype=np.bool) for i, indi in enumerate(corrinds[:-1]): sepi = corrs[indi, 0] labeli = np.flatnonzero(sepmap == sepi)[0] slicei = slices[labeli] lengthi = np.hypot(*sl.dims(slicei)) + areai = sl.area(slicei) for j, indj in enumerate(corrinds[i + 1:], i + 1): sepj = corrs[indj, 0] labelj = np.flatnonzero(sepmap == sepj)[0] slicej = slices[labelj] lengthj = np.hypot(*sl.dims(slicej)) - #inter = sl.intersect(slicei, slicej) + areaj = sl.area(slicej) union = sl.union(slicei, slicej) length = np.hypot(*sl.dims(union)) - if length > 0.9 * (lengthi + lengthj): - #if sl.empty(inter) or sl.area(inter) / sl.area(union) < 0.2: - corrmap[sepj] = corrmap[sepi] + if length < 0.9 * (lengthi + lengthj): + continue + if sl.area(union) > 1.3 * (areai + areaj): + continue + if not (0.8 < sepdists[sepi] / sepdists[sepj] < 1.2): + continue + inter = sl.intersect(slicei, slicej) + if (sl.empty(inter) or + (sl.area(inter) / areai < 0.2 and + sl.area(inter) / areaj < 0.2)): + nonoverlapping[i, j] = True + nonoverlapping[j, i] = True + # find largest maximal clique (i.e. fully connected subgraphs) + corrinds = corrinds[max(nx.find_cliques(nx.Graph(nonoverlapping)), key=len)] + corrmap[corrs[corrinds, 0]] = corrs[corrinds[0], 0] _, corrmap = np.unique(corrmap, return_inverse=True) # make contiguous numsep = corrmap.max() LOG.debug("linked to %d separator candidates", numsep) From ff326ca273a00ebd391c0bd29dbf16c63cd54c0d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Jun 2022 00:32:40 +0200 Subject: [PATCH 022/194] sepline detection linking: filter results entirely composed by tiny components --- ocrd_cis/ocropy/common.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index a195a4c1..012d55fd 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -522,6 +522,14 @@ def compute_seplines(binary, scale, maxseps=0): Returns a same-size separator label array. """ + # tries to find a compromise for the following issues, + # potentially occurring in combination (or all at once): + # - non-congiguous or broken lines (due to thin ink or low contrast) + # - skewed, curved or warped lines (due to non-planar photography or irregular typography) + # - very close or overlapping text (due to show-through or bad binarization) + # - superimposed fg noise (due to bad binarization) that may connect text and non-text + # - intersecting vertical and horizontal lines, even closed shapes (enclosing text) + # - line-like glyphs (i.e. false positives) if maxseps == 0: return np.zeros_like(binary, np.int) skel, dist = medial_axis(binary, return_distance=True) @@ -692,7 +700,7 @@ def union(slices): return sl.union(slices[0], union(slices[1:])) return slices[0] for sep in range(1, numsep + 1): - sepsizes[sep] = sum(sepsizes[corrmap == sep]) + sepsizes[sep] = max(sepsizes[corrmap == sep]) # sum sepslices[sep] = union(sepslices[corrmap == sep]) sepsizes = sepsizes[:numsep + 1] sepslices = sepslices[:numsep + 1] From 65eee888d19f74dddf3c47bbb60138d5e7e26994 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Jun 2022 00:34:24 +0200 Subject: [PATCH 023/194] sepline detection masking: spread against bg and non-separator fg --- ocrd_cis/ocropy/common.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 012d55fd..6b6eab78 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -719,11 +719,17 @@ def union(slices): ordermap[order[:maxseps]] = np.arange(1, maxseps + 1) sepmap = ordermap[sepmap] DSAVE("sep-top", sepmap[labels]) - sepseeds = morph.propagate_labels_simple(binary, sepmap[labels]) - DSAVE("seps-top-propagated", sepseeds) - # FIXME: perhaps hclose / vclose first? + # spread into fg against other fg + sepseeds = sepmap[labels] + sepseeds = morph.spread_labels(sepseeds, maxdist=max(sepdists)) + sepseeds[~binary] = 0 + #labels = morph.propagate_labels_simple(binary, labels) + #DSAVE("seps-top-spread-fg", sepseeds) + # spread into bg against other fg + sepseeds[binary & (sepseeds == 0)] = maxseps + 1 seplabels = morph.spread_labels(sepseeds, maxdist=scale / 2) - DSAVE("seps-top-spread", seplabels) + seplabels[seplabels == maxseps + 1] = 0 + DSAVE("seps-top-spread-bg", seplabels) return seplabels # from ocropus-gpageseg, but with horizontal opening From 81276131abe8425a3bf67c13ce8d3fbed4604322 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Jun 2022 00:46:11 +0200 Subject: [PATCH 024/194] ocropy.lines2region: improve splitting by separators (fix 6d8c0d36) - when trying to partition slices by separators topologically, also treat pre-existing regions like separators, but prevent placing them into distinct partitions - when trying to partition slices by separators morphologically, also merge partitions that share any significant line labels --- ocrd_cis/ocropy/common.py | 52 +++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 6b6eab78..e98ab4a2 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -1675,12 +1675,6 @@ def finalize(): sepm = sl.cut(sepmask, box) if isinstance(mask, np.ndarray): sepm = np.where(mask, sepm, 1) - if isinstance(rlabels, np.ndarray): - # treat existing regions like separators - rlab = sl.cut(rlabels, box) - if isinstance(mask, np.ndarray): - rlab = np.where(mask, rlab, 0) - sepm = np.where(rlab, 1, sepm) # provide `partitions` for next step partitions, npartitions = 1-sepm, 1 new_partition_type = None @@ -1690,31 +1684,35 @@ def finalize(): # try to apply in this cut like another separator partitions, npartitions = morph.label(1-sepm) if npartitions > 1: - # first, delete partitions that have no significant line labels - splitmap = np.zeros(len(objects)+1, dtype=np.int) - for label in range(1, npartitions+1): - linecounts = np.bincount(lbin[partitions==label], minlength=len(objects)) + # delete partitions that have no significant line labels, + # merge partitions that share any significant line labels + splitmap = np.zeros((len(objects), npartitions), dtype=np.bool) + for label in range(npartitions): + linecounts = np.bincount(lbin[partitions==label+1], minlength=len(objects)) linecounts[0] = 0 # without bg # get significant line labels for this partition # (but keep insignificant non-empty labels if complete) mincounts = np.minimum(min_line * scale, np.maximum(1, bincounts)) linelabels = np.nonzero(linecounts >= mincounts)[0] if linelabels.size: - splitmap[linelabels] = label - if debug: LOG.debug(' sepmask partition %d: %s', label, str(linelabels)) + splitmap[linelabels, label] = True + if debug: LOG.debug(' sepmask partition %d: %s', label+1, str(linelabels)) else: - partitions[partitions==label] = 0 - # second, merge partitions that share any significant line labels - for label1 in range(1, npartitions+1): - if not np.any(splitmap == label1): + partitions[partitions==label+1] = 0 + if isinstance(rlabels, np.ndarray): + # keep existing regions in distinct partitions if possible + rlab = sl.cut(rlabels, box) + if isinstance(mask, np.ndarray): + rlab = np.where(mask, rlab, 0) + splitmap[np.unique(lbin[rlab>0])] = False + mergemap = np.arange(npartitions + 1) + for line in splitmap: + if not np.any(line): continue - for label2 in range(label1+1, npartitions+1): - if not np.any(splitmap == label2): - continue - if np.any((splitmap == label1) & (splitmap == label2)): - splitmap[splitmap == label2] = label1 - partitions[partitions==label2] = label1 - npartitions = len(np.setdiff1d(np.unique(splitmap), [0])) + parts = np.flatnonzero(line)+1 + mergemap[parts] = mergemap[parts[0]] + partitions = mergemap[partitions] + npartitions = len(np.setdiff1d(np.unique(mergemap), [0])) new_partition_type = 'splitmask' if debug: LOG.debug(' %d sepmask partitions after filtering and merging', npartitions) if partition_type != 'topological': @@ -1722,10 +1720,16 @@ def finalize(): # get current slice's line labels def find_topological(): # run only if needed (no other partition/slicing possible) - nonlocal partitions, npartitions, new_partition_type + nonlocal sepm, partitions, npartitions, new_partition_type llab = sl.cut(llabels, box) if isinstance(mask, np.ndarray): llab = np.where(mask, llab, 0) + if isinstance(rlabels, np.ndarray): + # treat existing regions like separators + rlab = sl.cut(rlabels, box) + if isinstance(mask, np.ndarray): + rlab = np.where(mask, rlab, 0) + sepm = np.where(rlab, 1, sepm) obj = [sl.intersect(o, box) for o in objects] # get current slice's foreground bin = sl.cut(binary, box) From 2849464c14bf51b06cf01a36916a48e9424dc71a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Jun 2022 00:58:03 +0200 Subject: [PATCH 025/194] ocrolib.morph.all_neighbors: no diagonals --- ocrd_cis/ocropy/ocrolib/morph.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index 75d86b69..7d6ffc85 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -170,6 +170,20 @@ def rg_closing(image,size,origin=0): # image = r_dilation(image,size,origin=0) # return r_erosion(image,size,origin=-1) +@checks(GRAYSCALE,ABINARY2) +def rg_reconstruction(image,mask,step=1,maxsteps=None): + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2*step+1,2*step+1)) + dilated = image + while maxsteps is None or maxsteps > 0: + dilated = cv2.dilate(src=dilated, kernel=kernel) + dilated = np.where(mask, dilated, image) + # did result change? + if (image == dilated).all(): + return dilated + if maxsteps: + maxsteps -= step + return dilated + @checks(SEGMENTATION) def showlabels(x,n=7): import matplotlib.pyplot as plt @@ -337,8 +351,8 @@ def all_neighbors(image, dist=1, bg=NaN): assert amin(image)>=0 u = unique(q*image+shift(image,(dist,0),order=0,cval=bg)) d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg)) - l = unique(q*image+shift(image,(dist,dist),order=0,cval=bg)) - r = unique(q*image+shift(image,(-dist,dist),order=0,cval=bg)) + l = unique(q*image+shift(image,(0,dist),order=0,cval=bg)) + r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg)) all = unique(r_[u,d,l,r]) all = all[all!=bg] all = c_[all//q,all%q] From 4d7a0831e9072c56375e7cd9c2fc5ed107f565b3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Jun 2022 08:27:29 +0200 Subject: [PATCH 026/194] sepline detection polygonization: cut inner holes open --- ocrd_cis/ocropy/segment.py | 90 +++++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 5 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 3fc150ff..73b54fa8 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -59,7 +59,7 @@ TOOL = 'ocrd-cis-ocropy-segment' -def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, reorder=True): +def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -106,8 +106,88 @@ def getx(xy): label, str(conflicts)) else: bg_mask = hull - # find outer contour (parts): - contours, _ = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + if open_holes: + # def plot_poly(contour, color): + # import matplotlib.pyplot as plt + # from matplotlib.patches import Polygon as PolygonPatch + # plt.figure() + # plt.imshow(fg_bin) + # plt.gca().scatter(*zip(*contour[:,0])) + # plt.gca().add_patch(PolygonPatch(contour[:,0], alpha=0.5, color=color, closed=False)) + # plt.show() + # find outer contour (parts) plus direct holes (if any) + contours = [] + cont, hier = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) + idx = 0 + while idx >= 0: + contour = cont[idx] + if len(contour) < 3: + idx = hier[0, idx, 0] + continue + #plot_poly(contour, 'red') + idx_hole = hier[0, idx, 2] + while idx_hole >= 0: + hole = cont[idx_hole] + if len(hole) < 3: + idx_hole = hier[0, idx_hole, 0] + continue + LOG.debug("label %d contour %d [%d pts] has hole %d [%d pts]", + label, idx, len(contour), idx_hole, len(hole)) + #plot_poly(hole, 'blue') + # cut child from outside... + # first get nearest point on child + hole_idx = np.argmin([cv2.pointPolygonTest(contour, tuple(pt[0]), True) + for pt in hole]) + # now get nearest point on parent + # (we cannot use PolygonTest directly, because we must also interpolate + # to prevent crossing edges; at least each 10px) + contour = np.append(contour, contour[0:1], axis=0) + contour2 = np.diff(contour, axis=0) + contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(np.int)[:,0] // 10) + interpol = [] + for i, ntics in enumerate(contourtics): + interpol.extend(np.array(contour[i:i+1] + + contour2[i:i+1] * + np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis], + np.int)) + interpol.append(contour[-1]) + interpol = np.array(interpol) + contourtics = np.insert(np.cumsum(contourtics), 0, 0) + assert np.all(contour == interpol[contourtics]) + interpol_idx = np.linalg.norm(interpol - hole[hole_idx], axis=2).argmin() + contour_idx = np.searchsorted(contourtics, interpol_idx) + if interpol_idx in contourtics: + contour_idx2 = contour_idx + 1 + else: + contour_idx2 = contour_idx + if contour_idx2 >= len(contour): + contour_idx2 = 0 + cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx+1] + if interpol_idx == 0: + diff1 = (interpol[-1:] - cispoint1) // 5 + else: + diff1 = (interpol[interpol_idx-1:interpol_idx] - cispoint1) // 5 + if interpol_idx + 1 >= len(interpol): + diff2 = (interpol[0:1] - cispoint2) // 5 + else: + diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5 + cispoint1 = cispoint1 + diff1 + cispoint2 = cispoint2 + diff2 + LOG.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx) + # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) + # (this works, because inner contours have inverse direction) + contour = np.concatenate([contour[:contour_idx], cispoint1, + hole[hole_idx:], hole[:hole_idx], + cispoint2, contour[contour_idx:]]) + #plot_poly(contour, 'green') + idx_hole = hier[0, idx_hole, 0] + #plot_poly(contour, 'red') + LOG.debug("adding label %d contour %d [%d pts]", label, idx, len(contour)) + contours.append(contour) + idx = hier[0, idx, 0] + else: + # find outer contour (parts): + contours, _ = cv2.findContours(bg_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) # determine areas of parts: areas = [cv2.contourArea(contour) for contour in contours] total_area = sum(areas) @@ -669,7 +749,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # find contours around region labels (can be non-contiguous): sep_polygons, _ = masks2polygons(seplines, None, element_bin, '%s "%s"' % (element_name, element_id), - reorder=False) + open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -822,7 +902,7 @@ def join_polygons(polygons, scale=20): pairs = itertools.combinations(range(npoly), 2) dists = np.eye(npoly, dtype=float) for i, j in pairs: - dists[i, j] = polygons[i].distance(polygons[j]) + dists[i, j] = polygons[i].distance(polygons[j]) dists[j, i] = dists[i, j] dists = minimum_spanning_tree(dists, overwrite=True) # add bridge polygons (where necessary) From c90b29f4c6f3369b5eecae1617903dada14a3553 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 9 Jun 2022 10:34:57 +0200 Subject: [PATCH 027/194] re/segment: join_polygons: fix b490d3f8 imports --- ocrd_cis/ocropy/segment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 73b54fa8..f9948579 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,7 +1,7 @@ from __future__ import absolute_import import os.path -from itertools import chain +import itertools import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree from skimage import draw @@ -887,11 +887,11 @@ def diff_polygons(poly1, poly2): poly = make_valid(poly) return poly -def join_polygons(polygons, scale=20): +def join_polygons(polygons, loc='', scale=20): """construct concave hull (alpha shape) from input polygons""" # compoundp = unary_union(polygons) # jointp = compoundp.convex_hull - polygons = list(chain.from_iterable([ + polygons = list(itertools.chain.from_iterable([ poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection'] else [poly] for poly in polygons])) From 2ec107eea4fa29f672641c5ea55616e958330437 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 25 Mar 2023 02:34:23 +0100 Subject: [PATCH 028/194] re/segment: join_polygons: connect touching neighbours, too --- ocrd_cis/ocropy/segment.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index f9948579..43005088 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -902,8 +902,11 @@ def join_polygons(polygons, loc='', scale=20): pairs = itertools.combinations(range(npoly), 2) dists = np.eye(npoly, dtype=float) for i, j in pairs: - dists[i, j] = polygons[i].distance(polygons[j]) - dists[j, i] = dists[i, j] + dist = polygons[i].distance(polygons[j]) + if dist == 0: + dist = 1e-5 # if pair merely touches, we still need to get an edge + dists[i, j] = dist + dists[j, i] = dist dists = minimum_spanning_tree(dists, overwrite=True) # add bridge polygons (where necessary) for prevp, nextp in zip(*dists.nonzero()): From b2aba78d072486edf8cc441b9a9dd6543fe91937 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 25 Mar 2023 02:37:42 +0100 Subject: [PATCH 029/194] re/segment: join_baselines: for complex subtypes, apply recursively --- ocrd_cis/ocropy/segment.py | 39 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 43005088..6f7848ae 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -927,40 +927,33 @@ def join_polygons(polygons, loc='', scale=20): def join_baselines(baselines, loc=''): LOG = getLogger('processor.OcropyResegment') result = [] - for baseline in baselines: - if (baseline.is_empty or - baseline.type in ['Point', 'MultiPoint']): - continue + def add_baseline(baseline): + nonlocal result base_x = [pt[0] for pt in result] base_left = min(base_x, default=0) base_right = max(base_x, default=0) left = baseline.bounds[0] right = baseline.bounds[2] - if (baseline.type == 'GeometryCollection' or - baseline.type.startswith('Multi')): - # heterogeneous result: filter point - for geom in baseline.geoms: - if geom.type == 'Point': - continue - left = geom.bounds[0] - right = geom.bounds[2] - if left > base_right: - result.extend(geom.coords) - base_right = right - elif right < base_left: - result = list(geom.coords) + result - base_left = left - else: - LOG.warning("baseline part component crosses existing x in %s", loc) - continue - elif left > base_right: + if baseline.coords[0][0] > baseline.coords[-1][0]: + baseline.coords = list(baseline.coords[::-1]) + if left > base_right: result.extend(baseline.coords) elif right < base_left: result = list(baseline.coords) + result else: LOG.warning("baseline part crosses existing x in %s", loc) - continue + return assert all(p1[0] < p2[0] for p1, p2 in zip(result[:-1], result[1:])), result + for baseline in baselines: + if (baseline.is_empty or + baseline.type in ['Point', 'MultiPoint']): + continue + if (baseline.type == 'GeometryCollection' or + baseline.type.startswith('Multi')): + for geom in baseline.geoms: + add_baseline(geom) + continue + add_baseline(baseline) if not len(result): return None return LineString(result) From 77d60ca5006caffd406a29c67acd964933558efc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 25 Mar 2023 02:40:12 +0100 Subject: [PATCH 030/194] re/segment: join_baselines: skip lines outside of polygon --- ocrd_cis/ocropy/segment.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 6f7848ae..cfa99e62 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -221,6 +221,9 @@ def getx(xy): #LOG.debug(polygon.wkt) LOG.debug(explain_validity(polygon)) polygon = make_valid(polygon) + if not polygon.is_valid: + #LOG.debug(polygon.wkt) + LOG.warning(explain_validity(polygon)) poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name) @@ -229,7 +232,8 @@ def getx(xy): # and concatenate them from left to right if baselines is not None: base = join_baselines([baseline.intersection(polygon) - for baseline in baselines], name) + for baseline in baselines + if baseline.intersects(polygon)], name) if base is not None: base = base.coords else: From 8c986be4246a9e7c2a3123e6b76862bfeed8ed7c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 25 Mar 2023 02:41:43 +0100 Subject: [PATCH 031/194] re/segment: improve polygon simplification --- ocrd_cis/ocropy/segment.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index cfa99e62..e83dcacd 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -869,17 +869,18 @@ def make_intersection(poly1, poly2): return interp def make_valid(polygon): - for split in range(1, len(polygon.exterior.coords)-1): + points = list(polygon.exterior.coords) + for split in range(1, len(points)): if polygon.is_valid or polygon.simplify(polygon.area).is_valid: break # simplification may not be possible (at all) due to ordering # in that case, try another starting point - polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split]) - for tolerance in range(1, int(polygon.area)): + polygon = Polygon(points[-split:]+points[:-split]) + for tolerance in range(int(polygon.area)): if polygon.is_valid: break # simplification may require a larger tolerance - polygon = polygon.simplify(tolerance) + polygon = polygon.simplify(tolerance + 1) return polygon def diff_polygons(poly1, poly2): From 0acc6f292dfb69c4f91660b43a2d30c36484815e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 25 Mar 2023 02:45:05 +0100 Subject: [PATCH 032/194] resegment: list instead of generator --- ocrd_cis/ocropy/resegment.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index f96f2750..3bb270d0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -297,8 +297,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l min_area=640/zoom/zoom) DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) - new_line_polygons, new_baselines = zip(*[(make_valid(Polygon(line_poly)), LineString(baseline)) - for _, line_poly, baseline in new_line_polygons]) + new_line_polygons, new_baselines = list(zip(*[ + (make_valid(Polygon(line_poly)), LineString(baseline)) + for _, line_poly, baseline in new_line_polygons])) or ([], []) # polygons for intersecting pairs intersections = dict() # ratio of overlap between intersection and new line From 38206f0fc138077ecc1e3dd0d3409293e9807c85 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 25 Mar 2023 02:46:56 +0100 Subject: [PATCH 033/194] adapt to Numpy 1.24 dtypes --- ocrd_cis/ocropy/common.py | 36 +++++++++++++++--------------- ocrd_cis/ocropy/ocrolib/lineest.py | 2 +- ocrd_cis/ocropy/resegment.py | 22 +++++++++--------- ocrd_cis/ocropy/segment.py | 18 +++++++-------- 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index e98ab4a2..86372eeb 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -463,7 +463,7 @@ def compute_images(binary, scale, maximages=5): Returns a same-size image label array. """ if maximages == 0: - return np.zeros_like(binary, np.int) + return np.zeros_like(binary, int) images = binary # d0 = odd(max(2,scale/5)) # d1 = odd(max(2,scale/8)) @@ -475,7 +475,7 @@ def compute_images(binary, scale, maximages=5): images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages) DSAVE('images1_large', images+0.6*binary) if not images.any(): - return np.zeros_like(binary, np.int) + return np.zeros_like(binary, int) # 2- open horizontally and vertically to suppress # v/h-lines; these will be detected separately, # and it is dangerous to combine them into one @@ -500,7 +500,7 @@ def compute_images(binary, scale, maximages=5): images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages) DSAVE('images5_selected', images+0.6*binary) if not images.any(): - return np.zeros_like(binary, np.int) + return np.zeros_like(binary, int) # 6- dilate a little to get a smooth contour without gaps dilated = morph.r_dilation(images, (odd(scale),odd(scale))) images = morph.propagate_labels_majority(binary, dilated+1) @@ -531,7 +531,7 @@ def compute_seplines(binary, scale, maxseps=0): # - intersecting vertical and horizontal lines, even closed shapes (enclosing text) # - line-like glyphs (i.e. false positives) if maxseps == 0: - return np.zeros_like(binary, np.int) + return np.zeros_like(binary, int) skel, dist = medial_axis(binary, return_distance=True) DSAVE("medial-axis", [dist, skel]) labels, nlabels = morph.label(skel) @@ -539,7 +539,7 @@ def compute_seplines(binary, scale, maxseps=0): DSAVE("skel-labels", labels) # determine those components which could be separators # (filter by compactness, and by mean+variance of distances) - sepmap = np.zeros(nlabels + 1, np.int) + sepmap = np.zeros(nlabels + 1, int) numsep = 0 sepsizes = [0] sepslices = [None] @@ -662,7 +662,7 @@ def compute_seplines(binary, scale, maxseps=0): corrinds = corrinds.nonzero()[0] if len(corrinds) == 1: continue # nothing to link - nonoverlapping = np.zeros((len(corrinds), len(corrinds)), dtype=np.bool) + nonoverlapping = np.zeros((len(corrinds), len(corrinds)), dtype=bool) for i, indi in enumerate(corrinds[:-1]): sepi = corrs[indi, 0] labeli = np.flatnonzero(sepmap == sepi)[0] @@ -715,7 +715,7 @@ def union(slices): if np.any(minsize): maxseps = min(maxseps, minsize[0]) maxseps = min(maxseps, numsep) - ordermap = np.zeros(numsep + 1, np.int) + ordermap = np.zeros(numsep + 1, int) ordermap[order[:maxseps]] = np.arange(1, maxseps + 1) sepmap = ordermap[sepmap] DSAVE("sep-top", sepmap[labels]) @@ -1163,7 +1163,7 @@ def h_compatible(obj1, obj2, center1, center2): label1_y, label1_x = np.where(seeds == label) label2_y, label2_x = np.where(seed2) shared_y = np.intersect1d(label1_y, label2_y) - gap = np.zeros_like(seed2, np.bool) + gap = np.zeros_like(seed2, bool) for y in shared_y: can_x_min = label2_x[label2_y == y][0] can_x_max = label2_x[label2_y == y][-1] @@ -1407,7 +1407,7 @@ def compute_baselines(bottom, top, linelabels, scale, method='bottom'): if len(corrinds) == 1: labelmap.setdefault(line, list()).append(corrs[corrinds[0], 1]) continue - nonoverlapping = ~np.eye(len(corrinds), dtype=np.bool) + nonoverlapping = ~np.eye(len(corrinds), dtype=bool) for i, indi in enumerate(corrinds[:-1]): baselabeli = corrs[indi, 1] baseslicei = baseslices[baselabeli] @@ -1577,7 +1577,7 @@ def lines2regions(binary, llabels, bincounts = np.bincount(lbinary.flatten()) LOG.debug('combining lines to regions') - relabel = np.zeros(np.amax(llabels)+1, np.int) + relabel = np.zeros(np.amax(llabels)+1, int) num_regions = 0 def recursive_x_y_cut(box, mask=None, partition_type=None, debug=False): """Split lbinary at horizontal or vertical gaps recursively. @@ -1624,7 +1624,7 @@ def finalize(): llab = sl.cut(llabels, box) if isinstance(mask, np.ndarray): llab = np.where(mask, llab, 0) - linelabels0 = np.zeros(llabels.max()+1, dtype=np.bool) + linelabels0 = np.zeros(llabels.max()+1, dtype=bool) linelabels0[linelabels] = True llab *= linelabels0[llab] newregion = rlab.max()+1 @@ -1686,7 +1686,7 @@ def finalize(): if npartitions > 1: # delete partitions that have no significant line labels, # merge partitions that share any significant line labels - splitmap = np.zeros((len(objects), npartitions), dtype=np.bool) + splitmap = np.zeros((len(objects), npartitions), dtype=bool) for label in range(npartitions): linecounts = np.bincount(lbin[partitions==label+1], minlength=len(objects)) linecounts[0] = 0 # without bg @@ -1753,8 +1753,8 @@ def find_topological(): linelabels = np.setdiff1d(np.unique(lbin), [0]) nlines = linelabels.max() + 1 # find pairs of lines above each other with a separator next to them - leftseps = np.zeros((nlines, nseps), np.bool) - rghtseps = np.zeros((nlines, nseps), np.bool) + leftseps = np.zeros((nlines, nseps), bool) + rghtseps = np.zeros((nlines, nseps), bool) for line in linelabels: for i, sep in enumerate(sepobj): if sep is None: @@ -1775,7 +1775,7 @@ def find_topological(): if not np.any(trueseps): return if debug: LOG.debug("trueseps: %s", str(trueseps)) - neighbours = np.zeros((nlines, nlines), np.bool) + neighbours = np.zeros((nlines, nlines), bool) for i in linelabels: for j in linelabels[i+1:]: if sl.yoverlap_rel(obj[i], obj[j]) > 0.5: @@ -1791,7 +1791,7 @@ def find_topological(): # group neighbours by adjacency (i.e. put any contiguous pairs # of such line labels into the same group) nlabels = llab.max() + 1 - splitmap = np.zeros(nlabels, dtype=np.int) + splitmap = np.zeros(nlabels, dtype=int) for i, j in zip(*neighbours.nonzero()): if splitmap[i] > 0: splitmap[j] = splitmap[i] @@ -1879,8 +1879,8 @@ def find_topological(): if not gaps.shape[0]: continue for start, stop, height in sorted(zip( - props['left_ips'].astype(np.int), - props['right_ips'].astype(np.int), + props['left_ips'].astype(int), + props['right_ips'].astype(int), props['peak_heights']), key=lambda x: x[2]): if is_horizontal: llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) diff --git a/ocrd_cis/ocropy/ocrolib/lineest.py b/ocrd_cis/ocropy/ocrolib/lineest.py index 669b12ca..42ef2237 100644 --- a/ocrd_cis/ocropy/ocrolib/lineest.py +++ b/ocrd_cis/ocropy/ocrolib/lineest.py @@ -31,7 +31,7 @@ def check(self,line, max_ignore=0.02): #DSAVE('lineest check 1 dilated', smoothed + 0.5*line) smoothed = filters.gaussian_filter(smoothed, (1, h//10), mode='constant') # 2 #DSAVE('lineest check 2 smoothed', smoothed + 0.5*line) - smoothed = np.array(smoothed > np.median(smoothed), dtype=np.float) # 3 # or 0.05 instead of median? + smoothed = np.array(smoothed > np.median(smoothed), dtype=float) # 3 # or 0.05 instead of median? #DSAVE('lineest check 3 thresholded', smoothed + 0.5*line) smoothed = filters.minimum_filter(smoothed, (2, h//5)) # 4: undo 1/2 #DSAVE('lineest check 4 eroded', smoothed + 0.5*line) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 3bb270d0..5b5c37b4 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -188,8 +188,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # prepare line segmentation parent_array = pil2array(parent_image) #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw - parent_bin = np.array(parent_array <= midrange(parent_array), np.bool) - ignore_bin = np.ones_like(parent_bin, np.bool) + parent_bin = np.array(parent_array <= midrange(parent_array), bool) + ignore_bin = np.ones_like(parent_bin, bool) if isinstance(parent, PageType): tag = 'page' fullpage = True @@ -203,14 +203,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l page_id if fullpage else parent.id, report) return # get existing line labels: - line_labels = np.zeros_like(parent_bin, np.bool) + line_labels = np.zeros_like(parent_bin, bool) line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1)) line_polygons = [] for i, segment in enumerate(lines): segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) line_polygons.append(prep(segment_polygon)) - segment_polygon = np.array(segment_polygon.exterior.coords, np.int)[:-1] + segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1] # draw.polygon: If any segment_polygon lies outside of parent # (causing negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does not need @@ -225,7 +225,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l segment.id, page_id if fullpage else parent.id) segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) - segment_polygon = np.array(segment_polygon.exterior.coords, np.int)[:-1] + segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1] ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = False @@ -273,7 +273,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # left-hand side if left-to-right, and vice versa scale * (-1) ** line_ltr, single_sided=True)], loc=line.id, scale=scale)) - line_polygon = np.array(line_polygon.exterior.coords, np.int)[:-1] + line_polygon = np.array(line_polygon.exterior.coords, int)[:-1] line_y, line_x = draw.polygon(line_polygon[:, 1], line_polygon[:, 0], parent_bin.shape) @@ -303,11 +303,11 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # polygons for intersecting pairs intersections = dict() # ratio of overlap between intersection and new line - fits_bg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float) - fits_fg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float) + fits_bg = np.zeros((len(new_line_polygons), len(line_polygons)), float) + fits_fg = np.zeros((len(new_line_polygons), len(line_polygons)), float) # ratio of overlap between intersection and existing line - covers_bg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float) - covers_fg = np.zeros((len(new_line_polygons), len(line_polygons)), np.float) + covers_bg = np.zeros((len(new_line_polygons), len(line_polygons)), float) + covers_fg = np.zeros((len(new_line_polygons), len(line_polygons)), float) # compare segmentations, calculating ratios of overlapping fore/background area for i, new_line_poly in enumerate(new_line_polygons): for j, line_poly in enumerate(line_polygons): @@ -333,7 +333,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # fits_bg[i,j]*100, covers_bg[i,j]*100, # fits_fg[i,j]*100, covers_fg[i,j]*100) # assign new lines to existing lines, if possible - assignments = np.ones(len(new_line_polygons), np.int) * -1 + assignments = np.ones(len(new_line_polygons), int) * -1 for i, new_line_poly in enumerate(new_line_polygons): if not fits_bg[i].any(): LOG.debug("new line %d fits no existing line's background", i) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index e83dcacd..34182f20 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -90,7 +90,7 @@ def getx(xy): if not label: # ignore if background continue - bg_mask = np.array(bg_labels == label, np.bool) + bg_mask = np.array(bg_labels == label, bool) if not np.count_nonzero(bg_mask * fg_bin): # ignore if missing foreground LOG.debug('skipping label %d in %s due to empty fg', @@ -98,7 +98,7 @@ def getx(xy): continue # simplify to convex hull if simplify is not None: - hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(np.bool) + hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(bool) conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): @@ -136,20 +136,20 @@ def getx(xy): #plot_poly(hole, 'blue') # cut child from outside... # first get nearest point on child - hole_idx = np.argmin([cv2.pointPolygonTest(contour, tuple(pt[0]), True) + hole_idx = np.argmin([cv2.pointPolygonTest(contour, pt[0].tolist(), True) for pt in hole]) # now get nearest point on parent # (we cannot use PolygonTest directly, because we must also interpolate # to prevent crossing edges; at least each 10px) contour = np.append(contour, contour[0:1], axis=0) contour2 = np.diff(contour, axis=0) - contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(np.int)[:,0] // 10) + contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(int)[:,0] // 10) interpol = [] for i, ntics in enumerate(contourtics): interpol.extend(np.array(contour[i:i+1] + contour2[i:i+1] * np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis], - np.int)) + int)) interpol.append(contour[-1]) interpol = np.array(interpol) contourtics = np.insert(np.cumsum(contourtics), 0, 0) @@ -537,9 +537,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, LOG.warning("Skipping '%s' with zero size", element_id) return element_array = pil2array(image) - element_bin = np.array(element_array <= midrange(element_array), np.bool) - sep_bin = np.zeros_like(element_bin, np.bool) - ignore_labels = np.zeros_like(element_bin, np.int) + element_bin = np.array(element_array <= midrange(element_array), bool) + sep_bin = np.zeros_like(element_bin, bool) + ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): LOG.debug('masking foreground of %s "%s" for "%s"', type(segment).__name__[:-4], segment.id, element_id) @@ -778,7 +778,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, else: # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) - region_mask = np.zeros_like(element_bin, np.bool) + region_mask = np.zeros_like(element_bin, bool) region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True From 6eed14cb5ca71c16b56c6c5985ca853680f9fa9e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 25 Mar 2023 02:47:17 +0100 Subject: [PATCH 034/194] adapt to Shapely 2.0 deprecations --- ocrd_cis/ocropy/segment.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 34182f20..478d1c05 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -855,10 +855,10 @@ def make_intersection(poly1, poly2): # post-process if interp.is_empty or interp.area == 0.0: return None - if interp.type == 'GeometryCollection': + if interp.geom_type == 'GeometryCollection': # heterogeneous result: filter zero-area shapes (LineString, Point) interp = unary_union([geom for geom in interp.geoms if geom.area > 0]) - if interp.type == 'MultiPolygon': + if interp.geom_type == 'MultiPolygon': # homogeneous result: construct convex hull to connect interp = join_polygons(interp.geoms) if interp.minimum_clearance < 1.0: @@ -885,7 +885,7 @@ def make_valid(polygon): def diff_polygons(poly1, poly2): poly = poly1.difference(poly2) - if poly.type == 'MultiPolygon': + if poly.geom_type == 'MultiPolygon': poly = poly.convex_hull if poly.minimum_clearance < 1.0: poly = Polygon(np.round(poly.exterior.coords)) @@ -897,7 +897,7 @@ def join_polygons(polygons, loc='', scale=20): # compoundp = unary_union(polygons) # jointp = compoundp.convex_hull polygons = list(itertools.chain.from_iterable([ - poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection'] + poly.geoms if poly.geom_type in ['MultiPolygon', 'GeometryCollection'] else [poly] for poly in polygons])) npoly = len(polygons) @@ -921,7 +921,7 @@ def join_polygons(polygons, loc='', scale=20): bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) polygons.append(bridgep) jointp = unary_union(polygons) - assert jointp.type == 'Polygon', jointp.wkt + assert jointp.geom_type == 'Polygon', jointp.wkt if jointp.minimum_clearance < 1.0: # follow-up calculations will necessarily be integer; # so anticipate rounding here and then ensure validity @@ -951,10 +951,10 @@ def add_baseline(baseline): assert all(p1[0] < p2[0] for p1, p2 in zip(result[:-1], result[1:])), result for baseline in baselines: if (baseline.is_empty or - baseline.type in ['Point', 'MultiPoint']): + baseline.geom_type in ['Point', 'MultiPoint']): continue - if (baseline.type == 'GeometryCollection' or - baseline.type.startswith('Multi')): + if (baseline.geom_type == 'GeometryCollection' or + baseline.geom_type.startswith('Multi')): for geom in baseline.geoms: add_baseline(geom) continue From 2bf18e0e786683d857ca60a00ced26195a1811d4 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 14 Apr 2023 10:04:36 +0200 Subject: [PATCH 035/194] check_page: double max page size to 20k by 20k pixels --- ocrd_cis/ocropy/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index d84e42b3..728fabb1 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -370,9 +370,9 @@ def check_page(binary, zoom=1.0): if np.mean(binary)10000/zoom: return "image too tall for a page image %s"%(binary.shape,) + if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,) if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,) - if w>10000/zoom: return "image too wide for a page image %s"%(binary.shape,) + if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,) # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) From cd08aab3d1e3352e5c838f0c0b203de9e27e8f8b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 May 2023 21:24:36 +0200 Subject: [PATCH 036/194] check_page/region/line: skip assumptions on number of components --- ocrd_cis/ocropy/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 86372eeb..189c0db5 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -319,6 +319,7 @@ def check_line(binary, zoom=1.0): ##if w<1.5*h: return "line too short %s"%(binary.shape,) if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,) if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,) + return None ratio = w*1.0/h _, ncomps = measurements.label(binary) lo = int(0.5*ratio+0.5) @@ -348,6 +349,7 @@ def check_region(binary, zoom=1.0): if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,) if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,) if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,) + return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) @@ -375,6 +377,7 @@ def check_page(binary, zoom=1.0): if h>10000/zoom: return "image too tall for a page image %s"%(binary.shape,) if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,) if w>10000/zoom: return "image too wide for a page image %s"%(binary.shape,) + return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) From 70b21919f2f62dd8152992ba4d73d9d0f16f575b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 May 2023 21:25:55 +0200 Subject: [PATCH 037/194] resegment: add param baseline_only --- ocrd_cis/ocrd-tool.json | 7 ++++++- ocrd_cis/ocropy/resegment.py | 26 +++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index 74c0d0c9..e82a1c75 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -191,7 +191,7 @@ "output_file_grp": [ "OCR-D-SEG-LINE" ], - "description": "Resegment text lines", + "description": "Improve coordinates of text lines", "parameters": { "level-of-operation": { "type": "string", @@ -205,6 +205,11 @@ "description": "source for new line polygon candidates ('lineest' for line estimation, i.e. how Ocropy would have segmented text lines; 'baseline' tries to re-polygonize from the baseline annotation; 'ccomps' avoids crossing connected components by majority rule)", "default": "lineest" }, + "baseline_only": { + "type": "boolean", + "description": "ignore existing textline coords completely and use baseline as input if possible", + "default": false + }, "dpi": { "type": "number", "format": "float", diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 5b5c37b4..9242773d 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -207,8 +207,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1)) line_polygons = [] for i, segment in enumerate(lines): - segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) + if self.parameter['baseline_only'] and segment.Baseline: + segment_baseline = baseline_of_segment(segment, parent_coords) + segment_polygon = polygon_from_baseline(segment_baseline, 30/zoom) + else: + segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) + segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) line_polygons.append(prep(segment_polygon)) segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1] # draw.polygon: If any segment_polygon lies outside of parent @@ -267,12 +271,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l LOG.warning("Skipping '%s' without baseline", line.id) new_labels[line_labels[i]] = i + 1 continue - line_polygon = baseline_of_segment(line, parent_coords) - line_ltr = line_polygon[0,0] < line_polygon[-1,0] - line_polygon = make_valid(join_polygons([LineString(line_polygon).buffer( - # left-hand side if left-to-right, and vice versa - scale * (-1) ** line_ltr, single_sided=True)], - loc=line.id, scale=scale)) + line_baseline = baseline_of_segment(line, parent_coords) + line_polygon = polygon_from_baseline(line_baseline, scale) line_polygon = np.array(line_polygon.exterior.coords, int)[:-1] line_y, line_x = draw.polygon(line_polygon[:, 1], line_polygon[:, 0], @@ -460,7 +460,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, continue count = np.count_nonzero(old_label * binarized) if not count: - LOG.warning("skipping binarizy-empty line '%s'", line.id) + LOG.warning("skipping binary-empty line '%s'", line.id) continue covers = np.count_nonzero(new_label * binarized) / count if covers < threshold: @@ -494,3 +494,11 @@ def baseline_of_segment(segment, coords): line = transform_coordinates(line, coords['transform']) return np.round(line).astype(np.int32) +# zzz should go into core ocrd_utils +def polygon_from_baseline(baseline, scale): + ltr = baseline[0,0] < baseline[-1,0] + # left-hand side if left-to-right, and vice versa + polygon = make_valid(join_polygons([LineString(baseline).buffer(scale * (-1) ** ltr, + single_sided=True)], + scale=scale)) + return polygon From 35227a9bdc1977af7b2e51940f47516bcd1e10f0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 30 May 2023 21:26:38 +0200 Subject: [PATCH 038/194] resegment (baseline/ccomps): improve handling of fg conflicts --- ocrd_cis/ocropy/resegment.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 9242773d..ad05792e 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -2,7 +2,7 @@ import os.path import numpy as np -from skimage import draw +from skimage import draw, segmentation from shapely.geometry import Polygon, LineString from shapely.prepared import prep from shapely.ops import unary_union @@ -429,20 +429,22 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, scale=43, loc='', threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" LOG = getLogger('processor.OcropyResegment') - DSAVE('baseline-seeds', [new_labels, (components>0)]) - # allocate to connected components consistently (by majority, - # ignoring smallest components like punctuation) - #new_labels = morph.propagate_labels_majority(binarized, new_labels) - new_labels = morph.propagate_labels_majority(components > 0, new_labels) - DSAVE('majority-propagated', [new_labels, (components>0) & (new_labels==0)]) + DSAVE('seeds', [new_labels, (components>0)]) + # allocate to connected components consistently + # (ignoring smallest components like punctuation) + # but when there are conflicts, meet in the middle via watershed + new_labels2 = morph.propagate_labels(components > 0, new_labels, conflict=0) + new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0)) + DSAVE('propagated', new_labels2) # dilate/grow labels from connected components against each other and bg - new_labels = morph.spread_labels(new_labels, maxdist=scale*2) - DSAVE('scale-spread', [new_labels, (components>0)]) + new_labels = morph.spread_labels(new_labels2, maxdist=scale*2) + DSAVE('spread', new_labels) # now propagate again to catch smallest components like punctuation - new_labels = morph.propagate_labels_majority(components > 0, new_labels) - DSAVE('propagated-again', [new_labels, (components>0) & (new_labels==0)]) - new_labels = morph.spread_labels(new_labels, maxdist=scale/2) - DSAVE('spread-again', [new_labels, (components>0)]) + new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0) + new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized) + DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)]) + new_labels = morph.spread_labels(new_labels2, maxdist=scale/2) + DSAVE('spread-again', [new_labels, binarized]) # find polygon hull and modify line coords for i, line in enumerate(lines): new_label = new_labels == i + 1 From 1abc3b7b617b1c342908e6b69f6e706a14fc666f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 2 Jun 2023 03:50:12 +0200 Subject: [PATCH 039/194] segment: adapt to OpenCV changes --- ocrd_cis/ocropy/segment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 478d1c05..9e2a6ee3 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -136,7 +136,7 @@ def getx(xy): #plot_poly(hole, 'blue') # cut child from outside... # first get nearest point on child - hole_idx = np.argmin([cv2.pointPolygonTest(contour, pt[0].tolist(), True) + hole_idx = np.argmin([cv2.pointPolygonTest(contour, tuple(pt[0].tolist()), True) for pt in hole]) # now get nearest point on parent # (we cannot use PolygonTest directly, because we must also interpolate From 4c5542208fe2a65b5529e629c9a84d94c74fe705 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Jun 2023 23:09:41 +0200 Subject: [PATCH 040/194] ocrd-tool: rm old ocrd-cis-ocropy-rec (gone in 9e20991) --- ocrd_cis/ocrd-tool.json | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index e4224263..f518c051 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -302,26 +302,6 @@ } } }, - "ocrd-cis-ocropy-rec": { - "executable": "ocrd-cis-ocropy-rec", - "categories": [ - "Text recognition and optimization" - ], - "steps": [ - "recognition/text-recognition" - ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "description": "Recognize text snippets", - "parameters": { - "model": { - "type": "string", - "description": "ocropy model to apply (e.g. fraktur.pyrnn)" - } - } - }, "ocrd-cis-ocropy-segment": { "executable": "ocrd-cis-ocropy-segment", "categories": [ From 4c9ad27d1ae088f6df3c08fe126bc462977e7fd9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Jun 2023 01:55:14 +0200 Subject: [PATCH 041/194] =?UTF-8?q?ocropy-train:=20improve/update=20OCR-D?= =?UTF-8?q?=20wrapper=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - discern constructor vs. processing ctxt (initLogging, setup) - resolve start model via resmgr - get all text regions (recursively) - use true tempdir for extracted files - use CWD for model output paths (instead of dist dir) - use binarized derived image, if possible - use ocropy nlbin (instead of OpenCV thresholding), otherwise - skip segment if no/empty text transcription - simplify/deduplify hierarchy levels - improve logging and docs --- ocrd_cis/ocrd-tool.json | 10 +- ocrd_cis/ocropy/train.py | 245 ++++++++++++++------------------------- 2 files changed, 97 insertions(+), 158 deletions(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index f518c051..953ea1f8 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -298,7 +298,9 @@ }, "model": { "type": "string", - "description": "ocropy model to apply (e.g. fraktur.pyrnn)" + "format": "uri", + "content-type": "application/gzip", + "description": "ocropy model to apply (e.g. fraktur.pyrnn.gz)" } } }, @@ -418,13 +420,15 @@ "parameters": { "textequiv_level": { "type": "string", - "description": "PAGE XML hierarchy level granularity", + "description": "hierarchy level to extract GT pairs from", "enum": ["line", "word", "glyph"], "default": "line" }, "model": { "type": "string", - "description": "load model or create new one (e.g. fraktur.pyrnn)" + "format": "uri", + "content-type": "application/gzip", + "description": "load model (e.g. 'fraktur.pyrnn.gz') to init weights, or none to train from scratch" }, "ntrain": { "type": "number", diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 4427d47c..ceb26d21 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,25 +1,16 @@ from __future__ import absolute_import -import sys, os.path, cv2 +import sys +import os +import tempfile + from ocrd_modelfactory import page_from_file from ocrd import Processor from ocrd_utils import getLogger from ocrd_cis import get_ocrd_tool -sys.path.append(os.path.dirname(os.path.abspath(__file__))) - - from .ocropus_rtrain import * - -np.seterr(divide='raise',over='raise',invalid='raise',under='ignore') - - - - -def bounding_box(coord_points): - point_list = [[int(p) for p in pair.split(',')] for pair in coord_points.split(' ')] - x_coordinates, y_coordinates = zip(*point_list) - return (min(x_coordinates), min(y_coordinates), max(x_coordinates), max(y_coordinates)) +from .binarize import binarize def deletefiles(filelist): @@ -36,172 +27,116 @@ def resize_keep_ratio(image, baseheight=48): return image -def binarize(pil_image): - # Convert RGB to OpenCV - img = cv2.cvtColor(np.asarray(pil_image), cv2.COLOR_RGB2GRAY) - - # global thresholding - #ret1,th1 = cv2.threshold(img,127,255,cv2.THRESH_BINARY) - - # Otsu's thresholding - #ret2,th2 = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - # Otsu's thresholding after Gaussian filtering - blur = cv2.GaussianBlur(img,(5,5),0) - ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) - - bin_img = Image.fromarray(th3) - return bin_img - - - class OcropyTrain(Processor): def __init__(self, *args, **kwargs): - self.log = getLogger('OcropyTrain') + self.oldcwd = os.getcwd() ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-ocropy-train'] kwargs['version'] = ocrd_tool['version'] super(OcropyTrain, self).__init__(*args, **kwargs) + if hasattr(self, 'input_file_grp'): + # processing context + self.setup() - - def process(self): - """ - Performs the training - """ + def setup(self): + self.log = getLogger('processor.OcropyTrain') #print(self.parameter) - if self.parameter['textequiv_level'] not in ['line', 'word', 'glyph']: - raise Exception("currently only implemented at the line/glyph level") - - filepath = os.path.dirname(os.path.abspath(__file__)) - - - - if 'model' in self.parameter: model = self.parameter['model'] - modelpath = filepath + '/models/' + model + '.gz' - outputpath = filepath + '/output/' + model + try: + modelpath = self.resolve_resource(model) + except SystemExit: + ocropydir = os.path.dirname(os.path.abspath(__file__)) + modelpath = os.path.join(ocropydir, 'models', model) + self.log.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) + if not os.path.isfile(modelpath): + self.log.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", + model, model) + sys.exit(1) + outputpath = os.path.join(self.oldcwd, 'output', model) if 'outputpath' in self.parameter: - outputpath = self.parameter + '/' + model - if os.path.isfile(modelpath) == False: - raise Exception("configured model " + model + " is not in models folder") + outputpath = os.path.join(self.parameter, model) else: modelpath = None - outputpath = filepath + '/output/' + 'lstm' + outputpath = os.path.join(self.oldcwd, 'output', 'lstm') if 'outputpath' in self.parameter: - outputpath = self.parameter + '/' +'lstm' - - if 'ntrain' in self.parameter: - ntrain = self.parameter['ntrain'] - - + outputpath = os.path.join(self.parameter, 'lstm') + os.makedirs(os.path.dirname(outputpath)) + self.modelpath = modelpath + self.outputpath = outputpath + def process(self): + """ + Trains a new model on the text lines from the input fileGrp, + extracted as temporary image-text file pairs. + """ filelist = [] - + filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-') #self.log.info("Using model %s in %s for recognition", model) for (n, input_file) in enumerate(self.input_files): #self.log.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) - pil_image = self.workspace.resolve_image_as_pil(pcgts.get_Page().imageFilename) - + page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) + page = pcgts.get_Page() + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - self.log.info("page %s", pcgts) - for region in pcgts.get_Page().get_TextRegion(): + self.log.info("Extracting from page '%s'", page_id) + for region in page.get_AllRegions(classes=['Text']): textlines = region.get_TextLine() - self.log.info("About to extract %i lines in region '%s'", len(textlines), region.id) + self.log.info("Extracting %i lines from region '%s'", len(textlines), region.id) for line in textlines: - if self.parameter['textequiv_level'] == 'line': - self.log.debug("Extracting line '%s'", line.id) - - #get box from points - box = bounding_box(line.get_Coords().points) - - #crop word from page - croped_image = pil_image.crop(box=box) - - #binarize with Otsu's thresholding after Gaussian filtering - bin_image = binarize(croped_image) - - #resize image to 48 pixel height - final_img = resize_keep_ratio(bin_image) - - #save temp image - path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id)) - imgpath = path + '.png' - final_img.save(imgpath) - - filelist.append(imgpath) - - #ground truth - gt = line.get_TextEquiv()[0].Unicode.strip() - gtpath = path + '.gt.txt' - with open(gtpath, "w", encoding='utf-8') as f: - f.write(gt) - - - - if self.parameter['textequiv_level'] == 'word' or 'glyph': - for word in line.get_Word(): - - if self.parameter['textequiv_level'] == 'word': - self.log.debug("Extracting word '%s'", word.id) - - #get box from points - box = bounding_box(word.get_Coords().points) - - #crop word from page - croped_image = pil_image.crop(box=box) - - #binarize with Otsu's thresholding after Gaussian filtering - bin_image = binarize(croped_image) - - #resize image to 48 pixel height - final_img = resize_keep_ratio(bin_image) - - #save temp image - path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id) + str(word.id)) - imgpath = path + '.png' - final_img.save(imgpath) - + path = os.path.join(filepath, page_id + region.id + line.id) + imgpath = self.extract_segment(path, line, page_image, page_coords) + if imgpath: + filelist.append(imgpath) + continue + for word in line.get_Word(): + if self.parameter['textequiv_level'] == 'word': + path = os.path.join(filepath, page_id + region.id + line.id + word.id) + imgpath = self.extract_segment(path, word, page_image, page_coords) + if imgpath: + filelist.append(imgpath) + continue + for glyph in word.get_Glyph(): + path = os.path.join(filepath, page_id + region.id + line.id + glyph.id) + imgpath = self.extract_segment(path, glyph, page_image, page_coords) + if imgpath: filelist.append(imgpath) - #ground truth - gt = word.get_TextEquiv()[0].Unicode.strip() - gtpath = path + '.gt.txt' - - with open(gtpath, "w", encoding='utf-8') as f: - f.write(gt) - - else: - for glyph in word.get_Glyph(): - self.log.debug("Extracting glyph '%s'", glyph.id) - - #get box from points - box = bounding_box(glyph.get_Coords().points) - - #crop word from page - croped_image = pil_image.crop(box=box) - - #binarize with Otsu's thresholding after Gaussian filtering - bin_image = binarize(croped_image) - - #resize image to 48 pixel height - final_img = resize_keep_ratio(bin_image) - - #save temp image - path = os.path.join(filepath, 'temp', str(input_file.ID) + str(region.id) + str(line.id) + str(word.id) + str(glyph.id)) - imgpath = path + '.png' - final_img.save(imgpath) - - filelist.append(imgpath) - - #ground truth - gt = glyph.get_TextEquiv()[0].Unicode.strip() - with open(gtpath, "w", encoding='utf-8') as f: - f.write(gt) - - - rtrain(filelist, modelpath, outputpath, ntrain) + self.log.info("Training %s from %s on %i file pairs", + self.outputpath, + self.modelpath or 'scratch', + len(filelist)) + rtrain(filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) deletefiles(filelist) + + def extract_segment(self, path, segment, page_image, page_coords): + #ground truth + gt = segment.TextEquiv + if not gt: + return None + gt = gt[0].Unicode + if not gt or not gt.strip(): + return None + gt = gt.strip() + gtpath = path + '.gt.txt' + with open(gtpath, "w", encoding='utf-8') as f: + f.write(gt) + + self.log.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id) + image, coords = self.workspace.image_from_segment(segment, page_image, page_coords) + + if 'binarized' not in coords['features'].split(','): + # binarize with nlbin + image, _ = binarize(image, maxskew=0) + + # resize image to 48 pixel height + image = resize_keep_ratio(image) + + #save temp image + imgpath = path + '.png' + image.save(imgpath) + + return imgpath From 43a356a03221c6dd95b68c9bc0cb7b563f2b4870 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Jun 2023 02:48:09 +0200 Subject: [PATCH 042/194] =?UTF-8?q?postcorrect:=20improve/update=20OCR-D?= =?UTF-8?q?=20wrapper=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - discern constructor vs. processing ctxt (initLogging) - pass effective log level (instead of global CLI override) - use mets_target instead of fixed `mets.xml` - simplify --- ocrd_cis/postcorrect/cli.py | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index a5125b8d..42bedc04 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -5,19 +5,14 @@ from ocrd import Processor from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import getLogger +from ocrd_utils import getLogger, getLevelName from ocrd_models.ocrd_mets import OcrdMets from ocrd_cis import JavaPostCorrector from ocrd_cis import get_ocrd_tool -LOG_LEVEL = 'INFO' - @click.command() @ocrd_cli_options def ocrd_cis_postcorrect(*args, **kwargs): - if 'log_level' in kwargs and kwargs['log_level']: - global LOG_LEVEL - LOG_LEVEL = kwargs['log_level'] return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs) class PostCorrector(Processor): @@ -26,21 +21,22 @@ def __init__(self, *args, **kwargs): kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect'] kwargs['version'] = ocrd_tool['version'] super(PostCorrector, self).__init__(*args, **kwargs) - self.log = getLogger('cis.Processor.PostCorrector') def process(self): - ifgs = self.input_file_grp.split(",") # input file groups - ofg = self.output_file_grp + self.log = getLogger('processor.CISPostCorrector') profiler = {} profiler["path"] = self.parameter["profilerPath"] profiler["config"] = self.parameter["profilerConfig"] profiler["noCache"] = True self.parameter["profiler"] = profiler self.parameter["runDM"] = True - metspath = os.path.join(self.workspace.directory, "mets.xml") - print(json.dumps(self.parameter, indent=4)) - p = JavaPostCorrector(metspath, ",".join(ifgs), ofg, self.parameter, LOG_LEVEL) + self.log.debug(json.dumps(self.parameter, indent=4)) + p = JavaPostCorrector(self.workspace.mets_target, + self.input_file_grp, + self.output_file_grp, + self.parameter, + getLevelName(self.log.getEffectiveLevel())) p.exe() # reload the mets file to prevent it from overriding the # updated version from the java process - self.workspace.mets = OcrdMets(filename=metspath) + self.reload_mets() From 2783f615359bf6464d77084b80983b72b3461bae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 11 Jul 2023 17:01:15 +0200 Subject: [PATCH 043/194] segment: fix baseline extraction --- ocrd_cis/ocropy/deskew.py | 2 +- ocrd_cis/ocropy/segment.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index bb9904e0..4ed04218 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -39,7 +39,7 @@ def __init__(self, *args, **kwargs): super(OcropyDeskew, self).__init__(*args, **kwargs) def process(self): - """Deskew the regions of the workspace. + """Deskew the pages or regions of the workspace. Open and deserialise PAGE input files and their respective images, then iterate over the element hierarchy down to the TextRegion level. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 9e2a6ee3..ac25a1fb 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -959,7 +959,7 @@ def add_baseline(baseline): add_baseline(geom) continue add_baseline(baseline) - if not len(result): + if len(result) < 2: return None return LineString(result) From fcc02fddedb66c47472eddeb16d437bdca32172d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 17 Aug 2023 18:30:07 +0200 Subject: [PATCH 044/194] adapt to Numpy and Pillow deprecations --- ocrd_cis/div/cutter.py | 2 +- ocrd_cis/ocropy/ocrolib/time_morphology.py | 4 ++-- ocrd_cis/ocropy/ocropus_rtrain.py | 2 +- ocrd_cis/ocropy/recognize.py | 2 +- ocrd_cis/ocropy/train.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ocrd_cis/div/cutter.py b/ocrd_cis/div/cutter.py index ee187a1b..6dc6a9a9 100644 --- a/ocrd_cis/div/cutter.py +++ b/ocrd_cis/div/cutter.py @@ -26,7 +26,7 @@ def bounding_box(coord_points): def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) wsize = int((float(image.size[0] * float(hpercent)))) - image = image.resize((wsize, baseheight), Image.ANTIALIAS) + image = image.resize((wsize, baseheight), Image.LANCZOS) return image diff --git a/ocrd_cis/ocropy/ocrolib/time_morphology.py b/ocrd_cis/ocropy/ocrolib/time_morphology.py index 51a8e406..2e241d94 100644 --- a/ocrd_cis/ocropy/ocrolib/time_morphology.py +++ b/ocrd_cis/ocropy/ocrolib/time_morphology.py @@ -29,10 +29,10 @@ def cv_contours(bin): return zip((contour[:,0,::-1], cv2.contourArea(contour)) for contour in contours) def rb_opening(bin, size): - return filters.uniform_filter(filters.uniform_filter(bin, size, np.float, mode='constant', cval=1) == 1, size, np.float, origin=-1) > 1e-7 + return filters.uniform_filter(filters.uniform_filter(bin, size, float, mode='constant', cval=1) == 1, size, float, origin=-1) > 1e-7 def rb_closing(bin, size): - return filters.uniform_filter(filters.uniform_filter(bin, size, np.float) > 1e-7, size, mode='constant', cval=1, origin=-1) == 1 + return filters.uniform_filter(filters.uniform_filter(bin, size, float) > 1e-7, size, mode='constant', cval=1, origin=-1) == 1 def r_closing(bin, size): return filters.minimum_filter(filters.maximum_filter(bin, size), size, origin=-1) diff --git a/ocrd_cis/ocropy/ocropus_rtrain.py b/ocrd_cis/ocropy/ocropus_rtrain.py index fc34ad20..b1469e42 100644 --- a/ocrd_cis/ocropy/ocropus_rtrain.py +++ b/ocrd_cis/ocropy/ocropus_rtrain.py @@ -45,7 +45,7 @@ def resize_keep_ratio(image, baseheight): baseheight = 48 hpercent = (baseheight / float(image.size[1])) wsize = int((float(image.size[0] * float(hpercent)))) - image = image.resize((wsize, baseheight), Image.ANTIALIAS) + image = image.resize((wsize, baseheight), Image.LANCZOS) return image # make sure an output file has been set diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 07c2ebd4..e9259c6e 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -35,7 +35,7 @@ def resize_keep_ratio(image, baseheight=48): scale = baseheight / image.height wsize = round(image.width * scale) - image = image.resize((wsize, baseheight), Image.ANTIALIAS) + image = image.resize((wsize, baseheight), Image.LANCZOS) return image, scale # from ocropus-rpred process1, but without input files and without lineest/dewarping diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index ceb26d21..d257a61f 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -23,7 +23,7 @@ def deletefiles(filelist): def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) wsize = int((float(image.size[0] * float(hpercent)))) - image = image.resize((wsize, baseheight), Image.ANTIALIAS) + image = image.resize((wsize, baseheight), Image.LANCZOS) return image From 1512c81de109d44bd8ff105d52552d958bc5fdbb Mon Sep 17 00:00:00 2001 From: joschrew Date: Thu, 1 Feb 2024 09:12:16 +0100 Subject: [PATCH 045/194] Remove testing from Dockerfile --- Dockerfile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index f58112b8..cb81e78e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -53,6 +53,8 @@ RUN apt-get update \ && apt-get -y install --no-install-recommends gcc wget default-jre-headless \ && cd /build \ && make install \ - && make test \ + # test always fail, resources not available for download. Resources should be made available + # somewhere else, e.g. github.com/OCR-D/assets + # && make test \ && cd / \ && rm -rf /build From 67905d73675b7b1ce2ea80caefb2fdf36d1f3ee1 Mon Sep 17 00:00:00 2001 From: joschrew Date: Fri, 9 Feb 2024 15:52:29 +0100 Subject: [PATCH 046/194] Add metadata to Dockerfile --- Dockerfile | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Dockerfile b/Dockerfile index cb81e78e..71e8b09f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,12 @@ FROM ocrd/core:latest AS base +ARG VCS_REF +ARG BUILD_DATE +LABEL \ + maintainer="https://github.com/OCR-D/ocrd_cis/issues" \ + org.label-schema.vcs-ref=$VCS_REF \ + org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_cis" \ + org.label-schema.build-date=$BUILD_DATE + ENV VERSION="Di 12. Mai 13:26:35 CEST 2020" ENV GITURL="https://github.com/cisocrgroup" ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf" From d5e81876e43cb016a0f148c315f0a292bf30fff1 Mon Sep 17 00:00:00 2001 From: joschrew Date: Mon, 12 Feb 2024 11:26:35 +0100 Subject: [PATCH 047/194] Set docker metadata with makefile --- Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 730ba3f4..eebe029a 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,10 @@ uninstall: ${PIP} uninstall ${PKG} docker-build: Dockerfile - docker build -t flobar/ocrd_cis:latest . + docker build \ + --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ + --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ + -t flobar/ocrd_cis:latest . docker-push: docker-build docker push flobar/ocrd_cis:latest From 320d5fd69d2b6efbacaee9b70275cf5e66c9794e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 12 Feb 2024 15:44:19 +0100 Subject: [PATCH 048/194] try to fix tests by adapting URLs --- tests/test_lib.bash | 45 +++++++++++++++------------------------------ 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 5d38f482..8cfb0018 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -4,11 +4,13 @@ tmpdir=$(mktemp -d) trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://ocr-d-repo.scc.kit.edu/api/v1/dataresources/75ad9f94-dbaa-43e0-ab06-2ce24c497c61/data" +# fixme: it does not work like this - the OCR-D GT repo uses different URL paths for different datasets +# this is merely the path for blumenbach_anatomie_1805.ocrd.zip +data_url="https://ocr-d-repo.scc.kit.edu/api/v1/dataresources/75ad9f94-dbaa-43e0-ab06-2ce24c497c61/data/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$tmpdir/download" - wget -P "$tmpdir/download" "$url" + wget -nc -P "$tmpdir/download" "$url" } function ocrd_cis_init_ws() { @@ -19,33 +21,16 @@ function ocrd_cis_init_ws() { function ocrd_cis_align() { # download ocr models - wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz" - wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur2-00062000.pyrnn.gz" + ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz + ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz # run ocr - ocrd-cis-ocropy-recognize --log-level DEBUG \ - --input-file-grp "OCR-D-GT-SEG-LINE" \ - --output-file-grp OCR-D-CIS-OCR-1 \ - --mets "$tmpws/mets.xml" \ - --parameter <(cat < Date: Mon, 12 Feb 2024 15:51:52 +0100 Subject: [PATCH 049/194] add CircleCI config --- .circleci/config.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 .circleci/config.yml diff --git a/.circleci/config.yml b/.circleci/config.yml new file mode 100644 index 00000000..87ec16ba --- /dev/null +++ b/.circleci/config.yml @@ -0,0 +1,19 @@ +version: 2.1 +jobs: + + test-python3: + docker: + - image: ocrd/core + environment: + PIP: pip3 + PYTHON: python3 + steps: + - checkout + - run: make install + - run: make test + +workflows: + version: 2 + build-and-test: + jobs: + - test-python3 From a4dc20f0de12cebd8ba1b54b3478d6a065d52f35 Mon Sep 17 00:00:00 2001 From: joschrew Date: Tue, 13 Feb 2024 12:56:23 +0100 Subject: [PATCH 050/194] Update CircleCI config This is just a dummy commit to try to trigger circleci --- .circleci/config.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 87ec16ba..9f1b1685 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,6 +1,5 @@ version: 2.1 jobs: - test-python3: docker: - image: ocrd/core @@ -11,7 +10,6 @@ jobs: - checkout - run: make install - run: make test - workflows: version: 2 build-and-test: From c4f0724f97dc5880a64835d40ed265a540c4b2fd Mon Sep 17 00:00:00 2001 From: joschrew Date: Tue, 13 Feb 2024 14:27:05 +0100 Subject: [PATCH 051/194] Another try to fix tests --- tests/test_lib.bash | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 8cfb0018..7e560fbe 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -4,13 +4,12 @@ tmpdir=$(mktemp -d) trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -# fixme: it does not work like this - the OCR-D GT repo uses different URL paths for different datasets -# this is merely the path for blumenbach_anatomie_1805.ocrd.zip -data_url="https://ocr-d-repo.scc.kit.edu/api/v1/dataresources/75ad9f94-dbaa-43e0-ab06-2ce24c497c61/data/" +data_url="http://hdl.handle.net/21.11156/6B119B3C-A24A-424C-AC3C-27E64B051780" function ocrd_cis_download_bagit() { - local url="$data_url/$1" - mkdir -p "$tmpdir/download" - wget -nc -P "$tmpdir/download" "$url" + local destdir="$tmpdir/download" + mkdir -p "$destdir" + local dest="$destdir/$1" + wget -nc -O $dest $data_url } function ocrd_cis_init_ws() { @@ -32,5 +31,5 @@ function ocrd_cis_align() { -P textequiv_level word -P model fraktur-jze.pyrnn.gz ocrd-cis-align -l DEBUG -m $tmpws/mets.xml \ -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \ - -O OCR-D-CIS-ALIGN + -O OCR-D-CIS-ALIGN } From 71e4e50965d4794b77c2f006b984ed3b299385de Mon Sep 17 00:00:00 2001 From: joschrew Date: Tue, 13 Feb 2024 15:11:12 +0100 Subject: [PATCH 052/194] Debug circleci tests --- .circleci/config.yml | 2 +- tests/test_lib.bash | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9f1b1685..1f709dd4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -9,7 +9,7 @@ jobs: steps: - checkout - run: make install - - run: make test + - run: make test V="" workflows: version: 2 build-and-test: diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 7e560fbe..d4f5162c 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -4,6 +4,8 @@ tmpdir=$(mktemp -d) trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" +# fixme: it does not work like this - the OCR-D GT repo uses different URL paths for different datasets +# this is merely the path for blumenbach_anatomie_1805.ocrd.zip data_url="http://hdl.handle.net/21.11156/6B119B3C-A24A-424C-AC3C-27E64B051780" function ocrd_cis_download_bagit() { local destdir="$tmpdir/download" From 1b5029532fd61dbe41cdc4455876b6fe2921c7d0 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Feb 2024 22:39:09 +0100 Subject: [PATCH 053/194] tests: use proper new OCR-D GT URL --- tests/test_lib.bash | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index d4f5162c..0603929c 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -4,14 +4,11 @@ tmpdir=$(mktemp -d) trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -# fixme: it does not work like this - the OCR-D GT repo uses different URL paths for different datasets -# this is merely the path for blumenbach_anatomie_1805.ocrd.zip -data_url="http://hdl.handle.net/21.11156/6B119B3C-A24A-424C-AC3C-27E64B051780" +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.20/" function ocrd_cis_download_bagit() { - local destdir="$tmpdir/download" - mkdir -p "$destdir" - local dest="$destdir/$1" - wget -nc -O $dest $data_url + local url="$data_url/$1" + mkdir -p "$tmpdir/download" + wget -nc -P "$tmpdir/download" "$url" } function ocrd_cis_init_ws() { From f214c5970628ffcf4e053ba26b811f19bfd6590d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 14 Feb 2024 02:19:55 +0100 Subject: [PATCH 054/194] tests: reuse downloaded bag files --- tests/test_lib.bash | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 0603929c..0ae12d56 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -7,13 +7,13 @@ OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.20/" function ocrd_cis_download_bagit() { local url="$data_url/$1" - mkdir -p "$tmpdir/download" - wget -nc -P "$tmpdir/download" "$url" + mkdir -p "$PWD/download" + wget -nc -P "$PWD/download" "$url" } function ocrd_cis_init_ws() { ocrd_cis_download_bagit "$1" - ocrd zip spill -d "$tmpdir" "$tmpdir/download/$1" + ocrd zip spill -d "$tmpdir" "$PWD/download/$1" tmpws="$tmpdir/${1%.ocrd.zip}" } From 68dcb9035cb4e17b98f5d7b206eeb72dc2bee381 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 14 Feb 2024 02:20:47 +0100 Subject: [PATCH 055/194] ocrolib.common.load_object: find ocrolib in sys.path --- ocrd_cis/ocropy/ocrolib/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ocrd_cis/ocropy/ocrolib/common.py b/ocrd_cis/ocropy/ocrolib/common.py index 1c0c3208..6741a676 100644 --- a/ocrd_cis/ocropy/ocrolib/common.py +++ b/ocrd_cis/ocropy/ocrolib/common.py @@ -445,6 +445,9 @@ class names that have changed.""" LOG.info("# loading object '%s'", fname) if zip==0 and fname.endswith(".gz"): zip = 1 + # most models will have been pickled with ocrolib at top level + # we therefore need to add ocrd_cis.ocropy to the search path + sys.path.append(os.path.dirname(os.path.dirname(__file__))) if zip>0: with gzip.GzipFile(fname,"rb") as stream: #with os.popen("gunzip < '%s'"%fname,"rb") as stream: From 4f3ea7c94c01e295a64c3150c8efb0b1749d1953 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Feb 2024 00:36:24 +0100 Subject: [PATCH 056/194] postcorrect: adapt processor to new OCR-D (mets:file with @LOCTYPE and only relative paths) --- ocrd_cis/postcorrect/cli.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index 42bedc04..dc3ee48e 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -37,6 +37,12 @@ def process(self): self.parameter, getLevelName(self.log.getEffectiveLevel())) p.exe() - # reload the mets file to prevent it from overriding the - # updated version from the java process - self.reload_mets() + # reload the mets file to prevent run_processor's save_mets + # from overriding the results from the Java process + self.workspace.reload_mets() + # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): + for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + flocat.attrib['LOCTYPE'] = 'OTHER' + flocat.attrib['OTHERLOCTYPE'] = 'FILE' + output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) From 84e3acf16bfbd1ed0e6f2d02cfb36e2560126888 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Feb 2024 00:39:05 +0100 Subject: [PATCH 057/194] test_lib: update to fixed GT repo URL, don't remove workspace on failure --- tests/test_lib.bash | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 0ae12d56..e9df9985 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -1,10 +1,11 @@ #/bin/bash tmpdir=$(mktemp -d) +trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.20/" +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.19/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" From 5e0660392434d7331afbf406d164831420d7626f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Feb 2024 00:41:27 +0100 Subject: [PATCH 058/194] tests: use downloadable recognizer models throughout, simplify scripts --- tests/run_add_zip_test.bash | 20 ++-------- tests/run_alignment_test.bash | 22 +++-------- tests/run_image_preprocessing_test.bash | 49 +++++-------------------- tests/run_ocr_test.bash | 31 +++++----------- tests/run_postcorrection_test.bash | 43 +++++++--------------- tests/run_training_test.bash | 17 ++------- tests/test_lib.bash | 19 +++++++--- 7 files changed, 58 insertions(+), 143 deletions(-) diff --git a/tests/run_add_zip_test.bash b/tests/run_add_zip_test.bash index 003c5e86..02de2db2 100644 --- a/tests/run_add_zip_test.bash +++ b/tests/run_add_zip_test.bash @@ -7,30 +7,18 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip pushd "$tmpws" found_files=0 for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" popd # test if there are 3 gt files pushd "$tmpws" found_files=0 for file in $(ocrd workspace find -G OCR-D-IMG); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" popd diff --git a/tests/run_alignment_test.bash b/tests/run_alignment_test.bash index 1e9e3ea0..e8a3c79a 100644 --- a/tests/run_alignment_test.bash +++ b/tests/run_alignment_test.bash @@ -6,17 +6,11 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi +for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" popd ocrd_cis_align @@ -24,14 +18,8 @@ ocrd_cis_align pushd $tmpws found_files=0 for file in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do - if [[ ! -f "$file" ]]; then - echo "cannot find aligned file group workspace" - exit 1 - fi + [[ -f "$file" ]] || fail "cannot find aligned file group workspace" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" popd diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_image_preprocessing_test.bash index 4fd028e4..f80fc636 100644 --- a/tests/run_image_preprocessing_test.bash +++ b/tests/run_image_preprocessing_test.bash @@ -7,45 +7,16 @@ ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip" # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi +for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi +(( found_files == 3 )) || fail "invalid number of files: $found_files" + +ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN +ocrd-cis-ocropy-clip -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP +ocrd-cis-ocropy-denoise -l DEBUG -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN +ocrd-cis-ocropy-deskew -l DEBUG -I OCR-D-CIS-IMG-DEN -O OCR-D-CIS-IMG-DES +ocrd-cis-ocropy-dewarp -l DEBUG -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW +ocrd-cis-ocropy-segment -l DEBUG -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG popd - -ocrd-cis-ocropy-binarize --log-level DEBUG \ - --input-file-grp OCR-D-GT-SEG-LINE \ - --output-file-grp OCR-D-CIS-IMG-BIN \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-clip --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-BIN \ - --output-file-grp OCR-D-CIS-IMG-CLIP \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-denoise --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-CLIP \ - --output-file-grp OCR-D-CIS-IMG-DEN \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-deskew --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-DEN \ - --output-file-grp OCR-D-CIS-IMG-DES \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-dewarp --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-DES \ - --output-file-grp OCR-D-CIS-IMG-DEW \ - --mets "$tmpws/mets.xml" - -ocrd-cis-ocropy-segment --log-level DEBUG \ - --input-file-grp OCR-D-CIS-IMG-DEW \ - --output-file-grp OCR-D-CIS-IMG-SEG \ - --mets "$tmpws/mets.xml" diff --git a/tests/run_ocr_test.bash b/tests/run_ocr_test.bash index 6de88a7b..b10f6f6d 100644 --- a/tests/run_ocr_test.bash +++ b/tests/run_ocr_test.bash @@ -6,31 +6,18 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do - if [[ ! -f "$file" ]]; then - echo "cannot find ground truth file: $file" - exit 1 - fi +for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do + [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done -if [[ $found_files != 3 ]]; then - echo "invalid number of files: $found_files" - exit 1 -fi -popd +(( $found_files == 3 )) || fail "invalid number of files: $found_files" # download ocr model -wget -P "$tmpdir/download" "http://cis.lmu.de/~finkf/fraktur1-00085000.pyrnn.gz" +ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz # run ocr -ocrd-cis-ocropy-recognize --log-level DEBUG \ - --input-file-grp "OCR-D-GT-SEG-LINE" \ - --output-file-grp OCR-D-CIS-OCR \ - --mets "$tmpws/mets.xml" \ - --parameter <(cat < /dev/null echo '{}' EOF chmod a+x "$tmpdir/bin/profiler.bash" -ocrd-cis-postcorrect --log-level DEBUG \ - -I OCR-D-CIS-ALIGN \ - -O OCR-D-CIS-POSTCORRECT \ - -m $tmpws/mets.xml \ - --parameter <(cat <&2 "$@" + false } From 05895c9ff1855674dd08cbc72b4ca13646c3bda4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Feb 2024 00:58:29 +0100 Subject: [PATCH 059/194] tests: deactivate training (broken) --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index eebe029a..22f07508 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ docker-build: Dockerfile docker-push: docker-build docker push flobar/ocrd_cis:latest -TEST_SCRIPTS=$(sort $(wildcard tests/run_*.bash)) +TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) .PHONY: $(TEST_SCRIPTS) $(TEST_SCRIPTS): bash $@ $V From 29e6e3a6eea681c5aaa9b4f365e1f84c9e3dd250 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Feb 2024 00:59:05 +0100 Subject: [PATCH 060/194] makefile: allow setting different tag for docker build --- Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 22f07508..a040cf9d 100644 --- a/Makefile +++ b/Makefile @@ -2,6 +2,7 @@ PY ?= python3 PIP ?= pip3 V ?= > /dev/null 2>&1 PKG = ocrd_cis +TAG = flobar/ocrd_cis install: ${PIP} install --upgrade pip . @@ -14,14 +15,14 @@ docker-build: Dockerfile docker build \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ - -t flobar/ocrd_cis:latest . + -t $(TAG):latest . docker-push: docker-build - docker push flobar/ocrd_cis:latest + docker push $(TAG):latest TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) .PHONY: $(TEST_SCRIPTS) $(TEST_SCRIPTS): bash $@ $V test: $(TEST_SCRIPTS) - echo $^ -.PHONY: install test + @echo $^ +.PHONY: install install-devel uninstall test docker-build docker-push From 07662f22a1d26639dea77a60d9854d2a949eb20a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 15 Feb 2024 00:59:31 +0100 Subject: [PATCH 061/194] CI: add CD --- .circleci/config.yml | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 1f709dd4..470197da 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,8 +10,32 @@ jobs: - checkout - run: make install - run: make test V="" + + deploy-docker: + docker: + - image: circleci/buildpack-deps:stretch + environment: + DOCKER_TAG: ocrd/cis + steps: + - checkout + - setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/ + docker_layer_caching: true + - run: make docker TAG=$DOCKER_TAG + - run: + name: Login to Docker Hub + command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin + - run: docker push $DOCKER_TAG + workflows: version: 2 build-and-test: jobs: - test-python3 + deploy: + jobs: + - deploy-docker: + filters: + branches: + only: + - master + - fix-alpha-shape From 4673d9b342a25200313ed2ea31ab2b5796b4d4f8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 25 Apr 2024 14:47:07 +0200 Subject: [PATCH 062/194] re/segment join_polygons: fix rare case of adjacent rings --- ocrd_cis/ocropy/segment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index ac25a1fb..077363e1 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -908,7 +908,7 @@ def join_polygons(polygons, loc='', scale=20): dists = np.eye(npoly, dtype=float) for i, j in pairs: dist = polygons[i].distance(polygons[j]) - if dist == 0: + if dist < 1e-5: dist = 1e-5 # if pair merely touches, we still need to get an edge dists[i, j] = dist dists[j, i] = dist From 338b840e46e378e5427e37a2a95777d64a861332 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 25 Apr 2024 14:52:27 +0200 Subject: [PATCH 063/194] re/segment join_baselines: adapt to Shapely, improve --- ocrd_cis/ocropy/segment.py | 119 +++++++++++++++++++++++++++++-------- 1 file changed, 94 insertions(+), 25 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 077363e1..49cb6776 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -11,6 +11,7 @@ from shapely.prepared import prep from shapely.ops import unary_union, nearest_points from shapely.validation import explain_validity +from shapely import set_precision from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( @@ -931,37 +932,105 @@ def join_polygons(polygons, loc='', scale=20): def join_baselines(baselines, loc=''): LOG = getLogger('processor.OcropyResegment') - result = [] - def add_baseline(baseline): - nonlocal result - base_x = [pt[0] for pt in result] - base_left = min(base_x, default=0) - base_right = max(base_x, default=0) - left = baseline.bounds[0] - right = baseline.bounds[2] - if baseline.coords[0][0] > baseline.coords[-1][0]: - baseline.coords = list(baseline.coords[::-1]) - if left > base_right: - result.extend(baseline.coords) - elif right < base_left: - result = list(baseline.coords) + result - else: - LOG.warning("baseline part crosses existing x in %s", loc) - return - assert all(p1[0] < p2[0] for p1, p2 in zip(result[:-1], result[1:])), result + lines = [] for baseline in baselines: if (baseline.is_empty or baseline.geom_type in ['Point', 'MultiPoint']): continue - if (baseline.geom_type == 'GeometryCollection' or - baseline.geom_type.startswith('Multi')): + elif baseline.geom_type == 'MultiLineString': + lines.extend(baseline.geoms) + elif baseline.geom_type == 'LineString': + lines.append(baseline) + elif baseline.geom_type == 'GeometryCollection': for geom in baseline.geoms: - add_baseline(geom) - continue - add_baseline(baseline) - if len(result) < 2: + if geom.geom_type == 'LineString': + lines.append(geom) + elif geom.geom_type == 'MultiLineString': + lines.extend(geom) + else: + LOG.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc) + else: + LOG.warning("ignoring baseline type %s in %s", baseline.geom_type, loc) + nlines = len(lines) + if nlines == 0: + return None + elif nlines == 1: + return lines[0] + # Shapely cannot reorder: + #result = line_merge(MultiLineString([line.normalize() for line in lines])) + # find min-dist path through all lines (travelling salesman) + pairs = itertools.combinations(range(nlines), 2) + dists = np.eye(nlines, dtype=float) + for i, j in pairs: + dist = lines[i].distance(lines[j]) + if dist < 1e-5: + dist = 1e-5 # if pair merely touches, we still need to get an edge + dists[i, j] = dist + dists[j, i] = dist + dists = minimum_spanning_tree(dists, overwrite=True) + assert dists.nonzero()[0].size, dists + # get path + chains = [] + for prevl, nextl in zip(*dists.nonzero()): + foundchains = [] + for chain in chains: + if chain[0] == prevl: + found = chain, 0, nextl + elif chain[0] == nextl: + found = chain, 0, prevl + elif chain[-1] == prevl: + found = chain, -1, nextl + elif chain[-1] == nextl: + found = chain, -1, prevl + else: + continue + foundchains.append(found) + if len(foundchains): + assert len(foundchains) <= 2, foundchains + chain, pos, node = foundchains.pop() + if len(foundchains): + otherchain, otherpos, othernode = foundchains.pop() + assert node != othernode + assert chain[pos] == othernode + assert otherchain[otherpos] == node + if pos < 0 and otherpos < 0: + chain.extend(reversed(otherchain)) + chains.remove(otherchain) + elif pos < 0 and otherpos == 0: + chain.extend(otherchain) + chains.remove(otherchain) + elif pos == 0 and otherpos == 0: + otherchain.extend(reversed(chain)) + chains.remove(chain) + elif pos == 0 and otherpos < 0: + otherchain.extend(chain) + chains.remove(chain) + elif pos < 0: + chain.append(node) + else: + chain.insert(0, node) + else: + chains.append([prevl, nextl]) + if len(chains) > 1: + LOG.warning("baseline merge impossible (no spanning tree) in %s", loc) + return None + assert len(chains) == 1, chains + assert len(chains[0]) == nlines, chains[0] + path = chains[0] + # get points + coords = [] + for node in path: + line = lines[node] + coords.extend(line.normalize().coords) + result = LineString(coords) + if result.is_empty: + LOG.warning("baseline merge is empty in %s", loc) return None - return LineString(result) + assert result.geom_type == 'LineString', result.wkt + result = set_precision(result, 1.0) + if result.geom_type != 'LineString' or not result.is_valid: + result = LineString(np.round(line.coords)) + return result def page_get_reading_order(ro, rogroup): """Add all elements from the given reading order group to the given dictionary. From d2a52794789e7810b02de12f50b15b188f08c616 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 25 Apr 2024 15:09:48 +0200 Subject: [PATCH 064/194] resegment (lineest): fix/improve matching --- ocrd_cis/ocropy/resegment.py | 106 ++++++++++++++++++++--------------- 1 file changed, 61 insertions(+), 45 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index ad05792e..929edc3a 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -312,52 +312,68 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for i, new_line_poly in enumerate(new_line_polygons): for j, line_poly in enumerate(line_polygons): # too strict: .contains - if line_poly.intersects(new_line_poly): - inter = make_intersection(line_poly.context, new_line_poly) - if not inter: - continue - new_line_mask = (new_line_labels == i+1) & parent_bin - line_mask = line_labels[j] & parent_bin - inter_mask = new_line_mask & line_mask - if (not np.count_nonzero(inter_mask) or - not np.count_nonzero(new_line_mask) or - not np.count_nonzero(line_mask)): - continue - intersections[(i, j)] = inter - fits_bg[i, j] = inter.area / new_line_poly.area - covers_bg[i, j] = inter.area / line_poly.context.area - fits_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(new_line_mask) - covers_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(line_mask) - # LOG.debug("new %d old %d (%s): %.1f%% / %.1f%% bg, %.1f%% / %.1f%% fg", - # i, j, lines[j].id, - # fits_bg[i,j]*100, covers_bg[i,j]*100, - # fits_fg[i,j]*100, covers_fg[i,j]*100) - # assign new lines to existing lines, if possible - assignments = np.ones(len(new_line_polygons), int) * -1 - for i, new_line_poly in enumerate(new_line_polygons): - if not fits_bg[i].any(): - LOG.debug("new line %d fits no existing line's background", i) - continue - if not fits_fg[i].any(): - LOG.debug("new line %d fits no existing line's foreground", i) - continue - fits = (fits_bg[i] > 0.6) & (fits_fg[i] > 0.9) - if not fits.any(): - j = np.argmax(fits_bg[i] * fits_fg[i]) - LOG.debug("best fit '%s' for new line %d fits only %.1f%% bg / %.1f%% fg", - lines[j].id, i, fits_bg[i,j] * 100, fits_fg[i,j] * 100) + if not line_poly.intersects(new_line_poly): + continue + inter = make_intersection(line_poly.context, new_line_poly) + if not inter: + continue + new_line_mask = (new_line_labels == i+1) & parent_bin + line_mask = line_labels[j] & parent_bin + inter_mask = new_line_mask & line_mask + if (not np.count_nonzero(inter_mask) or + not np.count_nonzero(new_line_mask) or + not np.count_nonzero(line_mask)): + continue + intersections[(i, j)] = inter + fits_bg[i, j] = inter.area / new_line_poly.area + covers_bg[i, j] = inter.area / line_poly.context.area + fits_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(new_line_mask) + covers_fg[i, j] = np.count_nonzero(inter_mask) / np.count_nonzero(line_mask) + # LOG.debug("new %d old %d (%s): %.1f%% / %.1f%% bg, %.1f%% / %.1f%% fg", + # i, j, lines[j].id, + # fits_bg[i,j]*100, covers_bg[i,j]*100, + # fits_fg[i,j]*100, covers_fg[i,j]*100) + # assign existing lines to new lines (1:n), if possible + # start from best matches (forced alignment) + dim1 = len(new_line_polygons) + dim2 = len(line_polygons) + idx1 = np.arange(dim1) + idx2 = np.arange(dim2) + keep1 = np.ones(dim1, bool) + keep2 = np.ones(dim2, bool) + assignments = -1 * np.ones(dim1, int) + for _ in range(dim1): + fit_bg_view = fits_bg[np.ix_(keep1, keep2)] + if not fit_bg_view.size: + break + cov_bg_view = covers_bg[np.ix_(keep1, keep2)] + fit_fg_view = fits_fg[np.ix_(keep1, keep2)] + cov_fg_view = covers_fg[np.ix_(keep1, keep2)] + priority = cov_fg_view * cov_bg_view + ind1, ind2 = np.unravel_index(np.argmax(priority, axis=None), priority.shape) + fit_fg = fit_fg_view[ind1, ind2] + fit_bg = fit_bg_view[ind1, ind2] + cov_fg = cov_fg_view[ind1, ind2] + cov_bg = cov_bg_view[ind1, ind2] + # return to full view and assign next + ind1 = idx1[keep1][ind1] + ind2 = idx2[keep2][ind2] + #new_poly = new_line_polygons[ind1] + #poly = line_polygons[ind2] + # assignment must be new + assert assignments[ind1] < 0 + assert keep1[ind1] + assert keep2[ind2] + # minimum threshold + if not (fit_bg > 0.6 and fit_fg > 0.7): + # skip next time + # LOG.debug("match for %s too large: %d%%fg / %d%%bg", lines[ind2].id, fit_fg*100, fit_bg*100) + covers_bg[ind1, ind2] = 0 + covers_fg[ind1, ind2] = 0 continue - covers = covers_bg[i] * covers_fg[i] * fits - j = np.argmax(covers) - line = lines[j] - inter_polygon = intersections[(i,j)] - new_line_polygon = new_line_polygons[i] - new_center = inter_polygon.centroid - center = new_line_polygon.centroid - # FIXME: apply reasonable threshold for centroid distance - LOG.debug("new line for '%s' has centroid distance %.2f", - line.id, center.distance(new_center)) - assignments[i] = j + assignments[ind1] = ind2 + keep1[ind1] = False + #keep2[ind2] = False # validate assignments retain enough area and do not loose unassigned matches line_polygons = [poly.context.buffer(-margin) for poly in line_polygons] for j, line in enumerate(lines): From 8a71d8e84af01ce769a2c44e3f02d6a702efedb7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 25 Apr 2024 15:12:41 +0200 Subject: [PATCH 065/194] resegment (lineest): use new polygons instead of intersections but ignore extend_margins --- ocrd_cis/ocrd-tool.json | 2 +- ocrd_cis/ocropy/resegment.py | 57 ++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index 953ea1f8..be763142 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -225,7 +225,7 @@ "extend_margins": { "type": "number", "format": "integer", - "description": "number of pixels to extend the input polygons in all directions", + "description": "(ignored)", "default": 3 } } diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 929edc3a..ddb8fcb5 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -183,7 +183,6 @@ def process(self): def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): LOG = getLogger('processor.OcropyResegment') threshold = self.parameter['min_fraction'] - margin = self.parameter['extend_margins'] method = self.parameter['method'] # prepare line segmentation parent_array = pil2array(parent_image) @@ -206,32 +205,34 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_labels = np.zeros_like(parent_bin, bool) line_labels = np.tile(line_labels[np.newaxis], (len(lines), 1, 1)) line_polygons = [] - for i, segment in enumerate(lines): - if self.parameter['baseline_only'] and segment.Baseline: - segment_baseline = baseline_of_segment(segment, parent_coords) - segment_polygon = polygon_from_baseline(segment_baseline, 30/zoom) + for i, line in enumerate(lines): + if self.parameter['baseline_only'] and line.Baseline: + line_base = baseline_of_segment(line, parent_coords) + line_poly = polygon_from_baseline(line_base, 30/zoom) else: - segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) - line_polygons.append(prep(segment_polygon)) - segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1] - # draw.polygon: If any segment_polygon lies outside of parent + line_poly = coordinates_of_segment(line, parent_image, parent_coords) + line_poly = make_valid(Polygon(line_poly)) + line_polygons.append(line_poly) + line_polygons = list(map(prep, line_polygons)) + for i, line_polygon in enumerate(line_polygons): + polygon = np.array(line_polygon.context.exterior.coords, int)[:-1] + # draw.polygon: If any line_polygon lies outside of parent # (causing negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does not need # to concern herself with this. - segment_y, segment_x = draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - parent_bin.shape) - line_labels[i, segment_y, segment_x] = True + line_y, line_x = draw.polygon(polygon[:, 1], + polygon[:, 0], + parent_bin.shape) + line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines - for i, segment in enumerate(set(line.parent_object_ for line in lines)): + for i, region in enumerate(set(line.parent_object_ for line in lines)): LOG.debug('unmasking area of text region "%s" for "%s"', - segment.id, page_id if fullpage else parent.id) - segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - segment_polygon = make_valid(Polygon(segment_polygon)).buffer(margin) - segment_polygon = np.array(segment_polygon.exterior.coords, int)[:-1] - ignore_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], + region.id, page_id if fullpage else parent.id) + region_polygon = coordinates_of_segment(region, parent_image, parent_coords) + region_polygon = make_valid(Polygon(region_polygon)) + region_polygon = np.array(region_polygon.exterior.coords, int)[:-1] + ignore_bin[draw.polygon(region_polygon[:, 1], + region_polygon[:, 0], parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): @@ -295,11 +296,10 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_line_polygons, new_line_labels = masks2polygons( new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), min_area=640/zoom/zoom) - DSAVE('line_labels', [np.mean(line_labels, axis=0), parent_bin]) + DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) - new_line_polygons, new_baselines = list(zip(*[ - (make_valid(Polygon(line_poly)), LineString(baseline)) - for _, line_poly, baseline in new_line_polygons])) or ([], []) + new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base)) + for _, poly, base in new_line_polygons])) or ([], []) # polygons for intersecting pairs intersections = dict() # ratio of overlap between intersection and new line @@ -375,7 +375,6 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l keep1[ind1] = False #keep2[ind2] = False # validate assignments retain enough area and do not loose unassigned matches - line_polygons = [poly.context.buffer(-margin) for poly in line_polygons] for j, line in enumerate(lines): new_lines = np.nonzero(assignments == j)[0] if not np.prod(new_lines.shape): @@ -404,9 +403,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # combine all assigned new lines to single outline polygon if len(new_lines) > 1: LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) - new_polygon = join_polygons([intersections[(i, j)] + new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] for i in new_lines], loc=line.id, scale=scale) - line_polygons[j] = new_polygon new_baseline = join_baselines([new_polygon.intersection(new_baselines[i]) for i in new_lines], loc=line.id) # convert back to absolute (page) coordinates: @@ -422,6 +420,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_baseline = coordinates_for_segment(new_baseline.coords, parent_image, parent_coords) line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline))) + line_polygons[j] = prep(new_polygon) # now also ensure the assigned lines do not overlap other existing lines for i in new_lines: for otherj in np.nonzero(fits_fg[i] > 0.1)[0]: @@ -429,7 +428,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l continue otherline = lines[otherj] LOG.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) - other_polygon = diff_polygons(line_polygons[otherj], new_polygon) + other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon) if other_polygon.is_empty: continue # convert back to absolute (page) coordinates: From 6e95b3847ec5532c039062062129ce3c1c1a6bf7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 6 May 2024 13:15:25 +0200 Subject: [PATCH 066/194] tests: update data_url (after force-push upstream) --- tests/test_lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 7e2824f2..199e2a7b 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/download/l1.1.19/" +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.4.3/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" From b6c89572f5b5e78b181e5e28660597fef055ae3b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 29 May 2024 17:08:13 +0200 Subject: [PATCH 067/194] resegment: expose parameter spread (analogous to segment) --- ocrd_cis/ocrd-tool.json | 6 ++++++ ocrd_cis/ocropy/resegment.py | 17 +++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index be763142..7e5203c1 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -222,6 +222,12 @@ "description": "share of foreground pixels that must be retained by the output polygons", "default": 0.75 }, + "spread": { + "type": "number", + "format": "float", + "description": "distance in points (pt) from the foreground to project textline labels into the background for polygonal contours; if zero, project half a scale/capheight", + "default": 2.4 + }, "extend_margins": { "type": "number", "format": "integer", diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index ddb8fcb5..2495cadd 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -184,6 +184,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l LOG = getLogger('processor.OcropyResegment') threshold = self.parameter['min_fraction'] method = self.parameter['method'] + maxdist = self.parameter['spread']/zoom*300/72 # in pt # prepare line segmentation parent_array = pil2array(parent_image) #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw @@ -273,19 +274,19 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_labels[line_labels[i]] = i + 1 continue line_baseline = baseline_of_segment(line, parent_coords) - line_polygon = polygon_from_baseline(line_baseline, scale) + line_polygon = polygon_from_baseline(line_baseline, maxdist or scale/2) line_polygon = np.array(line_polygon.exterior.coords, int)[:-1] line_y, line_x = draw.polygon(line_polygon[:, 1], line_polygon[:, 0], parent_bin.shape) new_labels[line_y, line_x] = i + 1 spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords, - scale=scale, loc=parent.id, threshold=threshold) + maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold) return try: new_line_labels, new_baselines, _, _, _, scale = compute_segmentation( - parent_bin, seps=ignore_bin, zoom=zoom, fullpage=fullpage, - maxseps=0, maxcolseps=len(ignore), maximages=0) + parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2, + fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: LOG.error('Cannot line-segment %s "%s": %s', tag, page_id if fullpage else parent.id, err) @@ -441,7 +442,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l otherline.get_Coords().set_points(points_from_polygon(other_polygon)) def spread_dist(lines, old_labels, new_labels, binarized, components, coords, - scale=43, loc='', threshold=0.9): + maxdist=43, loc='', threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" LOG = getLogger('processor.OcropyResegment') DSAVE('seeds', [new_labels, (components>0)]) @@ -452,13 +453,13 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0)) DSAVE('propagated', new_labels2) # dilate/grow labels from connected components against each other and bg - new_labels = morph.spread_labels(new_labels2, maxdist=scale*2) + new_labels = morph.spread_labels(new_labels2, maxdist=maxdist) DSAVE('spread', new_labels) # now propagate again to catch smallest components like punctuation new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0) new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized) DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)]) - new_labels = morph.spread_labels(new_labels2, maxdist=scale/2) + new_labels = morph.spread_labels(new_labels2, maxdist=maxdist/4) DSAVE('spread-again', [new_labels, binarized]) # find polygon hull and modify line coords for i, line in enumerate(lines): @@ -496,7 +497,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, # get alpha shape poly = join_polygons([make_valid(Polygon(contour)) for contour in contours], - loc=line.id, scale=scale) + loc=line.id, scale=maxdist) poly = poly.exterior.coords[:-1] polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) From 3346b4e2b5181398676b5a476e18c3866dbb6306 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 30 May 2024 01:14:33 +0200 Subject: [PATCH 068/194] test assets: workaround for core#1189 / gt_structure_text#2 --- tests/test_lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 199e2a7b..f28acb1e 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.4.3/" +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" From 38ce45bf016546b748cce65031cad3fe24a35c0d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 30 May 2024 08:23:40 +0200 Subject: [PATCH 069/194] CircleCI: install JRE --- .circleci/config.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 470197da..35f0a966 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -8,6 +8,7 @@ jobs: PYTHON: python3 steps: - checkout + - run: apt-get update && apt-get -y install default-jre-headless - run: make install - run: make test V="" @@ -20,7 +21,7 @@ jobs: - checkout - setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/ docker_layer_caching: true - - run: make docker TAG=$DOCKER_TAG + - run: make docker-build TAG=$DOCKER_TAG - run: name: Login to Docker Hub command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin From 8d65708cc8ee6f42d00796d9dc1ed441b7cd7474 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 5 Jul 2024 13:26:38 +0200 Subject: [PATCH 070/194] resegment: fix 2 edge cases --- ocrd_cis/ocropy/resegment.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 2495cadd..a337b5e0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -262,7 +262,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for i, label in enumerate(labels): distances[i] = morph.dist_labels(label.astype(np.uint8)) # normalize the distances of all lines so larger ones do not displace smaller ones - distances[i] = distances[i] / distances[i].max() * 255 + if distances[i].any(): + distances[i] = distances[i] / distances[i].max() * 255 # use depth to flatten overlapping lines as seed labels new_labels = np.argmax(distances, axis=0) else: @@ -496,7 +497,8 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, else: # get alpha shape poly = join_polygons([make_valid(Polygon(contour)) - for contour in contours], + for contour in contours + if len(contour) >= 4], loc=line.id, scale=maxdist) poly = poly.exterior.coords[:-1] polygon = coordinates_for_segment(poly, None, coords) From eb4efe1e7bd21e9a1cf5d7c18dcca4d868a92f66 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 5 Jul 2024 13:33:30 +0200 Subject: [PATCH 071/194] ocrd-tool.json: add Ocropy default model resources --- ocrd_cis/ocrd-tool.json | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index 7e5203c1..a93917da 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -308,7 +308,33 @@ "content-type": "application/gzip", "description": "ocropy model to apply (e.g. fraktur.pyrnn.gz)" } - } + }, + "resources": [ + { + "url": "https://github.com/zuphilip/ocropy-models/raw/master/en-default.pyrnn.gz", + "name": "en-default.pyrnn.gz", + "description": "Default ocropy model for English", + "size": 83826134 + }, + { + "url": "https://github.com/zuphilip/ocropy-models/raw/master/fraktur.pyrnn.gz", + "name": "fraktur.pyrnn.gz", + "description": "Default ocropy fraktur model", + "size": 43882365 + }, + { + "url": "https://github.com/jze/ocropus-model_fraktur/raw/master/fraktur.pyrnn.gz", + "name": "fraktur-jze.pyrnn.gz", + "description": "ocropy fraktur model by github.com/jze", + "size": 2961298 + }, + { + "url": "https://github.com/chreul/OCR_Testdata_EarlyPrintedBooks/raw/master/LatinHist-98000.pyrnn.gz", + "name": "LatinHist.pyrnn.gz", + "description": "ocropy historical latin model by github.com/chreul", + "size": 16989864 + } + ] }, "ocrd-cis-ocropy-segment": { "executable": "ocrd-cis-ocropy-segment", From 842b4c25e5cd1529aaa533dc0b5f552c16c53c1a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 5 Jul 2024 13:47:03 +0200 Subject: [PATCH 072/194] docker: adapt to core using /build already --- Dockerfile | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Dockerfile b/Dockerfile index 71e8b09f..efffa9d9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -24,14 +24,14 @@ RUN sed -i -e 's/# en_US.UTF-8 UTF-8/en_US.UTF-8 UTF-8/' /etc/locale.gen \ FROM base AS profiler RUN apt-get update \ && apt-get -y install --no-install-recommends cmake g++ libcppunit-dev libxerces-c-dev \ - && git clone ${GITURL}/Profiler --branch devel --single-branch /build \ - && cd /build \ + && git clone ${GITURL}/Profiler --branch devel --single-branch /build/Profiler \ + && pushd /build/Profiler \ && cmake -DCMAKE_BUILD_TYPE=release . \ && make compileFBDic trainFrequencyList runDictSearch profiler \ && mkdir /apps \ && cp bin/compileFBDic bin/trainFrequencyList bin/profiler bin/runDictSearch /apps/ \ - && cd / \ - && rm -rf /build + && popd \ + && rm -rf /build/Profiler FROM profiler AS languagemodel # install the profiler's language backend @@ -40,13 +40,13 @@ COPY --from=profiler /apps/trainFrequencyList /apps/ COPY --from=profiler /apps/runDictSearch /apps/ RUN apt-get update \ && apt-get -y install --no-install-recommends icu-devtools \ - && git clone ${GITURL}/Resources --branch master --single-branch /build \ - && cd /build/lexica \ + && git clone ${GITURL}/Resources --branch master --single-branch /build/Resources \ + && pushd /build/Resources/lexica \ && PATH=$PATH:/apps make \ && PATH=$PATH:/apps make test \ && PATH=$PATH:/apps make install \ - && cd / \ - && rm -rf /build + && popd \ + && rm -rf /build/Resources FROM base AS postcorrection # install ocrd_cis (python) @@ -56,13 +56,13 @@ COPY --from=profiler /apps/profiler /apps/ COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linux-gnu/ COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/ COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/ -COPY . /build +COPY . /build/ocrd_cis RUN apt-get update \ && apt-get -y install --no-install-recommends gcc wget default-jre-headless \ - && cd /build \ + && pushd /build/ocrd_cis \ && make install \ # test always fail, resources not available for download. Resources should be made available # somewhere else, e.g. github.com/OCR-D/assets # && make test \ - && cd / \ - && rm -rf /build + && popd \ + && rm -rf /build/ocrd_cis From 53ae7d69fac017100bcdae2573d643a28c6a8f84 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 5 Jul 2024 17:28:25 +0200 Subject: [PATCH 073/194] use importlib instead of pkg_resources via ocrd_utils --- ocrd_cis/ocrd_tool.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py index 8cd184fb..0f06e55f 100644 --- a/ocrd_cis/ocrd_tool.py +++ b/ocrd_cis/ocrd_tool.py @@ -1,5 +1,5 @@ import json -from pkg_resources import resource_string +from ocrd_utils import resource_string def get_ocrd_tool(): diff --git a/setup.py b/setup.py index a5e19979..fcdf0a44 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=2.30', + 'ocrd>=2.47', 'click', 'scipy', 'numpy>=1.17.0', From fed84da7c731c7e2ed3840122df7f5345c465534 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 31 Jul 2024 15:59:17 +0200 Subject: [PATCH 074/194] fix 53ae7d69 (already str not bytes) --- ocrd_cis/ocrd_tool.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py index 0f06e55f..36cb9d7e 100644 --- a/ocrd_cis/ocrd_tool.py +++ b/ocrd_cis/ocrd_tool.py @@ -3,5 +3,4 @@ def get_ocrd_tool(): - return json.loads( - resource_string(__name__, 'ocrd-tool.json').decode('utf8')) + return json.loads(resource_string(__name__, 'ocrd-tool.json')) From 5282092997ad6b2b53a3cef8c3c96fbb27066682 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 31 Jul 2024 16:03:48 +0200 Subject: [PATCH 075/194] recognize: replace python-levenshtein with rapidfuzz --- ocrd_cis/ocropy/recognize.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index e9259c6e..74d858ab 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -5,7 +5,7 @@ import numpy as np from PIL import Image -import Levenshtein +from rapidfuzz.distance import Levenshtein from ocrd_utils import ( getLogger, diff --git a/setup.py b/setup.py index fcdf0a44..6df9445c 100644 --- a/setup.py +++ b/setup.py @@ -47,7 +47,7 @@ 'scikit-image', 'networkx', 'opencv-python-headless', - 'python-Levenshtein' + 'rapidfuzz' ], extras_require={ 'debug': ['matplotlib>3.0.0'], From a382d6fbcb64f2890ba4f22a38b8d1484b88e3df Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 31 Jul 2024 16:21:34 +0200 Subject: [PATCH 076/194] fix+update dockerfile --- Dockerfile | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index efffa9d9..e7b2249a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ocrd/core:latest AS base +FROM ocrd/core:v2.67.2 AS base ARG VCS_REF ARG BUILD_DATE LABEL \ @@ -7,10 +7,11 @@ LABEL \ org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_cis" \ org.label-schema.build-date=$BUILD_DATE -ENV VERSION="Di 12. Mai 13:26:35 CEST 2020" ENV GITURL="https://github.com/cisocrgroup" ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf" +SHELL ["/bin/bash", "-c"] + # deps RUN apt-get update \ && apt-get -y install --no-install-recommends locales From 2ed2c4f89ab4611d24e0a9328479124f88750ca1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 10:41:03 +0200 Subject: [PATCH 077/194] add executable property --- ocrd_cis/ocropy/binarize.py | 10 ++++++---- ocrd_cis/ocropy/clip.py | 8 +++++--- ocrd_cis/ocropy/denoise.py | 8 +++++--- ocrd_cis/ocropy/deskew.py | 6 +++++- ocrd_cis/ocropy/dewarp.py | 10 ++++++---- ocrd_cis/ocropy/recognize.py | 10 ++++++---- ocrd_cis/ocropy/resegment.py | 8 +++++--- ocrd_cis/ocropy/segment.py | 8 +++++--- ocrd_cis/ocropy/train.py | 6 +++++- 9 files changed, 48 insertions(+), 26 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 872185c3..7429d14a 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -28,8 +28,6 @@ #sys.path.append(os.path.dirname(os.path.abspath(__file__))) -TOOL = 'ocrd-cis-ocropy-binarize' - def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): LOG = getLogger('processor.OcropyBinarize') LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) @@ -71,13 +69,17 @@ class OcropyBinarize(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyBinarize, self).__init__(*args, **kwargs) if hasattr(self, 'output_file_grp'): # processing context self.setup() - + + @property + def executable(self): + return 'ocrd-cis-ocropy-binarize' + def setup(self): self.logger = getLogger('processor.OcropyBinarize') if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy': diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index a305f09e..919b26b0 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -31,16 +31,18 @@ pil2array, array2pil ) -TOOL = 'ocrd-cis-ocropy-clip' - class OcropyClip(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyClip, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-clip' + def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index cbbdf8cf..ac3c4dc5 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -19,16 +19,18 @@ # binarize, remove_noise) -TOOL = 'ocrd-cis-ocropy-denoise' - class OcropyDenoise(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyDenoise, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-denoise' + def process(self): """Despeckle the pages / regions / lines of the workspace. diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 4ed04218..fe61fce3 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -34,10 +34,14 @@ class OcropyDeskew(Processor): def __init__(self, *args, **kwargs): ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] super(OcropyDeskew, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-deskew' + def process(self): """Deskew the pages or regions of the workspace. diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 7d3251bf..1bc4a805 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -24,8 +24,6 @@ #sys.path.append(os.path.dirname(os.path.abspath(__file__))) -TOOL = 'ocrd-cis-ocropy-dewarp' - class InvalidLine(Exception): """Line image does not allow dewarping and should be ignored.""" @@ -72,13 +70,17 @@ class OcropyDewarp(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyDewarp, self).__init__(*args, **kwargs) if hasattr(self, 'output_file_grp'): # processing context self.setup() - + + @property + def executable(self): + return 'ocrd-cis-ocropy-dewarp' + def setup(self): # defaults from ocrolib.lineest: self.lnorm = lineest.CenterNormalizer( diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 74d858ab..5734aa92 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -30,8 +30,6 @@ check_line ) -TOOL = 'ocrd-cis-ocropy-recognize' - def resize_keep_ratio(image, baseheight=48): scale = baseheight / image.height wsize = round(image.width * scale) @@ -85,13 +83,17 @@ def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() self.pad = 16 # ocropus-rpred default self.network = None # set in process - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropyRecognize, self).__init__(*args, **kwargs) if hasattr(self, 'output_file_grp'): # processing context self.setup() - + + @property + def executable(self): + return 'ocrd-cis-ocropy-recognize' + def setup(self): self.logger = getLogger('processor.OcropyRecognize') # from ocropus-rpred: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index a337b5e0..2b1f73c3 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -46,16 +46,18 @@ diff_polygons ) -TOOL = 'ocrd-cis-ocropy-resegment' - class OcropyResegment(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super().__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-resegment' + def process(self): """Resegment lines of the workspace. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 49cb6776..1624597e 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -58,8 +58,6 @@ lines2regions ) -TOOL = 'ocrd-cis-ocropy-segment' - def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): """Convert label masks into polygon coordinates. @@ -248,10 +246,14 @@ class OcropySegment(Processor): def __init__(self, *args, **kwargs): self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][TOOL] + kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] super(OcropySegment, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-ocropy-segment' + def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index d257a61f..46e9d258 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -32,13 +32,17 @@ class OcropyTrain(Processor): def __init__(self, *args, **kwargs): self.oldcwd = os.getcwd() ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-ocropy-train'] + kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] super(OcropyTrain, self).__init__(*args, **kwargs) if hasattr(self, 'input_file_grp'): # processing context self.setup() + @property + def executable(self): + return 'ocrd-cis-ocropy-train' + def setup(self): self.log = getLogger('processor.OcropyTrain') #print(self.parameter) From 61e6caf06ff479d4e6a8c59d85254d5a25fa79e4 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 10:54:46 +0200 Subject: [PATCH 078/194] add setup method if missing --- ocrd_cis/ocropy/binarize.py | 10 ++++++---- ocrd_cis/ocropy/clip.py | 5 +++++ ocrd_cis/ocropy/denoise.py | 5 +++++ ocrd_cis/ocropy/deskew.py | 5 +++++ ocrd_cis/ocropy/dewarp.py | 4 +++- ocrd_cis/ocropy/recognize.py | 4 +++- ocrd_cis/ocropy/resegment.py | 5 +++++ ocrd_cis/ocropy/segment.py | 5 +++++ ocrd_cis/ocropy/train.py | 2 +- 9 files changed, 38 insertions(+), 7 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 7429d14a..f42ff2bd 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -68,6 +68,7 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo class OcropyBinarize(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyBinarize') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -81,10 +82,11 @@ def executable(self): return 'ocrd-cis-ocropy-binarize' def setup(self): - self.logger = getLogger('processor.OcropyBinarize') - if self.parameter['grayscale'] and self.parameter['method'] != 'ocropy': - self.logger.critical('requested method %s does not support grayscale normalized output', - self.parameter['method']) + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + method = self.parameter['method'] + if self.parameter['grayscale'] and method != 'ocropy': + self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise Exception('only method=ocropy allows grayscale=true') def process(self): diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 919b26b0..d11b8eae 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -34,6 +34,7 @@ class OcropyClip(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyClip') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -43,6 +44,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-clip' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index ac3c4dc5..fc1b582e 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -22,6 +22,7 @@ class OcropyDenoise(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyDenoise') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -31,6 +32,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-denoise' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Despeckle the pages / regions / lines of the workspace. diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index fe61fce3..1ffaec62 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -33,6 +33,7 @@ def deskew(pil_image, maxskew=2): class OcropyDeskew(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyDeskew') ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] @@ -42,6 +43,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-deskew' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Deskew the pages or regions of the workspace. diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 1bc4a805..89a62e11 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -69,6 +69,7 @@ def padvert(image, range_): class OcropyDewarp(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyDewarp') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -82,6 +83,8 @@ def executable(self): return 'ocrd-cis-ocropy-dewarp' def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) # defaults from ocrolib.lineest: self.lnorm = lineest.CenterNormalizer( params=(self.parameter['range'], @@ -91,7 +94,6 @@ def setup(self): # dependency between smoothness # and extra params) 0.3)) - self.logger = getLogger('processor.OcropyDewarp') def process(self): """Dewarp the lines of the workspace. diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 5734aa92..fdeaed27 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -80,6 +80,7 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyRecognize') self.ocrd_tool = get_ocrd_tool() self.pad = 16 # ocropus-rpred default self.network = None # set in process @@ -95,7 +96,8 @@ def executable(self): return 'ocrd-cis-ocropy-recognize' def setup(self): - self.logger = getLogger('processor.OcropyRecognize') + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) # from ocropus-rpred: self.network = load_object(self.get_model(), verbose=1) for x in self.network.walk(): diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 2b1f73c3..d9a92390 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -49,6 +49,7 @@ class OcropyResegment(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropyResegment') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -58,6 +59,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-resegment' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Resegment lines of the workspace. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 1624597e..7488eefe 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -245,6 +245,7 @@ def getx(xy): class OcropySegment(Processor): def __init__(self, *args, **kwargs): + self.logger = getLogger('processor.OcropySegment') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -254,6 +255,10 @@ def __init__(self, *args, **kwargs): def executable(self): return 'ocrd-cis-ocropy-segment' + def setup(self): + assert_file_grp_cardinality(self.input_file_grp, 1) + assert_file_grp_cardinality(self.output_file_grp, 1) + def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 46e9d258..25317c4d 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -30,6 +30,7 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): def __init__(self, *args, **kwargs): + self.log = getLogger('processor.OcropyTrain') self.oldcwd = os.getcwd() ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] @@ -44,7 +45,6 @@ def executable(self): return 'ocrd-cis-ocropy-train' def setup(self): - self.log = getLogger('processor.OcropyTrain') #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] From a0965c2aa7d6315f001606bc1c6043a020095ef9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 14:02:55 +0200 Subject: [PATCH 079/194] add self.logger wherever missing --- ocrd_cis/ocropy/clip.py | 20 +++--- ocrd_cis/ocropy/denoise.py | 16 ++--- ocrd_cis/ocropy/deskew.py | 14 ++-- ocrd_cis/ocropy/resegment.py | 74 +++++++++---------- ocrd_cis/ocropy/segment.py | 136 ++++++++++++++++++----------------- 5 files changed, 129 insertions(+), 131 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index d11b8eae..4c0eebea 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -83,13 +83,12 @@ def process(self): # too. However, region-level clipping _must_ be run before region-level # deskewing, because that would make segments incomensurable with their # neighbours. - LOG = getLogger('processor.OcropyClip') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -105,7 +104,7 @@ def process(self): dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) + self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 @@ -127,7 +126,7 @@ def process(self): page.get_TableRegion() + page.get_UnknownRegion()) if not num_texts: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) background = ImageStat.Stat(page_image) # workaround for Pillow#4925 if len(background.bands) > 1: @@ -158,7 +157,7 @@ def process(self): if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - LOG.warning('Page "%s" region "%s" already contains image data: skipping', + self.logger.warning('Page "%s" region "%s" already contains image data: skipping', page_id, region.id) continue shape = prep(shapes[i]) @@ -176,7 +175,7 @@ def process(self): # level == 'line': lines = region.get_TextLine() if not lines: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) continue region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') @@ -194,7 +193,7 @@ def process(self): for j, line in enumerate(lines): if line.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - LOG.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', + self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', page_id, region.id, line.id) continue shape = prep(shapes[j]) @@ -219,13 +218,12 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, page_id, file_id): - LOG = getLogger('processor.OcropyClip') # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( @@ -237,7 +235,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, segment_bbox = bbox_from_polygon(segment_polygon) for neighbour, neighbour_mask in neighbours: if not np.any(segment_mask > neighbour_mask): - LOG.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', + self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', neighbour.id, segment.id, page_id) continue # find connected components that (only) belong to the neighbour: @@ -247,7 +245,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: continue - LOG.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', + self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', segment.id, neighbour.id, num_intruders, num_foreground, page_id) # suppress in segment_mask so these intruders can stay in the neighbours # (are not removed from both sides) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index fc1b582e..d6a4f7ff 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -57,13 +57,12 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyDenoise') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -80,7 +79,7 @@ def process(self): dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) + self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 @@ -91,7 +90,7 @@ def process(self): else: regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, @@ -102,7 +101,7 @@ def process(self): continue lines = region.get_TextLine() if not lines: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, @@ -121,15 +120,14 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id): - LOG = getLogger('processor.OcropyDenoise') if not segment_image.width or not segment_image.height: - LOG.warning("Skipping '%s' with zero size", file_id) + self.logger.warning("Skipping '%s' with zero size", file_id) return - LOG.info("About to despeckle '%s'", file_id) + self.logger.info("About to despeckle '%s'", file_id) bin_image = remove_noise(segment_image, maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt # update METS (add the image file): diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 1ffaec62..63bb6b97 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -65,13 +65,12 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyDeskew') level = self.parameter['level-of-operation'] assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -95,7 +94,7 @@ def process(self): else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) for region in regions: # process region: region_image, region_coords = self.workspace.image_from_segment( @@ -118,23 +117,22 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id): - LOG = getLogger('processor.OcropyDeskew') if not segment_image.width or not segment_image.height: - LOG.warning("Skipping %s with zero size", segment_id) + self.logger.warning("Skipping %s with zero size", segment_id) return angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image - LOG.info("About to deskew %s", segment_id) + self.logger.info("About to deskew %s", segment_id) angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied # segment angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: orientation = -(angle + angle0) orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] segment.set_orientation(orientation) # also removes all deskewed AlternativeImages - LOG.info("Found angle for %s: %.1f", segment_id, angle) + self.logger.info("Found angle for %s: %.1f", segment_id, angle) # delegate reflection, rotation and re-cropping to core: if isinstance(segment, PageType): segment_image, segment_coords, _ = self.workspace.image_from_page( diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index d9a92390..2261cf3e 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -105,7 +105,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropyResegment') # This makes best sense for bad/coarse line segmentation, like current GT # or as postprocessing for bbox-only steps like Tesseract. # Most notably, it can convert rectangles to polygons (polygonalization), @@ -120,7 +119,7 @@ def process(self): assert_file_grp_cardinality(self.output_file_grp, 1) for n, input_file in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -136,7 +135,7 @@ def process(self): dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) + self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 @@ -156,14 +155,14 @@ def process(self): page.get_CustomRegion()) regions = page.get_AllRegions(classes=['Text']) if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) elif level == 'page': lines = [line for region in regions for line in region.get_TextLine()] if lines: self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) else: - LOG.warning('Page "%s" contains no text regions with lines', page_id) + self.logger.warning('Page "%s" contains no text regions with lines', page_id) else: for region in regions: lines = region.get_TextLine() @@ -172,7 +171,7 @@ def process(self): region, page_image, page_coords, feature_selector='binarized') self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) else: - LOG.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) # update METS (add the PAGE file): file_path = os.path.join(self.output_file_grp, file_id + '.xml') @@ -184,11 +183,10 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): - LOG = getLogger('processor.OcropyResegment') threshold = self.parameter['min_fraction'] method = self.parameter['method'] maxdist = self.parameter['spread']/zoom*300/72 # in pt @@ -206,7 +204,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l fullpage = False report = check_region(parent_bin, zoom) if report: - LOG.warning('Invalid %s "%s": %s', tag, + self.logger.warning('Invalid %s "%s": %s', tag, page_id if fullpage else parent.id, report) return # get existing line labels: @@ -234,7 +232,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines for i, region in enumerate(set(line.parent_object_ for line in lines)): - LOG.debug('unmasking area of text region "%s" for "%s"', + self.logger.debug('unmasking area of text region "%s" for "%s"', region.id, page_id if fullpage else parent.id) region_polygon = coordinates_of_segment(region, parent_image, parent_coords) region_polygon = make_valid(Polygon(region_polygon)) @@ -244,14 +242,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): - LOG.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4], + self.logger.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4], segment.id, page_id if fullpage else parent.id) segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True if method != 'lineest': - LOG.debug('calculating connected component and distance transforms for "%s"', parent.id) + self.logger.debug('calculating connected component and distance transforms for "%s"', parent.id) bin = parent_bin & ~ ignore_bin components, _ = morph.label(bin) # estimate glyph scale (roughly) @@ -260,7 +258,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l counts = np.sqrt(3 * counts) scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)])) components *= (counts > 15/zoom)[components] - LOG.debug("estimated scale: %d", scale) + self.logger.debug("estimated scale: %d", scale) else: scale = 43 if method == 'ccomps': @@ -278,7 +276,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_labels = np.zeros_like(parent_bin, np.uint8) for i, line in enumerate(lines): if line.Baseline is None: - LOG.warning("Skipping '%s' without baseline", line.id) + self.logger.warning("Skipping '%s' without baseline", line.id) new_labels[line_labels[i]] = i + 1 continue line_baseline = baseline_of_segment(line, parent_coords) @@ -289,22 +287,23 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin.shape) new_labels[line_y, line_x] = i + 1 spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords, - maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold) + maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold, logger=self.logger) return try: + # TODO: 'scale' passed as a param may not be always defined (mehmedGIT) new_line_labels, new_baselines, _, _, _, scale = compute_segmentation( parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: - LOG.error('Cannot line-segment %s "%s": %s', + self.logger.error('Cannot line-segment %s "%s": %s', tag, page_id if fullpage else parent.id, err) return - LOG.info("Found %d new line labels for %d existing lines on %s '%s'", + self.logger.info("Found %d new line labels for %d existing lines on %s '%s'", new_line_labels.max(), len(lines), tag, parent.id) # polygonalize and prepare comparison new_line_polygons, new_line_labels = masks2polygons( new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), - min_area=640/zoom/zoom) + min_area=640/zoom/zoom, logger=self.logger) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base)) @@ -387,41 +386,41 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for j, line in enumerate(lines): new_lines = np.nonzero(assignments == j)[0] if not np.prod(new_lines.shape): - LOG.debug("no lines for '%s' match or fit", line.id) + self.logger.debug("no lines for '%s' match or fit", line.id) continue covers = np.sum(covers_bg[new_lines,j]) if covers < threshold / 3: - LOG.debug("new lines for '%s' only cover %.1f%% bg", + self.logger.debug("new lines for '%s' only cover %.1f%% bg", line.id, covers * 100) continue covers = np.sum(covers_fg[new_lines,j]) if covers < threshold: - LOG.debug("new lines for '%s' only cover %.1f%% fg", + self.logger.debug("new lines for '%s' only cover %.1f%% fg", line.id, covers * 100) continue looses = (assignments < 0) & (covers_bg[:,j] > 0.1) if looses.any(): covers = np.sum(covers_bg[np.nonzero(looses)[0],j]) - LOG.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg", + self.logger.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg", line.id, np.count_nonzero(looses), covers * 100) continue line_count = np.count_nonzero(line_labels[j] & parent_bin) new_count = covers * line_count - LOG.debug('Black pixels before/after resegment of line "%s": %d/%d', + self.logger.debug('Black pixels before/after resegment of line "%s": %d/%d', line.id, line_count, new_count) # combine all assigned new lines to single outline polygon if len(new_lines) > 1: - LOG.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) + self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] for i in new_lines], loc=line.id, scale=scale) new_baseline = join_baselines([new_polygon.intersection(new_baselines[i]) - for i in new_lines], loc=line.id) + for i in new_lines], loc=line.id, logger=self.logger) # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) line_polygon = polygon_for_parent(line_polygon, line.parent_object_) if line_polygon is None: - LOG.warning("Ignoring extant new polygon for line '%s'", line.id) + self.logger.warning("Ignoring extant new polygon for line '%s'", line.id) return # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) @@ -436,7 +435,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if j == otherj: continue otherline = lines[otherj] - LOG.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) + self.logger.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon) if other_polygon.is_empty: continue @@ -445,14 +444,15 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_image, parent_coords) other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_) if other_polygon is None: - LOG.warning("Ignoring extant new polygon for line '%s'", otherline.id) + self.logger.warning("Ignoring extant new polygon for line '%s'", otherline.id) continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) def spread_dist(lines, old_labels, new_labels, binarized, components, coords, - maxdist=43, loc='', threshold=0.9): + maxdist=43, loc='', threshold=0.9, logger = None): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" - LOG = getLogger('processor.OcropyResegment') + if not logger: + raise ValueError(f"Logger has not been passed by the caller") DSAVE('seeds', [new_labels, (components>0)]) # allocate to connected components consistently # (ignoring smallest components like punctuation) @@ -477,29 +477,29 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, continue count = np.count_nonzero(old_label) if not count: - LOG.warning("skipping zero-area line '%s'", line.id) + logger.warning("skipping zero-area line '%s'", line.id) continue covers = np.count_nonzero(new_label) / count if covers < threshold / 3: - LOG.debug("new line for '%s' only covers %.1f%% bg", + logger.debug("new line for '%s' only covers %.1f%% bg", line.id, covers * 100) continue count = np.count_nonzero(old_label * binarized) if not count: - LOG.warning("skipping binary-empty line '%s'", line.id) + logger.warning("skipping binary-empty line '%s'", line.id) continue covers = np.count_nonzero(new_label * binarized) / count if covers < threshold: - LOG.debug("new line for '%s' only covers %.1f%% fg", + logger.debug("new line for '%s' only covers %.1f%% fg", line.id, covers * 100) continue - LOG.debug('Black pixels before/after resegment of line "%s": %d/%d', + logger.debug('Black pixels before/after resegment of line "%s": %d/%d', line.id, count, covers * count) contours = [contour[:,::-1] # get x,y order again for contour, area in morph.find_contours(new_label)] #LOG.debug("joining %d subsegments for %s", len(contours), line.id) if len(contours) == 0: - LOG.warning("no contours for %s - keeping", line.id) + logger.warning("no contours for %s - keeping", line.id) continue else: # get alpha shape @@ -511,7 +511,7 @@ def spread_dist(lines, old_labels, new_labels, binarized, components, coords, polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) if polygon is None: - LOG.warning("Ignoring extant line for %s", line.id) + logger.warning("Ignoring extant line for %s", line.id) continue line.get_Coords().set_points(points_from_polygon(polygon)) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 7488eefe..35f309b6 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -58,7 +58,7 @@ lines2regions ) -def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): +def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True, logger=None): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -75,7 +75,8 @@ def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=N - these polygons as a list of label, polygon, baseline tuples, and - a Numpy array of new background labels for that list. """ - LOG = getLogger('processor.OcropySegment') + if not logger: + raise ValueError(f"Logger has not been passed by the caller") # find sharp baseline if baselines is not None: def getx(xy): @@ -92,7 +93,7 @@ def getx(xy): bg_mask = np.array(bg_labels == label, bool) if not np.count_nonzero(bg_mask * fg_bin): # ignore if missing foreground - LOG.debug('skipping label %d in %s due to empty fg', + logger.debug('skipping label %d in %s due to empty fg', label, name) continue # simplify to convex hull @@ -101,7 +102,7 @@ def getx(xy): conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): - LOG.debug('Cannot simplify %d: convex hull would create additional intersections %s', + logger.debug('Cannot simplify %d: convex hull would create additional intersections %s', label, str(conflicts)) else: bg_mask = hull @@ -130,7 +131,7 @@ def getx(xy): if len(hole) < 3: idx_hole = hier[0, idx_hole, 0] continue - LOG.debug("label %d contour %d [%d pts] has hole %d [%d pts]", + logger.debug("label %d contour %d [%d pts] has hole %d [%d pts]", label, idx, len(contour), idx_hole, len(hole)) #plot_poly(hole, 'blue') # cut child from outside... @@ -172,7 +173,7 @@ def getx(xy): diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5 cispoint1 = cispoint1 + diff1 cispoint2 = cispoint2 + diff2 - LOG.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx) + logger.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx) # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) # (this works, because inner contours have inverse direction) contour = np.concatenate([contour[:contour_idx], cispoint1, @@ -181,7 +182,7 @@ def getx(xy): #plot_poly(contour, 'green') idx_hole = hier[0, idx_hole, 0] #plot_poly(contour, 'red') - LOG.debug("adding label %d contour %d [%d pts]", label, idx, len(contour)) + logger.debug("adding label %d contour %d [%d pts]", label, idx, len(contour)) contours.append(contour) idx = hier[0, idx, 0] else: @@ -207,7 +208,7 @@ def getx(xy): contour = contours[i] area = areas[i] if min_area and area < min_area and area / total_area < 0.1: - LOG.warning('Label %d contour %d is too small (%d/%d) in %s', + logger.warning('Label %d contour %d is too small (%d/%d) in %s', label, i, area, total_area, name) continue # simplify shape: @@ -217,22 +218,22 @@ def getx(xy): # simplify and validate: polygon = Polygon(polygon) if not polygon.is_valid: - #LOG.debug(polygon.wkt) - LOG.debug(explain_validity(polygon)) + #logger.debug(polygon.wkt) + logger.debug(explain_validity(polygon)) polygon = make_valid(polygon) if not polygon.is_valid: #LOG.debug(polygon.wkt) - LOG.warning(explain_validity(polygon)) + logger.warning(explain_validity(polygon)) poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: - LOG.warning('Label %d contour %d for %s has less than 4 points', label, i, name) + logger.warning('Label %d contour %d for %s has less than 4 points', label, i, name) continue # get baseline segments intersecting with this line mask # and concatenate them from left to right if baselines is not None: base = join_baselines([baseline.intersection(polygon) for baseline in baselines - if baseline.intersects(polygon)], name) + if baseline.intersects(polygon)], name, logger) if base is not None: base = base.coords else: @@ -324,7 +325,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - LOG = getLogger('processor.OcropySegment') # FIXME: allow passing a-priori info on reading order / textline order # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture # of different scripts; also, vertical writing needs internal rotation @@ -339,7 +339,7 @@ def process(self): assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): - LOG.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) + self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) pcgts = page_from_file(self.workspace.download_file(input_file)) @@ -356,7 +356,7 @@ def process(self): dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 - LOG.info('Page "%s" uses %f DPI', page_id, dpi) + self.logger.info('Page "%s" uses %f DPI', page_id, dpi) zoom = 300.0/dpi else: zoom = 1 @@ -393,7 +393,7 @@ def process(self): if regions: # page is already region-segmented if overwrite_regions: - LOG.info('removing existing TextRegions in page "%s"', page_id) + self.logger.info('removing existing TextRegions in page "%s"', page_id) # we could remove all other region types as well, # but this is more flexible (for workflows with # specialized separator/image/table detectors): @@ -401,7 +401,7 @@ def process(self): page.set_ReadingOrder(None) ro = None else: - LOG.warning('keeping existing TextRegions in page "%s"', page_id) + self.logger.warning('keeping existing TextRegions in page "%s"', page_id) ignore.extend(regions) # create reading order if necessary if not ro or overwrite_order: @@ -425,20 +425,20 @@ def process(self): ignore.extend(page.get_TextRegion()) regions = list(page.get_TableRegion()) if not regions: - LOG.warning('Page "%s" contains no table regions', page_id) + self.logger.warning('Page "%s" contains no table regions', page_id) for region in regions: subregions = region.get_TextRegion() if subregions: # table is already cell-segmented if overwrite_regions: - LOG.info('removing existing TextRegions in table "%s"', region.id) + self.logger.info('removing existing TextRegions in table "%s"', region.id) region.set_TextRegion([]) roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) - reading_order[region.id] = page_subgroup_in_reading_order(roelem) + reading_order[region.id] = page_subgroup_in_reading_order(roelem, self.logger) else: - LOG.warning('skipping table "%s" with existing TextRegions', region.id) + self.logger.warning('skipping table "%s" with existing TextRegions', region.id) continue # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -449,24 +449,24 @@ def process(self): # create reading order group if necessary roelem = reading_order.get(region.id) if not roelem: - LOG.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", + self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", page_id, region.id, "no target to add cells to") elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem) + roelem = page_subgroup_in_reading_order(roelem, self.logger) reading_order[region.id] = roelem elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - LOG.warning("Page '%s' table region '%s' already has an ordered group (%s)", + self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", page_id, region.id, "cells will be appended") elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - LOG.warning("Page '%s' table region '%s' already has an unordered group (%s)", + self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)", page_id, region.id, "cells will not be appended") roelem = None else: # replace regionRef(Indexed) by group with same index and ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem) + roelem = page_subgroup_in_reading_order(roelem, self.logger) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) self._process_element(region, subignore, region_image, region_coords, @@ -488,14 +488,14 @@ def process(self): region.add_TextRegion(subregion) regions.append(subregion) if not regions: - LOG.warning('Page "%s" contains no text regions', page_id) + self.logger.warning('Page "%s" contains no text regions', page_id) for region in regions: if region.get_TextLine(): if overwrite_lines: - LOG.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) + self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) region.set_TextLine([]) else: - LOG.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) + self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) ignore.extend(region.get_TextLine()) # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -519,7 +519,7 @@ def process(self): local_filename=file_path, mimetype=MIMETYPE_PAGE, content=to_xml(pcgts)) - LOG.info('created file ID: %s, file_grp: %s, path: %s', + self.logger.info('created file ID: %s, file_grp: %s, path: %s', file_id, self.output_file_grp, out.local_filename) def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): @@ -540,16 +540,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, in full page/table mode, then combine all separators among them with the newly detected separators to guide region segmentation. """ - LOG = getLogger('processor.OcropySegment') if not image.width or not image.height: - LOG.warning("Skipping '%s' with zero size", element_id) + self.logger.warning("Skipping '%s' with zero size", element_id) return element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), bool) sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - LOG.debug('masking foreground of %s "%s" for "%s"', + self.logger.debug('masking foreground of %s "%s" for "%s"', type(segment).__name__[:-4], segment.id, element_id) # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; @@ -583,7 +582,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - LOG.info('computing line segmentation for %s "%s"', element_name, element_id) + self.logger.info('computing line segmentation for %s "%s"', element_name, element_id) # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -601,14 +600,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): - LOG.error('Cannot line-segment region "%s": %s', element_id, err) + self.logger.error('Cannot line-segment region "%s": %s', element_id, err) # as a fallback, add a single text line comprising the whole region: element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords())) else: - LOG.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) + self.logger.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) return - LOG.info('Found %d text lines for %s "%s"', + self.logger.info('Found %d text lines for %s "%s"', len(np.unique(line_labels)) - 1, element_name, element_id) # post-process line labels @@ -631,11 +630,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - LOG.info('Found %d text regions for %s "%s"', + self.logger.info('Found %d text regions for %s "%s"', len(np.unique(region_labels)) - 1, element_name, element_id) except Exception as err: - LOG.error('Cannot region-segment %s "%s": %s', + self.logger.error('Cannot region-segment %s "%s": %s', element_name, element_id, err) region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) @@ -669,7 +668,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) - LOG.debug('Region label %d is for ignored region "%s"', + self.logger.debug('Region label %d is for ignored region "%s"', region_label, region.id) continue # normal case: new lines inside new regions @@ -685,11 +684,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, regions, _ = masks2polygons(region_mask * region_label, None, element_bin, '%s "%s"' % (element_name, element_id), min_area=6000/zoom/zoom, - simplify=ignore_labels * ~(sep_bin)) + simplify=ignore_labels * ~(sep_bin), + logger=self.logger) # find contours for lines (can be non-contiguous) lines, _ = masks2polygons(region_line_labels, baselines, element_bin, 'region "%s"' % element_id, - min_area=640/zoom/zoom) + min_area=640/zoom/zoom, + logger=self.logger) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -698,12 +699,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_polygon = coordinates_for_segment(region_polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for region label %d', region_label) + self.logger.warning('Ignoring extant region contour for region label %d', region_label) continue # annotate result: region_no += 1 region_id = element_id + "_region%04d" % region_no - LOG.debug('Region label %d becomes ID "%s"', region_label, region_id) + self.logger.debug('Region label %d becomes ID "%s"', region_label, region_id) region = TextRegionType( id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon))) @@ -717,13 +718,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, line_polygon = coordinates_for_segment(line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: - LOG.warning('Ignoring extant line contour for region label %d line label %d', + self.logger.warning('Ignoring extant line contour for region label %d line label %d', region_label, line_label) continue # annotate result: line_no += 1 line_id = region_id + "_line%04d" % line_no - LOG.debug('Line label %d becomes ID "%s"', line_label, line_id) + self.logger.debug('Line label %d becomes ID "%s"', line_label, line_id) line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if line_baseline: @@ -733,22 +734,22 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - LOG.info('Added region "%s" with %d lines for %s "%s"', + self.logger.info('Added region "%s" with %d lines for %s "%s"', region_id, line_no, element_name, element_id) if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - LOG.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) + self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): image_polygons, _ = masks2polygons(images, None, element_bin, - '%s "%s"' % (element_name, element_id)) + '%s "%s"' % (element_name, element_id), self.logger) for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for image label %d', image_label) + self.logger.warning('Ignoring extant region contour for image label %d', image_label) continue region_no += 1 # annotate result: @@ -757,17 +758,17 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, id=region_id, Coords=CoordsType( points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - LOG.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) + self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): sep_polygons, _ = masks2polygons(seplines, None, element_bin, '%s "%s"' % (element_name, element_id), - open_holes=True, reorder=False) + open_holes=True, reorder=False, logger=self.logger) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - LOG.warning('Ignoring extant region contour for separator %d', sep_label) + self.logger.warning('Ignoring extant region contour for separator %d', sep_label) continue # annotate result: region_no += 1 @@ -795,14 +796,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # find contours around labels (can be non-contiguous): line_polygons, _ = masks2polygons(line_labels, baselines, element_bin, 'region "%s"' % element_id, - min_area=640/zoom/zoom) + min_area=640/zoom/zoom, logger=self.logger) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: - LOG.warning('Ignoring extant line contour for line label %d', + self.logger.warning('Ignoring extant line contour for line label %d', line_label) continue # annotate result: @@ -937,8 +938,9 @@ def join_polygons(polygons, loc='', scale=20): jointp = make_valid(jointp) return jointp -def join_baselines(baselines, loc=''): - LOG = getLogger('processor.OcropyResegment') +def join_baselines(baselines, loc='', logger = None): + if not logger: + raise ValueError(f"Logger has not been passed by the caller") lines = [] for baseline in baselines: if (baseline.is_empty or @@ -955,9 +957,9 @@ def join_baselines(baselines, loc=''): elif geom.geom_type == 'MultiLineString': lines.extend(geom) else: - LOG.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc) + logger.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc) else: - LOG.warning("ignoring baseline type %s in %s", baseline.geom_type, loc) + logger.warning("ignoring baseline type %s in %s", baseline.geom_type, loc) nlines = len(lines) if nlines == 0: return None @@ -1019,7 +1021,7 @@ def join_baselines(baselines, loc=''): else: chains.append([prevl, nextl]) if len(chains) > 1: - LOG.warning("baseline merge impossible (no spanning tree) in %s", loc) + logger.warning("baseline merge impossible (no spanning tree) in %s", loc) return None assert len(chains) == 1, chains assert len(chains[0]) == nlines, chains[0] @@ -1031,7 +1033,7 @@ def join_baselines(baselines, loc=''): coords.extend(line.normalize().coords) result = LineString(coords) if result.is_empty: - LOG.warning("baseline merge is empty in %s", loc) + logger.warning("baseline merge is empty in %s", loc) return None assert result.geom_type == 'LineString', result.wkt result = set_precision(result, 1.0) @@ -1080,7 +1082,7 @@ def page_add_to_reading_order(rogroup, region_id, index=None): index += 1 return index -def page_subgroup_in_reading_order(roelem): +def page_subgroup_in_reading_order(roelem, logger = None): """Replace given RO element by an equivalent OrderedGroup. Given a ReadingOrder element ``roelem`` (of any type), @@ -1094,12 +1096,14 @@ def page_subgroup_in_reading_order(roelem): Return the new group object. """ - LOG = getLogger('processor.OcropySegment') + if not logger: + raise ValueError(f"Logger has not been passed by the caller") + if not roelem: - LOG.error('Cannot subgroup from empty ReadingOrder element') + logger.error('Cannot subgroup from empty ReadingOrder element') return roelem if not roelem.parent_object_: - LOG.error('Cannot subgroup from orphan ReadingOrder element') + logger.error('Cannot subgroup from orphan ReadingOrder element') return roelem if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not ( roelem.get_OrderedGroupIndexed() or From dbccae58d9213d5df4e072502a7eae8484902ef6 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 13 Aug 2024 14:57:16 +0200 Subject: [PATCH 080/194] require core >= 3.0.0a1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6df9445c..38f09abd 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=2.47', + 'ocrd>=3.0.0a1', 'click', 'scipy', 'numpy>=1.17.0', From 8557a26dc75cf858f9e6819296389f71ab972cf3 Mon Sep 17 00:00:00 2001 From: kba Date: Tue, 13 Aug 2024 15:26:32 +0200 Subject: [PATCH 081/194] port part of binarize to core v3 --- ocrd_cis/ocropy/binarize.py | 157 ++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 87 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index f42ff2bd..c3b4cded 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,9 +1,13 @@ from __future__ import absolute_import +import logging import os.path +import PIL import cv2 import numpy as np from PIL import Image +from os.path import join +from ocrd_models import OcrdExif #import kraken.binarization @@ -15,11 +19,10 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType + OcrdPage, to_xml, AlternativeImageType ) from ocrd import Processor -from .. import get_ocrd_tool from . import common from .common import ( pil2array, array2pil, @@ -64,18 +67,20 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo raise Exception('unknown binarization method %s' % method) return Image.fromarray(th), 0 +def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: + if dpi > 0: + zoom = 300.0/dpi + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi *= 2.54 + zoom = 300.0/dpi + else: + zoom = 1 + return zoom class OcropyBinarize(Processor): - - def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyBinarize') - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyBinarize, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() + logger : logging.Logger @property def executable(self): @@ -84,16 +89,16 @@ def executable(self): def setup(self): assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) + self.logger = getLogger('processor.OcropyBinarize') method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise Exception('only method=ocropy allows grayscale=true') - def process(self): + def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested + THEN Iterate over the PAGE-XML element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout @@ -109,80 +114,61 @@ def process(self): Reference each new image in the AlternativeImage of the element. - Produce a new output file by serialising the resulting hierarchy. + Return a PAGE-XML with AlternativeImage and the arguments for ``workspace.save_image_file``. """ level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) + assert self.workspace + self.logger.debug(f'Level of operation: "{level}"') - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, feature_filter='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 - - if level == 'page': - self.process_page(page, page_image, page_xywh, zoom, - input_file.pageId, file_id) - else: - if level == 'table': - regions = page.get_TableRegion() - else: # region - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, feature_filter='binarized') - if level == 'region': - self.process_region(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue - lines = region.get_TextLine() - if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', - page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, feature_filter='binarized') - self.process_line(line, line_image, line_xywh, zoom, - input_file.pageId, region.id, - file_id + '_' + region.id + '_' + line.id) + page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info('Page "%s" uses %f DPI', page_id, self.parameter['dpi']) + + ret = [pcgts] + if level == 'page': + try: + ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id)) + except ValueError as e: + self.logger.exception(e) + else: + # TODO + raise NotImplementedError + if level == 'table': + regions = page.get_TableRegion() + else: # region + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, feature_filter='binarized') + if level == 'region': + self.process_region(region, region_image, region_xywh, zoom, + input_file.pageId, file_id + '_' + region.id) + continue + lines = region.get_TextLine() + if not lines: + self.logger.warning('Page "%s" region "%s" contains no text lines', + page_id, region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, feature_filter='binarized') + self.process_line(line, line_image, line_xywh, zoom, + input_file.pageId, region.id, + file_id + '_' + region.id + '_' + line.id) - # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + return ret - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): + def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> tuple[Image.Image, str, str]: if not page_image.width or not page_image.height: - self.logger.warning("Skipping page '%s' with zero size", page_id) - return + raise ValueError("Skipping page '%s' with zero size", page_id) self.logger.info("About to binarize page '%s'", page_id) + assert self.output_file_grp + features = page_xywh['features'] if 'angle' in page_xywh and page_xywh['angle']: # orientation has already been annotated (by previous deskewing), @@ -216,13 +202,10 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id): else: file_id += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) + bin_image_path = join(self.output_file_grp, f'{file_id}.png') # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) + return (bin_image, file_id, bin_image_path) def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id): if not region_image.width or not region_image.height: From 278b706246e24ec0fc0b5030aff6d16673bad817 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:08:10 +0200 Subject: [PATCH 082/194] move: determine_zoom to common.py --- ocrd_cis/ocropy/binarize.py | 18 ++---------------- ocrd_cis/ocropy/clip.py | 15 +++------------ ocrd_cis/ocropy/common.py | 14 +++++++++++++- ocrd_cis/ocropy/denoise.py | 15 ++++----------- ocrd_cis/ocropy/deskew.py | 6 +----- ocrd_cis/ocropy/dewarp.py | 18 ++++-------------- ocrd_cis/ocropy/resegment.py | 14 ++++---------- ocrd_cis/ocropy/segment.py | 13 +++---------- 8 files changed, 34 insertions(+), 79 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index c3b4cded..b5e2bc7e 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -7,7 +7,6 @@ import numpy as np from PIL import Image from os.path import join -from ocrd_models import OcrdExif #import kraken.binarization @@ -25,9 +24,8 @@ from . import common from .common import ( - pil2array, array2pil, # binarize, - remove_noise) + array2pil, determine_zoom, pil2array, remove_noise) #sys.path.append(os.path.dirname(os.path.abspath(__file__))) @@ -67,18 +65,6 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo raise Exception('unknown binarization method %s' % method) return Image.fromarray(th), 0 -def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: - if dpi > 0: - zoom = 300.0/dpi - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - zoom = 300.0/dpi - else: - zoom = 1 - return zoom - class OcropyBinarize(Processor): logger : logging.Logger @@ -126,7 +112,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info('Page "%s" uses %f DPI', page_id, self.parameter['dpi']) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") ret = [pcgts] if level == 'page': diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 4c0eebea..3b854897 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -28,8 +28,7 @@ from .ocrolib import midrange, morph from .common import ( # binarize, - pil2array, array2pil -) + array2pil, determine_zoom, pil2array) class OcropyClip(Processor): @@ -98,16 +97,8 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") # FIXME: what about text regions inside table regions? regions = list(page.get_TextRegion()) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 3cb9e4c4..1804c29d 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -10,7 +10,7 @@ from skimage.morphology import medial_axis import networkx as nx from PIL import Image - +from ocrd_models import OcrdExif from . import ocrolib from .ocrolib import morph, psegutils, sl # for decorators (type-checks etc): @@ -2102,3 +2102,15 @@ def find_topological(): # rlabels[region_hull] = region # DSAVE('rlabels_closed', rlabels) return rlabels + +def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: + if dpi > 0: + zoom = 300.0/dpi + elif page_image_info.resolution != 1: + dpi = page_image_info.resolution + if page_image_info.resolutionUnit == 'cm': + dpi *= 2.54 + zoom = 300.0/dpi + else: + zoom = 1 + return zoom diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index d6a4f7ff..d8554a3e 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -17,7 +17,7 @@ from .. import get_ocrd_tool from .common import ( # binarize, - remove_noise) + determine_zoom, remove_noise) class OcropyDenoise(Processor): @@ -73,16 +73,9 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized' if level == 'page' else '') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 63bb6b97..055ab27d 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -17,14 +17,10 @@ from .. import get_ocrd_tool from . import common -from .common import ( - pil2array -) +from .common import pil2array #sys.path.append(os.path.dirname(os.path.abspath(__file__))) -TOOL = 'ocrd-cis-ocropy-deskew' - def deskew(pil_image, maxskew=2): array = pil2array(pil_image) _, angle = common.binarize(array, maxskew=maxskew) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 89a62e11..4c9a1bdb 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -17,10 +17,7 @@ from .. import get_ocrd_tool from .ocrolib import lineest -from .common import ( - pil2array, array2pil, - check_line, -) +from .common import array2pil, check_line, determine_zoom, pil2array #sys.path.append(os.path.dirname(os.path.abspath(__file__))) @@ -128,16 +125,9 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 2261cf3e..e4681b23 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -30,6 +30,7 @@ pil2array, odd, DSAVE, + determine_zoom, # binarize, check_page, check_region, @@ -129,16 +130,9 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 35f309b6..e13c3d71 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -53,6 +53,7 @@ pil2array, array2pil, check_page, check_region, + determine_zoom, hmerge_line_seeds, compute_segmentation, lines2regions @@ -350,16 +351,8 @@ def process(self): # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - if self.parameter['dpi'] > 0: - zoom = 300.0/self.parameter['dpi'] - elif page_image_info.resolution != 1: - dpi = page_image_info.resolution - if page_image_info.resolutionUnit == 'cm': - dpi *= 2.54 - self.logger.info('Page "%s" uses %f DPI', page_id, dpi) - zoom = 300.0/dpi - else: - zoom = 1 + zoom = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + From 6beec175ed89e321cae93917dbe02bd2809cd83b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:14:31 +0200 Subject: [PATCH 083/194] move: logger init to setup() --- ocrd_cis/ocropy/binarize.py | 6 +++--- ocrd_cis/ocropy/clip.py | 4 +++- ocrd_cis/ocropy/denoise.py | 5 +++-- ocrd_cis/ocropy/deskew.py | 5 +++-- ocrd_cis/ocropy/dewarp.py | 5 +++-- ocrd_cis/ocropy/recognize.py | 5 +++-- ocrd_cis/ocropy/resegment.py | 5 +++-- ocrd_cis/ocropy/segment.py | 6 ++++-- ocrd_cis/ocropy/train.py | 5 +++-- 9 files changed, 28 insertions(+), 18 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index b5e2bc7e..cc34690e 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,5 +1,5 @@ from __future__ import absolute_import -import logging +from logging import Logger import os.path import PIL @@ -66,16 +66,16 @@ def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zo return Image.fromarray(th), 0 class OcropyBinarize(Processor): - logger : logging.Logger + logger: Logger @property def executable(self): return 'ocrd-cis-ocropy-binarize' def setup(self): + self.logger = getLogger('processor.OcropyBinarize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) - self.logger = getLogger('processor.OcropyBinarize') method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 3b854897..1b7fb28b 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from logging import Logger import os.path import numpy as np @@ -31,9 +32,9 @@ array2pil, determine_zoom, pil2array) class OcropyClip(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyClip') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -44,6 +45,7 @@ def executable(self): return 'ocrd-cis-ocropy-clip' def setup(self): + self.logger = getLogger('processor.OcropyClip') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index d8554a3e..34750a53 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path from ocrd_utils import ( @@ -20,9 +20,9 @@ determine_zoom, remove_noise) class OcropyDenoise(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyDenoise') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -33,6 +33,7 @@ def executable(self): return 'ocrd-cis-ocropy-denoise' def setup(self): + self.logger = getLogger('processor.OcropyDenoise') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 055ab27d..2eb898ca 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path from ocrd_utils import ( @@ -27,9 +27,9 @@ def deskew(pil_image, maxskew=2): return angle class OcropyDeskew(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyDeskew') ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] @@ -40,6 +40,7 @@ def executable(self): return 'ocrd-cis-ocropy-deskew' def setup(self): + self.logger = getLogger('processor.OcropyDeskew') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 4c9a1bdb..cad280c6 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path import numpy as np @@ -64,9 +64,9 @@ def padvert(image, range_): return array2pil(line) class OcropyDewarp(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyDewarp') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -80,6 +80,7 @@ def executable(self): return 'ocrd-cis-ocropy-dewarp' def setup(self): + self.logger = getLogger('processor.OcropyDewarp') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # defaults from ocrolib.lineest: diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index fdeaed27..8e147fea 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import sys import os.path import numpy as np @@ -78,9 +78,9 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyRecognize') self.ocrd_tool = get_ocrd_tool() self.pad = 16 # ocropus-rpred default self.network = None # set in process @@ -96,6 +96,7 @@ def executable(self): return 'ocrd-cis-ocropy-recognize' def setup(self): + self.logger = getLogger('processor.OcropyRecognize') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # from ocropus-rpred: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index e4681b23..1e920b0f 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path import numpy as np from skimage import draw, segmentation @@ -48,9 +48,9 @@ ) class OcropyResegment(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropyResegment') self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -61,6 +61,7 @@ def executable(self): return 'ocrd-cis-ocropy-resegment' def setup(self): + self.logger = getLogger('processor.OcropyResegment') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index e13c3d71..3b89bda6 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import os.path import itertools import numpy as np @@ -245,9 +245,10 @@ def getx(xy): class OcropySegment(Processor): + logger: Logger def __init__(self, *args, **kwargs): - self.logger = getLogger('processor.OcropySegment') + self.ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] kwargs['version'] = self.ocrd_tool['version'] @@ -258,6 +259,7 @@ def executable(self): return 'ocrd-cis-ocropy-segment' def setup(self): + self.logger = getLogger('processor.OcropySegment') assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 25317c4d..61a918c7 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,5 +1,5 @@ from __future__ import absolute_import - +from logging import Logger import sys import os import tempfile @@ -28,9 +28,9 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): + log: Logger def __init__(self, *args, **kwargs): - self.log = getLogger('processor.OcropyTrain') self.oldcwd = os.getcwd() ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] @@ -45,6 +45,7 @@ def executable(self): return 'ocrd-cis-ocropy-train' def setup(self): + self.log = getLogger('processor.OcropyTrain') #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] From 1b2fea3ed5b7c9d1a02f2dcabe0770aa3eb87da6 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:16:55 +0200 Subject: [PATCH 084/194] refactor: log -> logger --- ocrd_cis/ocropy/train.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 61a918c7..9278da92 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -28,7 +28,7 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): - log: Logger + logger: Logger def __init__(self, *args, **kwargs): self.oldcwd = os.getcwd() @@ -45,7 +45,7 @@ def executable(self): return 'ocrd-cis-ocropy-train' def setup(self): - self.log = getLogger('processor.OcropyTrain') + self.logger = getLogger('processor.OcropyTrain') #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] @@ -54,9 +54,9 @@ def setup(self): except SystemExit: ocropydir = os.path.dirname(os.path.abspath(__file__)) modelpath = os.path.join(ocropydir, 'models', model) - self.log.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) + self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) if not os.path.isfile(modelpath): - self.log.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", + self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", model, model) sys.exit(1) outputpath = os.path.join(self.oldcwd, 'output', model) @@ -78,18 +78,18 @@ def process(self): """ filelist = [] filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-') - #self.log.info("Using model %s in %s for recognition", model) + #self.logger.info("Using model %s in %s for recognition", model) for (n, input_file) in enumerate(self.input_files): - #self.log.info("INPUT FILE %i / %s", n, input_file) + #self.logger.info("INPUT FILE %i / %s", n, input_file) pcgts = page_from_file(self.workspace.download_file(input_file)) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - self.log.info("Extracting from page '%s'", page_id) + self.logger.info("Extracting from page '%s'", page_id) for region in page.get_AllRegions(classes=['Text']): textlines = region.get_TextLine() - self.log.info("Extracting %i lines from region '%s'", len(textlines), region.id) + self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id) for line in textlines: if self.parameter['textequiv_level'] == 'line': path = os.path.join(filepath, page_id + region.id + line.id) @@ -110,7 +110,7 @@ def process(self): if imgpath: filelist.append(imgpath) - self.log.info("Training %s from %s on %i file pairs", + self.logger.info("Training %s from %s on %i file pairs", self.outputpath, self.modelpath or 'scratch', len(filelist)) @@ -130,7 +130,7 @@ def extract_segment(self, path, segment, page_image, page_coords): with open(gtpath, "w", encoding='utf-8') as f: f.write(gt) - self.log.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id) + self.logger.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id) image, coords = self.workspace.image_from_segment(segment, page_image, page_coords) if 'binarized' not in coords['features'].split(','): From fe33494814e845cfd969a5f1a51234ceadb865a3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:32:17 +0200 Subject: [PATCH 085/194] remove: unused imports --- ocrd_cis/ocropy/binarize.py | 19 +++++++----------- ocrd_cis/ocropy/clip.py | 4 ++-- ocrd_cis/ocropy/denoise.py | 4 ++-- ocrd_cis/ocropy/deskew.py | 4 ++-- ocrd_cis/ocropy/dewarp.py | 4 ++-- ocrd_cis/ocropy/recognize.py | 20 +++++++++--------- ocrd_cis/ocropy/resegment.py | 9 +++------ ocrd_cis/ocropy/segment.py | 4 ++-- ocrd_cis/ocropy/train.py | 39 ++++++++++++++++++------------------ 9 files changed, 49 insertions(+), 58 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index cc34690e..5d3fc7c3 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,12 +1,12 @@ from __future__ import absolute_import from logging import Logger -import os.path -import PIL import cv2 import numpy as np from PIL import Image -from os.path import join +from os.path import abspath, dirname, join + +from typing import Tuple #import kraken.binarization @@ -16,18 +16,13 @@ assert_file_grp_cardinality, MIMETYPE_PAGE ) -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - OcrdPage, to_xml, AlternativeImageType -) +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml from ocrd import Processor from . import common -from .common import ( - # binarize, - array2pil, determine_zoom, pil2array, remove_noise) +from .common import array2pil, determine_zoom, pil2array, remove_noise -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) +#sys.path.append(dirname(abspath(__file__))) def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): LOG = getLogger('processor.OcropyBinarize') @@ -149,7 +144,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: return ret - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> tuple[Image.Image, str, str]: + def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: if not page_image.width or not page_image.height: raise ValueError("Skipping page '%s' with zero size", page_id) self.logger.info("About to binarize page '%s'", page_id) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 1b7fb28b..b70d1fb0 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,7 +1,7 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join import numpy as np from PIL import Image, ImageStat, ImageOps from shapely.geometry import Polygon @@ -202,7 +202,7 @@ def process(self): input_file.pageId, file_id + '_' + region.id + '_' + line.id) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 34750a53..7cf74727 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join from ocrd_utils import ( getLogger, @@ -105,7 +105,7 @@ def process(self): file_id + '_' + region.id + '_' + line.id) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 2eb898ca..bcd3be01 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join from ocrd_utils import ( getLogger, @@ -105,7 +105,7 @@ def process(self): file_id + '_' + region.id) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index cad280c6..6c27c5c6 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join import numpy as np from ocrd_utils import ( @@ -172,7 +172,7 @@ def process(self): comments=line_xywh['features'] + ',dewarped')) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 8e147fea..f3ecf199 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,7 +1,7 @@ from __future__ import absolute_import from logging import Logger -import sys -import os.path +from sys import exit +from os.path import abspath, dirname, isfile, join import numpy as np from PIL import Image @@ -24,11 +24,9 @@ from ocrd import Processor from .. import get_ocrd_tool +from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange -from .common import ( - pil2array, - check_line -) + def resize_keep_ratio(image, baseheight=48): scale = baseheight / image.height @@ -112,20 +110,20 @@ def get_model(self): be resolved with OcrdResourceManager to a valid readeable file and returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" - canread = lambda p: os.path.isfile(p) and os.access(p, os.R_OK) + canread = lambda p: isfile(p) and os.access(p, os.R_OK) try: model = self.resolve_resource(self.parameter['model']) if canread(model): return model except SystemExit: - ocropydir = os.path.dirname(os.path.abspath(__file__)) - path = os.path.join(ocropydir, 'models', self.parameter['model']) + ocropydir = dirname(abspath(__file__)) + path = join(ocropydir, 'models', self.parameter['model']) self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path) if canread(path): return path self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s", self.parameter['model'], self.parameter['model']) - sys.exit(1) + exit(1) def process(self): """Recognize lines / words / glyphs of the workspace. @@ -176,7 +174,7 @@ def process(self): # update METS (add the PAGE file): file_id = make_file_id(input_file, self.output_file_grp) - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 1e920b0f..329694d0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,16 +1,13 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join import numpy as np from skimage import draw, segmentation from shapely.geometry import Polygon, LineString from shapely.prepared import prep -from shapely.ops import unary_union from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, PageType, BaselineType -) +from ocrd_models.ocrd_page import BaselineType, PageType, to_xml from ocrd import Processor from ocrd_utils import ( getLogger, @@ -169,7 +166,7 @@ def process(self): self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 3b89bda6..446fc628 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -import os.path +from os.path import join import itertools import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree @@ -505,7 +505,7 @@ def process(self): input_file.pageId, zoom) # update METS (add the PAGE file): - file_path = os.path.join(self.output_file_grp, file_id + '.xml') + file_path = join(self.output_file_grp, file_id + '.xml') pcgts.set_pcGtsId(file_id) out = self.workspace.add_file( ID=file_id, diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 9278da92..ff460523 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,7 +1,8 @@ from __future__ import absolute_import from logging import Logger -import sys -import os +from sys import exit +from os import getcwd, makedirs, remove +from os.path import abspath, dirname, exists, join, isfile import tempfile from ocrd_modelfactory import page_from_file @@ -15,10 +16,10 @@ def deletefiles(filelist): for file in filelist: - if os.path.exists(file): - os.remove(file) - if os.path.exists(file[:-3]+'gt.txt'): - os.remove(file[:-3]+'gt.txt') + if exists(file): + remove(file) + if exists(file[:-3]+'gt.txt'): + remove(file[:-3]+'gt.txt') def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) @@ -31,7 +32,7 @@ class OcropyTrain(Processor): logger: Logger def __init__(self, *args, **kwargs): - self.oldcwd = os.getcwd() + self.oldcwd = getcwd() ocrd_tool = get_ocrd_tool() kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] kwargs['version'] = ocrd_tool['version'] @@ -52,22 +53,22 @@ def setup(self): try: modelpath = self.resolve_resource(model) except SystemExit: - ocropydir = os.path.dirname(os.path.abspath(__file__)) - modelpath = os.path.join(ocropydir, 'models', model) + ocropydir = dirname(abspath(__file__)) + modelpath = join(ocropydir, 'models', model) self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) - if not os.path.isfile(modelpath): + if not isfile(modelpath): self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", model, model) - sys.exit(1) - outputpath = os.path.join(self.oldcwd, 'output', model) + exit(1) + outputpath = join(self.oldcwd, 'output', model) if 'outputpath' in self.parameter: - outputpath = os.path.join(self.parameter, model) + outputpath = join(self.parameter, model) else: modelpath = None - outputpath = os.path.join(self.oldcwd, 'output', 'lstm') + outputpath = join(self.oldcwd, 'output', 'lstm') if 'outputpath' in self.parameter: - outputpath = os.path.join(self.parameter, 'lstm') - os.makedirs(os.path.dirname(outputpath)) + outputpath = join(self.parameter, 'lstm') + makedirs(dirname(outputpath)) self.modelpath = modelpath self.outputpath = outputpath @@ -92,20 +93,20 @@ def process(self): self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id) for line in textlines: if self.parameter['textequiv_level'] == 'line': - path = os.path.join(filepath, page_id + region.id + line.id) + path = join(filepath, page_id + region.id + line.id) imgpath = self.extract_segment(path, line, page_image, page_coords) if imgpath: filelist.append(imgpath) continue for word in line.get_Word(): if self.parameter['textequiv_level'] == 'word': - path = os.path.join(filepath, page_id + region.id + line.id + word.id) + path = join(filepath, page_id + region.id + line.id + word.id) imgpath = self.extract_segment(path, word, page_image, page_coords) if imgpath: filelist.append(imgpath) continue for glyph in word.get_Glyph(): - path = os.path.join(filepath, page_id + region.id + line.id + glyph.id) + path = join(filepath, page_id + region.id + line.id + glyph.id) imgpath = self.extract_segment(path, glyph, page_image, page_coords) if imgpath: filelist.append(imgpath) From 3368a53e8341ab265ac5fa115a740cfc02bcc5ef Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:34:21 +0200 Subject: [PATCH 086/194] remove: file grp cardinality checks inside process() --- ocrd_cis/ocropy/clip.py | 2 -- ocrd_cis/ocropy/denoise.py | 2 -- ocrd_cis/ocropy/deskew.py | 2 -- ocrd_cis/ocropy/dewarp.py | 2 -- ocrd_cis/ocropy/recognize.py | 2 -- ocrd_cis/ocropy/resegment.py | 2 -- ocrd_cis/ocropy/segment.py | 3 --- 7 files changed, 15 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index b70d1fb0..777b3d3d 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -85,8 +85,6 @@ def process(self): # deskewing, because that would make segments incomensurable with their # neighbours. level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 7cf74727..5d3b9d44 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -59,8 +59,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index bcd3be01..16b4bc81 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -63,8 +63,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 6c27c5c6..dbe512f2 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -112,8 +112,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index f3ecf199..4b5da4b1 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -150,8 +150,6 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) maxlevel = self.parameter['textequiv_level'] # self.logger.info("Using model %s in %s for recognition", model) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 329694d0..378c2fd3 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -114,8 +114,6 @@ def process(self): # accuracy crucially depends on a good estimate of the images' # pixel density (at least if source input is not 300 DPI). level = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) for n, input_file in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 446fc628..6feb6e29 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -338,9 +338,6 @@ def process(self): overwrite_order = self.parameter['overwrite_order'] oplevel = self.parameter['level-of-operation'] - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) - for (n, input_file) in enumerate(self.input_files): self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) file_id = make_file_id(input_file, self.output_file_grp) From ae97768ea73a900092f656c6ad42a64670525a11 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 13 Aug 2024 16:41:13 +0200 Subject: [PATCH 087/194] remove: constructors, adapt setup() --- ocrd_cis/ocropy/clip.py | 7 ------- ocrd_cis/ocropy/denoise.py | 7 ------- ocrd_cis/ocropy/deskew.py | 7 ------- ocrd_cis/ocropy/dewarp.py | 10 ---------- ocrd_cis/ocropy/recognize.py | 19 ++++++------------- ocrd_cis/ocropy/resegment.py | 7 ------- ocrd_cis/ocropy/segment.py | 8 -------- ocrd_cis/ocropy/train.py | 17 ++++------------- 8 files changed, 10 insertions(+), 72 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 777b3d3d..62f68fcf 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -25,7 +25,6 @@ MIMETYPE_PAGE ) -from .. import get_ocrd_tool from .ocrolib import midrange, morph from .common import ( # binarize, @@ -34,12 +33,6 @@ class OcropyClip(Processor): logger: Logger - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyClip, self).__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-clip' diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 5d3b9d44..a68e2e3c 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -14,7 +14,6 @@ ) from ocrd import Processor -from .. import get_ocrd_tool from .common import ( # binarize, determine_zoom, remove_noise) @@ -22,12 +21,6 @@ class OcropyDenoise(Processor): logger: Logger - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyDenoise, self).__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-denoise' diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 16b4bc81..e41a557d 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -15,7 +15,6 @@ ) from ocrd import Processor -from .. import get_ocrd_tool from . import common from .common import pil2array @@ -29,12 +28,6 @@ def deskew(pil_image, maxskew=2): class OcropyDeskew(Processor): logger: Logger - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] - kwargs['version'] = ocrd_tool['version'] - super(OcropyDeskew, self).__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-deskew' diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index dbe512f2..bb9e4098 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -15,7 +15,6 @@ from ocrd import Processor from ocrd_utils import MIMETYPE_PAGE -from .. import get_ocrd_tool from .ocrolib import lineest from .common import array2pil, check_line, determine_zoom, pil2array @@ -66,15 +65,6 @@ def padvert(image, range_): class OcropyDewarp(Processor): logger: Logger - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyDewarp, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() - @property def executable(self): return 'ocrd-cis-ocropy-dewarp' diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 4b5da4b1..5880675c 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,6 +1,8 @@ from __future__ import absolute_import from logging import Logger from sys import exit +from typing import Any +from os import access, R_OK from os.path import abspath, dirname, isfile, join import numpy as np from PIL import Image @@ -23,7 +25,6 @@ ) from ocrd import Processor -from .. import get_ocrd_tool from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange @@ -77,17 +78,8 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): logger: Logger - - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - self.pad = 16 # ocropus-rpred default - self.network = None # set in process - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropyRecognize, self).__init__(*args, **kwargs) - if hasattr(self, 'output_file_grp'): - # processing context - self.setup() + network: Any + pad: int @property def executable(self): @@ -95,6 +87,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyRecognize') + self.pad = 16 assert_file_grp_cardinality(self.input_file_grp, 1) assert_file_grp_cardinality(self.output_file_grp, 1) # from ocropus-rpred: @@ -110,7 +103,7 @@ def get_model(self): be resolved with OcrdResourceManager to a valid readeable file and returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" - canread = lambda p: isfile(p) and os.access(p, os.R_OK) + canread = lambda p: isfile(p) and access(p, R_OK) try: model = self.resolve_resource(self.parameter['model']) if canread(model): diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 378c2fd3..17b90f65 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -21,7 +21,6 @@ MIMETYPE_PAGE ) -from .. import get_ocrd_tool from .ocrolib import midrange, morph from .common import ( pil2array, @@ -47,12 +46,6 @@ class OcropyResegment(Processor): logger: Logger - def __init__(self, *args, **kwargs): - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super().__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-resegment' diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 6feb6e29..f886e1d1 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -46,7 +46,6 @@ MIMETYPE_PAGE ) -from .. import get_ocrd_tool from .ocrolib import midrange from .ocrolib import morph from .common import ( @@ -247,13 +246,6 @@ def getx(xy): class OcropySegment(Processor): logger: Logger - def __init__(self, *args, **kwargs): - - self.ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = self.ocrd_tool['tools'][self.executable] - kwargs['version'] = self.ocrd_tool['version'] - super(OcropySegment, self).__init__(*args, **kwargs) - @property def executable(self): return 'ocrd-cis-ocropy-segment' diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index ff460523..08b68693 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -8,7 +8,6 @@ from ocrd_modelfactory import page_from_file from ocrd import Processor from ocrd_utils import getLogger -from ocrd_cis import get_ocrd_tool from .ocropus_rtrain import * from .binarize import binarize @@ -30,16 +29,7 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): logger: Logger - - def __init__(self, *args, **kwargs): - self.oldcwd = getcwd() - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools'][self.executable] - kwargs['version'] = ocrd_tool['version'] - super(OcropyTrain, self).__init__(*args, **kwargs) - if hasattr(self, 'input_file_grp'): - # processing context - self.setup() + old_cwd: str @property def executable(self): @@ -47,6 +37,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyTrain') + self.old_cwd = getcwd() #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] @@ -60,12 +51,12 @@ def setup(self): self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", model, model) exit(1) - outputpath = join(self.oldcwd, 'output', model) + outputpath = join(self.old_cwd, 'output', model) if 'outputpath' in self.parameter: outputpath = join(self.parameter, model) else: modelpath = None - outputpath = join(self.oldcwd, 'output', 'lstm') + outputpath = join(self.old_cwd, 'output', 'lstm') if 'outputpath' in self.parameter: outputpath = join(self.parameter, 'lstm') makedirs(dirname(outputpath)) From 60d02d28040f5b1bc2b4f5497f5353d4f53d5c45 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 00:39:18 +0200 Subject: [PATCH 088/194] completed: OcropyBinarize --- ocrd_cis/ocropy/binarize.py | 138 +++++++++++++++++------------------- 1 file changed, 65 insertions(+), 73 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 5d3fc7c3..0728f852 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -116,38 +116,36 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: except ValueError as e: self.logger.exception(e) else: - # TODO - raise NotImplementedError if level == 'table': regions = page.get_TableRegion() else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f"Page '{page_id}' contains no text regions") for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') if level == 'region': - self.process_region(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue + try: + ret.append(self.process_region(region, region_image, region_xywh, zoom, page_id, file_id)) + except ValueError as e: + self.logger.exception(e) lines = region.get_TextLine() if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', - page_id, region.id) + self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines") for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') - self.process_line(line, line_image, line_xywh, zoom, - input_file.pageId, region.id, - file_id + '_' + region.id + '_' + line.id) - + try: + ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, file_id)) + except ValueError as e: + self.logger.exception(e) return ret def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: if not page_image.width or not page_image.height: - raise ValueError("Skipping page '%s' with zero size", page_id) - self.logger.info("About to binarize page '%s'", page_id) + raise ValueError(f"Skipping page '{page_id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}'") assert self.output_file_grp features = page_xywh['features'] @@ -157,18 +155,18 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T maxskew = 0 else: maxskew = self.parameter['maxskew'] - bin_image, angle = binarize(page_image, - method=self.parameter['method'], - maxskew=maxskew, - threshold=self.parameter['threshold'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + page_image, + method=self.parameter['method'], + maxskew=maxskew, + threshold=self.parameter['threshold'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' page_xywh['angle'] = angle if self.parameter['noise_maxsize']: - bin_image = remove_noise( - bin_image, maxsize=self.parameter['noise_maxsize']) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) features += ',despeckled' # annotate angle in PAGE (to allow consumers of the AlternativeImage # to do consistent coordinate transforms, and non-consumers @@ -176,43 +174,43 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T orientation = -page_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) - # update METS (add the image file): if self.parameter['grayscale']: file_id += '.IMG-NRM' features += ',grayscale_normalized' else: file_id += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{file_id}.png') + bin_image_id = f'{file_id}.IMG-BIN' + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return (bin_image, file_id, bin_image_path) + return bin_image, bin_image_id, bin_image_path - def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id): + def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: if not region_image.width or not region_image.height: - self.logger.warning("Skipping region '%s' with zero size", region.id) - return - self.logger.info("About to binarize page '%s' region '%s'", page_id, region.id) + raise ValueError(f"Skipping region '{region.id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'") features = region_xywh['features'] if 'angle' in region_xywh and region_xywh['angle']: # orientation has already been annotated (by previous deskewing), # so skip deskewing here: - bin_image, _ = binarize(region_image, - method=self.parameter['method'], - maxskew=0, - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, _ = binarize( + region_image, + method=self.parameter['method'], + maxskew=0, + nrm=self.parameter['grayscale'], + zoom=zoom) else: - bin_image, angle = binarize(region_image, - method=self.parameter['method'], - maxskew=self.parameter['maxskew'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + region_image, + method=self.parameter['method'], + maxskew=self.parameter['maxskew'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' region_xywh['angle'] = angle - bin_image = remove_noise(bin_image, - maxsize=self.parameter['noise_maxsize']) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' # annotate angle in PAGE (to allow consumers of the AlternativeImage @@ -221,33 +219,31 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ orientation = -region_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] region.set_orientation(orientation) - # update METS (add the image file): + bin_image_id = f'{file_id}_{region.id}' if self.parameter['grayscale']: - file_id += '.IMG-NRM' + bin_image_id += '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + bin_image_id += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - region.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) + return bin_image, bin_image_id, bin_image_path - def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, file_id): + def process_line( + self, line, line_image, line_xywh, zoom, page_id, region_id, file_id + ) -> Tuple[Image.Image, str, str]: if not line_image.width or not line_image.height: - self.logger.warning("Skipping line '%s' with zero size", line.id) - return - self.logger.info("About to binarize page '%s' region '%s' line '%s'", - page_id, region_id, line.id) + raise ValueError(f"Skipping line '{line.id}' with zero size") + self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") features = line_xywh['features'] - bin_image, angle = binarize(line_image, - method=self.parameter['method'], - maxskew=self.parameter['maxskew'], - nrm=self.parameter['grayscale'], - zoom=zoom) + bin_image, angle = binarize( + line_image, + method=self.parameter['method'], + maxskew=self.parameter['maxskew'], + nrm=self.parameter['grayscale'], + zoom=zoom) if angle: features += ',deskewed' # annotate angle in PAGE (to allow consumers of the AlternativeImage @@ -256,23 +252,19 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id, fi #orientation = -angle #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] #line.set_orientation(orientation) # does not exist on line level! - self.logger.warning("cannot add orientation %.2f to page '%s' region '%s' line '%s'", - -angle, page_id, region_id, line.id) - bin_image = remove_noise(bin_image, - maxsize=self.parameter['noise_maxsize']) + self.logger.warning(f"cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", + -angle) + bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' - # update METS (add the image file): + bin_image_id = f'{file_id}_{region_id}_{line.id}' if self.parameter['grayscale']: - file_id += '.IMG-NRM' + bin_image_id += '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + bin_image_id += '.IMG-BIN' features += ',binarized' - file_path = self.workspace.save_image_file( - bin_image, file_id, self.output_file_grp, - page_id=page_id) + bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) + return bin_image, bin_image_id, bin_image_path From dcaccd4b5bb357c4f73356aaed04fd8a4483caa8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 00:46:34 +0200 Subject: [PATCH 089/194] remove file grp cardinality asserts --- ocrd_cis/ocropy/binarize.py | 3 --- ocrd_cis/ocropy/clip.py | 3 --- ocrd_cis/ocropy/denoise.py | 3 --- ocrd_cis/ocropy/deskew.py | 3 --- ocrd_cis/ocropy/dewarp.py | 3 --- ocrd_cis/ocropy/recognize.py | 3 --- ocrd_cis/ocropy/resegment.py | 3 --- ocrd_cis/ocropy/segment.py | 3 --- 8 files changed, 24 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 0728f852..746aba5e 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -13,7 +13,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, MIMETYPE_PAGE ) from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml @@ -69,8 +68,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyBinarize') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 62f68fcf..3e76157b 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -15,7 +15,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, polygon_from_points, bbox_from_polygon, @@ -39,8 +38,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index a68e2e3c..24852f24 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -5,7 +5,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, MIMETYPE_PAGE ) from ocrd_modelfactory import page_from_file @@ -27,8 +26,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDenoise') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Despeckle the pages / regions / lines of the workspace. diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index e41a557d..616864e1 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -5,7 +5,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, MIMETYPE_PAGE ) from ocrd_modelfactory import page_from_file @@ -34,8 +33,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDeskew') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Deskew the pages or regions of the workspace. diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index bb9e4098..17b69bc5 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -6,7 +6,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( @@ -71,8 +70,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDewarp') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) # defaults from ocrolib.lineest: self.lnorm = lineest.CenterNormalizer( params=(self.parameter['range'], diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 5880675c..40de2817 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -12,7 +12,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, coordinates_for_segment, polygon_from_bbox, points_from_polygon, @@ -88,8 +87,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyRecognize') self.pad = 16 - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) # from ocropus-rpred: self.network = load_object(self.get_model(), verbose=1) for x in self.network.walk(): diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 17b90f65..2483411d 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -12,7 +12,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, coordinates_for_segment, points_from_polygon, @@ -52,8 +51,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyResegment') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Resegment lines of the workspace. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index f886e1d1..9a1b8e11 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -38,7 +38,6 @@ from ocrd_utils import ( getLogger, make_file_id, - assert_file_grp_cardinality, coordinates_of_segment, coordinates_for_segment, points_from_polygon, @@ -252,8 +251,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropySegment') - assert_file_grp_cardinality(self.input_file_grp, 1) - assert_file_grp_cardinality(self.output_file_grp, 1) def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. From b178227763b834802b1e775623402b7bb5cdf84c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 10:51:52 +0200 Subject: [PATCH 090/194] Update ocrd_cis/ocropy/binarize.py Co-authored-by: Konstantin Baierer --- ocrd_cis/ocropy/binarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 746aba5e..27a3667c 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -118,7 +118,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - self.logger.warning(f"Page '{page_id}' contains no text regions") + self.logger.warning(f"Page '{page_id}' contains no regions") for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, feature_filter='binarized') From 67b6107e19c604063e9dae37473fcc48e04b4558 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 10:52:25 +0200 Subject: [PATCH 091/194] Update ocrd_cis/ocropy/binarize.py Co-authored-by: Konstantin Baierer --- ocrd_cis/ocropy/binarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 27a3667c..fea064af 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -76,7 +76,7 @@ def setup(self): def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. - THEN Iterate over the PAGE-XML element hierarchy down to the requested + Iterate over the PAGE-XML element hierarchy down to the requested ``level-of-operation``. Next, for each file, crop each segment image according to the layout From 06a98b1f601d80511e73b0c366a60f574e2a8e27 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 10:55:29 +0200 Subject: [PATCH 092/194] Update ocrd_cis/ocropy/binarize.py Co-authored-by: Konstantin Baierer --- ocrd_cis/ocropy/binarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index fea064af..7e355d73 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -71,7 +71,7 @@ def setup(self): method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') - raise Exception('only method=ocropy allows grayscale=true') + raise ValueError('only method=ocropy allows grayscale=true') def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. From 1e6cd7bd53547de5c41f2100cdad8adc1a2091ca Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 10:55:45 +0200 Subject: [PATCH 093/194] Update ocrd_cis/ocropy/binarize.py Co-authored-by: Konstantin Baierer --- ocrd_cis/ocropy/binarize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 7e355d73..af60e613 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -21,7 +21,6 @@ from . import common from .common import array2pil, determine_zoom, pil2array, remove_noise -#sys.path.append(dirname(abspath(__file__))) def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): LOG = getLogger('processor.OcropyBinarize') From 71bb26d9c4f0b45498625b90c9e4cd136d8e667e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 11:04:12 +0200 Subject: [PATCH 094/194] fix: potentially wrong dpi in logs --- ocrd_cis/ocropy/binarize.py | 4 ++-- ocrd_cis/ocropy/clip.py | 4 ++-- ocrd_cis/ocropy/common.py | 4 ++-- ocrd_cis/ocropy/denoise.py | 4 ++-- ocrd_cis/ocropy/dewarp.py | 4 ++-- ocrd_cis/ocropy/resegment.py | 4 ++-- ocrd_cis/ocropy/segment.py | 4 ++-- 7 files changed, 14 insertions(+), 14 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index af60e613..61e959ca 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -102,8 +102,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: assert page page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") ret = [pcgts] if level == 'page': diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 3e76157b..3607399b 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -87,8 +87,8 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") # FIXME: what about text regions inside table regions? regions = list(page.get_TextRegion()) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 1804c29d..49e8f248 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -2103,7 +2103,7 @@ def find_topological(): # DSAVE('rlabels_closed', rlabels) return rlabels -def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: +def determine_zoom(dpi: float, page_image_info: OcrdExif) -> (float, float): if dpi > 0: zoom = 300.0/dpi elif page_image_info.resolution != 1: @@ -2113,4 +2113,4 @@ def determine_zoom(dpi: float, page_image_info: OcrdExif) -> float: zoom = 300.0/dpi else: zoom = 1 - return zoom + return zoom, dpi diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 24852f24..713af889 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -63,8 +63,8 @@ def process(self): page, page_id, feature_selector='binarized' if level == 'page' else '') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 17b69bc5..412724db 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -112,8 +112,8 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 2483411d..5bc9d008 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -117,8 +117,8 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 9a1b8e11..d171b6ed 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -339,8 +339,8 @@ def process(self): # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {self.parameter['dpi']} DPI") + zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) + self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + From 64f02a32f938a00e01d6d390993246a617cbab5e Mon Sep 17 00:00:00 2001 From: kba Date: Wed, 14 Aug 2024 11:14:31 +0200 Subject: [PATCH 095/194] binarize: don't conflate region/lines seg, pass output_file_id --- ocrd_cis/ocropy/binarize.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 61e959ca..817d4a8a 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -123,7 +123,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: region, page_image, page_xywh, feature_filter='binarized') if level == 'region': try: - ret.append(self.process_region(region, region_image, region_xywh, zoom, page_id, file_id)) + ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id)) + continue except ValueError as e: self.logger.exception(e) lines = region.get_TextLine() @@ -133,8 +134,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') try: - ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, file_id)) - except ValueError as e: + ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) + except alueError as e: self.logger.exception(e) return ret From d7c15c7738cdad474eb1999718c41371192e0e14 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 11:29:21 +0200 Subject: [PATCH 096/194] Update binarize.py --- ocrd_cis/ocropy/binarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 817d4a8a..064a733e 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -135,7 +135,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: line, region_image, region_xywh, feature_filter='binarized') try: ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) - except alueError as e: + except ValueError as e: self.logger.exception(e) return ret From 19566c0567b5b23bdc4596384d3867601045ca57 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 13:53:35 +0200 Subject: [PATCH 097/194] try to migrate recognize --- ocrd_cis/ocropy/recognize.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 40de2817..140a3c83 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -115,6 +115,30 @@ def get_model(self): self.parameter['model'], self.parameter['model']) exit(1) + def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + maxlevel = self.parameter['textequiv_level'] + assert self.workspace + self.logger.debug(f'Max level: "{maxlevel}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + ret = [pcgts] + + self.logger.info(f"Recognizing text in page '{page_id}'") + # region, line, word, or glyph level: + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning(f"Page '{page_id}' contains no text regions") + self.process_regions(regions, maxlevel, page_image, page_coords) + + file_path = join(self.output_file_grp, output_file_id + '.xml') + ret.append((output_file_id, file_path)) + return ret + + # TODO: remove when `process_page_pcgts` is validated to be correct def process(self): """Recognize lines / words / glyphs of the workspace. From 5f60976452011656fd05c1375055dd5ebd5f89d9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 13:59:33 +0200 Subject: [PATCH 098/194] fix: migrate recognize --- ocrd_cis/ocropy/recognize.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 140a3c83..9729b480 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -125,18 +125,13 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: assert page page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - ret = [pcgts] - self.logger.info(f"Recognizing text in page '{page_id}'") # region, line, word, or glyph level: regions = page.get_AllRegions(classes=['Text']) if not regions: self.logger.warning(f"Page '{page_id}' contains no text regions") self.process_regions(regions, maxlevel, page_image, page_coords) - - file_path = join(self.output_file_grp, output_file_id + '.xml') - ret.append((output_file_id, file_path)) - return ret + return [pcgts] # TODO: remove when `process_page_pcgts` is validated to be correct def process(self): From e8b26035f0d4bd84e689ce92f8da805cb0adaf13 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 14:35:53 +0200 Subject: [PATCH 099/194] fix: detect_zoom logging --- ocrd_cis/ocropy/binarize.py | 5 ++--- ocrd_cis/ocropy/clip.py | 4 ++-- ocrd_cis/ocropy/common.py | 5 +++-- ocrd_cis/ocropy/denoise.py | 3 +-- ocrd_cis/ocropy/dewarp.py | 3 +-- ocrd_cis/ocropy/resegment.py | 3 +-- ocrd_cis/ocropy/segment.py | 3 +-- 7 files changed, 11 insertions(+), 15 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 064a733e..387c51dc 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -102,9 +102,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: assert page page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") - + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + ret = [pcgts] if level == 'page': try: diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 3607399b..dd0de012 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -87,8 +87,8 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + # TODO: zoom is not used anywhere, is it still useful to have this call here? + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) # FIXME: what about text regions inside table regions? regions = list(page.get_TextRegion()) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 49e8f248..095de5eb 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -2103,14 +2103,15 @@ def find_topological(): # DSAVE('rlabels_closed', rlabels) return rlabels -def determine_zoom(dpi: float, page_image_info: OcrdExif) -> (float, float): +def determine_zoom(logger: logging.Logger, dpi: float, page_image_info: OcrdExif) -> float: if dpi > 0: zoom = 300.0/dpi elif page_image_info.resolution != 1: dpi = page_image_info.resolution if page_image_info.resolutionUnit == 'cm': dpi *= 2.54 + logger.info(f"Page '{page_id}' uses {dpi} DPI.") zoom = 300.0/dpi else: zoom = 1 - return zoom, dpi + return zoom diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 713af889..78d11c28 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -63,8 +63,7 @@ def process(self): page, page_id, feature_selector='binarized' if level == 'page' else '') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 412724db..9dddae44 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -112,8 +112,7 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 5bc9d008..e8c52a69 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -117,8 +117,7 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index d171b6ed..c092718f 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -339,8 +339,7 @@ def process(self): # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom, dpi = determine_zoom(self.parameter['dpi'], page_image_info) - self.logger.info(f"Page '{page_id}' uses {dpi} DPI. Determined zoom={zoom}") + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + From 7dfd4964be3f4e4db9bfe6ff548eda477ed36ae6 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 14:38:05 +0200 Subject: [PATCH 100/194] update: test_lib base url --- tests/test_lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index f28acb1e..c018d253 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/" +data_url="https://github.com/OCR-D/gt_structure_text/releases/tag/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" From 033c38ac3e3a6fdd9e74ab502d792878aad77439 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 15:07:22 +0200 Subject: [PATCH 101/194] logging exception -> error --- ocrd_cis/ocropy/binarize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 387c51dc..0ea170e4 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -109,7 +109,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: try: ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id)) except ValueError as e: - self.logger.exception(e) + self.logger.error(e) else: if level == 'table': regions = page.get_TableRegion() @@ -125,7 +125,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id)) continue except ValueError as e: - self.logger.exception(e) + self.logger.error(e) lines = region.get_TextLine() if not lines: self.logger.warning(f"Page '{page_id}' region '{region.id}' contains no text lines") @@ -135,7 +135,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: try: ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) except ValueError as e: - self.logger.exception(e) + self.logger.error(e) return ret def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: From 46d84d58b7474adc3cb9f9b756b215efebd495e3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 15:50:10 +0200 Subject: [PATCH 102/194] refactor: logger as a first positional argument --- ocrd_cis/ocropy/binarize.py | 9 +++++--- ocrd_cis/ocropy/resegment.py | 18 +++++++-------- ocrd_cis/ocropy/segment.py | 43 +++++++++++++++--------------------- 3 files changed, 32 insertions(+), 38 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 0ea170e4..8f7d8d3a 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -22,9 +22,8 @@ from .common import array2pil, determine_zoom, pil2array, remove_noise -def binarize(pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): - LOG = getLogger('processor.OcropyBinarize') - LOG.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) +def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): + logger.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) if method == 'none': # useful if the images are already binary, # but lack image attribute `binarized` @@ -152,6 +151,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T else: maxskew = self.parameter['maxskew'] bin_image, angle = binarize( + self.logger, page_image, method=self.parameter['method'], maxskew=maxskew, @@ -191,6 +191,7 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ # orientation has already been annotated (by previous deskewing), # so skip deskewing here: bin_image, _ = binarize( + self.logger, region_image, method=self.parameter['method'], maxskew=0, @@ -198,6 +199,7 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ zoom=zoom) else: bin_image, angle = binarize( + self.logger, region_image, method=self.parameter['method'], maxskew=self.parameter['maxskew'], @@ -235,6 +237,7 @@ def process_line( self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") features = line_xywh['features'] bin_image, angle = binarize( + self.logger, line_image, method=self.parameter['method'], maxskew=self.parameter['maxskew'], diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index e8c52a69..b18c0b5e 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -265,8 +265,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_polygon[:, 0], parent_bin.shape) new_labels[line_y, line_x] = i + 1 - spread_dist(lines, line_labels, new_labels, parent_bin, components, parent_coords, - maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold, logger=self.logger) + spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords, + maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold) return try: # TODO: 'scale' passed as a param may not be always defined (mehmedGIT) @@ -280,9 +280,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l self.logger.info("Found %d new line labels for %d existing lines on %s '%s'", new_line_labels.max(), len(lines), tag, parent.id) # polygonalize and prepare comparison - new_line_polygons, new_line_labels = masks2polygons( + new_line_polygons, new_line_labels = masks2polygons(self.logger, new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), - min_area=640/zoom/zoom, logger=self.logger) + min_area=640/zoom/zoom) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base)) @@ -392,8 +392,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] for i in new_lines], loc=line.id, scale=scale) - new_baseline = join_baselines([new_polygon.intersection(new_baselines[i]) - for i in new_lines], loc=line.id, logger=self.logger) + new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i]) + for i in new_lines], loc=line.id) # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) @@ -427,11 +427,9 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) -def spread_dist(lines, old_labels, new_labels, binarized, components, coords, - maxdist=43, loc='', threshold=0.9, logger = None): +def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, components, coords, + maxdist=43, loc='', threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" - if not logger: - raise ValueError(f"Logger has not been passed by the caller") DSAVE('seeds', [new_labels, (components>0)]) # allocate to connected components consistently # (ignoring smallest components like punctuation) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index c092718f..782425cc 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -57,7 +57,7 @@ lines2regions ) -def masks2polygons(bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True, logger=None): +def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -230,9 +230,9 @@ def getx(xy): # get baseline segments intersecting with this line mask # and concatenate them from left to right if baselines is not None: - base = join_baselines([baseline.intersection(polygon) + base = join_baselines(logger, [baseline.intersection(polygon) for baseline in baselines - if baseline.intersects(polygon)], name, logger) + if baseline.intersects(polygon)], name) if base is not None: base = base.coords else: @@ -416,7 +416,7 @@ def process(self): roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) - reading_order[region.id] = page_subgroup_in_reading_order(roelem, self.logger) + reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) else: self.logger.warning('skipping table "%s" with existing TextRegions', region.id) continue @@ -434,7 +434,7 @@ def process(self): elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem, self.logger) + roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", @@ -446,7 +446,7 @@ def process(self): else: # replace regionRef(Indexed) by group with same index and ref # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(roelem, self.logger) + roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) self._process_element(region, subignore, region_image, region_coords, @@ -661,16 +661,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(region_mask * region_label, None, element_bin, + regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin, '%s "%s"' % (element_name, element_id), min_area=6000/zoom/zoom, - simplify=ignore_labels * ~(sep_bin), - logger=self.logger) + simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(region_line_labels, baselines, element_bin, + lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin, 'region "%s"' % element_id, - min_area=640/zoom/zoom, - logger=self.logger) + min_area=640/zoom/zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -722,8 +720,8 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (e.g. drop-capitals or images) ... self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(images, None, element_bin, - '%s "%s"' % (element_name, element_id), self.logger) + image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, + '%s "%s"' % (element_name, element_id)) for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -740,9 +738,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # split detected separator labels into separator regions: self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons(seplines, None, element_bin, + sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin, '%s "%s"' % (element_name, element_id), - open_holes=True, reorder=False, logger=self.logger) + open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -774,9 +772,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(line_labels, baselines, element_bin, + line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin, 'region "%s"' % element_id, - min_area=640/zoom/zoom, logger=self.logger) + min_area=640/zoom/zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: @@ -918,9 +916,7 @@ def join_polygons(polygons, loc='', scale=20): jointp = make_valid(jointp) return jointp -def join_baselines(baselines, loc='', logger = None): - if not logger: - raise ValueError(f"Logger has not been passed by the caller") +def join_baselines(logger: Logger, baselines, loc=''): lines = [] for baseline in baselines: if (baseline.is_empty or @@ -1062,7 +1058,7 @@ def page_add_to_reading_order(rogroup, region_id, index=None): index += 1 return index -def page_subgroup_in_reading_order(roelem, logger = None): +def page_subgroup_in_reading_order(logger: Logger, roelem): """Replace given RO element by an equivalent OrderedGroup. Given a ReadingOrder element ``roelem`` (of any type), @@ -1076,9 +1072,6 @@ def page_subgroup_in_reading_order(roelem, logger = None): Return the new group object. """ - if not logger: - raise ValueError(f"Logger has not been passed by the caller") - if not roelem: logger.error('Cannot subgroup from empty ReadingOrder element') return roelem From f6fe4cf4caaf056ded182b498b44a610349627fc Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 15:54:25 +0200 Subject: [PATCH 103/194] fix: test_lib.bash data url --- tests/test_lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index c018d253..801be01a 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/tag/v1.5.0/" +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" From aed0f95ccdc0dfe4cc26982258ef1c8acd613e1e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 16:33:31 +0200 Subject: [PATCH 104/194] fix: recognize OcrdPage import --- ocrd_cis/ocropy/recognize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 9729b480..ccb019eb 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -19,7 +19,7 @@ ) from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - to_xml, TextEquivType, + to_xml, TextEquivType, OcrdPage, CoordsType, GlyphType, WordType ) from ocrd import Processor From 804f031221eb4e64649e167c2f554d26555d5637 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Wed, 14 Aug 2024 18:10:00 +0200 Subject: [PATCH 105/194] try to migrate clip --- ocrd_cis/ocropy/clip.py | 178 +++++++++++++++++++++++++++++++--------- 1 file changed, 138 insertions(+), 40 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index dd0de012..0675257b 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -8,9 +8,7 @@ from shapely.prepared import prep from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType -) +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml from ocrd import Processor from ocrd_utils import ( getLogger, @@ -39,6 +37,113 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') + def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + level = self.parameter['level-of-operation'] + assert self.workspace + self.logger.debug(f'Level of operation: "{level}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page + + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + # TODO: zoom is not used anywhere, is it still useful to have this call here? + zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + ret = [pcgts] + + # FIXME: what about text regions inside table regions? + regions = list(page.get_TextRegion()) + num_texts = len(regions) + regions += ( + page.get_AdvertRegion() + + page.get_ChartRegion() + + page.get_ChemRegion() + + page.get_GraphicRegion() + + page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_MathsRegion() + + page.get_MusicRegion() + + page.get_NoiseRegion() + + page.get_SeparatorRegion() + + page.get_TableRegion() + + page.get_UnknownRegion()) + if not num_texts: + self.logger.warning('Page "%s" contains no text regions', page_id) + background = ImageStat.Stat(page_image) + # workaround for Pillow#4925 + if len(background.bands) > 1: + background = tuple(background.median) + else: + background = background.median[0] + if level == 'region': + background_image = Image.new(page_image.mode, page_image.size, background) + page_array = pil2array(page_image) + page_bin = np.array(page_array <= midrange(page_array), np.uint8) + # in absolute coordinates merely for comparison/intersection + shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] + # in relative coordinates for mask/cropping + polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] + for i, polygon in enumerate(polygons[num_texts:], num_texts): + # for non-text regions, extend mask by 3 pixels in each direction + # to ensure they do not leak components accidentally + # (accounts for bad cropping of such regions in GT): + polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open + polygons[i] = polygon + masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] + for i, region in enumerate(regions): + if i >= num_texts: + break # keep non-text regions unchanged + if level == 'region': + if region.get_AlternativeImage(): + # FIXME: This should probably be an exception (bad workflow configuration). + self.logger.warning( + f'Page "{page_id}" region "{region.id}" already contains image data: skipping') + continue + shape = prep(shapes[i]) + neighbours = [(regionj, maskj) for shapej, regionj, maskj + in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:]) + if shape.intersects(shapej)] + if neighbours: + segment_region_file_id = f"{output_file_id}_{region.id}" + ret.append(self.process_segment( + region, masks[i], polygons[i], neighbours, background_image, + page_image, page_coords, page_bin, page_id, segment_region_file_id)) + continue + # level == 'line': + lines = region.get_TextLine() + if not lines: + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') + continue + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + background_image = Image.new(region_image.mode, region_image.size, background) + region_array = pil2array(region_image) + region_bin = np.array(region_array <= midrange(region_array), np.uint8) + # in absolute coordinates merely for comparison/intersection + shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] + # in relative coordinates for mask/cropping + polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] + masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] + for j, line in enumerate(lines): + if line.get_AlternativeImage(): + # FIXME: This should probably be an exception (bad workflow configuration). + self.logger.warning( + f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image ' + f'data: skipping') + continue + shape = prep(shapes[j]) + neighbours = [(linej, maskj) for shapej, linej, maskj + in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:]) + if shape.intersects(shapej)] + if neighbours: + segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}" + ret.append(self.process_segment( + line, masks[j], polygons[j], neighbours, background_image, + region_image, region_coords, region_bin, page_id, segment_line_file_id)) + return ret + + # TODO: remove when `process_page_pcgts` is validated to be correct def process(self): """Clip text regions / lines of the workspace at intersections with neighbours. @@ -119,27 +224,24 @@ def process(self): page_array = pil2array(page_image) page_bin = np.array(page_array <= midrange(page_array), np.uint8) # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(region.get_Coords().points)) - for region in regions] + shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(region, page_image, page_coords) - for region in regions] + polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] for i, polygon in enumerate(polygons[num_texts:], num_texts): # for non-text regions, extend mask by 3 pixels in each direction # to ensure they do not leak components accidentally # (accounts for bad cropping of such regions in GT): polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open polygons[i] = polygon - masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) - for polygon in polygons] + masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] for i, region in enumerate(regions): if i >= num_texts: break # keep non-text regions unchanged if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning('Page "%s" region "%s" already contains image data: skipping', - page_id, region.id) + self.logger.warning( + f'Page "{page_id}" region "{region.id}" already contains image data: skipping') continue shape = prep(shapes[i]) neighbours = [(regionj, maskj) for shapej, regionj, maskj @@ -148,15 +250,15 @@ def process(self): masks[:i] + masks[i+1:]) if shape.intersects(shapej)] if neighbours: - self.process_segment(region, masks[i], polygons[i], - neighbours, background_image, - page_image, page_coords, page_bin, - input_file.pageId, file_id + '_' + region.id) + segment_region_file_id = f"{file_id}_{region.id}" + self.process_segment( + region, masks[i], polygons[i], neighbours, background_image, + page_image, page_coords, page_bin, input_file.pageId, segment_region_file_id) continue # level == 'line': lines = region.get_TextLine() if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') continue region_image, region_coords = self.workspace.image_from_segment( region, page_image, page_coords, feature_selector='binarized') @@ -164,18 +266,16 @@ def process(self): region_array = pil2array(region_image) region_bin = np.array(region_array <= midrange(region_array), np.uint8) # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(line.get_Coords().points)) - for line in lines] + shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(line, region_image, region_coords) - for line in lines] - masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) - for polygon in polygons] + polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] + masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] for j, line in enumerate(lines): if line.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning('Page "%s" region "%s" line "%s" already contains image data: skipping', - page_id, region.id, line.id) + self.logger.warning( + f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image ' + f'data: skipping') continue shape = prep(shapes[j]) neighbours = [(linej, maskj) for shapej, linej, maskj @@ -184,10 +284,10 @@ def process(self): masks[:j] + masks[j+1:]) if shape.intersects(shapej)] if neighbours: - self.process_segment(line, masks[j], polygons[j], - neighbours, background_image, - region_image, region_coords, region_bin, - input_file.pageId, file_id + '_' + region.id + '_' + line.id) + segment_line_file_id = f"{file_id}_{region.id}_{line.id}" + self.process_segment( + line, masks[j], polygons[j], neighbours, background_image, + region_image, region_coords, region_bin, input_file.pageId, segment_line_file_id) # update METS (add the PAGE file): file_path = join(self.output_file_grp, file_id + '.xml') @@ -204,7 +304,7 @@ def process(self): def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, - page_id, file_id): + page_id, file_id) -> Tuple[Image.Image, str, str]: # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( @@ -216,8 +316,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, segment_bbox = bbox_from_polygon(segment_polygon) for neighbour, neighbour_mask in neighbours: if not np.any(segment_mask > neighbour_mask): - self.logger.info('Ignoring enclosing neighbour "%s" of segment "%s" on page "%s"', - neighbour.id, segment.id, page_id) + self.logger.info( + f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"') continue # find connected components that (only) belong to the neighbour: intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour @@ -226,8 +326,9 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: continue - self.logger.debug('segment "%s" vs neighbour "%s": suppressing %d of %d pixels on page "%s"', - segment.id, neighbour.id, num_intruders, num_foreground, page_id) + self.logger.debug( + f'segment "{segment.id}" vs neighbour "{neighbour.id}": suppressing {num_intruders} of ' + f'{num_foreground} pixels on page "{page_id}"') # suppress in segment_mask so these intruders can stay in the neighbours # (are not removed from both sides) segment_mask -= intruders @@ -241,11 +342,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): segment_image = crop_image(segment_image,box=segment_bbox) - # update METS (add the image file): - file_path = self.workspace.save_image_file( - segment_image, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) + segment_image_id = file_id + '.IMG-CLIP' + segment_image_path = join(self.output_file_grp, f'{segment_image_id}.png') # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=features)) + segment.add_AlternativeImage(AlternativeImageType(filename=segment_image_path, comments=features)) + return segment_image, segment_image_id, segment_image_path From 7bdff31747ad2c9cdb834569b8b1adf8b90303d2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 11:51:42 +0200 Subject: [PATCH 106/194] remove: process() methods --- ocrd_cis/ocropy/clip.py | 194 +++++++---------------------------- ocrd_cis/ocropy/recognize.py | 65 +++--------- 2 files changed, 50 insertions(+), 209 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 0675257b..9e6d8d19 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -37,7 +37,42 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') + # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + """Clip text regions / lines of the workspace at intersections with neighbours. + + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the requested + ``level-of-operation``. + + Next, get each segment image according to the layout annotation (by cropping + via coordinates into the higher-level image), as well as all its neighbours', + binarize them (without deskewing), and make a connected component analysis. + (Segments must not already have AlternativeImage annotated, otherwise they + will be skipped.) + + Then, for each section of overlap with a neighbour, re-assign components + which are only contained in the neighbour by clipping them to white (background), + and export the (final) result as image file. + + Add the new image file to the workspace along with the output fileGrp, + and using a file ID with suffix ``.IMG-CLIP`` along with further + identification of the input element. + + Reference each new image in the AlternativeImage of the element. + + Produce a new output file by serialising the resulting hierarchy. + """ + # This makes best sense for overlapping segmentation, like current GT + # or Tesseract layout analysis. Most notably, it can suppress graphics + # and separators within or across a region or line. It _should_ ideally + # be run after binarization (on page level for region-level clipping, + # and on the region level for line-level clipping), because the + # connected component analysis after implicit binarization could be + # suboptimal, and the explicit binarization after clipping could be, + # too. However, region-level clipping _must_ be run before region-level + # deskewing, because that would make segments incomensurable with their + # neighbours. level = self.parameter['level-of-operation'] assert self.workspace self.logger.debug(f'Level of operation: "{level}"') @@ -143,165 +178,6 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: region_image, region_coords, region_bin, page_id, segment_line_file_id)) return ret - # TODO: remove when `process_page_pcgts` is validated to be correct - def process(self): - """Clip text regions / lines of the workspace at intersections with neighbours. - - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested - ``level-of-operation``. - - Next, get each segment image according to the layout annotation (by cropping - via coordinates into the higher-level image), as well as all its neighbours', - binarize them (without deskewing), and make a connected component analysis. - (Segments must not already have AlternativeImage annotated, otherwise they - will be skipped.) - - Then, for each section of overlap with a neighbour, re-assign components - which are only contained in the neighbour by clipping them to white (background), - and export the (final) result as image file. - - Add the new image file to the workspace along with the output fileGrp, - and using a file ID with suffix ``.IMG-CLIP`` along with further - identification of the input element. - - Reference each new image in the AlternativeImage of the element. - - Produce a new output file by serialising the resulting hierarchy. - """ - # This makes best sense for overlapping segmentation, like current GT - # or Tesseract layout analysis. Most notably, it can suppress graphics - # and separators within or across a region or line. It _should_ ideally - # be run after binarization (on page level for region-level clipping, - # and on the region level for line-level clipping), because the - # connected component analysis after implicit binarization could be - # suboptimal, and the explicit binarization after clipping could be, - # too. However, region-level clipping _must_ be run before region-level - # deskewing, because that would make segments incomensurable with their - # neighbours. - level = self.parameter['level-of-operation'] - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - # TODO: zoom is not used anywhere, is it still useful to have this call here? - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) - - # FIXME: what about text regions inside table regions? - regions = list(page.get_TextRegion()) - num_texts = len(regions) - regions += ( - page.get_AdvertRegion() + - page.get_ChartRegion() + - page.get_ChemRegion() + - page.get_GraphicRegion() + - page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_MathsRegion() + - page.get_MusicRegion() + - page.get_NoiseRegion() + - page.get_SeparatorRegion() + - page.get_TableRegion() + - page.get_UnknownRegion()) - if not num_texts: - self.logger.warning('Page "%s" contains no text regions', page_id) - background = ImageStat.Stat(page_image) - # workaround for Pillow#4925 - if len(background.bands) > 1: - background = tuple(background.median) - else: - background = background.median[0] - if level == 'region': - background_image = Image.new(page_image.mode, page_image.size, background) - page_array = pil2array(page_image) - page_bin = np.array(page_array <= midrange(page_array), np.uint8) - # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] - # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] - for i, polygon in enumerate(polygons[num_texts:], num_texts): - # for non-text regions, extend mask by 3 pixels in each direction - # to ensure they do not leak components accidentally - # (accounts for bad cropping of such regions in GT): - polygon = Polygon(polygon).buffer(3).exterior.coords[:-1] # keep open - polygons[i] = polygon - masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] - for i, region in enumerate(regions): - if i >= num_texts: - break # keep non-text regions unchanged - if level == 'region': - if region.get_AlternativeImage(): - # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning( - f'Page "{page_id}" region "{region.id}" already contains image data: skipping') - continue - shape = prep(shapes[i]) - neighbours = [(regionj, maskj) for shapej, regionj, maskj - in zip(shapes[:i] + shapes[i+1:], - regions[:i] + regions[i+1:], - masks[:i] + masks[i+1:]) - if shape.intersects(shapej)] - if neighbours: - segment_region_file_id = f"{file_id}_{region.id}" - self.process_segment( - region, masks[i], polygons[i], neighbours, background_image, - page_image, page_coords, page_bin, input_file.pageId, segment_region_file_id) - continue - # level == 'line': - lines = region.get_TextLine() - if not lines: - self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') - continue - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - background_image = Image.new(region_image.mode, region_image.size, background) - region_array = pil2array(region_image) - region_bin = np.array(region_array <= midrange(region_array), np.uint8) - # in absolute coordinates merely for comparison/intersection - shapes = [Polygon(polygon_from_points(line.get_Coords().points)) for line in lines] - # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(line, region_image, region_coords) for line in lines] - masks = [pil2array(polygon_mask(region_image, polygon)).astype(np.uint8) for polygon in polygons] - for j, line in enumerate(lines): - if line.get_AlternativeImage(): - # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning( - f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image ' - f'data: skipping') - continue - shape = prep(shapes[j]) - neighbours = [(linej, maskj) for shapej, linej, maskj - in zip(shapes[:j] + shapes[j+1:], - lines[:j] + lines[j+1:], - masks[:j] + masks[j+1:]) - if shape.intersects(shapej)] - if neighbours: - segment_line_file_id = f"{file_id}_{region.id}_{line.id}" - self.process_segment( - line, masks[j], polygons[j], neighbours, background_image, - region_image, region_coords, region_bin, input_file.pageId, segment_line_file_id) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, page_id, file_id) -> Tuple[Image.Image, str, str]: diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index ccb019eb..389cf8db 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -115,26 +115,8 @@ def get_model(self): self.parameter['model'], self.parameter['model']) exit(1) + # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: - maxlevel = self.parameter['textequiv_level'] - assert self.workspace - self.logger.debug(f'Max level: "{maxlevel}"') - - pcgts = input_pcgts[0] - page = pcgts.get_Page() - assert page - - page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - self.logger.info(f"Recognizing text in page '{page_id}'") - # region, line, word, or glyph level: - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning(f"Page '{page_id}' contains no text regions") - self.process_regions(regions, maxlevel, page_image, page_coords) - return [pcgts] - - # TODO: remove when `process_page_pcgts` is validated to be correct - def process(self): """Recognize lines / words / glyphs of the workspace. Open and deserialise each PAGE input file and its respective image, @@ -160,38 +142,21 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ maxlevel = self.parameter['textequiv_level'] + assert self.workspace + self.logger.debug(f'Max level: "{maxlevel}"') + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + assert page - # self.logger.info("Using model %s in %s for recognition", model) - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id) - - self.logger.info("Recognizing text in page '%s'", page_id) - # region, line, word, or glyph level: - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning("Page '%s' contains no text regions", page_id) - self.process_regions(regions, maxlevel, page_image, page_coords) - - # update METS (add the PAGE file): - file_id = make_file_id(input_file, self.output_file_grp) - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + self.logger.info(f"Recognizing text in page '{page_id}'") + # region, line, word, or glyph level: + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning(f"Page '{page_id}' contains no text regions") + self.process_regions(regions, maxlevel, page_image, page_coords) + return [pcgts] def process_regions(self, regions, maxlevel, page_image, page_coords): edits = 0 From 03c2f158fa02ddeae40baa93cee686be1fd0ca09 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 11:57:36 +0200 Subject: [PATCH 107/194] adapt: docstring of process_page_pcgts --- ocrd_cis/ocropy/clip.py | 8 ++++---- ocrd_cis/ocropy/recognize.py | 17 ++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 9e6d8d19..a5f4f705 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -39,9 +39,9 @@ def setup(self): # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: - """Clip text regions / lines of the workspace at intersections with neighbours. + """Clip text regions / lines of a page at intersections with neighbours. - Open and deserialise PAGE input files and their respective images, + Open and deserialize PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``level-of-operation``. @@ -61,7 +61,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: Reference each new image in the AlternativeImage of the element. - Produce a new output file by serialising the resulting hierarchy. + Return the resulting OcrdPage. """ # This makes best sense for overlapping segmentation, like current GT # or Tesseract layout analysis. Most notably, it can suppress graphics @@ -71,7 +71,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: # connected component analysis after implicit binarization could be # suboptimal, and the explicit binarization after clipping could be, # too. However, region-level clipping _must_ be run before region-level - # deskewing, because that would make segments incomensurable with their + # deskewing, because that would make segments incommensurable with their # neighbours. level = self.parameter['level-of-operation'] assert self.workspace diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 389cf8db..69b374ec 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -115,18 +115,17 @@ def get_model(self): self.parameter['model'], self.parameter['model']) exit(1) - # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: - """Recognize lines / words / glyphs of the workspace. + """Recognize lines / words / glyphs of a page. - Open and deserialise each PAGE input file and its respective image, + Open and deserialize the PAGE input file and its respective image, then iterate over the element hierarchy down to the requested ``textequiv_level``. If any layout annotation below the line level already exists, then remove it (regardless of ``textequiv_level``). - Set up Ocropy to recognise each text line (via coordinates into + Set up Ocropy to recognize each text line (via coordinates into the higher-level image, or from the alternative image; the image - must have been binarised/grayscale-normalised, deskewed and dewarped + must have been binarized/grayscale-normalised, deskewed and dewarped already). Rescale and pad the image, then recognize. Create new elements below the line level, if necessary. @@ -139,11 +138,11 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: Levenshtein distance. Aggregate these scores for each file and print the line-wise and the total character error rates (CER). - Produce a new output file by serialising the resulting hierarchy. + Return the resulting OcrdPage. """ - maxlevel = self.parameter['textequiv_level'] + max_level = self.parameter['textequiv_level'] assert self.workspace - self.logger.debug(f'Max level: "{maxlevel}"') + self.logger.debug(f'Max level: "{max_level}"') pcgts = input_pcgts[0] page = pcgts.get_Page() @@ -155,7 +154,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: regions = page.get_AllRegions(classes=['Text']) if not regions: self.logger.warning(f"Page '{page_id}' contains no text regions") - self.process_regions(regions, maxlevel, page_image, page_coords) + self.process_regions(regions, max_level, page_image, page_coords) return [pcgts] def process_regions(self, regions, maxlevel, page_image, page_coords): From 90ac28e1f9c9b6c95492aac765aaf5183a045be2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 12:11:30 +0200 Subject: [PATCH 108/194] refactor: other small things --- ocrd_cis/ocropy/clip.py | 16 +++++------ ocrd_cis/ocropy/recognize.py | 52 +++++++++++++++--------------------- 2 files changed, 29 insertions(+), 39 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index a5f4f705..75b4123f 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -37,7 +37,6 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') - # TODO: Adapt the docstring comment to process_page_pcgts def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: """Clip text regions / lines of a page at intersections with neighbours. @@ -81,9 +80,9 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page = pcgts.get_Page() assert page - page_image, page_coords, page_image_info = self.workspace.image_from_page( + page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - # TODO: zoom is not used anywhere, is it still useful to have this call here? + # The zoom is not used anywhere zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) ret = [pcgts] @@ -104,7 +103,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page.get_TableRegion() + page.get_UnknownRegion()) if not num_texts: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') background = ImageStat.Stat(page_image) # workaround for Pillow#4925 if len(background.bands) > 1: @@ -118,7 +117,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: # in absolute coordinates merely for comparison/intersection shapes = [Polygon(polygon_from_points(region.get_Coords().points)) for region in regions] # in relative coordinates for mask/cropping - polygons = [coordinates_of_segment(region, page_image, page_coords) for region in regions] + polygons = [coordinates_of_segment(region, page_image, page_xywh) for region in regions] for i, polygon in enumerate(polygons[num_texts:], num_texts): # for non-text regions, extend mask by 3 pixels in each direction # to ensure they do not leak components accidentally @@ -143,7 +142,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: segment_region_file_id = f"{output_file_id}_{region.id}" ret.append(self.process_segment( region, masks[i], polygons[i], neighbours, background_image, - page_image, page_coords, page_bin, page_id, segment_region_file_id)) + page_image, page_xywh, page_bin, page_id, segment_region_file_id)) continue # level == 'line': lines = region.get_TextLine() @@ -151,7 +150,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') continue region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') + region, page_image, page_xywh, feature_selector='binarized') background_image = Image.new(region_image.mode, region_image.size, background) region_array = pil2array(region_image) region_bin = np.array(region_array <= midrange(region_array), np.uint8) @@ -164,8 +163,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: if line.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). self.logger.warning( - f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image ' - f'data: skipping') + f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping') continue shape = prep(shapes[j]) neighbours = [(linej, maskj) for shapej, linej, maskj diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 69b374ec..b9fc453f 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -101,18 +101,19 @@ def get_model(self): returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" canread = lambda p: isfile(p) and access(p, R_OK) + p_model = self.parameter['model'] try: - model = self.resolve_resource(self.parameter['model']) + model = self.resolve_resource(p_model) if canread(model): return model except SystemExit: ocropydir = dirname(abspath(__file__)) - path = join(ocropydir, 'models', self.parameter['model']) - self.logger.info("Failed to resolve model with OCR-D/core mechanism, trying %s", path) + path = join(ocropydir, 'models', p_model) + self.logger.info(f"Failed to resolve model with OCR-D/core mechanism, trying {path}") if canread(path): return path - self.logger.error("Could not find model %s. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s", - self.parameter['model'], self.parameter['model']) + self.logger.error( + f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}") exit(1) def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: @@ -148,7 +149,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page = pcgts.get_Page() assert page - page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + page_image, page_xywh, _ = self.workspace.image_from_page(page, page_id) self.logger.info(f"Recognizing text in page '{page_id}'") # region, line, word, or glyph level: regions = page.get_AllRegions(classes=['Text']) @@ -157,37 +158,32 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: self.process_regions(regions, max_level, page_image, page_coords) return [pcgts] - def process_regions(self, regions, maxlevel, page_image, page_coords): + def process_regions(self, regions, maxlevel, page_image, page_xywh): edits = 0 lengs = 0 for region in regions: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords) - - self.logger.info("Recognizing text in region '%s'", region.id) + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) + self.logger.info(f"Recognizing text in region '{region.id}'") textlines = region.get_TextLine() if not textlines: - self.logger.warning("Region '%s' contains no text lines", region.id) + self.logger.warning(f"Region '{region.id}' contains no text lines") else: - edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_coords) + edits_, lengs_ = self.process_lines(textlines, maxlevel, region_image, region_xywh) edits += edits_ lengs += lengs_ # update region text by concatenation for consistency - region_unicode = u'\n'.join(line.get_TextEquiv()[0].Unicode - if line.get_TextEquiv() - else u'' for line in textlines) + region_unicode = u'\n'.join( + line.get_TextEquiv()[0].Unicode if line.get_TextEquiv() else u'' for line in textlines) region.set_TextEquiv([TextEquivType(Unicode=region_unicode)]) if lengs > 0: self.logger.info('CER: %.1f%%', 100.0 * edits / lengs) - def process_lines(self, textlines, maxlevel, region_image, region_coords): + def process_lines(self, textlines, maxlevel, region_image, region_xywh): edits = 0 lengs = 0 for line in textlines: - line_image, line_coords = self.workspace.image_from_segment( - line, region_image, region_coords) - - self.logger.info("Recognizing text in line '%s'", line.id) + line_image, line_coords = self.workspace.image_from_segment(line, region_image, region_xywh) + self.logger.info(f"Recognizing text in line '{line.id}'") if line.get_TextEquiv(): linegt = line.TextEquiv[0].Unicode else: @@ -198,19 +194,18 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): line.set_Word([]) if line_image.size[1] < 16: - self.logger.debug("ERROR: bounding box is too narrow at line %s", line.id) + self.logger.debug(f"ERROR: bounding box is too narrow at line {line.id}") continue # resize image to 48 pixel height final_img, scale = resize_keep_ratio(line_image) # process ocropy: try: - linepred, clist, rlist, confidlist = recognize( - final_img, self.pad, self.network, check=True) + linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: - self.logger.debug('error processing line "%s": %s', line.id, err) + self.logger.debug(f'error processing line "{line.id}": {err}') continue - self.logger.debug("OCR '%s': '%s'", line.id, linepred) + self.logger.debug(f"OCR '{line.id}': '{linepred}'") edits += Levenshtein.distance(linepred, linegt) lengs += len(linegt) @@ -226,11 +221,9 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): found_char = True word_conf_list[w_no].append(confidlist[i]) word_r_list[w_no].append(rlist[i]) - if c == ' ' and found_char: if i == 0: word_r_list[0][0] = rlist[i] - elif i+1 <= len(clist)-1 and clist[i+1] != ' ': word_conf_list.append([]) word_r_list.append([rlist[i]]) @@ -244,8 +237,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_coords): # conf for the line line_conf = (min(wordsconf) + max(wordsconf))/2 # line text - line.add_TextEquiv(TextEquivType( - Unicode=linepred, conf=line_conf)) + line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf)) if maxlevel in ['word', 'glyph']: for word_no, word_str in enumerate(words): From f24f86b9e963e28f206662e464f8843c99deddf0 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 12:33:04 +0200 Subject: [PATCH 109/194] fix: determine_zoom --- ocrd_cis/ocropy/binarize.py | 2 +- ocrd_cis/ocropy/clip.py | 3 ++- ocrd_cis/ocropy/common.py | 2 +- ocrd_cis/ocropy/denoise.py | 2 +- ocrd_cis/ocropy/dewarp.py | 2 +- ocrd_cis/ocropy/recognize.py | 2 +- ocrd_cis/ocropy/resegment.py | 2 +- ocrd_cis/ocropy/segment.py | 2 +- 8 files changed, 9 insertions(+), 8 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 8f7d8d3a..7478edb5 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -101,7 +101,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: assert page page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) ret = [pcgts] if level == 'page': diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 75b4123f..400e9b54 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,5 +1,6 @@ from __future__ import absolute_import from logging import Logger +from typing import Tuple from os.path import join import numpy as np @@ -83,7 +84,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') # The zoom is not used anywhere - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) ret = [pcgts] # FIXME: what about text regions inside table regions? diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 095de5eb..c6b7c49d 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -2103,7 +2103,7 @@ def find_topological(): # DSAVE('rlabels_closed', rlabels) return rlabels -def determine_zoom(logger: logging.Logger, dpi: float, page_image_info: OcrdExif) -> float: +def determine_zoom(logger: logging.Logger, page_id: str, dpi: float, page_image_info: OcrdExif) -> float: if dpi > 0: zoom = 300.0/dpi elif page_image_info.resolution != 1: diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 78d11c28..cc622c24 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -63,7 +63,7 @@ def process(self): page, page_id, feature_selector='binarized' if level == 'page' else '') - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) if level == 'page': self.process_segment(page, page_image, page_xywh, zoom, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 9dddae44..72efca45 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -112,7 +112,7 @@ def process(self): page_image, page_xywh, page_image_info = self.workspace.image_from_page( page, page_id) - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index b9fc453f..bbb8e415 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -155,7 +155,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: regions = page.get_AllRegions(classes=['Text']) if not regions: self.logger.warning(f"Page '{page_id}' contains no text regions") - self.process_regions(regions, max_level, page_image, page_coords) + self.process_regions(regions, max_level, page_image, page_xywh) return [pcgts] def process_regions(self, regions, maxlevel, page_image, page_xywh): diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index b18c0b5e..1e9f8c7f 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -117,7 +117,7 @@ def process(self): page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) ignore = (page.get_ImageRegion() + page.get_LineDrawingRegion() + diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 782425cc..57368fe8 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -339,7 +339,7 @@ def process(self): # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.logger, self.parameter['dpi'], page_image_info) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) # aggregate existing regions so their foreground can be ignored ignore = (page.get_ImageRegion() + From 5f8e1dfb337d78cd757f4a6b5aff968829c2d4a1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 13:19:08 +0200 Subject: [PATCH 110/194] add missing Levenshtein req in setup --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 38f09abd..e3ee8213 100644 --- a/setup.py +++ b/setup.py @@ -41,6 +41,7 @@ 'ocrd>=3.0.0a1', 'click', 'scipy', + 'python-Levenshtein>=0.25.1', 'numpy>=1.17.0', 'pillow>=7.1.2', 'shapely>=1.7.1', From 9a14e1dddf44515630dadbcc23b62e6951eccc5d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 13:53:33 +0200 Subject: [PATCH 111/194] fix: remove version req for Levenshtein --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e3ee8213..6b75d3a3 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ 'ocrd>=3.0.0a1', 'click', 'scipy', - 'python-Levenshtein>=0.25.1', + 'python-Levenshtein', 'numpy>=1.17.0', 'pillow>=7.1.2', 'shapely>=1.7.1', From 4ca4d1417030e40818327a7cc3571b22ad4ccda9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 15 Aug 2024 13:59:33 +0200 Subject: [PATCH 112/194] fix: Levenshtein import --- ocrd_cis/align/cli.py | 2 +- setup.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index ffe53fd8..7747622e 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -2,7 +2,7 @@ import click import json import os -import Levenshtein +from rapidfuzz.distance import Levenshtein from ocrd import Processor from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor diff --git a/setup.py b/setup.py index 6b75d3a3..38f09abd 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,6 @@ 'ocrd>=3.0.0a1', 'click', 'scipy', - 'python-Levenshtein', 'numpy>=1.17.0', 'pillow>=7.1.2', 'shapely>=1.7.1', From fbaafcb4e3f982496aafdf561a4cd4713d859f5c Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 16:23:00 +0200 Subject: [PATCH 113/194] update ocrd-cis-binarize to be compatible with bertsky/core#8 --- ocrd_cis/ocropy/binarize.py | 70 ++++++++++++++++--------------------- ocrd_cis/ocropy/common.py | 3 +- 2 files changed, 33 insertions(+), 40 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 7478edb5..3c9583f9 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,21 +1,15 @@ from __future__ import absolute_import from logging import Logger +from typing import Optional import cv2 import numpy as np from PIL import Image -from os.path import abspath, dirname, join -from typing import Tuple +from ocrd.processor.ocrd_page_result import OcrdPageResult, OcrdPageResultImage -#import kraken.binarization - -from ocrd_utils import ( - getLogger, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from ocrd import Processor from . import common @@ -71,7 +65,7 @@ def setup(self): self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise ValueError('only method=ocropy allows grayscale=true') - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. Iterate over the PAGE-XML element hierarchy down to the requested @@ -97,16 +91,17 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: self.logger.debug(f'Level of operation: "{level}"') pcgts = input_pcgts[0] + assert pcgts page = pcgts.get_Page() assert page page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - ret = [pcgts] + result = OcrdPageResult(pcgts) if level == 'page': try: - ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id)) + result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id)) except ValueError as e: self.logger.error(e) else: @@ -121,7 +116,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: region, page_image, page_xywh, feature_filter='binarized') if level == 'region': try: - ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id)) + result.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id)) continue except ValueError as e: self.logger.error(e) @@ -132,12 +127,12 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') try: - ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) + result.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id)) except ValueError as e: self.logger.error(e) - return ret + return result - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: + def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage: if not page_image.width or not page_image.height: raise ValueError(f"Skipping page '{page_id}' with zero size") self.logger.info(f"About to binarize page '{page_id}'") @@ -171,18 +166,17 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) if self.parameter['grayscale']: - file_id += '.IMG-NRM' + id_suffix = '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + id_suffix = '.IMG-BIN' features += ',binarized' - bin_image_id = f'{file_id}.IMG-BIN' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alternative_image = AlternativeImageType(comments=features) + page.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(bin_image, id_suffix, alternative_image) - def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: + def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage: if not region_image.width or not region_image.height: raise ValueError(f"Skipping region '{region.id}' with zero size") self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'") @@ -217,21 +211,19 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ orientation = -region_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] region.set_orientation(orientation) - bin_image_id = f'{file_id}_{region.id}' + id_suffix = f'{region.id}' if self.parameter['grayscale']: - bin_image_id += '.IMG-NRM' + id_suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - bin_image_id += '.IMG-BIN' + id_suffix += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alternative_image = AlternativeImageType(comments=features) + region.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(bin_image, id_suffix, alternative_image) - def process_line( - self, line, line_image, line_xywh, zoom, page_id, region_id, file_id - ) -> Tuple[Image.Image, str, str]: + def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage: if not line_image.width or not line_image.height: raise ValueError(f"Skipping line '{line.id}' with zero size") self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") @@ -256,14 +248,14 @@ def process_line( bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' - bin_image_id = f'{file_id}_{region_id}_{line.id}' + id_suffix = f'{region_id}_{line.id}' if self.parameter['grayscale']: - bin_image_id += '.IMG-NRM' + id_suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - bin_image_id += '.IMG-BIN' + id_suffix += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alternative_image = AlternativeImageType(comments=features) + line.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(bin_image, id_suffix, alternative_image) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index c6b7c49d..c5b56ed0 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -1,4 +1,5 @@ from __future__ import absolute_import +from typing import Optional import warnings import logging @@ -2103,7 +2104,7 @@ def find_topological(): # DSAVE('rlabels_closed', rlabels) return rlabels -def determine_zoom(logger: logging.Logger, page_id: str, dpi: float, page_image_info: OcrdExif) -> float: +def determine_zoom(logger: logging.Logger, page_id: Optional[str], dpi: float, page_image_info: OcrdExif) -> float: if dpi > 0: zoom = 300.0/dpi elif page_image_info.resolution != 1: From 516ce4ba4bd4f65dae975472b5632d8d3b6027c2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 00:58:16 +0200 Subject: [PATCH 114/194] binarize: use final v3 API --- ocrd_cis/ocropy/binarize.py | 69 +++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 37 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 7478edb5..fa47e139 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -6,17 +6,15 @@ from PIL import Image from os.path import abspath, dirname, join -from typing import Tuple +from typing import Union, Optional #import kraken.binarization -from ocrd_utils import ( - getLogger, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType +from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdPage from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from . import common from .common import array2pil, determine_zoom, pil2array, remove_noise @@ -71,7 +69,7 @@ def setup(self): self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise ValueError('only method=ocropy allows grayscale=true') - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: Optional[Union[OcrdFile, ClientSideOcrdFile]], page_id: str = None) -> OcrdPageResult: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. Iterate over the PAGE-XML element hierarchy down to the requested @@ -90,7 +88,8 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: Reference each new image in the AlternativeImage of the element. - Return a PAGE-XML with AlternativeImage and the arguments for ``workspace.save_image_file``. + Return a PAGE-XML with new AlternativeImage(s) and the arguments + for ``workspace.save_image_file``. """ level = self.parameter['level-of-operation'] assert self.workspace @@ -103,10 +102,10 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - ret = [pcgts] + ret = OcrdPageResult(pcgts) if level == 'page': try: - ret.append(self.process_page(page, page_image, page_xywh, zoom, page_id, output_file_id)) + ret.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id)) except ValueError as e: self.logger.error(e) else: @@ -121,7 +120,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: region, page_image, page_xywh, feature_filter='binarized') if level == 'region': try: - ret.append(self.process_region(region, region_image, region_xywh, zoom, region.id, output_file_id)) + ret.images.append(self.process_region(region, region_image, region_xywh, zoom, region.id)) continue except ValueError as e: self.logger.error(e) @@ -132,16 +131,15 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_filter='binarized') try: - ret.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id, output_file_id)) + ret.images.append(self.process_line(line, line_image, line_xywh, zoom, page_id, region.id)) except ValueError as e: self.logger.error(e) return ret - def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: + def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageResultImage: if not page_image.width or not page_image.height: raise ValueError(f"Skipping page '{page_id}' with zero size") self.logger.info(f"About to binarize page '{page_id}'") - assert self.output_file_grp features = page_xywh['features'] if 'angle' in page_xywh and page_xywh['angle']: @@ -171,18 +169,17 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id, file_id) -> T orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) if self.parameter['grayscale']: - file_id += '.IMG-NRM' + suffix = '.IMG-NRM' features += ',grayscale_normalized' else: - file_id += '.IMG-BIN' + suffix = '.IMG-BIN' features += ',binarized' - bin_image_id = f'{file_id}.IMG-BIN' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - page.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alt_image = AlternativeImageType(comments=features) + page.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) - def process_region(self, region, region_image, region_xywh, zoom, page_id, file_id) -> Tuple[Image.Image, str, str]: + def process_region(self, region, region_image, region_xywh, zoom, page_id) -> OcrdPageResultImage: if not region_image.width or not region_image.height: raise ValueError(f"Skipping region '{region.id}' with zero size") self.logger.info(f"About to binarize page '{page_id}' region '{region.id}'") @@ -217,21 +214,19 @@ def process_region(self, region, region_image, region_xywh, zoom, page_id, file_ orientation = -region_xywh['angle'] orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] region.set_orientation(orientation) - bin_image_id = f'{file_id}_{region.id}' + suffix = region.id if self.parameter['grayscale']: - bin_image_id += '.IMG-NRM' + suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - bin_image_id += '.IMG-BIN' + suffix += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - region.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alt_image = AlternativeImageType(comments=features) + region.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) - def process_line( - self, line, line_image, line_xywh, zoom, page_id, region_id, file_id - ) -> Tuple[Image.Image, str, str]: + def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> OcrdPageResultImage: if not line_image.width or not line_image.height: raise ValueError(f"Skipping line '{line.id}' with zero size") self.logger.info(f"About to binarize page '{page_id}' region '{region_id}' line '{line.id}'") @@ -256,14 +251,14 @@ def process_line( bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' - bin_image_id = f'{file_id}_{region_id}_{line.id}' + suffix = f'{region_id}_{line.id}' if self.parameter['grayscale']: - bin_image_id += '.IMG-NRM' + suffix += '.IMG-NRM' features += ',grayscale_normalized' else: - bin_image_id += '.IMG-BIN' + suffix += '.IMG-BIN' features += ',binarized' - bin_image_path = join(self.output_file_grp, f'{bin_image_id}.png') # update PAGE (reference the image file): - line.add_AlternativeImage(AlternativeImageType(filename=bin_image_path, comments=features)) - return bin_image, bin_image_id, bin_image_path + alt_image = AlternativeImageType(comments=features) + line.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) From 2e4f26f04ec5b2070a0396015d4339493e365fa1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:05:17 +0200 Subject: [PATCH 115/194] binarize: use correct types --- ocrd_cis/ocropy/binarize.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index fa47e139..ac499336 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -11,8 +11,7 @@ #import kraken.binarization from ocrd_utils import getLogger -from ocrd_models.ocrd_page import AlternativeImageType -from ocrd_models import OcrdFile, ClientSideOcrdFile, OcrdPage +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage @@ -69,7 +68,7 @@ def setup(self): self.logger.critical(f'Requested method {method} does not support grayscale normalized output') raise ValueError('only method=ocropy allows grayscale=true') - def process_page_pcgts(self, *input_pcgts: Optional[Union[OcrdFile, ClientSideOcrdFile]], page_id: str = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Binarize (and optionally deskew/despeckle) the pages/regions/lines of the workspace. Iterate over the PAGE-XML element hierarchy down to the requested From 21be94106ac55d001cb5729f21138fb9c7715bcb Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:12:04 +0200 Subject: [PATCH 116/194] clip: use final v3 API --- ocrd_cis/ocropy/clip.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 400e9b54..d0119544 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -8,19 +8,17 @@ from shapely.geometry import Polygon from shapely.prepared import prep -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, to_xml +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from ocrd_utils import ( getLogger, - make_file_id, coordinates_of_segment, polygon_from_points, bbox_from_polygon, image_from_polygon, polygon_mask, crop_image, - MIMETYPE_PAGE ) from .ocrolib import midrange, morph @@ -38,7 +36,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyClip') - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Clip text regions / lines of a page at intersections with neighbours. Open and deserialize PAGE input file and its respective image, @@ -85,7 +83,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: page, page_id, feature_selector='binarized') # The zoom is not used anywhere zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - ret = [pcgts] + ret = OcrdPageResult(pcgts) # FIXME: what about text regions inside table regions? regions = list(page.get_TextRegion()) @@ -141,9 +139,9 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: if shape.intersects(shapej)] if neighbours: segment_region_file_id = f"{output_file_id}_{region.id}" - ret.append(self.process_segment( + ret.images.append(self.process_segment( region, masks[i], polygons[i], neighbours, background_image, - page_image, page_xywh, page_bin, page_id, segment_region_file_id)) + page_image, page_xywh, page_bin, page_id)) continue # level == 'line': lines = region.get_TextLine() @@ -172,14 +170,14 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: if shape.intersects(shapej)] if neighbours: segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}" - ret.append(self.process_segment( + ret.images.append(self.process_segment( line, masks[j], polygons[j], neighbours, background_image, - region_image, region_coords, region_bin, page_id, segment_line_file_id)) + region_image, region_coords, region_bin, page_id)) return ret def process_segment(self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, parent_bin, - page_id, file_id) -> Tuple[Image.Image, str, str]: + page_id) -> OcrdPageResultImage: # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( @@ -217,8 +215,7 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): segment_image = crop_image(segment_image,box=segment_bbox) - segment_image_id = file_id + '.IMG-CLIP' - segment_image_path = join(self.output_file_grp, f'{segment_image_id}.png') # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType(filename=segment_image_path, comments=features)) - return segment_image, segment_image_id, segment_image_path + alternative_image = AlternativeImageType(comments=features) + segment.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(segment_image, '.IMG-CLIP', alternative_image) From 9539ac9620776e335bbe107e57e92742027f02b3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:12:51 +0200 Subject: [PATCH 117/194] clip: use correct types --- ocrd_cis/ocropy/clip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index d0119544..3ddd6a70 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -1,6 +1,6 @@ from __future__ import absolute_import from logging import Logger -from typing import Tuple +from typing import Optional from os.path import join import numpy as np From 734b5eb4ef9bfee2e24d8053966b17eaf6e9e1f9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 01:14:56 +0200 Subject: [PATCH 118/194] recognize: use final v3 API --- ocrd_cis/ocropy/recognize.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index bbb8e415..7e4f2957 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -11,18 +11,16 @@ from ocrd_utils import ( getLogger, - make_file_id, coordinates_for_segment, polygon_from_bbox, points_from_polygon, - MIMETYPE_PAGE ) -from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( - to_xml, TextEquivType, OcrdPage, + TextEquivType, OcrdPage, CoordsType, GlyphType, WordType ) from ocrd import Processor +from ocrd.processor import OcrdPageResult from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange @@ -116,7 +114,7 @@ def get_model(self): f"Could not find model {p_model}. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {p_model}") exit(1) - def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Recognize lines / words / glyphs of a page. Open and deserialize the PAGE input file and its respective image, @@ -156,7 +154,7 @@ def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: if not regions: self.logger.warning(f"Page '{page_id}' contains no text regions") self.process_regions(regions, max_level, page_image, page_xywh) - return [pcgts] + return OcrdPageResult(pcgts) def process_regions(self, regions, maxlevel, page_image, page_xywh): edits = 0 From 039e052f0a4226341ce1bf3070de53495b2a550f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:18:53 +0200 Subject: [PATCH 119/194] test_lib.bash: update GT Github URL --- tests/test_lib.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.bash b/tests/test_lib.bash index f28acb1e..801be01a 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -5,7 +5,7 @@ trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR trap "rm -rf $tmpdir" EXIT OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" -data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.2.4/" +data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" mkdir -p "$PWD/download" From 28ad585c94f9895b3f5011a72aabf36b73d71a8e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:20:58 +0200 Subject: [PATCH 120/194] recognize: fix typing import --- ocrd_cis/ocropy/recognize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 7e4f2957..97fcc64d 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -1,7 +1,8 @@ from __future__ import absolute_import + from logging import Logger from sys import exit -from typing import Any +from typing import Any, Optional from os import access, R_OK from os.path import abspath, dirname, isfile, join import numpy as np From 9a7c10ab71f7df3783f44848536aa99dd9c8e483 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:31:27 +0200 Subject: [PATCH 121/194] denoise: adapt to final v3 API --- ocrd_cis/ocropy/denoise.py | 122 +++++++++++++++---------------------- 1 file changed, 49 insertions(+), 73 deletions(-) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index cc622c24..0f368fd5 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,17 +1,15 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join -from ocrd_utils import ( - getLogger, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file +from ocrd_utils import getLogger from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType + AlternativeImageType, OcrdPage ) from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .common import ( # binarize, @@ -27,10 +25,10 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDenoise') - def process(self): + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Despeckle the pages / regions / lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested ``level-of-operation``. @@ -49,73 +47,51 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ level = self.parameter['level-of-operation'] - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id, - feature_selector='binarized' if level == 'page' else '') - - zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - - if level == 'page': - self.process_segment(page, page_image, page_xywh, zoom, - input_file.pageId, file_id) - else: - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh, - feature_selector='binarized' if level == 'region' else '') - if level == 'region': - self.process_segment(region, region_image, region_xywh, zoom, - input_file.pageId, file_id + '_' + region.id) - continue - lines = region.get_TextLine() - if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, - feature_selector='binarized') - self.process_segment(line, line_image, line_xywh, zoom, - input_file.pageId, - file_id + '_' + region.id + '_' + line.id) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id, file_id): + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, + feature_selector='binarized' if level == 'page' else '') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + if level == 'page': + image = self.process_segment(page, page_image, page_xywh, zoom) + if image: + result.images.append(image) + else: + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh, + feature_selector='binarized' if level == 'region' else '') + if level == 'region': + image = self.process_segment(region, region_image, region_xywh, zoom) + if image: + result.images.append(image) + continue + lines = region.get_TextLine() + if not lines: + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh, + feature_selector='binarized') + image = self.process_segment(line, line_image, line_xywh, zoom) + if image: + result.images.append(image) + + def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning("Skipping '%s' with zero size", file_id) - return + return None self.logger.info("About to despeckle '%s'", file_id) bin_image = remove_noise(segment_image, maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt - # update METS (add the image file): - file_path = self.workspace.save_image_file( - bin_image, file_id + '.IMG-DESPECK', self.output_file_grp, - page_id=page_id) # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=segment_xywh['features'] + ',despeckled')) + alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') + segment.add_AlternativeImage(alt_image) + return OcrdPageResultImage(bin_image, segment.id + '.IMG-DESPECK', alt_image) From 7c9f39fa4516401fe17e24d3ca67799c5b85d308 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:40:41 +0200 Subject: [PATCH 122/194] deskew: adapt to final v3 API --- ocrd_cis/ocropy/deskew.py | 116 +++++++++++++++----------------------- 1 file changed, 47 insertions(+), 69 deletions(-) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 616864e1..fae0c90c 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,24 +1,21 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join -from ocrd_utils import ( - getLogger, - make_file_id, - MIMETYPE_PAGE -) -from ocrd_modelfactory import page_from_file +from ocrd_utils import getLogger from ocrd_models.ocrd_page import ( PageType, - to_xml, AlternativeImageType + AlternativeImageType, + OcrdPage ) from ocrd import Processor +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from . import common from .common import pil2array -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) - def deskew(pil_image, maxskew=2): array = pil2array(pil_image) _, angle = common.binarize(array, maxskew=maxskew) @@ -34,10 +31,10 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDeskew') - def process(self): + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Deskew the pages or regions of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the TextRegion level. Next, for each file, crop each region image according to the layout @@ -53,62 +50,45 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ level = self.parameter['level-of-operation'] - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_coords, _ = self.workspace.image_from_page( - page, page_id, + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_coords, _ = self.workspace.image_from_page( + page, page_id, + # image must not have been rotated already, + # (we will overwrite @orientation anyway,) + # abort if no such image can be produced: + feature_filter='deskewed' if level == 'page' else '') + if level == 'page': + image = self._process_segment(page, page_image, page_coords, "page '%s'" % page_id, page_id) + if image: + result.images.append(image) + return result + if level == 'table': + regions = page.get_TableRegion() + else: # region + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + # process region: + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, # image must not have been rotated already, # (we will overwrite @orientation anyway,) # abort if no such image can be produced: - feature_filter='deskewed' if level == 'page' else '') - if level == 'page': - self._process_segment(page, page_image, page_coords, - "page '%s'" % page_id, input_file.pageId, - file_id) - else: - if level == 'table': - regions = page.get_TableRegion() - else: # region - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - # process region: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, - # image must not have been rotated already, - # (we will overwrite @orientation anyway,) - # abort if no such image can be produced: - feature_filter='deskewed') - self._process_segment(region, region_image, region_coords, - "region '%s'" % region.id, input_file.pageId, - file_id + '_' + region.id) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - - def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id, file_id): + feature_filter='deskewed') + image = self._process_segment(region, region_image, region_coords, + "region '%s'" % region.id, page_id) + if image: + result.images.append(image) + return result + + def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning("Skipping %s with zero size", segment_id) - return + return None angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image self.logger.info("About to deskew %s", segment_id) angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied @@ -123,20 +103,18 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p segment_image, segment_coords, _ = self.workspace.image_from_page( segment, page_id, fill='background', transparency=True) + suffix = '.IMG-DESKEW' else: segment_image, segment_coords = self.workspace.image_from_segment( segment, segment_image, segment_coords, fill='background', transparency=True) + suffix = segment.id + '.IMG-DESKEW' if not angle: # zero rotation does not change coordinates, # but assures consuming processors that the # workflow had deskewing segment_coords['features'] += ',deskewed' - # update METS (add the image file): - file_path = self.workspace.save_image_file( - segment_image, file_id + '.IMG-DESKEW', self.output_file_grp, - page_id=page_id) # update PAGE (reference the image file): - segment.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=segment_coords['features'])) + alternative = AlternativeImageType(comments=segment_coords['features']) + segment.add_AlternativeImage(alternative) + return OcrdPageResultImage(segment_image, suffix, alternative) From 669866857395544ed10c0fbda5ea03abd1b31f14 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 02:52:55 +0200 Subject: [PATCH 123/194] dewarp: adapt to final v3 API --- ocrd_cis/ocropy/dewarp.py | 129 +++++++++++++++----------------------- 1 file changed, 50 insertions(+), 79 deletions(-) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 72efca45..a063a05e 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,24 +1,22 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join + import numpy as np -from ocrd_utils import ( - getLogger, - make_file_id, -) -from ocrd_modelfactory import page_from_file +from ocrd_utils import getLogger from ocrd_models.ocrd_page import ( - to_xml, AlternativeImageType + AlternativeImageType, + OcrdPage ) from ocrd import Processor -from ocrd_utils import MIMETYPE_PAGE +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .ocrolib import lineest from .common import array2pil, check_line, determine_zoom, pil2array -#sys.path.append(os.path.dirname(os.path.abspath(__file__))) - class InvalidLine(Exception): """Line image does not allow dewarping and should be ignored.""" @@ -80,10 +78,10 @@ def setup(self): # and extra params) 0.3)) - def process(self): + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Dewarp the lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the TextLine level. Next, get each line image according to the layout annotation (from @@ -99,71 +97,44 @@ def process(self): Produce a new output file by serialising the resulting hierarchy. """ - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id) - - zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - - regions = page.get_AllRegions(classes=['Text'], order='reading-order') - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) - - lines = region.get_TextLine() - if not lines: - self.logger.warning('Region %s contains no text lines', region.id) - for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh) - - self.logger.info("About to dewarp page '%s' region '%s' line '%s'", - page_id, region.id, line.id) - try: - dew_image = dewarp(line_image, self.lnorm, check=True, - max_neighbour=self.parameter['max_neighbour'], - zoom=zoom) - except InvalidLine as err: - self.logger.error('cannot dewarp line "%s": %s', line.id, err) - continue - except InadequateLine as err: - self.logger.warning('cannot dewarp line "%s": %s', line.id, err) - # as a fallback, simply pad the image vertically - # (just as dewarping would do on average, so at least - # this line has similar margins as the others): - dew_image = padvert(line_image, self.parameter['range']) - # update METS (add the image file): - file_path = self.workspace.save_image_file( - dew_image, - file_id + '_' + region.id + '_' + line.id + '.IMG-DEWARP', - self.output_file_grp, - page_id=input_file.pageId) - # update PAGE (reference the image file): - alternative_image = line.get_AlternativeImage() - line.add_AlternativeImage(AlternativeImageType( - filename=file_path, - comments=line_xywh['features'] + ',dewarped')) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) + pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) + page = pcgts.get_Page() + + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id) + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + regions = page.get_AllRegions(classes=['Text'], order='reading-order') + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + region_image, region_xywh = self.workspace.image_from_segment( + region, page_image, page_xywh) + + lines = region.get_TextLine() + if not lines: + self.logger.warning('Region %s contains no text lines', region.id) + for line in lines: + line_image, line_xywh = self.workspace.image_from_segment( + line, region_image, region_xywh) + + self.logger.info("About to dewarp page '%s' region '%s' line '%s'", + page_id, region.id, line.id) + try: + dew_image = dewarp(line_image, self.lnorm, check=True, + max_neighbour=self.parameter['max_neighbour'], + zoom=zoom) + except InvalidLine as err: + self.logger.error('cannot dewarp line "%s": %s', line.id, err) + continue + except InadequateLine as err: + self.logger.warning('cannot dewarp line "%s": %s', line.id, err) + # as a fallback, simply pad the image vertically + # (just as dewarping would do on average, so at least + # this line has similar margins as the others): + dew_image = padvert(line_image, self.parameter['range']) + # update PAGE (reference the image file): + alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped') + line.add_AlternativeImage(alternative_image) + return OcrdPageResultImage(dew_image, region.id + '_' + line.id + '.IMG-DEWARP', alt_image) From 48a3146a4e510b14899aafc80c7f9f05da05fc48 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 03:07:40 +0200 Subject: [PATCH 124/194] resegment: adapt to final v3 API --- ocrd_cis/ocropy/resegment.py | 109 +++++++++++++++-------------------- 1 file changed, 45 insertions(+), 64 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 1e9f8c7f..05f17d4f 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -1,24 +1,25 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join + import numpy as np from skimage import draw, segmentation from shapely.geometry import Polygon, LineString from shapely.prepared import prep -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import BaselineType, PageType, to_xml -from ocrd import Processor from ocrd_utils import ( getLogger, - make_file_id, coordinates_of_segment, coordinates_for_segment, points_from_polygon, polygon_from_points, transform_coordinates, - MIMETYPE_PAGE ) +from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage +from ocrd import Processor +from ocrd.processor import OcrdPageResult from .ocrolib import midrange, morph from .common import ( @@ -52,10 +53,10 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyResegment') - def process(self): + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Resegment lines of the workspace. - Open and deserialise PAGE input files and their respective images, + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the line level. Next, get the page image according to the layout annotation (from @@ -104,67 +105,47 @@ def process(self): # accuracy crucially depends on a good estimate of the images' # pixel density (at least if source input is not 300 DPI). level = self.parameter['level-of-operation'] + pcgts = input_pcgts[0] + page = pcgts.get_Page() - for n, input_file in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID - page = pcgts.get_Page() - - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - - ignore = (page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_SeparatorRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - regions = page.get_AllRegions(classes=['Text']) - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - elif level == 'page': - lines = [line for region in regions - for line in region.get_TextLine()] + ignore = (page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_SeparatorRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + regions = page.get_AllRegions(classes=['Text']) + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + elif level == 'page': + lines = [line for region in regions + for line in region.get_TextLine()] + if lines: + self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) + else: + self.logger.warning('Page "%s" contains no text regions with lines', page_id) + else: + for region in regions: + lines = region.get_TextLine() if lines: - self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) else: - self.logger.warning('Page "%s" contains no text regions with lines', page_id) - else: - for region in regions: - lines = region.get_TextLine() - if lines: - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) - else: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - + self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + return OcrdPageResult(pcgts) + def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): threshold = self.parameter['min_fraction'] method = self.parameter['method'] From 0dd6fbac1a63965d241203cdc1dda85ca1fa4728 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 11:04:23 +0200 Subject: [PATCH 125/194] ocropy_segment: implement process_page_pcgts --- ocrd_cis/ocropy/segment.py | 314 +++++++++++++++++++++++++++---------- 1 file changed, 229 insertions(+), 85 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 57368fe8..d2a7a727 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,6 +1,7 @@ from __future__ import absolute_import from logging import Logger from os.path import join +from typing import Optional import itertools import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree @@ -16,6 +17,7 @@ from ocrd_modelfactory import page_from_file from ocrd_models.ocrd_page import ( to_xml, CoordsType, + OcrdPage, TextLineType, TextRegionType, SeparatorRegionType, @@ -35,6 +37,7 @@ ReadingOrderType ) from ocrd import Processor +from ocrd.processor import OcrdPageResult from ocrd_utils import ( getLogger, make_file_id, @@ -252,6 +255,168 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropySegment') + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + overwrite_lines = self.parameter['overwrite_lines'] + overwrite_regions = self.parameter['overwrite_regions'] + overwrite_separators = self.parameter['overwrite_separators'] + overwrite_order = self.parameter['overwrite_order'] + oplevel = self.parameter['level-of-operation'] + + pcgts = input_pcgts[0] + page = pcgts.get_Page() + + # TODO: also allow grayscale_normalized (try/except?) + page_image, page_coords, page_image_info = self.workspace.image_from_page( + page, page_id, feature_selector='binarized') + zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + + # aggregate existing regions so their foreground can be ignored + ignore = (page.get_ImageRegion() + + page.get_LineDrawingRegion() + + page.get_GraphicRegion() + + page.get_ChartRegion() + + page.get_MapRegion() + + page.get_MathsRegion() + + page.get_ChemRegion() + + page.get_MusicRegion() + + page.get_AdvertRegion() + + page.get_NoiseRegion() + + page.get_UnknownRegion() + + page.get_CustomRegion()) + if oplevel == 'page' and overwrite_separators: + page.set_SeparatorRegion([]) + else: + ignore.extend(page.get_SeparatorRegion()) + # prepare reading order + reading_order = dict() + ro = page.get_ReadingOrder() + if ro: + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + if rogroup: + page_get_reading_order(reading_order, rogroup) + # get segments to process / overwrite + if oplevel == 'page': + ignore.extend(page.get_TableRegion()) + regions = list(page.get_TextRegion()) + if regions: + # page is already region-segmented + if overwrite_regions: + self.logger.info(f'Removing existing TextRegions in page "{page_id}"') + # we could remove all other region types as well, + # but this is more flexible (for workflows with + # specialized separator/image/table detectors): + page.set_TextRegion([]) + page.set_ReadingOrder(None) + ro = None + else: + self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"') + ignore.extend(regions) + # create reading order if necessary + if not ro or overwrite_order: + ro = ReadingOrderType() + page.set_ReadingOrder(ro) + rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() + if not rogroup: + # new top-level group + rogroup = OrderedGroupType(id="reading-order") + ro.set_OrderedGroup(rogroup) + # go get TextRegions with TextLines (and SeparatorRegions): + self._process_element( + page, ignore, page_image, page_coords, page_id, file_id, page_id, zoom, rogroup=rogroup) + if (not rogroup.get_RegionRefIndexed() and + not rogroup.get_OrderedGroupIndexed() and + not rogroup.get_UnorderedGroupIndexed()): + # schema forbids empty OrderedGroup + ro.set_OrderedGroup(None) + elif oplevel == 'table': + ignore.extend(page.get_TextRegion()) + regions = list(page.get_TableRegion()) + if not regions: + self.logger.warning(f'Page "{page_id}" contains no table regions') + for region in regions: + subregions = region.get_TextRegion() + if subregions: + # table is already cell-segmented + if overwrite_regions: + self.logger.info(f'Removing existing TextRegions in table "{region.id}"') + region.set_TextRegion([]) + roelem = reading_order.get(region.id) + # replace by empty group with same index and ref + # (which can then take the cells as subregions) + reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) + else: + self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions') + continue + # TODO: also allow grayscale_normalized (try/except?) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + # ignore everything but the current table region + subignore = regions + ignore + subignore.remove(region) + # create reading order group if necessary + roelem = reading_order.get(region.id) + if not roelem: + self.logger.warning(f"Page '{page_id}' table region '{region.id}' is not referenced in reading " + f"order (no target to add cells to)") + elif overwrite_order: + # replace by empty ordered group with same (index and) ref + # (which can then take the cells as subregions) + roelem = page_subgroup_in_reading_order(self.logger, roelem) + reading_order[region.id] = roelem + elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): + self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an ordered " + f"group (cells will be appended)") + elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): + self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an unordered " + f"group (cells will not be appended)") + roelem = None + else: + # replace regionRef(Indexed) by group with same index and ref + # (which can then take the cells as subregions) + roelem = page_subgroup_in_reading_order(self.logger, roelem) + reading_order[region.id] = roelem + # go get TextRegions with TextLines (and SeparatorRegions) + self._process_element( + region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id, + page_id, zoom, rogroup=roelem) + else: # 'region' + regions = list(page.get_TextRegion()) + # besides top-level text regions, line-segment any table cells, + # and for tables without any cells, add a pseudo-cell + for region in page.get_TableRegion(): + subregions = region.get_TextRegion() + if subregions: + regions.extend(subregions) + else: + subregion = TextRegionType( + id=region.id + '_text', + Coords=region.get_Coords(), + # as if generated from parser: + parent_object_=region) + region.add_TextRegion(subregion) + regions.append(subregion) + if not regions: + self.logger.warning('Page "%s" contains no text regions', page_id) + for region in regions: + if region.get_TextLine(): + if overwrite_lines: + self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"') + region.set_TextLine([]) + else: + self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"') + ignore.extend(region.get_TextLine()) + # TODO: also allow grayscale_normalized (try/except?) + region_image, region_coords = self.workspace.image_from_segment( + region, page_image, page_coords, feature_selector='binarized') + # if the region images have already been clipped against their neighbours specifically, + # then we don't need to suppress all neighbours' foreground generally here + if 'clipped' in region_coords['features'].split(','): + ignore = [] + # go get TextLines + self._process_element( + region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom) + return OcrdPageResult(pcgts) + def process(self): """Segment pages into regions+lines, tables into cells+lines, or regions into lines. @@ -335,7 +500,7 @@ def process(self): self.add_metadata(pcgts) page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) page = pcgts.get_Page() - + # TODO: also allow grayscale_normalized (try/except?) page_image, page_coords, page_image_info = self.workspace.image_from_page( page, page_id, feature_selector='binarized') @@ -521,15 +686,14 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, newly detected separators to guide region segmentation. """ if not image.width or not image.height: - self.logger.warning("Skipping '%s' with zero size", element_id) + self.logger.warning(f"Skipping '{element_id}' with zero size") return element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), bool) sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - self.logger.debug('masking foreground of %s "%s" for "%s"', - type(segment).__name__[:-4], segment.id, element_id) + self.logger.debug(f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element_id}"') # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: @@ -540,13 +704,11 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does # not need to concern herself with this. + sp_row = segment_polygon[:, 1] + sp_column = segment_polygon[:, 0] if isinstance(segment, SeparatorRegionType): - sep_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - sep_bin.shape)] = True - ignore_labels[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - ignore_labels.shape)] = i+1 # mapped back for RO + sep_bin[draw.polygon(sp_row, sp_column, sep_bin.shape)] = True + ignore_labels[draw.polygon(sp_row, sp_column, ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True @@ -562,7 +724,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - self.logger.info('computing line segmentation for %s "%s"', element_name, element_id) + self.logger.info(f'Computing line segmentation for {element_name} "{element_id}"') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -570,9 +732,9 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, line_labels, baselines, seplines, images, colseps, scale = compute_segmentation( # suppress separators and ignored regions for textline estimation # but keep them for h/v-line detection (in fullpage mode): - element_bin, seps=(sep_bin+ignore_labels)>0, + element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, - spread_dist=round(self.parameter['spread']/zoom*300/72), # in pt + spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], @@ -580,16 +742,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): - self.logger.error('Cannot line-segment region "%s": %s', element_id, err) + self.logger.error(f'Cannot line-segment region "{element_id}": {err}') # as a fallback, add a single text line comprising the whole region: element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords())) else: - self.logger.error('Cannot line-segment %s "%s": %s', element_name, element_id, err) + self.logger.error(f'Cannot line-segment {element_name} "{element_id}": {err}') return - - self.logger.info('Found %d text lines for %s "%s"', - len(np.unique(line_labels)) - 1, - element_name, element_id) + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element_id}"') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -598,31 +757,28 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # i.e. identical line and region labels # to detect their reading order among the others # (these cannot be split or grouped together with other regions) - line_labels = np.where(line_labels, line_labels+len(ignore), ignore_labels) + line_labels = np.where(line_labels, line_labels + len(ignore), ignore_labels) # suppress separators/images in fg and try to use for partitioning slices sepmask = np.maximum(sep_bin, np.maximum(seplines > 0, images > 0)) region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, - sepmask=np.maximum(sepmask, colseps), # add bg + sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - self.logger.info('Found %d text regions for %s "%s"', - len(np.unique(region_labels)) - 1, - element_name, element_id) + self.logger.info( + f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element_id}"') except Exception as err: - self.logger.error('Cannot region-segment %s "%s": %s', - element_name, element_id, err) + self.logger.error(f'Cannot region-segment {element_name} "{element_id}": {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) - # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): index = 0 - # start counting from largest existing index + # start counting from the largest existing index for elem in (rogroup.get_RegionRefIndexed() + rogroup.get_OrderedGroupIndexed() + rogroup.get_UnorderedGroupIndexed()): @@ -634,7 +790,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_no = 0 for region_label in np.unique(region_labels): if not region_label: - continue # no bg + continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) @@ -643,13 +799,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (no new region, no actual text lines) region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ - "region label %d has both existing regions and new lines (%s)" % ( - region_label, str(region_line_labels0)) + (f"Region label {region_label} has both existing regions and new lines " + f"({str(region_line_labels0)})") region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) - self.logger.debug('Region label %d is for ignored region "%s"', - region_label, region.id) + self.logger.debug(f'Region label {region_label} is for ignored region "{region.id}"') continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally @@ -657,18 +812,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps - region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, - seps=np.maximum(sepmask, colseps)) + region_line_labels = hmerge_line_seeds( + element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin, - '%s "%s"' % (element_name, element_id), - min_area=6000/zoom/zoom, - simplify=ignore_labels * ~(sep_bin)) + regions, _ = masks2polygons( + self.logger, region_mask * region_label, None, element_bin, + name=f'{element_name} "{element_id}"', min_area=6000 / zoom / zoom, + simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin, - 'region "%s"' % element_id, - min_area=640/zoom/zoom) + lines, _ = masks2polygons( + self.logger, region_line_labels, baselines, element_bin, + name=f'region "{element_id}"', min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -677,34 +832,31 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_polygon = coordinates_for_segment(region_polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - self.logger.warning('Ignoring extant region contour for region label %d', region_label) + self.logger.warning(f'Ignoring extant region contour for region label {region_label}') continue # annotate result: region_no += 1 region_id = element_id + "_region%04d" % region_no - self.logger.debug('Region label %d becomes ID "%s"', region_label, region_id) - region = TextRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon))) + self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"') + region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): - if not region_poly.intersects(line_poly): # .contains + if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(line_polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, region) if line_polygon is None: - self.logger.warning('Ignoring extant line contour for region label %d line label %d', - region_label, line_label) + self.logger.warning( + f'Ignoring extant line contour for region label {region_label} line label {line_label}') continue # annotate result: line_no += 1 line_id = region_id + "_line%04d" % line_no - self.logger.debug('Line label %d becomes ID "%s"', line_label, line_id) - line = TextLineType(id=line_id, - Coords=CoordsType(points=points_from_polygon(line_polygon))) + self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"') + line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if line_baseline: line_baseline = coordinates_for_segment(line_baseline, image, coords) line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) @@ -712,95 +864,87 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - self.logger.info('Added region "%s" with %d lines for %s "%s"', - region_id, line_no, element_name, element_id) + self.logger.info( + f'Added region "{region_id}" with {line_no} lines for {element_name} "{element_id}"') if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - self.logger.info('Found %d large image regions for %s "%s"', images.max(), element_name, element_id) + self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element_id}"') # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, - '%s "%s"' % (element_name, element_id)) + image_polygons, _ = masks2polygons( + self.logger, images, None, element_bin, f'{element_name} "{element_id}"') for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - self.logger.warning('Ignoring extant region contour for image label %d', image_label) + self.logger.warning(f'Ignoring extant region contour for image label {image_label}') continue region_no += 1 # annotate result: region_id = element_id + "_image%04d" % region_no element.add_ImageRegion(ImageRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon)))) + id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - self.logger.info('Found %d separators for %s "%s"', seplines.max(), element_name, element_id) + self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element_id}"') # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin, - '%s "%s"' % (element_name, element_id), - open_holes=True, reorder=False) + sep_polygons, _ = masks2polygons( + self.logger, seplines, None, element_bin, + name=f'{element_name} "{element_id}"', open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) region_polygon = polygon_for_parent(region_polygon, element) if region_polygon is None: - self.logger.warning('Ignoring extant region contour for separator %d', sep_label) + self.logger.warning(f'Ignoring extant region contour for separator {sep_label}') continue # annotate result: region_no += 1 region_id = element_id + "_sep%04d" % region_no element.add_SeparatorRegion(SeparatorRegionType( - id=region_id, Coords=CoordsType( - points=points_from_polygon(region_polygon)))) + id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) + image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id) element.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=coords['features'] + ',clipped')) else: - # get mask from region polygon: + # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - region_mask.shape)] = True + region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin, - 'region "%s"' % element_id, - min_area=640/zoom/zoom) + line_polygons, _ = masks2polygons( + self.logger, line_labels, baselines, element_bin, + name=f'region "{element_id}"', min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: line_polygon = coordinates_for_segment(polygon, image, coords) line_polygon = polygon_for_parent(line_polygon, element) if line_polygon is None: - self.logger.warning('Ignoring extant line contour for line label %d', - line_label) + self.logger.warning(f'Ignoring extant line contour for line label {line_label}') continue # annotate result: line_no += 1 line_id = element_id + "_line%04d" % line_no - line = TextLineType(id=line_id, - Coords=CoordsType(points=points_from_polygon(line_polygon))) + line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if baseline: line_baseline = coordinates_for_segment(baseline, image, coords) line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) element.add_TextLine(line) if not sep_bin.any(): - return # no derived image + return # no derived image # annotate a text/image-separated image - element_array[sep_bin] = np.amax(element_array) # clip to white/bg + element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, - page_id=page_id) + image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id) # update PAGE (reference the image file): element.add_AlternativeImage(AlternativeImageType( filename=file_path, comments=coords['features'] + ',clipped')) From ad5ac7c4ab7f2b52bf313563456feca0094761ce Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 11:06:01 +0200 Subject: [PATCH 126/194] ocropy_segment: remove process --- ocrd_cis/ocropy/segment.py | 317 ++++++++----------------------------- 1 file changed, 67 insertions(+), 250 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index d2a7a727..94b6ab1f 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -256,6 +256,73 @@ def setup(self): self.logger = getLogger('processor.OcropySegment') def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + """Segment pages into regions+lines, tables into cells+lines, or regions into lines. + Open and deserialise PAGE input files and their respective images, + then iterate over the element hierarchy down to the requested level. + + Depending on ``level-of-operation``, consider existing segments: + - If ``overwrite_separators=True`` on ``page`` level, then + delete any SeparatorRegions. + - If ``overwrite_regions=True`` on ``page`` level, then + delete any top-level TextRegions (along with ReadingOrder). + - If ``overwrite_regions=True`` on ``table`` level, then + delete any TextRegions in TableRegions (along with their OrderGroup). + - If ``overwrite_lines=True`` on ``region`` level, then + delete any TextLines in TextRegions. + - If ``overwrite_order=True`` on ``page`` or ``table`` level, then + delete the reading order OrderedGroup entry corresponding + to the (page/table) segment. + + Next, get each element image according to the layout annotation (from + the alternative image of the page/region, or by cropping via coordinates + into the higher-level image) in binarized form, and represent it as an array + with non-text regions and (remaining) text neighbours suppressed. + + Then compute a text line segmentation for that array (as a label mask). + When ``level-of-operation`` is ``page`` or ``table``, this also entails + detecting + - up to ``maximages`` large foreground images, + - up to ``maxseps`` foreground line separators and + - up to ``maxcolseps`` background column separators + before text line segmentation itself, as well as aggregating text lines + to text regions afterwards. + + Text regions are detected via a hybrid variant recursive X-Y cut algorithm + (RXYC): RXYC partitions the binarized image in top-down manner by detecting + horizontal or vertical gaps. This implementation uses the bottom-up text line + segmentation to guide the search, and also uses both pre-existing and newly + detected separators to alternatively partition the respective boxes into + non-rectangular parts. + + During line segmentation, suppress the foreground of all previously annotated + regions (of any kind) and lines, except if just removed due to ``overwrite``. + During region aggregation however, combine the existing separators with the + new-found separators to guide the column search. + + All detected segments (both text line and text region) are sorted according + to their reading order (assuming a top-to-bottom, left-to-right ordering). + When ``level-of-operation`` is ``page``, prefer vertical (column-first) + succession of regions. When it is ``table``, prefer horizontal (row-first) + succession of cells. + + Then for each resulting segment label, convert its background mask into + polygon outlines by finding the outer contours consistent with the element's + polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: + - If ``level-of-operation`` is ``region``, then append the new lines to the + parent region. + - If it is ``table``, then append the new lines to their respective regions, + and append the new regions to the parent table. + (Also, create an OrderedGroup for it as the parent's RegionRef.) + - If it is ``page``, then append the new lines to their respective regions, + and append the new regions to the page. + (Also, create an OrderedGroup for it in the ReadingOrder.) + + Produce a new output file by serialising the resulting hierarchy. + """ + # FIXME: allow passing a-priori info on reading order / textline order + # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture + # of different scripts; also, vertical writing needs internal rotation + # because our line segmentation only works for horizontal writing) overwrite_lines = self.parameter['overwrite_lines'] overwrite_regions = self.parameter['overwrite_regions'] overwrite_separators = self.parameter['overwrite_separators'] @@ -417,256 +484,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom) return OcrdPageResult(pcgts) - def process(self): - """Segment pages into regions+lines, tables into cells+lines, or regions into lines. - - Open and deserialise PAGE input files and their respective images, - then iterate over the element hierarchy down to the requested level. - - Depending on ``level-of-operation``, consider existing segments: - - If ``overwrite_separators=True`` on ``page`` level, then - delete any SeparatorRegions. - - If ``overwrite_regions=True`` on ``page`` level, then - delete any top-level TextRegions (along with ReadingOrder). - - If ``overwrite_regions=True`` on ``table`` level, then - delete any TextRegions in TableRegions (along with their OrderGroup). - - If ``overwrite_lines=True`` on ``region`` level, then - delete any TextLines in TextRegions. - - If ``overwrite_order=True`` on ``page`` or ``table`` level, then - delete the reading order OrderedGroup entry corresponding - to the (page/table) segment. - - Next, get each element image according to the layout annotation (from - the alternative image of the page/region, or by cropping via coordinates - into the higher-level image) in binarized form, and represent it as an array - with non-text regions and (remaining) text neighbours suppressed. - - Then compute a text line segmentation for that array (as a label mask). - When ``level-of-operation`` is ``page`` or ``table``, this also entails - detecting - - up to ``maximages`` large foreground images, - - up to ``maxseps`` foreground line separators and - - up to ``maxcolseps`` background column separators - before text line segmentation itself, as well as aggregating text lines - to text regions afterwards. - - Text regions are detected via a hybrid variant recursive X-Y cut algorithm - (RXYC): RXYC partitions the binarized image in top-down manner by detecting - horizontal or vertical gaps. This implementation uses the bottom-up text line - segmentation to guide the search, and also uses both pre-existing and newly - detected separators to alternatively partition the respective boxes into - non-rectangular parts. - - During line segmentation, suppress the foreground of all previously annotated - regions (of any kind) and lines, except if just removed due to ``overwrite``. - During region aggregation however, combine the existing separators with the - new-found separators to guide the column search. - - All detected segments (both text line and text region) are sorted according - to their reading order (assuming a top-to-bottom, left-to-right ordering). - When ``level-of-operation`` is ``page``, prefer vertical (column-first) - succession of regions. When it is ``table``, prefer horizontal (row-first) - succession of cells. - - Then for each resulting segment label, convert its background mask into - polygon outlines by finding the outer contours consistent with the element's - polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: - - If ``level-of-operation`` is ``region``, then append the new lines to the - parent region. - - If it is ``table``, then append the new lines to their respective regions, - and append the new regions to the parent table. - (Also, create an OrderedGroup for it as the parent's RegionRef.) - - If it is ``page``, then append the new lines to their respective regions, - and append the new regions to the page. - (Also, create an OrderedGroup for it in the ReadingOrder.) - - Produce a new output file by serialising the resulting hierarchy. - """ - # FIXME: allow passing a-priori info on reading order / textline order - # (and then pass on as ``bt`` and ``rl``; however, there may be a mixture - # of different scripts; also, vertical writing needs internal rotation - # because our line segmentation only works for horizontal writing) - overwrite_lines = self.parameter['overwrite_lines'] - overwrite_regions = self.parameter['overwrite_regions'] - overwrite_separators = self.parameter['overwrite_separators'] - overwrite_order = self.parameter['overwrite_order'] - oplevel = self.parameter['level-of-operation'] - - for (n, input_file) in enumerate(self.input_files): - self.logger.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID) - file_id = make_file_id(input_file, self.output_file_grp) - - pcgts = page_from_file(self.workspace.download_file(input_file)) - self.add_metadata(pcgts) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - - # TODO: also allow grayscale_normalized (try/except?) - page_image, page_coords, page_image_info = self.workspace.image_from_page( - page, page_id, feature_selector='binarized') - zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) - - # aggregate existing regions so their foreground can be ignored - ignore = (page.get_ImageRegion() + - page.get_LineDrawingRegion() + - page.get_GraphicRegion() + - page.get_ChartRegion() + - page.get_MapRegion() + - page.get_MathsRegion() + - page.get_ChemRegion() + - page.get_MusicRegion() + - page.get_AdvertRegion() + - page.get_NoiseRegion() + - page.get_UnknownRegion() + - page.get_CustomRegion()) - if oplevel == 'page' and overwrite_separators: - page.set_SeparatorRegion([]) - else: - ignore.extend(page.get_SeparatorRegion()) - # prepare reading order - reading_order = dict() - ro = page.get_ReadingOrder() - if ro: - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - if rogroup: - page_get_reading_order(reading_order, rogroup) - - # get segments to process / overwrite - if oplevel == 'page': - ignore.extend(page.get_TableRegion()) - regions = list(page.get_TextRegion()) - if regions: - # page is already region-segmented - if overwrite_regions: - self.logger.info('removing existing TextRegions in page "%s"', page_id) - # we could remove all other region types as well, - # but this is more flexible (for workflows with - # specialized separator/image/table detectors): - page.set_TextRegion([]) - page.set_ReadingOrder(None) - ro = None - else: - self.logger.warning('keeping existing TextRegions in page "%s"', page_id) - ignore.extend(regions) - # create reading order if necessary - if not ro or overwrite_order: - ro = ReadingOrderType() - page.set_ReadingOrder(ro) - rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() - if not rogroup: - # new top-level group - rogroup = OrderedGroupType(id="reading-order") - ro.set_OrderedGroup(rogroup) - # go get TextRegions with TextLines (and SeparatorRegions): - self._process_element(page, ignore, page_image, page_coords, - page_id, file_id, - input_file.pageId, zoom, rogroup=rogroup) - if (not rogroup.get_RegionRefIndexed() and - not rogroup.get_OrderedGroupIndexed() and - not rogroup.get_UnorderedGroupIndexed()): - # schema forbids empty OrderedGroup - ro.set_OrderedGroup(None) - elif oplevel == 'table': - ignore.extend(page.get_TextRegion()) - regions = list(page.get_TableRegion()) - if not regions: - self.logger.warning('Page "%s" contains no table regions', page_id) - for region in regions: - subregions = region.get_TextRegion() - if subregions: - # table is already cell-segmented - if overwrite_regions: - self.logger.info('removing existing TextRegions in table "%s"', region.id) - region.set_TextRegion([]) - roelem = reading_order.get(region.id) - # replace by empty group with same index and ref - # (which can then take the cells as subregions) - reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) - else: - self.logger.warning('skipping table "%s" with existing TextRegions', region.id) - continue - # TODO: also allow grayscale_normalized (try/except?) - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - # ignore everything but the current table region - subignore = regions + ignore - subignore.remove(region) - # create reading order group if necessary - roelem = reading_order.get(region.id) - if not roelem: - self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", - page_id, region.id, "no target to add cells to") - elif overwrite_order: - # replace by empty ordered group with same (index and) ref - # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(self.logger, roelem) - reading_order[region.id] = roelem - elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", - page_id, region.id, "cells will be appended") - elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)", - page_id, region.id, "cells will not be appended") - roelem = None - else: - # replace regionRef(Indexed) by group with same index and ref - # (which can then take the cells as subregions) - roelem = page_subgroup_in_reading_order(self.logger, roelem) - reading_order[region.id] = roelem - # go get TextRegions with TextLines (and SeparatorRegions) - self._process_element(region, subignore, region_image, region_coords, - region.id, file_id + '_' + region.id, - input_file.pageId, zoom, rogroup=roelem) - else: # 'region' - regions = list(page.get_TextRegion()) - # besides top-level text regions, line-segment any table cells, - # and for tables without any cells, add a pseudo-cell - for region in page.get_TableRegion(): - subregions = region.get_TextRegion() - if subregions: - regions.extend(subregions) - else: - subregion = TextRegionType(id=region.id + '_text', - Coords=region.get_Coords(), - # as if generated from parser: - parent_object_=region) - region.add_TextRegion(subregion) - regions.append(subregion) - if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) - for region in regions: - if region.get_TextLine(): - if overwrite_lines: - self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) - region.set_TextLine([]) - else: - self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) - ignore.extend(region.get_TextLine()) - # TODO: also allow grayscale_normalized (try/except?) - region_image, region_coords = self.workspace.image_from_segment( - region, page_image, page_coords, feature_selector='binarized') - # if the region images have already been clipped against their neighbours specifically, - # then we don't need to suppress all neighbours' foreground generally here - if 'clipped' in region_coords['features'].split(','): - ignore = [] - # go get TextLines - self._process_element(region, ignore, region_image, region_coords, - region.id, file_id + '_' + region.id, - input_file.pageId, zoom) - - # update METS (add the PAGE file): - file_path = join(self.output_file_grp, file_id + '.xml') - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=file_path, - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts)) - self.logger.info('created file ID: %s, file_grp: %s, path: %s', - file_id, self.output_file_grp, out.local_filename) - def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): """Add PAGE layout elements by segmenting an image. From 5d4007be9ec0e352520995302bd8b11e92e51aae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:41:01 +0200 Subject: [PATCH 127/194] segment: adapt to final v3 API --- ocrd_cis/ocropy/segment.py | 252 +++++++++++++++++++------------------ 1 file changed, 133 insertions(+), 119 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 94b6ab1f..bdeb40dd 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -1,8 +1,10 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from os.path import join -from typing import Optional import itertools + import numpy as np from scipy.sparse.csgraph import minimum_spanning_tree from skimage import draw @@ -14,15 +16,21 @@ from shapely.validation import explain_validity from shapely import set_precision -from ocrd_modelfactory import page_from_file +from ocrd_utils import ( + getLogger, + coordinates_of_segment, + coordinates_for_segment, + points_from_polygon, + polygon_from_points, +) from ocrd_models.ocrd_page import ( - to_xml, CoordsType, - OcrdPage, + CoordsType, TextLineType, TextRegionType, SeparatorRegionType, PageType, - AlternativeImageType + AlternativeImageType, + OcrdPage ) from ocrd_models.ocrd_page_generateds import ( BaselineType, @@ -37,16 +45,7 @@ ReadingOrderType ) from ocrd import Processor -from ocrd.processor import OcrdPageResult -from ocrd_utils import ( - getLogger, - make_file_id, - coordinates_of_segment, - coordinates_for_segment, - points_from_polygon, - polygon_from_points, - MIMETYPE_PAGE -) +from ocrd.processor import OcrdPageResult, OcrdPageResultImage from .ocrolib import midrange from .ocrolib import morph @@ -255,11 +254,12 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropySegment') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: """Segment pages into regions+lines, tables into cells+lines, or regions into lines. - Open and deserialise PAGE input files and their respective images, + + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested level. - + Depending on ``level-of-operation``, consider existing segments: - If ``overwrite_separators=True`` on ``page`` level, then delete any SeparatorRegions. @@ -272,12 +272,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - If ``overwrite_order=True`` on ``page`` or ``table`` level, then delete the reading order OrderedGroup entry corresponding to the (page/table) segment. - + Next, get each element image according to the layout annotation (from the alternative image of the page/region, or by cropping via coordinates into the higher-level image) in binarized form, and represent it as an array with non-text regions and (remaining) text neighbours suppressed. - + Then compute a text line segmentation for that array (as a label mask). When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting @@ -286,25 +286,25 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. - + Text regions are detected via a hybrid variant recursive X-Y cut algorithm (RXYC): RXYC partitions the binarized image in top-down manner by detecting horizontal or vertical gaps. This implementation uses the bottom-up text line segmentation to guide the search, and also uses both pre-existing and newly detected separators to alternatively partition the respective boxes into non-rectangular parts. - + During line segmentation, suppress the foreground of all previously annotated regions (of any kind) and lines, except if just removed due to ``overwrite``. During region aggregation however, combine the existing separators with the new-found separators to guide the column search. - + All detected segments (both text line and text region) are sorted according to their reading order (assuming a top-to-bottom, left-to-right ordering). When ``level-of-operation`` is ``page``, prefer vertical (column-first) succession of regions. When it is ``table``, prefer horizontal (row-first) succession of cells. - + Then for each resulting segment label, convert its background mask into polygon outlines by finding the outer contours consistent with the element's polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: @@ -316,7 +316,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - If it is ``page``, then append the new lines to their respective regions, and append the new regions to the page. (Also, create an OrderedGroup for it in the ReadingOrder.) - + Produce a new output file by serialising the resulting hierarchy. """ # FIXME: allow passing a-priori info on reading order / textline order @@ -330,6 +330,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional oplevel = self.parameter['level-of-operation'] pcgts = input_pcgts[0] + result = OcrdPageResult(pcgts) page = pcgts.get_Page() # TODO: also allow grayscale_normalized (try/except?) @@ -361,14 +362,15 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional rogroup = ro.get_OrderedGroup() or ro.get_UnorderedGroup() if rogroup: page_get_reading_order(reading_order, rogroup) - # get segments to process / overwrite + + # get segments to process / overwrite if oplevel == 'page': ignore.extend(page.get_TableRegion()) regions = list(page.get_TextRegion()) if regions: # page is already region-segmented if overwrite_regions: - self.logger.info(f'Removing existing TextRegions in page "{page_id}"') + self.logger.info('removing existing TextRegions in page "%s"', page_id) # we could remove all other region types as well, # but this is more flexible (for workflows with # specialized separator/image/table detectors): @@ -376,7 +378,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page.set_ReadingOrder(None) ro = None else: - self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"') + self.logger.warning('keeping existing TextRegions in page "%s"', page_id) ignore.extend(regions) # create reading order if necessary if not ro or overwrite_order: @@ -387,32 +389,36 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # new top-level group rogroup = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(rogroup) - # go get TextRegions with TextLines (and SeparatorRegions): - self._process_element( - page, ignore, page_image, page_coords, page_id, file_id, page_id, zoom, rogroup=rogroup) if (not rogroup.get_RegionRefIndexed() and - not rogroup.get_OrderedGroupIndexed() and - not rogroup.get_UnorderedGroupIndexed()): - # schema forbids empty OrderedGroup + not rogroup.get_OrderedGroupIndexed() and + not rogroup.get_UnorderedGroupIndexed()): + # schema forbids empty OrderedGroup ro.set_OrderedGroup(None) - elif oplevel == 'table': + # go get TextRegions with TextLines (and SeparatorRegions): + image = self._process_element(page, ignore, page_image, page_coords, + zoom=zoom, rogroup=rogroup) + if image: + result.images.append(image) + return result + + if oplevel == 'table': ignore.extend(page.get_TextRegion()) regions = list(page.get_TableRegion()) if not regions: - self.logger.warning(f'Page "{page_id}" contains no table regions') + self.logger.warning('Page "%s" contains no table regions', page_id) for region in regions: subregions = region.get_TextRegion() if subregions: # table is already cell-segmented if overwrite_regions: - self.logger.info(f'Removing existing TextRegions in table "{region.id}"') + self.logger.info('removing existing TextRegions in table "%s"', region.id) region.set_TextRegion([]) roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) else: - self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions') + self.logger.warning('skipping table "%s" with existing TextRegions', region.id) continue # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -423,19 +429,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional # create reading order group if necessary roelem = reading_order.get(region.id) if not roelem: - self.logger.warning(f"Page '{page_id}' table region '{region.id}' is not referenced in reading " - f"order (no target to add cells to)") + self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", + page_id, region.id, "no target to add cells to") elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an ordered " - f"group (cells will be appended)") + self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", + page_id, region.id, "cells will be appended") elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - self.logger.warning(f"Page '{page_id}' table region '{region.id}' already has an unordered " - f"group (cells will not be appended)") + self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)", + page_id, region.id, "cells will not be appended") roelem = None else: # replace regionRef(Indexed) by group with same index and ref @@ -443,10 +449,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) - self._process_element( - region, subignore, region_image, region_coords, region.id, file_id + '_' + region.id, - page_id, zoom, rogroup=roelem) - else: # 'region' + image = self._process_element(region, subignore, region_image, region_coords, + zoom=zoom, rogroup=roelem) + if image: + result.images.append(image) + else: # 'region' regions = list(page.get_TextRegion()) # besides top-level text regions, line-segment any table cells, # and for tables without any cells, add a pseudo-cell @@ -455,11 +462,10 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if subregions: regions.extend(subregions) else: - subregion = TextRegionType( - id=region.id + '_text', - Coords=region.get_Coords(), - # as if generated from parser: - parent_object_=region) + subregion = TextRegionType(id=region.id + '_text', + Coords=region.get_Coords(), + # as if generated from parser: + parent_object_=region) region.add_TextRegion(subregion) regions.append(subregion) if not regions: @@ -467,10 +473,10 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional for region in regions: if region.get_TextLine(): if overwrite_lines: - self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"') + self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) region.set_TextLine([]) else: - self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"') + self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) ignore.extend(region.get_TextLine()) # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -480,11 +486,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if 'clipped' in region_coords['features'].split(','): ignore = [] # go get TextLines - self._process_element( - region, ignore, region_image, region_coords, region.id, file_id + '_' + region.id, page_id, zoom) - return OcrdPageResult(pcgts) + image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom) + if image: + result.images.append(image) - def _process_element(self, element, ignore, image, coords, element_id, file_id, page_id, zoom=1.0, rogroup=None): + return result + + def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]: """Add PAGE layout elements by segmenting an image. Given a PageType, TableRegionType or TextRegionType ``element``, and @@ -503,14 +511,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, newly detected separators to guide region segmentation. """ if not image.width or not image.height: - self.logger.warning(f"Skipping '{element_id}' with zero size") - return + self.logger.warning(f"Skipping '{element.id}' with zero size") + return None element_array = pil2array(image) element_bin = np.array(element_array <= midrange(element_array), bool) sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - self.logger.debug(f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element_id}"') + self.logger.debug(f'masking foreground of {type(segment).__name__[:-4]} ' + f'"{segment.id}" for "{element.id}"') # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: @@ -522,14 +531,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # then this will silently ignore them. The caller does # not need to concern herself with this. sp_row = segment_polygon[:, 1] - sp_column = segment_polygon[:, 0] + sp_col = segment_polygon[:, 0] if isinstance(segment, SeparatorRegionType): - sep_bin[draw.polygon(sp_row, sp_column, sep_bin.shape)] = True - ignore_labels[draw.polygon(sp_row, sp_column, ignore_labels.shape)] = i + 1 # mapped back for RO + sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True + ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i+1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True report = check_page(element_bin, zoom) + suffix = '.IMG-CLIP' elif isinstance(element, TableRegionType) or ( # sole/congruent text region of a table region? element.id.endswith('_text') and @@ -537,11 +547,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, element_name = 'table' fullpage = True report = check_region(element_bin, zoom) + suffix = element.id + '.IMG-CLIP' else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - self.logger.info(f'Computing line segmentation for {element_name} "{element_id}"') + suffix = element.id + '.IMG-CLIP' + self.logger.info(f'computing line segmentation for {element_name} "{element.id}"') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -551,7 +563,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, - spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt + spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], @@ -559,13 +571,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, csminheight=self.parameter['csminheight']) except Exception as err: if isinstance(element, TextRegionType): - self.logger.error(f'Cannot line-segment region "{element_id}": {err}') + self.logger.error(f'Cannot line-segment region "{element.id}": {err}') # as a fallback, add a single text line comprising the whole region: - element.add_TextLine(TextLineType(id=element_id + "_line", Coords=element.get_Coords())) + element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords())) else: - self.logger.error(f'Cannot line-segment {element_name} "{element_id}": {err}') - return - self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element_id}"') + self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}') + return None + + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines ' + f'for {element_name} "{element.id}"') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -580,17 +594,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, - sepmask=np.maximum(sepmask, colseps), # add bg + sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - self.logger.info( - f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element_id}"') + self.logger.info(f'Found {len(np.unique(region_labels)) - 1} text regions ' + f'for {element_name} "{element.id}"') except Exception as err: - self.logger.error(f'Cannot region-segment {element_name} "{element_id}": {err}') + self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) + # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): @@ -607,7 +622,7 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, region_no = 0 for region_label in np.unique(region_labels): if not region_label: - continue # no bg + continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) @@ -616,12 +631,12 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # (no new region, no actual text lines) region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ - (f"Region label {region_label} has both existing regions and new lines " - f"({str(region_line_labels0)})") + (f'region label "{region_label}" has both existing regions and new lines ' + f'({str(region_line_labels0)})') region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): index = page_add_to_reading_order(rogroup, region.id, index) - self.logger.debug(f'Region label {region_label} is for ignored region "{region.id}"') + self.logger.debug(f'Region label "{region_label}" is for ignored region "{region.id}"') continue # normal case: new lines inside new regions # remove binary-empty labels, and re-order locally @@ -629,18 +644,18 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps - region_line_labels = hmerge_line_seeds( - element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps)) + region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, + seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons( - self.logger, region_mask * region_label, None, element_bin, - name=f'{element_name} "{element_id}"', min_area=6000 / zoom / zoom, - simplify=ignore_labels * ~(sep_bin)) + regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin, + name=f'{element_name} "{element.id}"', + min_area=6000 / zoom / zoom, + simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons( - self.logger, region_line_labels, baselines, element_bin, - name=f'region "{element_id}"', min_area=640 / zoom / zoom) + lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin, + name=f'region "{element.id}"', + min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -653,13 +668,13 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, continue # annotate result: region_no += 1 - region_id = element_id + "_region%04d" % region_no + region_id = element.id + "_region%04d" % region_no self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"') region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): - if not region_poly.intersects(line_poly): # .contains + if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: @@ -681,16 +696,16 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - self.logger.info( - f'Added region "{region_id}" with {line_no} lines for {element_name} "{element_id}"') + self.logger.info(f'Added region "{region_id}" with {line_no} lines ' + f'for {element_name} "{element.id}"') if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element_id}"') + self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element.id}"') # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons( - self.logger, images, None, element_bin, f'{element_name} "{element_id}"') + image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, + name=f'{element_name} "{element.id}"') for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -700,15 +715,15 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, continue region_no += 1 # annotate result: - region_id = element_id + "_image%04d" % region_no + region_id = element.id + "_image%04d" % region_no element.add_ImageRegion(ImageRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element_id}"') + self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element.id}"') # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons( - self.logger, seplines, None, element_bin, - name=f'{element_name} "{element_id}"', open_holes=True, reorder=False) + sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin, + name=f'{element_name} "{element.id}"', + open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -718,27 +733,28 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, continue # annotate result: region_no += 1 - region_id = element_id + "_sep%04d" % region_no + region_id = element.id + "_sep%04d" % region_no element.add_SeparatorRegion(SeparatorRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) - file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id) - element.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=coords['features'] + ',clipped')) + image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') + element.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_clipped, suffix, image_ref) else: - # get mask from region polygon: + # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True + region_mask[draw.polygon(region_polygon[:, 1], + region_polygon[:, 0], + region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons( - self.logger, line_labels, baselines, element_bin, - name=f'region "{element_id}"', min_area=640 / zoom / zoom) + line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin, + name=f'region "{element.id}"', + min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: @@ -749,22 +765,20 @@ def _process_element(self, element, ignore, image, coords, element_id, file_id, continue # annotate result: line_no += 1 - line_id = element_id + "_line%04d" % line_no + line_id = element.id + "_line%04d" % line_no line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if baseline: line_baseline = coordinates_for_segment(baseline, image, coords) line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) element.add_TextLine(line) if not sep_bin.any(): - return # no derived image + return None # no derived image # annotate a text/image-separated image - element_array[sep_bin] = np.amax(element_array) # clip to white/bg + element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) - file_path = self.workspace.save_image_file( - image_clipped, file_id + '.IMG-CLIP', self.output_file_grp, page_id=page_id) - # update PAGE (reference the image file): - element.add_AlternativeImage(AlternativeImageType( - filename=file_path, comments=coords['features'] + ',clipped')) + image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') + element.add_AlternativeImage(image_ref) + return OcrdPageResultImage(image_clipped, suffix, image_ref) def polygon_for_parent(polygon, parent): """Clip polygon to parent polygon range. From 83ba2f01e0cb210fa7777c7fc4f9ddc3233be633 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:54:14 +0200 Subject: [PATCH 128/194] CI: try testing in parallel --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 35f0a966..d5e18b9f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -10,7 +10,7 @@ jobs: - checkout - run: apt-get update && apt-get -y install default-jre-headless - run: make install - - run: make test V="" + - run: make -j test V="" deploy-docker: docker: From a2100c29c0f4f85803e0faa2dde9bdf84299c589 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 12:55:16 +0200 Subject: [PATCH 129/194] Updated config.yml --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index d5e18b9f..5825a4e0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -3,6 +3,7 @@ jobs: test-python3: docker: - image: ocrd/core + resource_class: large environment: PIP: pip3 PYTHON: python3 From df1c35cbe1325a8da5dabd2c9227a7246439fd15 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:42:57 +0200 Subject: [PATCH 130/194] train: adapt to final v3 API --- ocrd_cis/ocropy/train.py | 129 +++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 65 deletions(-) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 08b68693..5c57b2cf 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -1,12 +1,15 @@ from __future__ import absolute_import + +from typing import Optional from logging import Logger from sys import exit from os import getcwd, makedirs, remove from os.path import abspath, dirname, exists, join, isfile import tempfile -from ocrd_modelfactory import page_from_file -from ocrd import Processor +from ocrd_models import OcrdPage +from ocrd import Processor, Workspace +from ocrd.processor import OcrdPageResult from ocrd_utils import getLogger from .ocropus_rtrain import * @@ -37,80 +40,79 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyTrain') - self.old_cwd = getcwd() - #print(self.parameter) if 'model' in self.parameter: model = self.parameter['model'] try: - modelpath = self.resolve_resource(model) + self.modelpath = self.resolve_resource(model) except SystemExit: ocropydir = dirname(abspath(__file__)) - modelpath = join(ocropydir, 'models', model) - self.logger.info("Failed to resolve model '%s' path, trying '%s'", model, modelpath) + self.modelpath = join(ocropydir, 'models', model) + self.logger.error(f"Failed to resolve model '{model}' path, trying '{modelpath}'") if not isfile(modelpath): - self.logger.error("Could not find model '%s'. Try 'ocrd resmgr download ocrd-cis-ocropy-recognize %s'", - model, model) + self.logger.critical(f"Could not find model '{model}'.\n" + f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'") exit(1) - outputpath = join(self.old_cwd, 'output', model) - if 'outputpath' in self.parameter: - outputpath = join(self.parameter, model) + self.outputpath = join(self.parameter.get('outputpath', 'output'), model) else: - modelpath = None - outputpath = join(self.old_cwd, 'output', 'lstm') - if 'outputpath' in self.parameter: - outputpath = join(self.parameter, 'lstm') - makedirs(dirname(outputpath)) - self.modelpath = modelpath - self.outputpath = outputpath - - def process(self): + self.modelpath = None + self.outputpath = join(self.parameter.get('outputpath', 'output'), 'lstm') + makedirs(dirname(self.outputpath)) + self.filelist = None + + def process_workspace(self, workspace: Workspace) -> None: """ Trains a new model on the text lines from the input fileGrp, - extracted as temporary image-text file pairs. + extracted as image-text file pairs into the output fileGrp. + (If the output fileGrp already exists and these files should + be re-used, pass the `--overwrite` option when processing.) + + The model is written into `outputpath` (or just `output`) under + the same name as `model` (i.e. the start model, or just `lstm`). + """ + self.filelist = [] + super().process_workspace(workspace) + self.logger.info(f"Training {self.outputpath} from {self.modelpath or 'scratch'} " + f"on {len(self.filelist)} file pairs") + rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) + # deletefiles(self.filelist) + + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + """ + Extracts pairs of plaintext and cropped image files for each text line + in the PAGE file (to be used during training). """ - filelist = [] - filepath = tempfile.mkdtemp(prefix='ocrd-cis-ocropy-train-') + pcgts = input_pcgts[0] #self.logger.info("Using model %s in %s for recognition", model) - for (n, input_file) in enumerate(self.input_files): - #self.logger.info("INPUT FILE %i / %s", n, input_file) - pcgts = page_from_file(self.workspace.download_file(input_file)) - page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id) - page = pcgts.get_Page() - page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) - - self.logger.info("Extracting from page '%s'", page_id) - for region in page.get_AllRegions(classes=['Text']): - textlines = region.get_TextLine() - self.logger.info("Extracting %i lines from region '%s'", len(textlines), region.id) - for line in textlines: - if self.parameter['textequiv_level'] == 'line': - path = join(filepath, page_id + region.id + line.id) - imgpath = self.extract_segment(path, line, page_image, page_coords) - if imgpath: - filelist.append(imgpath) + page = pcgts.get_Page() + page_image, page_coords, _ = self.workspace.image_from_page(page, page_id) + + self.logger.debug(f"Extracting from page '{page_id}'") + for region in page.get_AllRegions(classes=['Text']): + textlines = region.get_TextLine() + self.logger.debug(f"Extracting {len(textlines)} lines from region '{region.id}'") + for line in textlines: + if self.parameter['textequiv_level'] == 'line': + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}") + self.filelist.append(self.extract_segment(path, line, page_image, page_coords)) + continue + for word in line.get_Word(): + if self.parameter['textequiv_level'] == 'word': + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}") + self.filelist.append(self.extract_segment(path, word, page_image, page_coords)) continue - for word in line.get_Word(): - if self.parameter['textequiv_level'] == 'word': - path = join(filepath, page_id + region.id + line.id + word.id) - imgpath = self.extract_segment(path, word, page_image, page_coords) - if imgpath: - filelist.append(imgpath) - continue - for glyph in word.get_Glyph(): - path = join(filepath, page_id + region.id + line.id + glyph.id) - imgpath = self.extract_segment(path, glyph, page_image, page_coords) - if imgpath: - filelist.append(imgpath) - - self.logger.info("Training %s from %s on %i file pairs", - self.outputpath, - self.modelpath or 'scratch', - len(filelist)) - rtrain(filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) - deletefiles(filelist) + for glyph in word.get_Glyph(): + path = join(self.output_file_grp, f"{page_id}_{region.id}_{line.id}_{word.id}_{glyph.id}") + self.filelist.append(self.extract_segment(path, glyph, page_image, page_coords)) + # FIXME: PAGE-XML not really needed, find a way around this (raising special exception?) + return OcrdPageResult(pcgts) def extract_segment(self, path, segment, page_image, page_coords): - #ground truth + gtpath = path + '.gt.txt' + imgpath = path + '.png' + if exists(gtpath) and exists(imgpath): + self.logger.debug(f"Reusing {segment.__class__.__name__} '{segment.id}' file pair") + return imgpath + gt = segment.TextEquiv if not gt: return None @@ -118,11 +120,10 @@ def extract_segment(self, path, segment, page_image, page_coords): if not gt or not gt.strip(): return None gt = gt.strip() - gtpath = path + '.gt.txt' with open(gtpath, "w", encoding='utf-8') as f: f.write(gt) - self.logger.debug("Extracting %s '%s'", segment.__class__.__name__, segment.id) + self.logger.debug(f"Extracting {segment.__class__.__name__} '{segment.id}' file pair") image, coords = self.workspace.image_from_segment(segment, page_image, page_coords) if 'binarized' not in coords['features'].split(','): @@ -132,8 +133,6 @@ def extract_segment(self, path, segment, page_image, page_coords): # resize image to 48 pixel height image = resize_keep_ratio(image) - #save temp image - imgpath = path + '.png' image.save(imgpath) return imgpath From c08b623f9b0ad9daf4f8dc858b5b416b1212e018 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:51:54 +0200 Subject: [PATCH 131/194] ocrd-tool.json: add v3 cardinalities --- ocrd_cis/ocrd-tool.json | 120 +++++++++++----------------------------- 1 file changed, 31 insertions(+), 89 deletions(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index a93917da..c2e20268 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -12,17 +12,9 @@ "preprocessing/optimization/grayscale_normalization", "preprocessing/optimization/deskewing" ], - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-BIN", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Binarize (and optionally deskew/despeckle) pages / regions / lines with Ocropy v1", "parameters": { "method": { "type": "string", @@ -75,15 +67,9 @@ "steps": [ "preprocessing/optimization/deskewing" ], - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Deskew regions with ocropy (by annotating orientation angle and adding AlternativeImage)", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Deskew regions with Ocropy v1 (by annotating orientation angle and adding AlternativeImage)", "parameters": { "maxskew": { "type": "number", @@ -106,17 +92,9 @@ "steps": [ "preprocessing/optimization/despeckling" ], - "input_file_grp": [ - "OCR-D-IMG", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-IMG-DESPECK", - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "description": "Despeckle pages / regions / lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Despeckle pages / regions / lines with Ocropy v1", "parameters": { "noise_maxsize": { "type": "number", @@ -147,14 +125,8 @@ "layout/segmentation/region", "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-BLOCK", - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "description": "Clip text regions / lines at intersections with neighbours", "parameters": { "level-of-operation": { @@ -185,12 +157,8 @@ "steps": [ "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "description": "Improve coordinates of text lines", "parameters": { "level-of-operation": { @@ -245,12 +213,8 @@ "preprocessing/optimization/dewarping" ], "description": "Dewarp line images with ocropy", - "input_file_grp": [ - "OCR-D-SEG-LINE" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "dpi": { "type": "number", @@ -286,15 +250,9 @@ "steps": [ "recognition/text-recognition" ], - "description": "Recognize text in (binarized+deskewed+dewarped) lines with ocropy", - "input_file_grp": [ - "OCR-D-SEG-LINE", - "OCR-D-SEG-WORD", - "OCR-D-SEG-GLYPH" - ], - "output_file_grp": [ - "OCR-D-OCR-OCRO" - ], + "description": "Recognize text in (binarized+deskewed+dewarped) lines with Ocropy v1", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "textequiv_level": { "type": "string", @@ -345,14 +303,9 @@ "layout/segmentation/region", "layout/segmentation/line" ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "output_file_grp": [ - "OCR-D-SEG-LINE" - ], - "description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with ocropy", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "Segment pages into regions and lines, tables into cells and lines, or regions into lines with Ocropy v1", "parameters": { "dpi": { "type": "number", @@ -444,11 +397,9 @@ "steps": [ "recognition/text-recognition" ], - "input_file_grp": [ - "OCR-D-GT-SEG-BLOCK", - "OCR-D-SEG-BLOCK" - ], - "description": "train model with ground truth from mets data", + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, + "description": "train Ocropy v1 text recognition model with PAGE ground truth from the input fileGrp extracted as file pairs into the output fileGrp", "parameters": { "textequiv_level": { "type": "string", @@ -470,7 +421,8 @@ }, "outputpath": { "type": "string", - "description": "(existing) path for the trained model" + "default": "output", + "description": "directory path for the trained model" } } }, @@ -482,15 +434,9 @@ "steps": [ "recognition/post-correction" ], - "input_file_grp": [ - "OCR-D-OCR-1", - "OCR-D-OCR-2", - "OCR-D-OCR-N" - ], - "output_file_grp": [ - "OCR-D-ALIGNED" - ], - "description": "Align multiple OCRs and/or GTs" + "input_file_grp_cardinality": [2, -1], + "output_file_grp_cardinality": 1, + "description": "Align multiple OCRs and/or GTs textually on line/word level" }, "ocrd-cis-postcorrect": { "executable": "ocrd-cis-postcorrect", @@ -501,12 +447,8 @@ "recognition/post-correction" ], "description": "Post correct OCR results", - "input_file_grp": [ - "OCR-D-LINE-ALIGNED" - ], - "output_file_grp": [ - "OCR-D-POST-CORRECTED" - ], + "input_file_grp_cardinality": 1, + "output_file_grp_cardinality": 1, "parameters": { "maxCandidates": { "description": "Maximum number of considered correction candidates per suspicious token", From a18307d4a8f50b0a4b081016c9d9db55cca63023 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 15:27:09 +0200 Subject: [PATCH 132/194] fix: ocropy train errors --- ocrd_cis/ocropy/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 5c57b2cf..f5d70d6a 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -47,8 +47,8 @@ def setup(self): except SystemExit: ocropydir = dirname(abspath(__file__)) self.modelpath = join(ocropydir, 'models', model) - self.logger.error(f"Failed to resolve model '{model}' path, trying '{modelpath}'") - if not isfile(modelpath): + self.logger.error(f"Failed to resolve model '{model}' path, trying '{self.modelpath}'") + if not isfile(self.modelpath): self.logger.critical(f"Could not find model '{model}'.\n" f"Try 'ocrd resmgr download ocrd-cis-ocropy-recognize {model}'") exit(1) @@ -128,7 +128,7 @@ def extract_segment(self, path, segment, page_image, page_coords): if 'binarized' not in coords['features'].split(','): # binarize with nlbin - image, _ = binarize(image, maxskew=0) + image, _ = binarize(self.logger, image, maxskew=0) # resize image to 48 pixel height image = resize_keep_ratio(image) From 0ba6839c849688431fa2259da4cd934963724cfb Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 15:39:09 +0200 Subject: [PATCH 133/194] remove: unused imports --- ocrd_cis/ocropy/binarize.py | 6 +----- ocrd_cis/ocropy/clip.py | 14 ++++++-------- ocrd_cis/ocropy/denoise.py | 10 ++-------- ocrd_cis/ocropy/deskew.py | 8 +------- ocrd_cis/ocropy/dewarp.py | 12 +++--------- ocrd_cis/ocropy/recognize.py | 12 ++---------- ocrd_cis/ocropy/resegment.py | 1 - ocrd_cis/ocropy/segment.py | 1 - ocrd_cis/ocropy/train.py | 9 +++++---- 9 files changed, 20 insertions(+), 53 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index ac499336..271f01fa 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -1,14 +1,10 @@ from __future__ import absolute_import from logging import Logger +from typing import Optional import cv2 import numpy as np from PIL import Image -from os.path import abspath, dirname, join - -from typing import Union, Optional - -#import kraken.binarization from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 3ddd6a70..36ee4eb3 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -2,7 +2,6 @@ from logging import Logger from typing import Optional -from os.path import join import numpy as np from PIL import Image, ImageStat, ImageOps from shapely.geometry import Polygon @@ -12,19 +11,18 @@ from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage from ocrd_utils import ( - getLogger, - coordinates_of_segment, - polygon_from_points, bbox_from_polygon, + coordinates_of_segment, + crop_image, + getLogger, image_from_polygon, + polygon_from_points, polygon_mask, - crop_image, ) +from .common import array2pil, determine_zoom, pil2array from .ocrolib import midrange, morph -from .common import ( - # binarize, - array2pil, determine_zoom, pil2array) + class OcropyClip(Processor): logger: Logger diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 0f368fd5..72757e0c 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -1,19 +1,13 @@ from __future__ import absolute_import - from typing import Optional from logging import Logger -from os.path import join from ocrd_utils import getLogger -from ocrd_models.ocrd_page import ( - AlternativeImageType, OcrdPage -) +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage -from .common import ( - # binarize, - determine_zoom, remove_noise) +from .common import determine_zoom, remove_noise class OcropyDenoise(Processor): logger: Logger diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index fae0c90c..9f9f8b0a 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -1,15 +1,9 @@ from __future__ import absolute_import - from typing import Optional from logging import Logger -from os.path import join from ocrd_utils import getLogger -from ocrd_models.ocrd_page import ( - PageType, - AlternativeImageType, - OcrdPage -) +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index a063a05e..9902af95 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -1,18 +1,12 @@ from __future__ import absolute_import - -from typing import Optional from logging import Logger -from os.path import join - +from typing import Optional import numpy as np -from ocrd_utils import getLogger -from ocrd_models.ocrd_page import ( - AlternativeImageType, - OcrdPage -) from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd_utils import getLogger +from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from .ocrolib import lineest from .common import array2pil, check_line, determine_zoom, pil2array diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 97fcc64d..41576e43 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -10,16 +10,8 @@ from rapidfuzz.distance import Levenshtein -from ocrd_utils import ( - getLogger, - coordinates_for_segment, - polygon_from_bbox, - points_from_polygon, -) -from ocrd_models.ocrd_page import ( - TextEquivType, OcrdPage, - CoordsType, GlyphType, WordType -) +from ocrd_utils import coordinates_for_segment, getLogger, points_from_polygon, polygon_from_bbox +from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType from ocrd import Processor from ocrd.processor import OcrdPageResult diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 05f17d4f..0ef64687 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -2,7 +2,6 @@ from typing import Optional from logging import Logger -from os.path import join import numpy as np from skimage import draw, segmentation diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index bdeb40dd..edb5751a 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -2,7 +2,6 @@ from typing import Optional from logging import Logger -from os.path import join import itertools import numpy as np diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index f5d70d6a..8f224b86 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -3,9 +3,8 @@ from typing import Optional from logging import Logger from sys import exit -from os import getcwd, makedirs, remove +from os import makedirs, remove from os.path import abspath, dirname, exists, join, isfile -import tempfile from ocrd_models import OcrdPage from ocrd import Processor, Workspace @@ -32,7 +31,9 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): logger: Logger + modelpath: str old_cwd: str + outputpath: str @property def executable(self): @@ -75,8 +76,8 @@ def process_workspace(self, workspace: Workspace) -> None: f"on {len(self.filelist)} file pairs") rtrain(self.filelist, self.modelpath, self.outputpath, self.parameter['ntrain']) # deletefiles(self.filelist) - - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """ Extracts pairs of plaintext and cropped image files for each text line in the PAGE file (to be used during training). From 6b06e8856addd3b4963961df6d6cb1fb29e126cf Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 15:48:42 +0200 Subject: [PATCH 134/194] Update binarize.py --- ocrd_cis/ocropy/binarize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 3e87cf8a..e82dbc16 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -98,6 +98,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + result = OcrdPageResult(pcgts) if level == 'page': try: result.images.append(self.process_page(page, page_image, page_xywh, zoom, page_id)) @@ -256,4 +257,4 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=features) line.add_AlternativeImage(alt_image) - return OcrdPageResultImage(bin_image, suffix, alt_image) \ No newline at end of file + return OcrdPageResultImage(bin_image, suffix, alt_image) From d1a14b704c0d2559685b8f33ddd23d60c65563a7 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:22:42 +0200 Subject: [PATCH 135/194] refactor: python strings v3 --- ocrd_cis/ocropy/binarize.py | 6 +-- ocrd_cis/ocropy/clip.py | 5 +-- ocrd_cis/ocropy/denoise.py | 8 ++-- ocrd_cis/ocropy/deskew.py | 7 ++-- ocrd_cis/ocropy/dewarp.py | 11 +++--- ocrd_cis/ocropy/recognize.py | 6 +-- ocrd_cis/ocropy/resegment.py | 72 +++++++++++++++------------------- ocrd_cis/ocropy/segment.py | 76 ++++++++++++++++++------------------ 8 files changed, 88 insertions(+), 103 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index e82dbc16..782dd578 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -16,7 +16,7 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0.5, nrm=False, zoom=1.0): - logger.debug('binarizing %dx%d image with method=%s', pil_image.width, pil_image.height, method) + logger.debug(f'Binarizing {pil_image.width}x{pil_image.height} image with method={method}') if method == 'none': # useful if the images are already binary, # but lack image attribute `binarized` @@ -242,8 +242,8 @@ def process_line(self, line, line_image, line_xywh, zoom, page_id, region_id) -> #orientation = -angle #orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] #line.set_orientation(orientation) # does not exist on line level! - self.logger.warning(f"cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", - -angle) + self.logger.warning( + f"Cannot add orientation %.2f to page '{page_id}' region '{region_id}' line '{line.id}'", -angle) bin_image = remove_noise(bin_image, maxsize=self.parameter['noise_maxsize']) if self.parameter['noise_maxsize']: features += ',despeckled' diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 36ee4eb3..7f40a214 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -128,15 +128,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). - self.logger.warning( - f'Page "{page_id}" region "{region.id}" already contains image data: skipping') + self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping') continue shape = prep(shapes[i]) neighbours = [(regionj, maskj) for shapej, regionj, maskj in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:]) if shape.intersects(shapej)] if neighbours: - segment_region_file_id = f"{output_file_id}_{region.id}" ret.images.append(self.process_segment( region, masks[i], polygons[i], neighbours, background_image, page_image, page_xywh, page_bin, page_id)) @@ -167,7 +165,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:]) if shape.intersects(shapej)] if neighbours: - segment_line_file_id = f"{output_file_id}_{region.id}_{line.id}" ret.images.append(self.process_segment( line, masks[j], polygons[j], neighbours, background_image, region_image, region_coords, region_bin, page_id)) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 72757e0c..b3c219fb 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -57,7 +57,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option else: regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh, @@ -69,7 +69,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option continue lines = region.get_TextLine() if not lines: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, @@ -80,9 +80,9 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: - self.logger.warning("Skipping '%s' with zero size", file_id) + self.logger.warning(f"Skipping '{segment.id}' with zero size") return None - self.logger.info("About to despeckle '%s'", file_id) + self.logger.info(f"About to despeckle '{segment.id}'") bin_image = remove_noise(segment_image, maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt # update PAGE (reference the image file): diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 9f9f8b0a..84475d81 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -73,8 +73,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option # (we will overwrite @orientation anyway,) # abort if no such image can be produced: feature_filter='deskewed') - image = self._process_segment(region, region_image, region_coords, - "region '%s'" % region.id, page_id) + image = self._process_segment(region, region_image, region_coords, f"region '{region.id}'", page_id) if image: result.images.append(image) return result @@ -84,14 +83,14 @@ def _process_segment(self, segment, segment_image, segment_coords, segment_id, p self.logger.warning("Skipping %s with zero size", segment_id) return None angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image - self.logger.info("About to deskew %s", segment_id) + self.logger.info(f"About to deskew {segment_id}") angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied # segment angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: orientation = -(angle + angle0) orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] segment.set_orientation(orientation) # also removes all deskewed AlternativeImages - self.logger.info("Found angle for %s: %.1f", segment_id, angle) + self.logger.info(f"Found angle for {segment_id}: %.1f", angle) # delegate reflection, rotation and re-cropping to core: if isinstance(segment, PageType): segment_image, segment_coords, _ = self.workspace.image_from_page( diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 9902af95..302cf2e0 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -101,29 +101,28 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') for region in regions: region_image, region_xywh = self.workspace.image_from_segment( region, page_image, page_xywh) lines = region.get_TextLine() if not lines: - self.logger.warning('Region %s contains no text lines', region.id) + self.logger.warning(f'Region {region.id} contains no text lines') for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh) - self.logger.info("About to dewarp page '%s' region '%s' line '%s'", - page_id, region.id, line.id) + self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") try: dew_image = dewarp(line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) except InvalidLine as err: - self.logger.error('cannot dewarp line "%s": %s', line.id, err) + self.logger.error(f'Cannot dewarp line "{line.id}": {err}') continue except InadequateLine as err: - self.logger.warning('cannot dewarp line "%s": %s', line.id, err) + self.logger.warning(f'cannot dewarp line "{line.id}": {err}') # as a fallback, simply pad the image vertically # (just as dewarping would do on average, so at least # this line has similar margins as the others): diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 41576e43..f0c4b520 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -179,13 +179,13 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): linegt = line.TextEquiv[0].Unicode else: linegt = '' - self.logger.debug("GT '%s': '%s'", line.id, linegt) + self.logger.debug(f"GT '{line.id}': '{linegt}'") # remove existing annotation below line level: line.set_TextEquiv([]) line.set_Word([]) if line_image.size[1] < 16: - self.logger.debug(f"ERROR: bounding box is too narrow at line {line.id}") + self.logger.debug(f"Error: bounding box is too narrow at line {line.id}") continue # resize image to 48 pixel height final_img, scale = resize_keep_ratio(line_image) @@ -194,7 +194,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): try: linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: - self.logger.debug(f'error processing line "{line.id}": {err}') + self.logger.debug(f'Error processing line "{line.id}": {err}') continue self.logger.debug(f"OCR '{line.id}': '{linepred}'") edits += Levenshtein.distance(linepred, linegt) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 0ef64687..d429c1de 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -126,14 +126,14 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option page.get_CustomRegion()) regions = page.get_AllRegions(classes=['Text']) if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') elif level == 'page': lines = [line for region in regions for line in region.get_TextLine()] if lines: self._process_segment(page, page_image, page_coords, page_id, zoom, lines, ignore) else: - self.logger.warning('Page "%s" contains no text regions with lines', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions with lines', ) else: for region in regions: lines = region.get_TextLine() @@ -142,7 +142,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option region, page_image, page_coords, feature_selector='binarized') self._process_segment(region, region_image, region_coords, page_id, zoom, lines, ignore) else: - self.logger.warning('Page "%s" region "%s" contains no text lines', page_id, region.id) + self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') return OcrdPageResult(pcgts) def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): @@ -163,8 +163,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l fullpage = False report = check_region(parent_bin, zoom) if report: - self.logger.warning('Invalid %s "%s": %s', tag, - page_id if fullpage else parent.id, report) + self.logger.warning(f'Invalid {tag} "{page_id if fullpage else parent.id}": {report}') return # get existing line labels: line_labels = np.zeros_like(parent_bin, bool) @@ -191,8 +190,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines for i, region in enumerate(set(line.parent_object_ for line in lines)): - self.logger.debug('unmasking area of text region "%s" for "%s"', - region.id, page_id if fullpage else parent.id) + self.logger.debug(f'Unmasking area of text region "{region.id}" for "{page_id if fullpage else parent.id}"') region_polygon = coordinates_of_segment(region, parent_image, parent_coords) region_polygon = make_valid(Polygon(region_polygon)) region_polygon = np.array(region_polygon.exterior.coords, int)[:-1] @@ -201,14 +199,14 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): - self.logger.debug('masking area of %s "%s" for "%s"', type(segment).__name__[:-4], - segment.id, page_id if fullpage else parent.id) + self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for ' + f'"{page_id if fullpage else parent.id}"') segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True if method != 'lineest': - self.logger.debug('calculating connected component and distance transforms for "%s"', parent.id) + self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"') bin = parent_bin & ~ ignore_bin components, _ = morph.label(bin) # estimate glyph scale (roughly) @@ -217,7 +215,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l counts = np.sqrt(3 * counts) scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)])) components *= (counts > 15/zoom)[components] - self.logger.debug("estimated scale: %d", scale) + self.logger.debug(f"Estimated scale: {scale}") else: scale = 43 if method == 'ccomps': @@ -235,7 +233,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l new_labels = np.zeros_like(parent_bin, np.uint8) for i, line in enumerate(lines): if line.Baseline is None: - self.logger.warning("Skipping '%s' without baseline", line.id) + self.logger.warning(f"Skipping '{line.id}' without baseline") new_labels[line_labels[i]] = i + 1 continue line_baseline = baseline_of_segment(line, parent_coords) @@ -254,14 +252,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: - self.logger.error('Cannot line-segment %s "%s": %s', - tag, page_id if fullpage else parent.id, err) + self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}') return - self.logger.info("Found %d new line labels for %d existing lines on %s '%s'", - new_line_labels.max(), len(lines), tag, parent.id) + self.logger.info( + f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'") # polygonalize and prepare comparison new_line_polygons, new_line_labels = masks2polygons(self.logger, - new_line_labels, new_baselines, parent_bin, '%s "%s"' % (tag, parent.id), + new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"', min_area=640/zoom/zoom) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) @@ -345,31 +342,29 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for j, line in enumerate(lines): new_lines = np.nonzero(assignments == j)[0] if not np.prod(new_lines.shape): - self.logger.debug("no lines for '%s' match or fit", line.id) + self.logger.debug(f"no lines for '{line.id}' match or fit", ) continue covers = np.sum(covers_bg[new_lines,j]) if covers < threshold / 3: - self.logger.debug("new lines for '%s' only cover %.1f%% bg", - line.id, covers * 100) + self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100) continue covers = np.sum(covers_fg[new_lines,j]) if covers < threshold: - self.logger.debug("new lines for '%s' only cover %.1f%% fg", - line.id, covers * 100) + self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100) continue looses = (assignments < 0) & (covers_bg[:,j] > 0.1) if looses.any(): covers = np.sum(covers_bg[np.nonzero(looses)[0],j]) - self.logger.debug("new lines for '%s' would loose %d non-matching segments totalling %.1f%% bg", - line.id, np.count_nonzero(looses), covers * 100) + self.logger.debug( + f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments " + f"totalling %.1f%% bg", covers * 100) continue line_count = np.count_nonzero(line_labels[j] & parent_bin) new_count = covers * line_count - self.logger.debug('Black pixels before/after resegment of line "%s": %d/%d', - line.id, line_count, new_count) + self.logger.debug(f'Black pixels before/after resegment of line "{line.id}": {line_count}/{new_count}') # combine all assigned new lines to single outline polygon if len(new_lines) > 1: - self.logger.debug("joining %d new line polygons for '%s'", len(new_lines), line.id) + self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'") new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] for i in new_lines], loc=line.id, scale=scale) new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i]) @@ -379,7 +374,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_image, parent_coords) line_polygon = polygon_for_parent(line_polygon, line.parent_object_) if line_polygon is None: - self.logger.warning("Ignoring extant new polygon for line '%s'", line.id) + self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'") return # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) @@ -394,7 +389,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if j == otherj: continue otherline = lines[otherj] - self.logger.debug("subtracting new '%s' from overlapping '%s'", line.id, otherline.id) + self.logger.debug(f"subtracting new '{line.id}' from overlapping '{otherline.id}'") other_polygon = diff_polygons(line_polygons[otherj].context, new_polygon) if other_polygon.is_empty: continue @@ -403,7 +398,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_image, parent_coords) other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_) if other_polygon is None: - self.logger.warning("Ignoring extant new polygon for line '%s'", otherline.id) + self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'") continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) @@ -434,29 +429,26 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon continue count = np.count_nonzero(old_label) if not count: - logger.warning("skipping zero-area line '%s'", line.id) + logger.warning(f"skipping zero-area line '{line.id}'") continue covers = np.count_nonzero(new_label) / count if covers < threshold / 3: - logger.debug("new line for '%s' only covers %.1f%% bg", - line.id, covers * 100) + logger.debug(f"new line for '%s' only covers %.1f%% bg", covers * 100) continue count = np.count_nonzero(old_label * binarized) if not count: - logger.warning("skipping binary-empty line '%s'", line.id) + logger.warning(f"skipping binary-empty line '{line.id}'") continue covers = np.count_nonzero(new_label * binarized) / count if covers < threshold: - logger.debug("new line for '%s' only covers %.1f%% fg", - line.id, covers * 100) + logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100) continue - logger.debug('Black pixels before/after resegment of line "%s": %d/%d', - line.id, count, covers * count) + logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}') contours = [contour[:,::-1] # get x,y order again for contour, area in morph.find_contours(new_label)] #LOG.debug("joining %d subsegments for %s", len(contours), line.id) if len(contours) == 0: - logger.warning("no contours for %s - keeping", line.id) + logger.warning(f"no contours for {line.id} - keeping") continue else: # get alpha shape @@ -468,7 +460,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) if polygon is None: - logger.warning("Ignoring extant line for %s", line.id) + logger.warning(f"Ignoring extant line for {line.id}") continue line.get_Coords().set_points(points_from_polygon(polygon)) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index edb5751a..e8c4a1ed 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -75,8 +75,6 @@ def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area= - these polygons as a list of label, polygon, baseline tuples, and - a Numpy array of new background labels for that list. """ - if not logger: - raise ValueError(f"Logger has not been passed by the caller") # find sharp baseline if baselines is not None: def getx(xy): @@ -93,8 +91,7 @@ def getx(xy): bg_mask = np.array(bg_labels == label, bool) if not np.count_nonzero(bg_mask * fg_bin): # ignore if missing foreground - logger.debug('skipping label %d in %s due to empty fg', - label, name) + logger.debug(f'Skipping label {label} in {name} due to empty fg') continue # simplify to convex hull if simplify is not None: @@ -102,8 +99,8 @@ def getx(xy): conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): - logger.debug('Cannot simplify %d: convex hull would create additional intersections %s', - label, str(conflicts)) + logger.debug( + f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}') else: bg_mask = hull if open_holes: @@ -131,8 +128,8 @@ def getx(xy): if len(hole) < 3: idx_hole = hier[0, idx_hole, 0] continue - logger.debug("label %d contour %d [%d pts] has hole %d [%d pts]", - label, idx, len(contour), idx_hole, len(hole)) + logger.debug( + f"Label {label} contour {idx} [{len(contour)} pts] has hole {idx_hole} [{len(hole)} pts]") #plot_poly(hole, 'blue') # cut child from outside... # first get nearest point on child @@ -173,7 +170,7 @@ def getx(xy): diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5 cispoint1 = cispoint1 + diff1 cispoint2 = cispoint2 + diff2 - logger.debug("stitching at interpolation pos %d hole pos %d", interpol_idx, hole_idx) + logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}") # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) # (this works, because inner contours have inverse direction) contour = np.concatenate([contour[:contour_idx], cispoint1, @@ -182,7 +179,7 @@ def getx(xy): #plot_poly(contour, 'green') idx_hole = hier[0, idx_hole, 0] #plot_poly(contour, 'red') - logger.debug("adding label %d contour %d [%d pts]", label, idx, len(contour)) + logger.debug(f"Adding label {label} contour {idx} [{len(contour)} pts]") contours.append(contour) idx = hier[0, idx, 0] else: @@ -208,8 +205,7 @@ def getx(xy): contour = contours[i] area = areas[i] if min_area and area < min_area and area / total_area < 0.1: - logger.warning('Label %d contour %d is too small (%d/%d) in %s', - label, i, area, total_area, name) + logger.warning(f'Label {label} contour {i} is too small ({area}/{total_area}) in {name}') continue # simplify shape: # can produce invalid (self-intersecting) polygons: @@ -226,7 +222,7 @@ def getx(xy): logger.warning(explain_validity(polygon)) poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: - logger.warning('Label %d contour %d for %s has less than 4 points', label, i, name) + logger.warning(f'Label {label} contour {i} for {name} has less than 4 points') continue # get baseline segments intersecting with this line mask # and concatenate them from left to right @@ -369,7 +365,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option if regions: # page is already region-segmented if overwrite_regions: - self.logger.info('removing existing TextRegions in page "%s"', page_id) + self.logger.info(f'Removing existing TextRegions in page "{page_id}"', ) # we could remove all other region types as well, # but this is more flexible (for workflows with # specialized separator/image/table detectors): @@ -377,7 +373,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option page.set_ReadingOrder(None) ro = None else: - self.logger.warning('keeping existing TextRegions in page "%s"', page_id) + self.logger.warning(f'Keeping existing TextRegions in page "{page_id}"', ) ignore.extend(regions) # create reading order if necessary if not ro or overwrite_order: @@ -404,20 +400,20 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option ignore.extend(page.get_TextRegion()) regions = list(page.get_TableRegion()) if not regions: - self.logger.warning('Page "%s" contains no table regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no table regions') for region in regions: subregions = region.get_TextRegion() if subregions: # table is already cell-segmented if overwrite_regions: - self.logger.info('removing existing TextRegions in table "%s"', region.id) + self.logger.info(f'Removing existing TextRegions in table "{region.id}"') region.set_TextRegion([]) roelem = reading_order.get(region.id) # replace by empty group with same index and ref # (which can then take the cells as subregions) reading_order[region.id] = page_subgroup_in_reading_order(self.logger, roelem) else: - self.logger.warning('skipping table "%s" with existing TextRegions', region.id) + self.logger.warning(f'Skipping table "{region.id}" with existing TextRegions') continue # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -428,19 +424,22 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option # create reading order group if necessary roelem = reading_order.get(region.id) if not roelem: - self.logger.warning("Page '%s' table region '%s' is not referenced in reading order (%s)", - page_id, region.id, "no target to add cells to") + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' is not referenced in reading order " + f"(no target to add cells to)") elif overwrite_order: # replace by empty ordered group with same (index and) ref # (which can then take the cells as subregions) roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem elif isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)): - self.logger.warning("Page '%s' table region '%s' already has an ordered group (%s)", - page_id, region.id, "cells will be appended") + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' already has an ordered group " + f"(cells will be appended)") elif isinstance(roelem, (UnorderedGroupType, UnorderedGroupIndexedType)): - self.logger.warning("Page '%s' table region '%s' already has an unordered group (%s)", - page_id, region.id, "cells will not be appended") + self.logger.warning( + f"Page '{page_id}' table region '{region.id}' already has an unordered group " + f"(cells will not be appended)") roelem = None else: # replace regionRef(Indexed) by group with same index and ref @@ -468,14 +467,14 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option region.add_TextRegion(subregion) regions.append(subregion) if not regions: - self.logger.warning('Page "%s" contains no text regions', page_id) + self.logger.warning(f'Page "{page_id}" contains no text regions') for region in regions: if region.get_TextLine(): if overwrite_lines: - self.logger.info('removing existing TextLines in page "%s" region "%s"', page_id, region.id) + self.logger.info(f'Removing existing TextLines in page "{page_id}" region "{region.id}"') region.set_TextLine([]) else: - self.logger.warning('keeping existing TextLines in page "%s" region "%s"', page_id, region.id) + self.logger.warning(f'Keeping existing TextLines in page "{page_id}" region "{region.id}"') ignore.extend(region.get_TextLine()) # TODO: also allow grayscale_normalized (try/except?) region_image, region_coords = self.workspace.image_from_segment( @@ -517,8 +516,8 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non sep_bin = np.zeros_like(element_bin, bool) ignore_labels = np.zeros_like(element_bin, int) for i, segment in enumerate(ignore): - self.logger.debug(f'masking foreground of {type(segment).__name__[:-4]} ' - f'"{segment.id}" for "{element.id}"') + self.logger.debug( + f'Masking foreground of {type(segment).__name__[:-4]} "{segment.id}" for "{element.id}"') # mark these segments (e.g. separator regions, tables, images) # for workflows where they have been detected already; # these will be: @@ -552,7 +551,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non fullpage = False report = check_region(element_bin, zoom) suffix = element.id + '.IMG-CLIP' - self.logger.info(f'computing line segmentation for {element_name} "{element.id}"') + self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -577,8 +576,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}') return None - self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines ' - f'for {element_name} "{element.id}"') + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element.id}"') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -599,8 +597,8 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) - self.logger.info(f'Found {len(np.unique(region_labels)) - 1} text regions ' - f'for {element_name} "{element.id}"') + self.logger.info( + f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element.id}"') except Exception as err: self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) @@ -630,7 +628,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # (no new region, no actual text lines) region_line_labels0 = np.intersect1d(region_line_labels0, ignore_labels) assert len(region_line_labels0) == 1, \ - (f'region label "{region_label}" has both existing regions and new lines ' + (f'Region label "{region_label}" has both existing regions and new lines ' f'({str(region_line_labels0)})') region = ignore[region_line_labels0[0] - 1] if rogroup and region.parent_object_ is element and not isinstance(region, SeparatorRegionType): @@ -907,9 +905,9 @@ def join_baselines(logger: Logger, baselines, loc=''): elif geom.geom_type == 'MultiLineString': lines.extend(geom) else: - logger.warning("ignoring baseline subtype %s in %s", geom.geom_type, loc) + logger.warning(f"Ignoring baseline subtype {geom.geom_type} in {loc}") else: - logger.warning("ignoring baseline type %s in %s", baseline.geom_type, loc) + logger.warning(f"Ignoring baseline type {baseline.geom_type} in {loc}") nlines = len(lines) if nlines == 0: return None @@ -971,7 +969,7 @@ def join_baselines(logger: Logger, baselines, loc=''): else: chains.append([prevl, nextl]) if len(chains) > 1: - logger.warning("baseline merge impossible (no spanning tree) in %s", loc) + logger.warning(f"Baseline merge impossible (no spanning tree) in {loc}") return None assert len(chains) == 1, chains assert len(chains[0]) == nlines, chains[0] @@ -983,7 +981,7 @@ def join_baselines(logger: Logger, baselines, loc=''): coords.extend(line.normalize().coords) result = LineString(coords) if result.is_empty: - logger.warning("baseline merge is empty in %s", loc) + logger.warning(f"Baseline merge is empty in {loc}") return None assert result.geom_type == 'LineString', result.wkt result = set_precision(result, 1.0) From d8542c20d5e39c1bf8670205a75c039f25198bf8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:39:43 +0200 Subject: [PATCH 136/194] spacing: train --- ocrd_cis/ocropy/train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 8f224b86..6c627231 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -19,8 +19,8 @@ def deletefiles(filelist): for file in filelist: if exists(file): remove(file) - if exists(file[:-3]+'gt.txt'): - remove(file[:-3]+'gt.txt') + if exists(file[:-3] + 'gt.txt'): + remove(file[:-3] + 'gt.txt') def resize_keep_ratio(image, baseheight=48): hpercent = (baseheight / float(image.size[1])) From d7859714ec6622a0b9294d9dc54d9f3e35f4606c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:39:54 +0200 Subject: [PATCH 137/194] spacing: segment --- ocrd_cis/ocropy/segment.py | 41 ++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index e8c4a1ed..75be2a11 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -58,7 +58,9 @@ lines2regions ) -def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, reorder=True): + +def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area=None, simplify=None, open_holes=False, + reorder=True): """Convert label masks into polygon coordinates. Given a Numpy array of background labels ``bg_labels``, @@ -79,6 +81,7 @@ def masks2polygons(logger: Logger, bg_labels, baselines, fg_bin, name, min_area= if baselines is not None: def getx(xy): return xy[0] + baselines = [LineString(sorted([p[::-1] for p in line], key=getx)).simplify(5) for line in baselines if len(line) >= 2] @@ -96,8 +99,7 @@ def getx(xy): # simplify to convex hull if simplify is not None: hull = convex_hull_image(bg_mask.astype(np.uint8)).astype(bool) - conflicts = np.setdiff1d(hull * simplify, - bg_mask * simplify) + conflicts = np.setdiff1d(hull * simplify, bg_mask * simplify) if conflicts.any(): logger.debug( f'Cannot simplify {label}: convex hull would create additional intersections {str(conflicts)}') @@ -143,10 +145,10 @@ def getx(xy): contourtics = np.maximum(1, np.linalg.norm(contour2, axis=2).astype(int)[:,0] // 10) interpol = [] for i, ntics in enumerate(contourtics): - interpol.extend(np.array(contour[i:i+1] + - contour2[i:i+1] * - np.linspace(0, 1, ntics)[:,np.newaxis,np.newaxis], - int)) + interpol.extend(np.array( + contour[i:i + 1] + + contour2[i:i + 1] * + np.linspace(0, 1, ntics)[:, np.newaxis, np.newaxis], int)) interpol.append(contour[-1]) interpol = np.array(interpol) contourtics = np.insert(np.cumsum(contourtics), 0, 0) @@ -159,23 +161,24 @@ def getx(xy): contour_idx2 = contour_idx if contour_idx2 >= len(contour): contour_idx2 = 0 - cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx+1] + cispoint1 = cispoint2 = interpol[interpol_idx:interpol_idx + 1] if interpol_idx == 0: diff1 = (interpol[-1:] - cispoint1) // 5 else: - diff1 = (interpol[interpol_idx-1:interpol_idx] - cispoint1) // 5 + diff1 = (interpol[interpol_idx - 1: interpol_idx] - cispoint1) // 5 if interpol_idx + 1 >= len(interpol): diff2 = (interpol[0:1] - cispoint2) // 5 else: - diff2 = (interpol[interpol_idx+1:interpol_idx+2] - cispoint2) // 5 + diff2 = (interpol[interpol_idx + 1: interpol_idx + 2] - cispoint2) // 5 cispoint1 = cispoint1 + diff1 cispoint2 = cispoint2 + diff2 logger.debug(f"Stitching at interpolation pos {interpol_idx} hole pos {hole_idx}") # now stitch together outer (up to cision), inner (re-arranged around cision), outer (rest) # (this works, because inner contours have inverse direction) - contour = np.concatenate([contour[:contour_idx], cispoint1, - hole[hole_idx:], hole[:hole_idx], - cispoint2, contour[contour_idx:]]) + contour = np.concatenate( + [contour[:contour_idx], cispoint1, + hole[hole_idx:], hole[:hole_idx], + cispoint2, contour[contour_idx:]]) #plot_poly(contour, 'green') idx_hole = hier[0, idx_hole, 0] #plot_poly(contour, 'red') @@ -210,7 +213,7 @@ def getx(xy): # simplify shape: # can produce invalid (self-intersecting) polygons: #polygon = cv2.approxPolyDP(contour, 2, False)[:, 0, ::] # already ordered x,y - polygon = contour[:, 0, ::] # already ordered x,y + polygon = contour[:, 0, ::] # already ordered x,y # simplify and validate: polygon = Polygon(polygon) if not polygon.is_valid: @@ -220,22 +223,22 @@ def getx(xy): if not polygon.is_valid: #LOG.debug(polygon.wkt) logger.warning(explain_validity(polygon)) - poly = polygon.exterior.coords[:-1] # keep open + poly = polygon.exterior.coords[:-1] # keep open if len(poly) < 4: logger.warning(f'Label {label} contour {i} for {name} has less than 4 points') continue # get baseline segments intersecting with this line mask # and concatenate them from left to right if baselines is not None: - base = join_baselines(logger, [baseline.intersection(polygon) - for baseline in baselines - if baseline.intersects(polygon)], name) + base = join_baselines( + logger, + [baseline.intersection(polygon) for baseline in baselines if baseline.intersects(polygon)], name) if base is not None: base = base.coords else: base = None results.append((label, poly, base)) - result_labels[contour_labels == i+1] = len(results) + result_labels[contour_labels == i + 1] = len(results) return results, result_labels From 7ca78a97db34559ebf1a8dd819ea08e5415ec8d9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:40:08 +0200 Subject: [PATCH 138/194] spacing: resegment --- ocrd_cis/ocropy/resegment.py | 94 +++++++++++++++++------------------- 1 file changed, 43 insertions(+), 51 deletions(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index d429c1de..48bb0d40 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -144,11 +144,11 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option else: self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') return OcrdPageResult(pcgts) - + def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, lines, ignore): threshold = self.parameter['min_fraction'] method = self.parameter['method'] - maxdist = self.parameter['spread']/zoom*300/72 # in pt + maxdist = self.parameter['spread'] / zoom * 300 / 72 # in pt # prepare line segmentation parent_array = pil2array(parent_image) #parent_array, _ = common.binarize(parent_array, maxskew=0) # just in case still raw @@ -172,7 +172,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l for i, line in enumerate(lines): if self.parameter['baseline_only'] and line.Baseline: line_base = baseline_of_segment(line, parent_coords) - line_poly = polygon_from_baseline(line_base, 30/zoom) + line_poly = polygon_from_baseline(line_base, 30 / zoom) else: line_poly = coordinates_of_segment(line, parent_image, parent_coords) line_poly = make_valid(Polygon(line_poly)) @@ -184,9 +184,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # (causing negative/above-max indices), either fully or partially, # then this will silently ignore them. The caller does not need # to concern herself with this. - line_y, line_x = draw.polygon(polygon[:, 1], - polygon[:, 0], - parent_bin.shape) + line_y, line_x = draw.polygon(polygon[:, 1], polygon[:, 0], parent_bin.shape) line_labels[i, line_y, line_x] = True # only text region(s) may contain new text lines for i, region in enumerate(set(line.parent_object_ for line in lines)): @@ -194,17 +192,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l region_polygon = coordinates_of_segment(region, parent_image, parent_coords) region_polygon = make_valid(Polygon(region_polygon)) region_polygon = np.array(region_polygon.exterior.coords, int)[:-1] - ignore_bin[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - parent_bin.shape)] = False + ignore_bin[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], parent_bin.shape)] = False # mask/ignore overlapping neighbours for i, segment in enumerate(ignore): self.logger.debug(f'Masking area of {type(segment).__name__[:-4]} "{segment.id}" for ' f'"{page_id if fullpage else parent.id}"') segment_polygon = coordinates_of_segment(segment, parent_image, parent_coords) - ignore_bin[draw.polygon(segment_polygon[:, 1], - segment_polygon[:, 0], - parent_bin.shape)] = True + ignore_bin[draw.polygon(segment_polygon[:, 1], segment_polygon[:, 0], parent_bin.shape)] = True if method != 'lineest': self.logger.debug(f'Calculating connected component and distance transforms for "{parent.id}"') bin = parent_bin & ~ ignore_bin @@ -213,8 +207,8 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l _, counts = np.unique(components, return_counts=True) if counts.shape[0] > 1: counts = np.sqrt(3 * counts) - scale = int(np.median(counts[(5/zoom < counts) & (counts < 100/zoom)])) - components *= (counts > 15/zoom)[components] + scale = int(np.median(counts[(5 / zoom < counts) & (counts < 100 / zoom)])) + components *= (counts > 15 / zoom)[components] self.logger.debug(f"Estimated scale: {scale}") else: scale = 43 @@ -244,12 +238,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l parent_bin.shape) new_labels[line_y, line_x] = i + 1 spread_dist(self.logger, lines, line_labels, new_labels, parent_bin, components, parent_coords, - maxdist=maxdist or scale/2, loc=parent.id, threshold=threshold) + maxdist=maxdist or scale / 2, loc=parent.id, threshold=threshold) return try: # TODO: 'scale' passed as a param may not be always defined (mehmedGIT) new_line_labels, new_baselines, _, _, _, scale = compute_segmentation( - parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale/2, + parent_bin, seps=ignore_bin, zoom=zoom, spread_dist=maxdist or scale / 2, fullpage=fullpage, maxseps=0, maxcolseps=len(ignore), maximages=0) except Exception as err: self.logger.error(f'Cannot line-segment {tag} "{page_id if fullpage else parent.id}": {err}') @@ -257,13 +251,13 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l self.logger.info( f"Found {new_line_labels.max()} new line labels for {len(lines)} existing lines on {tag} '{parent.id}'") # polygonalize and prepare comparison - new_line_polygons, new_line_labels = masks2polygons(self.logger, - new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"', - min_area=640/zoom/zoom) + new_line_polygons, new_line_labels = masks2polygons( + self.logger, new_line_labels, new_baselines, parent_bin, name=f'{tag} "{parent.id}"', + min_area=640 / zoom / zoom) DSAVE('line_labels', [np.argmax(np.insert(line_labels, 0, 0, axis=0), axis=0), parent_bin]) DSAVE('new_line_labels', [new_line_labels, parent_bin]) - new_line_polygons, new_baselines = list(zip(*[(Polygon(poly), LineString(base)) - for _, poly, base in new_line_polygons])) or ([], []) + new_line_polygons, new_baselines = list(zip( + *[(Polygon(poly), LineString(base)) for _, poly, base in new_line_polygons])) or ([], []) # polygons for intersecting pairs intersections = dict() # ratio of overlap between intersection and new line @@ -281,12 +275,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l inter = make_intersection(line_poly.context, new_line_poly) if not inter: continue - new_line_mask = (new_line_labels == i+1) & parent_bin + new_line_mask = (new_line_labels == i + 1) & parent_bin line_mask = line_labels[j] & parent_bin inter_mask = new_line_mask & line_mask if (not np.count_nonzero(inter_mask) or - not np.count_nonzero(new_line_mask) or - not np.count_nonzero(line_mask)): + not np.count_nonzero(new_line_mask) or + not np.count_nonzero(line_mask)): continue intersections[(i, j)] = inter fits_bg[i, j] = inter.area / new_line_poly.area @@ -344,17 +338,17 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if not np.prod(new_lines.shape): self.logger.debug(f"no lines for '{line.id}' match or fit", ) continue - covers = np.sum(covers_bg[new_lines,j]) + covers = np.sum(covers_bg[new_lines, j]) if covers < threshold / 3: self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% bg", covers * 100) continue - covers = np.sum(covers_fg[new_lines,j]) + covers = np.sum(covers_fg[new_lines, j]) if covers < threshold: self.logger.debug(f"new lines for '{line.id}' only cover %.1f%% fg", covers * 100) continue - looses = (assignments < 0) & (covers_bg[:,j] > 0.1) + looses = (assignments < 0) & (covers_bg[:, j] > 0.1) if looses.any(): - covers = np.sum(covers_bg[np.nonzero(looses)[0],j]) + covers = np.sum(covers_bg[np.nonzero(looses)[0], j]) self.logger.debug( f"new lines for '{line.id}' would loose {np.count_nonzero(looses)} non-matching segments " f"totalling %.1f%% bg", covers * 100) @@ -365,13 +359,12 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # combine all assigned new lines to single outline polygon if len(new_lines) > 1: self.logger.debug(f"joining {len(new_lines)} new line polygons for '{line.id}'") - new_polygon = join_polygons([new_line_polygons[i] #intersections[(i, j)] - for i in new_lines], loc=line.id, scale=scale) - new_baseline = join_baselines(self.logger, [new_polygon.intersection(new_baselines[i]) - for i in new_lines], loc=line.id) + # intersections[(i, j)] + new_polygon = join_polygons([new_line_polygons[i] for i in new_lines], loc=line.id, scale=scale) + new_baseline = join_baselines( + self.logger, [new_polygon.intersection(new_baselines[i]) for i in new_lines], loc=line.id) # convert back to absolute (page) coordinates: - line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], - parent_image, parent_coords) + line_polygon = coordinates_for_segment(new_polygon.exterior.coords[:-1], parent_image, parent_coords) line_polygon = polygon_for_parent(line_polygon, line.parent_object_) if line_polygon is None: self.logger.warning(f"Ignoring extant new polygon for line '{line.id}'") @@ -379,8 +372,7 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l # annotate result: line.get_Coords().set_points(points_from_polygon(line_polygon)) if new_baseline is not None: - new_baseline = coordinates_for_segment(new_baseline.coords, - parent_image, parent_coords) + new_baseline = coordinates_for_segment(new_baseline.coords, parent_image, parent_coords) line.set_Baseline(BaselineType(points=points_from_polygon(new_baseline))) line_polygons[j] = prep(new_polygon) # now also ensure the assigned lines do not overlap other existing lines @@ -394,20 +386,22 @@ def _process_segment(self, parent, parent_image, parent_coords, page_id, zoom, l if other_polygon.is_empty: continue # convert back to absolute (page) coordinates: - other_polygon = coordinates_for_segment(other_polygon.exterior.coords[:-1], - parent_image, parent_coords) + other_polygon = coordinates_for_segment( + other_polygon.exterior.coords[:-1], parent_image, parent_coords) other_polygon = polygon_for_parent(other_polygon, otherline.parent_object_) if other_polygon is None: self.logger.warning(f"Ignoring extant new polygon for line '{otherline.id}'") continue otherline.get_Coords().set_points(points_from_polygon(other_polygon)) -def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, components, coords, - maxdist=43, loc='', threshold=0.9): + +def spread_dist( + logger: Logger, lines, old_labels, new_labels, binarized, components, coords, maxdist=43, loc='', + threshold=0.9): """redefine line coordinates by contourizing spread of connected components propagated from new labels""" DSAVE('seeds', [new_labels, (components>0)]) # allocate to connected components consistently - # (ignoring smallest components like punctuation) + # (ignoring the smallest components like punctuation) # but when there are conflicts, meet in the middle via watershed new_labels2 = morph.propagate_labels(components > 0, new_labels, conflict=0) new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=(components > 0)) @@ -415,7 +409,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon # dilate/grow labels from connected components against each other and bg new_labels = morph.spread_labels(new_labels2, maxdist=maxdist) DSAVE('spread', new_labels) - # now propagate again to catch smallest components like punctuation + # now propagate again to catch the smallest components like punctuation new_labels2 = morph.propagate_labels(binarized, new_labels, conflict=0) new_labels2 = segmentation.watershed(new_labels2, markers=new_labels, mask=binarized) DSAVE('propagated-again', [new_labels2, binarized & (new_labels2==0)]) @@ -444,7 +438,7 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon logger.debug(f"new line for '{line.id}' only covers %.1f%% fg", covers * 100) continue logger.debug(f'Black pixels before/after resegment of line "{line.id}": {count}/{covers * count}') - contours = [contour[:,::-1] # get x,y order again + contours = [contour[:, :: -1] # get x,y order again for contour, area in morph.find_contours(new_label)] #LOG.debug("joining %d subsegments for %s", len(contours), line.id) if len(contours) == 0: @@ -452,10 +446,9 @@ def spread_dist(logger: Logger, lines, old_labels, new_labels, binarized, compon continue else: # get alpha shape - poly = join_polygons([make_valid(Polygon(contour)) - for contour in contours - if len(contour) >= 4], - loc=line.id, scale=maxdist) + poly = join_polygons( + [make_valid(Polygon(contour)) for contour in contours if len(contour) >= 4], + loc=line.id, scale=maxdist) poly = poly.exterior.coords[:-1] polygon = coordinates_for_segment(poly, None, coords) polygon = polygon_for_parent(polygon, line.parent_object_) @@ -472,9 +465,8 @@ def baseline_of_segment(segment, coords): # zzz should go into core ocrd_utils def polygon_from_baseline(baseline, scale): - ltr = baseline[0,0] < baseline[-1,0] + ltr = baseline[0, 0] < baseline[-1, 0] # left-hand side if left-to-right, and vice versa - polygon = make_valid(join_polygons([LineString(baseline).buffer(scale * (-1) ** ltr, - single_sided=True)], - scale=scale)) + polygon = make_valid(join_polygons( + [LineString(baseline).buffer(scale * (-1) ** ltr, single_sided=True)], scale=scale)) return polygon From 1004b431e451be4288aa98054dff843bce3e306b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:52:51 +0200 Subject: [PATCH 139/194] spacing: rest --- ocrd_cis/ocropy/binarize.py | 11 ++++++----- ocrd_cis/ocropy/clip.py | 34 ++++++++++++++++++---------------- ocrd_cis/ocropy/denoise.py | 9 ++++----- ocrd_cis/ocropy/deskew.py | 22 +++++++++++----------- ocrd_cis/ocropy/dewarp.py | 21 ++++++++++----------- ocrd_cis/ocropy/recognize.py | 35 +++++++++++++++-------------------- ocrd_cis/ocropy/resegment.py | 2 +- ocrd_cis/ocropy/segment.py | 2 +- 8 files changed, 66 insertions(+), 70 deletions(-) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 782dd578..35b28c5a 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -38,14 +38,14 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0. if method == 'global': # global thresholding - _, th = cv2.threshold(img,threshold*255,255,cv2.THRESH_BINARY) + _, th = cv2.threshold(img, threshold * 255, 255, cv2.THRESH_BINARY) elif method == 'otsu': # Otsu's thresholding - _, th = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, th = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) elif method == 'gauss-otsu': # Otsu's thresholding after Gaussian filtering blur = cv2.GaussianBlur(img, (5, 5), 0) - _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU) + _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) else: raise Exception('unknown binarization method %s' % method) return Image.fromarray(th), 0 @@ -95,7 +95,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page = pcgts.get_Page() assert page - page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id, feature_filter='binarized') + page_image, page_xywh, page_image_info = self.workspace.image_from_page( + page, page_id, feature_filter='binarized') zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) result = OcrdPageResult(pcgts) @@ -162,7 +163,7 @@ def process_page(self, page, page_image, page_xywh, zoom, page_id) -> OcrdPageRe # to do consistent coordinate transforms, and non-consumers # to redo the rotation themselves): orientation = -page_xywh['angle'] - orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] + orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] page.set_orientation(orientation) if self.parameter['grayscale']: suffix = '.IMG-NRM' diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index 7f40a214..f5390dde 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -124,16 +124,17 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No masks = [pil2array(polygon_mask(page_image, polygon)).astype(np.uint8) for polygon in polygons] for i, region in enumerate(regions): if i >= num_texts: - break # keep non-text regions unchanged + break # keep non-text regions unchanged if level == 'region': if region.get_AlternativeImage(): # FIXME: This should probably be an exception (bad workflow configuration). self.logger.warning(f'Page "{page_id}" region "{region.id}" already contains image data: skipping') continue shape = prep(shapes[i]) - neighbours = [(regionj, maskj) for shapej, regionj, maskj - in zip(shapes[:i] + shapes[i+1:], regions[:i] + regions[i+1:], masks[:i] + masks[i+1:]) - if shape.intersects(shapej)] + neighbours = [ + (regionj, maskj) for shapej, regionj, maskj in + zip(shapes[:i] + shapes[i + 1:], regions[:i] + regions[i + 1:], masks[:i] + masks[i + 1:]) + if shape.intersects(shapej)] if neighbours: ret.images.append(self.process_segment( region, masks[i], polygons[i], neighbours, background_image, @@ -161,24 +162,25 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = No f'Page "{page_id}" region "{region.id}" line "{line.id}" already contains image data: skipping') continue shape = prep(shapes[j]) - neighbours = [(linej, maskj) for shapej, linej, maskj - in zip(shapes[:j] + shapes[j+1:], lines[:j] + lines[j+1:], masks[:j] + masks[j+1:]) - if shape.intersects(shapej)] + neighbours = [ + (linej, maskj) for shapej, linej, maskj in + zip(shapes[:j] + shapes[j + 1:], lines[:j] + lines[j + 1:], masks[:j] + masks[j + 1:]) + if shape.intersects(shapej)] if neighbours: ret.images.append(self.process_segment( line, masks[j], polygons[j], neighbours, background_image, region_image, region_coords, region_bin, page_id)) return ret - def process_segment(self, segment, segment_mask, segment_polygon, neighbours, - background_image, parent_image, parent_coords, parent_bin, - page_id) -> OcrdPageResultImage: + def process_segment( + self, segment, segment_mask, segment_polygon, neighbours, background_image, parent_image, parent_coords, + parent_bin, page_id + ) -> OcrdPageResultImage: # initialize AlternativeImage@comments classes from parent, except # for those operations that can apply on multiple hierarchy levels: features = ','.join( [feature for feature in parent_coords['features'].split(',') - if feature in ['binarized', 'grayscale_normalized', - 'despeckled', 'dewarped']]) + ',clipped' + if feature in ['binarized', 'grayscale_normalized', 'despeckled', 'dewarped']]) + ',clipped' # mask segment within parent image: segment_image = image_from_polygon(parent_image, segment_polygon) segment_bbox = bbox_from_polygon(segment_polygon) @@ -188,8 +190,8 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, f'Ignoring enclosing neighbour "{neighbour.id}" of segment "{segment.id}" on page "{page_id}"') continue # find connected components that (only) belong to the neighbour: - intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour - intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively + intruders = segment_mask * morph.keep_marked(parent_bin, neighbour_mask > 0) # overlaps neighbour + intruders = morph.remove_marked(intruders, segment_mask > neighbour_mask) # but exclusively num_intruders = np.count_nonzero(intruders) num_foreground = np.count_nonzero(segment_mask * parent_bin) if not num_intruders: @@ -202,14 +204,14 @@ def process_segment(self, segment, segment_mask, segment_polygon, neighbours, segment_mask -= intruders # suppress in derived image result to be annotated clip_mask = array2pil(intruders) - segment_image.paste(background_image, mask=clip_mask) # suppress in raw image + segment_image.paste(background_image, mask=clip_mask) # suppress in raw image if segment_image.mode in ['RGB', 'L', 'RGBA', 'LA']: # for consumers that do not have to rely on our # guessed background color, but can cope with transparency: segment_image.putalpha(ImageOps.invert(clip_mask)) # recrop segment into rectangle, just as image_from_segment would do # (and also clipping with background colour): - segment_image = crop_image(segment_image,box=segment_bbox) + segment_image = crop_image(segment_image, box=segment_bbox) # update PAGE (reference the image file): alternative_image = AlternativeImageType(comments=features) segment.add_AlternativeImage(alternative_image) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index b3c219fb..0dd14ef8 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -19,7 +19,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDenoise') - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Despeckle the pages / regions / lines of the workspace. Open and deserialise PAGE input file and its respective images, @@ -72,8 +72,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option self.logger.warning(f'Page "{page_id}" region "{region.id}" contains no text lines') for line in lines: line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh, - feature_selector='binarized') + line, region_image, region_xywh, feature_selector='binarized') image = self.process_segment(line, line_image, line_xywh, zoom) if image: result.images.append(image) @@ -83,8 +82,8 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optiona self.logger.warning(f"Skipping '{segment.id}' with zero size") return None self.logger.info(f"About to despeckle '{segment.id}'") - bin_image = remove_noise(segment_image, - maxsize=self.parameter['noise_maxsize']/zoom*300/72) # in pt + bin_image = remove_noise( + segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72) # in pt # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') segment.add_AlternativeImage(alt_image) diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 84475d81..7bdbba2d 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -25,7 +25,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyDeskew') - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Deskew the pages or regions of the workspace. Open and deserialise PAGE input file and its respective images, @@ -61,7 +61,7 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option return result if level == 'table': regions = page.get_TableRegion() - else: # region + else: # region regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: self.logger.warning('Page "%s" contains no text regions', page_id) @@ -78,29 +78,29 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option result.images.append(image) return result - def _process_segment(self, segment, segment_image, segment_coords, segment_id, page_id) -> Optional[OcrdPageResultImage]: + def _process_segment( + self, segment, segment_image, segment_coords, segment_id, page_id + ) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning("Skipping %s with zero size", segment_id) return None - angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image + angle0 = segment_coords['angle'] # deskewing (w.r.t. top image) already applied to segment_image self.logger.info(f"About to deskew {segment_id}") - angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied + angle = deskew(segment_image, maxskew=self.parameter['maxskew']) # additional angle to be applied # segment angle: PAGE orientation is defined clockwise, # whereas PIL/ndimage rotation is in mathematical direction: orientation = -(angle + angle0) - orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] - segment.set_orientation(orientation) # also removes all deskewed AlternativeImages + orientation = 180 - (180 - orientation) % 360 # map to [-179.999,180] + segment.set_orientation(orientation) # also removes all deskewed AlternativeImages self.logger.info(f"Found angle for {segment_id}: %.1f", angle) # delegate reflection, rotation and re-cropping to core: if isinstance(segment, PageType): segment_image, segment_coords, _ = self.workspace.image_from_page( - segment, page_id, - fill='background', transparency=True) + segment, page_id, fill='background', transparency=True) suffix = '.IMG-DESKEW' else: segment_image, segment_coords = self.workspace.image_from_segment( - segment, segment_image, segment_coords, - fill='background', transparency=True) + segment, segment_image, segment_coords, fill='background', transparency=True) suffix = segment.id + '.IMG-DESKEW' if not angle: # zero rotation does not change coordinates, diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 302cf2e0..e06718c8 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -22,27 +22,27 @@ def dewarp(image, lnorm, check=True, max_neighbour=0.02, zoom=1.0): if not image.width or not image.height: raise InvalidLine('image size is zero') line = pil2array(image) - + if np.prod(line.shape) == 0: raise InvalidLine('image dimensions are zero') if np.amax(line) == np.amin(line): raise InvalidLine('image is blank') - - temp = np.amax(line)-line # inverse, zero-closed + + temp = np.amax(line) - line # inverse, zero-closed if check: report = check_line(temp, zoom=zoom) if report: raise InadequateLine(report) - - temp = temp * 1.0 / np.amax(temp) # normalized + + temp = temp * 1.0 / np.amax(temp) # normalized if check: report = lnorm.check(temp, max_ignore=max_neighbour) if report: raise InvalidLine(report) - lnorm.measure(temp) # find centerline + lnorm.measure(temp) # find centerline line = lnorm.dewarp(line, cval=np.amax(line)) - + return array2pil(line) # pad with white above and below (as a fallback for dewarp) @@ -72,7 +72,7 @@ def setup(self): # and extra params) 0.3)) - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Dewarp the lines of the workspace. Open and deserialise PAGE input file and its respective images, @@ -115,9 +115,8 @@ def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Option self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") try: - dew_image = dewarp(line_image, self.lnorm, check=True, - max_neighbour=self.parameter['max_neighbour'], - zoom=zoom) + dew_image = dewarp( + line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) except InvalidLine as err: self.logger.error(f'Cannot dewarp line "{line.id}": {err}') continue diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index f0c4b520..02d29e7c 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -47,7 +47,7 @@ def recognize(image, pad, network, check=True): # getting confidence result = lstm.translate_back(network.outputs, pos=1) - scale = len(raw_line.T)*1.0/(len(network.outputs)-2*pad) + scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad) clist = [] rlist = [] @@ -57,7 +57,7 @@ def recognize(image, pad, network, check=True): if c != 0: confid = network.outputs[r, c] c = network.l2s([c]) - r = (r-pad)*scale + r = (r - pad) * scale confidlist.append(confid) clist.append(c) @@ -88,7 +88,7 @@ def setup(self): def get_model(self): """Search for the model file. First checks if parameter['model'] can - be resolved with OcrdResourceManager to a valid readeable file and + be resolved with OcrdResourceManager to a valid readable file and returns it. If not, it checks if the model can be found in the dirname(__file__)/models/ directory.""" canread = lambda p: isfile(p) and access(p, R_OK) @@ -202,8 +202,8 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): words = [x.strip() for x in linepred.split(' ') if x.strip()] - word_r_list = [[0]] # r-positions of every glyph in every word - word_conf_list = [[]] # confidences of every glyph in every word + word_r_list = [[0]] # r-positions of every glyph in every word + word_conf_list = [[]] # confidences of every glyph in every word if words != []: w_no = 0 found_char = False @@ -215,7 +215,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): if c == ' ' and found_char: if i == 0: word_r_list[0][0] = rlist[i] - elif i+1 <= len(clist)-1 and clist[i+1] != ' ': + elif i + 1 <= len(clist) - 1 and clist[i + 1] != ' ': word_conf_list.append([]) word_r_list.append([rlist[i]]) w_no += 1 @@ -224,9 +224,9 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): word_r_list = [[0, line_image.width]] # conf for each word - wordsconf = [(min(x)+max(x))/2 for x in word_conf_list] + wordsconf = [(min(x) + max(x)) / 2 for x in word_conf_list] # conf for the line - line_conf = (min(wordsconf) + max(wordsconf))/2 + line_conf = (min(wordsconf) + max(wordsconf)) / 2 # line text line.add_TextEquiv(TextEquivType(Unicode=linepred, conf=line_conf)) @@ -235,32 +235,27 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): word_points = points_from_polygon( coordinates_for_segment( np.array(polygon_from_bbox( - word_r_list[word_no][0] / scale, - 0, - word_r_list[word_no][-1] / scale, - 0 + line_image.height)), + word_r_list[word_no][0] / scale,0, + word_r_list[word_no][-1] / scale, 0 + line_image.height)), line_image, line_coords)) word_id = '%s_word%04d' % (line.id, word_no) word = WordType(id=word_id, Coords=CoordsType(word_points)) line.add_Word(word) - word.add_TextEquiv(TextEquivType( - Unicode=word_str, conf=wordsconf[word_no])) + word.add_TextEquiv(TextEquivType(Unicode=word_str, conf=wordsconf[word_no])) if maxlevel == 'glyph': for glyph_no, glyph_str in enumerate(word_str): glyph_points = points_from_polygon( coordinates_for_segment( np.array(polygon_from_bbox( - word_r_list[word_no][glyph_no] / scale, - 0, - word_r_list[word_no][glyph_no+1] / scale, - 0 + line_image.height)), + word_r_list[word_no][glyph_no] / scale, 0, + word_r_list[word_no][glyph_no + 1] / scale, 0 + line_image.height)), line_image, line_coords)) glyph_id = '%s_glyph%04d' % (word.id, glyph_no) glyph = GlyphType(id=glyph_id, Coords=CoordsType(glyph_points)) word.add_Glyph(glyph) - glyph.add_TextEquiv(TextEquivType( - Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) + glyph.add_TextEquiv( + TextEquivType(Unicode=glyph_str, conf=word_conf_list[word_no][glyph_no])) return edits, lengs diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 48bb0d40..5a8c7e96 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -52,7 +52,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropyResegment') - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Resegment lines of the workspace. Open and deserialise PAGE input file and its respective images, diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 75be2a11..6dc75056 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -252,7 +252,7 @@ def executable(self): def setup(self): self.logger = getLogger('processor.OcropySegment') - def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Segment pages into regions+lines, tables into cells+lines, or regions into lines. Open and deserialise PAGE input file and its respective images, From c5498a0e8d8bc9a8e3fe3bf0848df9b135bae69c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:55:44 +0200 Subject: [PATCH 140/194] spacing: dewarp --- ocrd_cis/ocropy/dewarp.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index e06718c8..89901efd 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -95,24 +95,19 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional result = OcrdPageResult(pcgts) page = pcgts.get_Page() - page_image, page_xywh, page_image_info = self.workspace.image_from_page( - page, page_id) + page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id) zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: self.logger.warning(f'Page "{page_id}" contains no text regions') for region in regions: - region_image, region_xywh = self.workspace.image_from_segment( - region, page_image, page_xywh) - + region_image, region_xywh = self.workspace.image_from_segment(region, page_image, page_xywh) lines = region.get_TextLine() if not lines: self.logger.warning(f'Region {region.id} contains no text lines') for line in lines: - line_image, line_xywh = self.workspace.image_from_segment( - line, region_image, region_xywh) - + line_image, line_xywh = self.workspace.image_from_segment(line, region_image, region_xywh) self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") try: dew_image = dewarp( From 31e124577faad71f2bb039a6b094900b6cdf9df1 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 16:58:52 +0200 Subject: [PATCH 141/194] fix: dewarp return --- ocrd_cis/ocropy/dewarp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 89901efd..17d0b4ce 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -123,5 +123,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional dew_image = padvert(line_image, self.parameter['range']) # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=line_xywh['features'] + ',dewarped') - line.add_AlternativeImage(alternative_image) - return OcrdPageResultImage(dew_image, region.id + '_' + line.id + '.IMG-DEWARP', alt_image) + line.add_AlternativeImage(alt_image) + suffix = f"{region.id}_{line.id}.IMG-DEWARP" + result.images.append(OcrdPageResultImage(dew_image, suffix, alt_image)) + return result From f86c99391e987d4918b6d626dbf1b2f990d7712b Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 17:21:14 +0200 Subject: [PATCH 142/194] improve str speed: precompute element_name_id --- ocrd_cis/ocropy/segment.py | 92 +++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 50 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 6dc75056..9daf59de 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -388,13 +388,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional rogroup = OrderedGroupType(id="reading-order") ro.set_OrderedGroup(rogroup) if (not rogroup.get_RegionRefIndexed() and - not rogroup.get_OrderedGroupIndexed() and - not rogroup.get_UnorderedGroupIndexed()): - # schema forbids empty OrderedGroup + not rogroup.get_OrderedGroupIndexed() and + not rogroup.get_UnorderedGroupIndexed()): + # schema forbids empty OrderedGroup ro.set_OrderedGroup(None) # go get TextRegions with TextLines (and SeparatorRegions): - image = self._process_element(page, ignore, page_image, page_coords, - zoom=zoom, rogroup=rogroup) + image = self._process_element(page, ignore, page_image, page_coords, zoom=zoom, rogroup=rogroup) if image: result.images.append(image) return result @@ -450,11 +449,11 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional roelem = page_subgroup_in_reading_order(self.logger, roelem) reading_order[region.id] = roelem # go get TextRegions with TextLines (and SeparatorRegions) - image = self._process_element(region, subignore, region_image, region_coords, - zoom=zoom, rogroup=roelem) + image = self._process_element( + region, subignore, region_image, region_coords, zoom=zoom, rogroup=roelem) if image: result.images.append(image) - else: # 'region' + else: # 'region' regions = list(page.get_TextRegion()) # besides top-level text regions, line-segment any table cells, # and for tables without any cells, add a pseudo-cell @@ -463,10 +462,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional if subregions: regions.extend(subregions) else: - subregion = TextRegionType(id=region.id + '_text', - Coords=region.get_Coords(), - # as if generated from parser: - parent_object_=region) + subregion = TextRegionType( + id=f'{region.id}_text', Coords=region.get_Coords(), parent_object_=region) region.add_TextRegion(subregion) regions.append(subregion) if not regions: @@ -490,7 +487,6 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional image = self._process_element(region, ignore, region_image, region_coords, zoom=zoom) if image: result.images.append(image) - return result def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=None) -> Optional[OcrdPageResultImage]: @@ -535,7 +531,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non sp_col = segment_polygon[:, 0] if isinstance(segment, SeparatorRegionType): sep_bin[draw.polygon(sp_row, sp_col, sep_bin.shape)] = True - ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i+1 # mapped back for RO + ignore_labels[draw.polygon(sp_row, sp_col, ignore_labels.shape)] = i + 1 # mapped back for RO if isinstance(element, PageType): element_name = 'page' fullpage = True @@ -555,6 +551,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non report = check_region(element_bin, zoom) suffix = element.id + '.IMG-CLIP' self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"') + element_name_id = f'{element_name} "{element.id}"' # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -564,7 +561,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # but keep them for h/v-line detection (in fullpage mode): element_bin, seps=(sep_bin + ignore_labels) > 0, zoom=zoom, fullpage=fullpage, - spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt + spread_dist=round(self.parameter['spread'] / zoom * 300 / 72), # in pt # these are ignored when not in fullpage mode: maxcolseps=self.parameter['maxcolseps'], maxseps=self.parameter['maxseps'], @@ -576,10 +573,10 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # as a fallback, add a single text line comprising the whole region: element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords())) else: - self.logger.error(f'Cannot line-segment {element_name} "{element.id}": {err}') + self.logger.error(f'Cannot line-segment {element_name_id}: {err}') return None - self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name} "{element.id}"') + self.logger.info(f'Found {len(np.unique(line_labels)) - 1} text lines for {element_name_id}') # post-process line labels if isinstance(element, (PageType, TableRegionType)): # aggregate text lines to text regions @@ -594,18 +591,18 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non region_labels = lines2regions( element_bin, line_labels, rlabels=ignore_labels, - sepmask=np.maximum(sepmask, colseps), # add bg + sepmask=np.maximum(sepmask, colseps), # add bg # decide horizontal vs vertical cut when gaps of similar size prefer_vertical=not isinstance(element, TableRegionType), gap_height=self.parameter['gap_height'], gap_width=self.parameter['gap_width'], scale=scale, zoom=zoom) self.logger.info( - f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name} "{element.id}"') + f'Found {len(np.unique(region_labels)) - 1} text regions for {element_name_id}') except Exception as err: - self.logger.error(f'Cannot region-segment {element_name} "{element.id}": {err}') + self.logger.error(f'Cannot region-segment {element_name_id}: {err}') region_labels = np.where(line_labels > len(ignore), 1 + len(ignore), line_labels) - + # prepare reading order group index if rogroup: if isinstance(rogroup, (OrderedGroupType, OrderedGroupIndexedType)): @@ -622,7 +619,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non region_no = 0 for region_label in np.unique(region_labels): if not region_label: - continue # no bg + continue # no bg region_mask = region_labels == region_label region_line_labels = line_labels * region_mask region_line_labels0 = np.setdiff1d(region_line_labels, [0]) @@ -644,18 +641,17 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non order[np.setdiff1d(region_line_labels0, element_bin * region_line_labels)] = 0 region_line_labels = order[region_line_labels] # avoid horizontal gaps - region_line_labels = hmerge_line_seeds(element_bin, region_line_labels, scale, - seps=np.maximum(sepmask, colseps)) + region_line_labels = hmerge_line_seeds( + element_bin, region_line_labels, scale, seps=np.maximum(sepmask, colseps)) region_mask |= region_line_labels > 0 # find contours for region (can be non-contiguous) - regions, _ = masks2polygons(self.logger, region_mask * region_label, None, element_bin, - name=f'{element_name} "{element.id}"', - min_area=6000 / zoom / zoom, - simplify=ignore_labels * ~(sep_bin)) + regions, _ = masks2polygons( + self.logger, region_mask * region_label, None, element_bin, name=element_name_id, + min_area=6000 / zoom / zoom, simplify=ignore_labels * ~(sep_bin)) # find contours for lines (can be non-contiguous) - lines, _ = masks2polygons(self.logger, region_line_labels, baselines, element_bin, - name=f'region "{element.id}"', - min_area=640 / zoom / zoom) + lines, _ = masks2polygons( + self.logger, region_line_labels, baselines, element_bin, name=f'region "{element.id}"', + min_area=640 / zoom / zoom) # create new lines in new regions (allocating by intersection) line_polys = [Polygon(polygon) for _, polygon, _ in lines] for _, region_polygon, _ in regions: @@ -674,7 +670,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # find out which line (contours) belong to which region (contours) line_no = 0 for i, line_poly in enumerate(line_polys): - if not region_poly.intersects(line_poly): # .contains + if not region_poly.intersects(line_poly): # .contains continue line_label, line_polygon, line_baseline = lines[i] # convert back to absolute (page) coordinates: @@ -696,16 +692,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # if the region has received text lines, keep it if region.get_TextLine(): element.add_TextRegion(region) - self.logger.info(f'Added region "{region_id}" with {line_no} lines ' - f'for {element_name} "{element.id}"') + self.logger.info(f'Added region "{region_id}" with {line_no} lines for {element_name_id}') if rogroup: index = page_add_to_reading_order(rogroup, region.id, index) # add additional image/non-text regions from compute_segmentation # (e.g. drop-capitals or images) ... - self.logger.info(f'Found {images.max()} large image regions for {element_name} "{element.id}"') + self.logger.info(f'Found {images.max()} large image regions for {element_name_id}') # find contours around region labels (can be non-contiguous): - image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, - name=f'{element_name} "{element.id}"') + image_polygons, _ = masks2polygons(self.logger, images, None, element_bin, name=element_name_id) for image_label, polygon, _ in image_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -719,11 +713,10 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non element.add_ImageRegion(ImageRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: - self.logger.info(f'Found {seplines.max()} separators for {element_name} "{element.id}"') + self.logger.info(f'Found {seplines.max()} separators for {element_name_id}') # find contours around region labels (can be non-contiguous): - sep_polygons, _ = masks2polygons(self.logger, seplines, None, element_bin, - name=f'{element_name} "{element.id}"', - open_holes=True, reorder=False) + sep_polygons, _ = masks2polygons( + self.logger, seplines, None, element_bin, name=element_name_id, open_holes=True, reorder=False) for sep_label, polygon, _ in sep_polygons: # convert back to absolute (page) coordinates: region_polygon = coordinates_for_segment(polygon, image, coords) @@ -737,7 +730,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non element.add_SeparatorRegion(SeparatorRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image - element_array[sepmask] = np.amax(element_array) # clip to white/bg + element_array[sepmask] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') element.add_AlternativeImage(image_ref) @@ -746,15 +739,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon(region_polygon[:, 1], - region_polygon[:, 0], - region_mask.shape)] = True + region_mask[draw.polygon( + region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): - line_polygons, _ = masks2polygons(self.logger, line_labels, baselines, element_bin, - name=f'region "{element.id}"', - min_area=640 / zoom / zoom) + line_polygons, _ = masks2polygons( + self.logger, line_labels, baselines, element_bin, + name=f'region "{element.id}"', min_area=640 / zoom / zoom) line_no = 0 for line_label, polygon, baseline in line_polygons: # convert back to absolute (page) coordinates: @@ -772,9 +764,9 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non line.set_Baseline(BaselineType(points=points_from_polygon(line_baseline))) element.add_TextLine(line) if not sep_bin.any(): - return None # no derived image + return None # no derived image # annotate a text/image-separated image - element_array[sep_bin] = np.amax(element_array) # clip to white/bg + element_array[sep_bin] = np.amax(element_array) # clip to white/bg image_clipped = array2pil(element_array) image_ref = AlternativeImageType(comments=coords['features'] + ',clipped') element.add_AlternativeImage(image_ref) From b8e3ad6207a832fad65bccf5ea4756c004bb1f96 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 22:26:33 +0200 Subject: [PATCH 143/194] fix: clip suffix --- ocrd_cis/ocropy/clip.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index f5390dde..b81c731c 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -213,6 +213,7 @@ def process_segment( # (and also clipping with background colour): segment_image = crop_image(segment_image, box=segment_bbox) # update PAGE (reference the image file): + suffix = f'{segment.id}.IMG_CLIP' alternative_image = AlternativeImageType(comments=features) segment.add_AlternativeImage(alternative_image) - return OcrdPageResultImage(segment_image, '.IMG-CLIP', alternative_image) + return OcrdPageResultImage(segment_image, suffix, alternative_image) From 02724f2db8c1d29f739282a42330c1a9b14e27d2 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 16 Aug 2024 22:30:11 +0200 Subject: [PATCH 144/194] fix: denoise return --- ocrd_cis/ocropy/denoise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 0dd14ef8..4ae883fd 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -76,6 +76,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional image = self.process_segment(line, line_image, line_xywh, zoom) if image: result.images.append(image) + return result def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: From aac6fe0989ccb483626af6b238e98162b780aac5 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 00:50:08 +0200 Subject: [PATCH 145/194] try to fix: ocropy denoise --- ocrd_cis/ocropy/denoise.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index 4ae883fd..fd9812f8 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -51,7 +51,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) if level == 'page': - image = self.process_segment(page, page_image, page_xywh, zoom) + image = self.process_segment(page, page_image, page_xywh, zoom, page_id) if image: result.images.append(image) else: @@ -63,7 +63,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional region, page_image, page_xywh, feature_selector='binarized' if level == 'region' else '') if level == 'region': - image = self.process_segment(region, region_image, region_xywh, zoom) + image = self.process_segment(region, region_image, region_xywh, zoom, page_id) if image: result.images.append(image) continue @@ -73,12 +73,12 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') - image = self.process_segment(line, line_image, line_xywh, zoom) + image = self.process_segment(line, line_image, line_xywh, zoom, page_id) if image: result.images.append(image) return result - def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optional[OcrdPageResultImage]: + def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning(f"Skipping '{segment.id}' with zero size") return None @@ -87,5 +87,6 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom) -> Optiona segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72) # in pt # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') + suffix = f"{page_id}_{segment.id}.IMG-DESPECK" segment.add_AlternativeImage(alt_image) - return OcrdPageResultImage(bin_image, segment.id + '.IMG-DESPECK', alt_image) + return OcrdPageResultImage(bin_image, suffix, alt_image) From 5548d0e6043e32d7409fef9817775670b2d1b96f Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 00:58:12 +0200 Subject: [PATCH 146/194] fix: ocropy denoise --- ocrd_cis/ocropy/denoise.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index fd9812f8..eb3e7d23 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -63,7 +63,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional region, page_image, page_xywh, feature_selector='binarized' if level == 'region' else '') if level == 'region': - image = self.process_segment(region, region_image, region_xywh, zoom, page_id) + file_id = f"{page_id}_{region.id}" + image = self.process_segment(region, region_image, region_xywh, zoom, file_id) if image: result.images.append(image) continue @@ -73,12 +74,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional for line in lines: line_image, line_xywh = self.workspace.image_from_segment( line, region_image, region_xywh, feature_selector='binarized') - image = self.process_segment(line, line_image, line_xywh, zoom, page_id) + file_id = f"{page_id}_{region.id}_{line.id}" + image = self.process_segment(line, line_image, line_xywh, zoom, file_id) if image: result.images.append(image) return result - def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) -> Optional[OcrdPageResultImage]: + def process_segment(self, segment, segment_image, segment_xywh, zoom, file_id) -> Optional[OcrdPageResultImage]: if not segment_image.width or not segment_image.height: self.logger.warning(f"Skipping '{segment.id}' with zero size") return None @@ -87,6 +89,6 @@ def process_segment(self, segment, segment_image, segment_xywh, zoom, page_id) - segment_image, maxsize=self.parameter['noise_maxsize'] / zoom * 300 / 72) # in pt # update PAGE (reference the image file): alt_image = AlternativeImageType(comments=segment_xywh['features'] + ',despeckled') - suffix = f"{page_id}_{segment.id}.IMG-DESPECK" + suffix = f"{file_id}.IMG-DESPECK" segment.add_AlternativeImage(alt_image) return OcrdPageResultImage(bin_image, suffix, alt_image) From c9f0f56787f2d34d718bc504ee3d07f7501dff75 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 01:26:54 +0200 Subject: [PATCH 147/194] fix: resegment --- ocrd_cis/ocropy/resegment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index 5a8c7e96..c1809569 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -427,7 +427,7 @@ def spread_dist( continue covers = np.count_nonzero(new_label) / count if covers < threshold / 3: - logger.debug(f"new line for '%s' only covers %.1f%% bg", covers * 100) + logger.debug(f"new line for '{line.id}' only covers %.1f%% bg", covers * 100) continue count = np.count_nonzero(old_label * binarized) if not count: From fff909746f1347fc9336f8413fd311ac4e3ce206 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 01:27:05 +0200 Subject: [PATCH 148/194] optimize segment --- ocrd_cis/ocropy/segment.py | 48 ++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 28 deletions(-) diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index 9daf59de..b363cbd2 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -544,14 +544,14 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non element_name = 'table' fullpage = True report = check_region(element_bin, zoom) - suffix = element.id + '.IMG-CLIP' + suffix = f"{element.id}.IMG-CLIP" else: element_name = 'region' fullpage = False report = check_region(element_bin, zoom) - suffix = element.id + '.IMG-CLIP' - self.logger.info(f'Computing line segmentation for {element_name} "{element.id}"') + suffix = f"{element.id}.IMG-CLIP" element_name_id = f'{element_name} "{element.id}"' + self.logger.info(f'Computing line segmentation for {element_name_id}') # TODO: we should downscale if DPI is large enough to save time try: if report: @@ -571,7 +571,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non if isinstance(element, TextRegionType): self.logger.error(f'Cannot line-segment region "{element.id}": {err}') # as a fallback, add a single text line comprising the whole region: - element.add_TextLine(TextLineType(id=element.id + "_line", Coords=element.get_Coords())) + element.add_TextLine(TextLineType(id=f"{element.id}_line", Coords=element.get_Coords())) else: self.logger.error(f'Cannot line-segment {element_name_id}: {err}') return None @@ -664,7 +664,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue # annotate result: region_no += 1 - region_id = element.id + "_region%04d" % region_no + region_id = f"{element.id}_region%04d" % region_no self.logger.debug(f'Region label {region_label} becomes ID "{region_id}"') region = TextRegionType(id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon))) # find out which line (contours) belong to which region (contours) @@ -682,7 +682,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue # annotate result: line_no += 1 - line_id = region_id + "_line%04d" % line_no + line_id = f"{region_id}_line%04d" % line_no self.logger.debug(f'Line label {line_label} becomes ID "{line_id}"') line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if line_baseline: @@ -709,7 +709,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue region_no += 1 # annotate result: - region_id = element.id + "_image%04d" % region_no + region_id = f"{element.id}_image%04d" % region_no element.add_ImageRegion(ImageRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # split detected separator labels into separator regions: @@ -726,7 +726,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue # annotate result: region_no += 1 - region_id = element.id + "_sep%04d" % region_no + region_id = f"{element.id}_sep%04d" % region_no element.add_SeparatorRegion(SeparatorRegionType( id=region_id, Coords=CoordsType(points=points_from_polygon(region_polygon)))) # annotate a text/image-separated image @@ -739,8 +739,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non # get mask from region polygon: region_polygon = coordinates_of_segment(element, image, coords) region_mask = np.zeros_like(element_bin, bool) - region_mask[draw.polygon( - region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True + region_mask[draw.polygon(region_polygon[:, 1], region_polygon[:, 0], region_mask.shape)] = True # ensure the new line labels do not extrude from the region: line_labels = line_labels * region_mask # find contours around labels (can be non-contiguous): @@ -757,7 +756,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non continue # annotate result: line_no += 1 - line_id = element.id + "_line%04d" % line_no + line_id = f"{element.id}_line%04d" % line_no line = TextLineType(id=line_id, Coords=CoordsType(points=points_from_polygon(line_polygon))) if baseline: line_baseline = coordinates_for_segment(baseline, image, coords) @@ -868,11 +867,12 @@ def join_polygons(polygons, loc='', scale=20): dists[j, i] = dist dists = minimum_spanning_tree(dists, overwrite=True) # add bridge polygons (where necessary) + max_dist = max(1.0, scale / 5) for prevp, nextp in zip(*dists.nonzero()): prevp = polygons[prevp] nextp = polygons[nextp] nearest = nearest_points(prevp, nextp) - bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1) + bridgep = LineString(nearest).buffer(max_dist, resolution=1) polygons.append(bridgep) jointp = unary_union(polygons) assert jointp.geom_type == 'Polygon', jointp.wkt @@ -1017,11 +1017,9 @@ def page_add_to_reading_order(rogroup, region_id, index=None): """ if rogroup: if index is None: - rogroup.add_RegionRef(RegionRefType( - regionRef=region_id)) + rogroup.add_RegionRef(RegionRefType(regionRef=region_id)) else: - rogroup.add_RegionRefIndexed(RegionRefIndexedType( - regionRef=region_id, index=index)) + rogroup.add_RegionRefIndexed(RegionRefIndexedType(regionRef=region_id, index=index)) index += 1 return index @@ -1045,36 +1043,30 @@ def page_subgroup_in_reading_order(logger: Logger, roelem): if not roelem.parent_object_: logger.error('Cannot subgroup from orphan ReadingOrder element') return roelem - if isinstance(roelem, (OrderedGroupType,OrderedGroupIndexedType)) and not ( + if isinstance(roelem, (OrderedGroupType, OrderedGroupIndexedType)) and not ( roelem.get_OrderedGroupIndexed() or roelem.get_UnorderedGroupIndexed() or roelem.get_RegionRefIndexed()): # is already a group and still empty return roelem - if isinstance(roelem, (OrderedGroupType, - UnorderedGroupType, - RegionRefType)): + if isinstance(roelem, (OrderedGroupType, UnorderedGroupType, RegionRefType)): getattr(roelem.parent_object_, { OrderedGroupType: 'get_OrderedGroup', UnorderedGroupType: 'get_UnorderedGroup', RegionRefType: 'get_RegionRef', }.get(roelem.__class__))().remove(roelem) - roelem2 = OrderedGroupType(id=roelem.regionRef + '_group', - regionRef=roelem.regionRef) + roelem2 = OrderedGroupType(id=f"{roelem.regionRef}_group", regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroup(roelem2) roelem2.parent_object_ = roelem.parent_object_ return roelem2 - if isinstance(roelem, (OrderedGroupIndexedType, - UnorderedGroupIndexedType, - RegionRefIndexedType)): + if isinstance(roelem, (OrderedGroupIndexedType, UnorderedGroupIndexedType, RegionRefIndexedType)): getattr(roelem.parent_object_, { OrderedGroupIndexedType: 'get_OrderedGroupIndexed', UnorderedGroupIndexedType: 'get_UnorderedGroupIndexed', RegionRefIndexedType: 'get_RegionRefIndexed' }.get(roelem.__class__))().remove(roelem) - roelem2 = OrderedGroupIndexedType(id=roelem.regionRef + '_group', - index=roelem.index, - regionRef=roelem.regionRef) + roelem2 = OrderedGroupIndexedType( + id=f"{roelem.regionRef}_group", index=roelem.index, regionRef=roelem.regionRef) roelem.parent_object_.add_OrderedGroupIndexed(roelem2) roelem2.parent_object_ = roelem.parent_object_ return roelem2 From 8b9283232a57b7c49a78420b32c915b32992ee9a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 02:02:28 +0200 Subject: [PATCH 149/194] optimize ocropy common --- ocrd_cis/ocropy/common.py | 186 +++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index c5b56ed0..a5806517 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -184,16 +184,19 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90): d0, d1 = flat.shape o0, o1 = int(bignore * d0), int(bignore * d1) est = flat[o0:d0 - o0, o1:d1 - o1] + if escale > 0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = escale - v = est - filters.gaussian_filter(est, e * 20.0) - v = filters.gaussian_filter(v ** 2, e * 20.0) ** 0.5 + e_20_0 = e * 20.0 + e_50 = int(e * 50) + v = est - filters.gaussian_filter(est, e_20_0) + v = filters.gaussian_filter(v ** 2, e_20_0) ** 0.5 v = (v > 0.3 * np.amax(v)) - v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1))) - v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50)))) + v = morphology.binary_dilation(v, structure=np.ones((e_50, 1))) + v = morphology.binary_dilation(v, structure=np.ones((1, e_50))) est = est[v] lo = stats.scoreatpercentile(est.ravel(), lo) hi = stats.scoreatpercentile(est.ravel(), hi) @@ -310,24 +313,24 @@ def check_line(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape)==0: return "image dimensions are zero" - if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,) + if np.prod(binary.shape) == 0: return "image dimensions are zero" + if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}" if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)200/zoom: return "image too tall for a text line %s"%(binary.shape,) + if h<20/zoom: return f"image not tall enough for a text line {binary.shape}" + if h>200/zoom: return f"image too tall for a text line {binary.shape}" ##if w<1.5*h: return "line too short %s"%(binary.shape,) - if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,) - if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,) + if w<1.5*h and w<32/zoom: return f"image too short for a line image {binary.shape}" + if w>4000/zoom: return f"image too long for a line image {binary.shape}" return None ratio = w*1.0/h _, ncomps = measurements.label(binary) lo = int(0.5*ratio+0.5) hi = int(4*ratio)+1 - if ncomps=%d)"%(ncomps,lo) - ##if ncomps>hi*ratio: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi) - if ncomps>hi*ratio and ncomps>10: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi) + if ncomps={lo})" + ##if ncomps>hi*ratio: return f"too many connected components (got {ncomps}, wanted <={hi})" + if ncomps>hi*ratio and ncomps>10: return f"too many connected components (got {ncomps}, wanted <={hi})" return None # inspired by ocropus-gpageseg check_page @@ -341,21 +344,21 @@ def check_region(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape)==0: return "image dimensions are zero" - if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,) + if np.prod(binary.shape) == 0: return "image dimensions are zero" + if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}" if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)5000/zoom: return "image too tall for a region image %s"%(binary.shape,) - if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,) - if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,) + if h<45/zoom: return f"image not tall enough for a region image {binary.shape}" + if h>5000/zoom: return f"image too tall for a region image {binary.shape}" + if w<100/zoom: return f"image too narrow for a region image {binary.shape}" + if w>5000/zoom: return f"image too wide for a region image {binary.shape}" return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) - if ncomps<5: return "too few connected components for a region image (got %d)"%(ncomps,) - if ncomps>slots and ncomps>10: return "too many connected components for a region image (%d > %d)"%(ncomps,slots) + if ncomps<5: return f"too few connected components for a region image (got {ncomps})" + if ncomps>slots and ncomps>10: return f"too many connected components for a region image ({ncomps} > {slots})" return None # from ocropus-gpageseg, but with zoom parameter @@ -369,21 +372,21 @@ def check_page(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape)==0: return "image dimensions are zero" - if len(binary.shape)==3: return "image not monochrome %s"%(binary.shape,) + if np.prod(binary.shape) == 0: return "image dimensions are zero" + if len(binary.shape) == 3: return f"image not monochrome {binary.shape}" if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)20000/zoom: return "image too tall for a page image %s"%(binary.shape,) - if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,) - if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,) + if h<600/zoom: return f"image not tall enough for a page image {binary.shape}" + if h>20000/zoom: return f"image too tall for a page image {binary.shape}" + if w<600/zoom: return f"image too narrow for a page image {binary.shape}" + if w>20000/zoom: return f"image too wide for a page image {binary.shape}" return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) - if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,) - if ncomps>slots and ncomps>10: return "too many connected components for a page image (%d > %d)"%(ncomps,slots) + if ncomps<10: return f"too few connected components for a page image (got {ncomps})" + if ncomps>slots and ncomps>10: return f"too many connected components for a page image ({ncomps} > {slots})" return None def odd(num): @@ -476,8 +479,13 @@ def compute_images(binary, scale, maximages=5): #images = morph.rb_closing(images, (d0,d1)) #DSAVE('images1_closed', images+0.6*binary) # 1- filter largest connected components - images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages) - DSAVE('images1_large', images+0.6*binary) + binary_0_6 = 0.6 * binary + odd_scale = odd(scale) + odd_half_scale = odd(scale / 2) + odd_doubled_scale = odd(2 * scale) + region_min = (4 * scale) ** 2 + images = morph.select_regions(images, sl.area, min=region_min, nbest=2 * maximages) + DSAVE('images1_large', images + binary_0_6) if not images.any(): return np.zeros_like(binary, int) # 2- open horizontally and vertically to suppress @@ -486,31 +494,31 @@ def compute_images(binary, scale, maximages=5): # single frame, because then the hull polygon # can cover/overlap large text/table parts which # we cannot discern from the actual image anymore - h_opened = morph.rb_opening(images, (1, odd(scale/2))) - DSAVE('images2_h-opened', h_opened+0.6*binary) - v_opened = morph.rb_opening(images, (odd(scale/2), 1)) - DSAVE('images2_v-opened', v_opened+0.6*binary) + h_opened = morph.rb_opening(images, (1, odd_half_scale)) + DSAVE('images2_h-opened', h_opened + binary_0_6) + v_opened = morph.rb_opening(images, (odd_half_scale, 1)) + DSAVE('images2_v-opened', v_opened + binary_0_6) # 3- close whatever remains - closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale),odd(2*scale))) - DSAVE('images3_closed', closed+0.6*binary) + closed = morph.rb_closing(h_opened&v_opened, (odd_doubled_scale, odd_doubled_scale)) + DSAVE('images3_closed', closed + binary_0_6) # 4- reconstruct the losses up to a certain distance # to avoid creeping into pure h/v-lines again but still # cover most of the large object #images = np.where(images, closed, 2) #images = morph.spread_labels(images, maxdist=scale) % 2 | closed images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale) - DSAVE('images4_reconstructed', images+0.6*binary) + DSAVE('images4_reconstructed', images + binary_0_6) # 5- select nbest - images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages) - DSAVE('images5_selected', images+0.6*binary) + images = morph.select_regions(images, sl.area, min=region_min, nbest=maximages) + DSAVE('images5_selected', images + binary_0_6) if not images.any(): return np.zeros_like(binary, int) # 6- dilate a little to get a smooth contour without gaps - dilated = morph.r_dilation(images, (odd(scale),odd(scale))) + dilated = morph.r_dilation(images, (odd_scale, odd_scale)) images = morph.propagate_labels_majority(binary, dilated+1) images = morph.spread_labels(images, maxdist=scale)==2 images, _ = morph.label(images) - DSAVE('images6_dilated', images+0.6*binary) + DSAVE('images6_dilated', images + binary_0_6) # we could repeat reconstruct-dilate here... return images @@ -548,6 +556,7 @@ def compute_seplines(binary, scale, maxseps=0): sepsizes = [0] sepslices = [None] sepdists = [0] + doubled_scale = 2 * scale for label in range(1, nlabels + 1): labelslice = slices[label] labelmask = labels == label @@ -599,8 +608,8 @@ def compute_seplines(binary, scale, maxseps=0): binmask = sublabels == bin + 1 binlabels, nbinlabels = morph.label(binmask) _, binlabelcounts = np.unique(binlabels, return_counts=True) - largemask = (binlabelcounts > 2 * scale)[binlabels] - smallmask = (binlabelcounts <= 2 * scale)[binlabels] + largemask = (binlabelcounts > doubled_scale)[binlabels] + smallmask = (binlabelcounts <= doubled_scale)[binlabels] sublabels2[binmask & smallmask] = 1 if not np.any(binmask & largemask): continue @@ -1843,11 +1852,13 @@ def find_topological(): else: llab[box] = lbinary[box] # show projection at the sides - for i in range(int(scale/2)): - llab[box[0],box[1].start+i] = -10*np.log(y+1e-9) - llab[box[0],box[1].stop-1-i] = -10*np.log(y+1e-9) - llab[box[0].start+i,box[1]] = -10*np.log(x+1e-9) - llab[box[0].stop-1-i,box[1]] = -10*np.log(x+1e-9) + log_y = -10 * np.log(y + 1e-9) + log_x = -10 * np.log(x + 1e-9) + for i in range(int(scale / 2)): + llab[box[0], box[1].start + i] = log_y + llab[box[0], box[1].stop - 1 - i] = log_y + llab[box[0].start + i, box[1]] = log_x + llab[box[0].stop - 1 - i, box[1]] = log_x DSAVE('recursive_x_y_cut_' + (partition_type or 'sliced'), llab) gap_weights = list() for is_horizontal, profile in enumerate([y, x]): @@ -1877,19 +1888,19 @@ def find_topological(): weights = weights * (1 + 0.5 * props['peak_heights']/gap_height) gap_weights.append((gaps, weights)) if debug: - LOG.debug(' {} gaps {} {} weights {}'.format( - 'horizontal' if is_horizontal else 'vertical', - gaps, props, weights)) + orientation = 'horizontal' if is_horizontal else 'vertical' + LOG.debug(f' {orientation} gaps {gaps} {props} weights {weights}') if not gaps.shape[0]: continue + half_scale = int(scale / 2) for start, stop, height in sorted(zip( props['left_ips'].astype(int), props['right_ips'].astype(int), props['peak_heights']), key=lambda x: x[2]): if is_horizontal: - llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) + llab[box[0].start+half_scale:box[0].stop-half_scale,box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) else: - llab[box[0].start+start:box[0].start+stop,box[1].start+int(scale/2):box[1].stop-int(scale/2)] = -10*np.log(-height+1e-9) + llab[box[0].start+start:box[0].start+stop,box[1].start+half_scale:box[1].stop-half_scale] = -10*np.log(-height+1e-9) DSAVE('recursive_x_y_cut_gaps_' + ('h' if is_horizontal else 'v'), llab) # heuristic (not strict) decision on x or y cut, # factors to consider: @@ -1916,32 +1927,27 @@ def find_topological(): # are not allowed y_gaps, y_weights = gap_weights[0][0], gap_weights[0][1] x_gaps, x_weights = gap_weights[1][0], gap_weights[1][1] - if debug: LOG.debug(' all y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' all y_gaps {y_gaps} x_gaps {x_gaps}') # suppress cuts that significantly split any line labels + min_line_scale = min_line * scale y_allowed = [not(np.any(np.intersect1d( # significant line labels above - np.nonzero(np.bincount(lbin[:gap,:].flatten(), - minlength=len(objects))[1:] > min_line * scale)[0], + np.nonzero(np.bincount(lbin[:gap,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], # significant line labels below - np.nonzero(np.bincount(lbin[gap:,:].flatten(), - minlength=len(objects))[1:] > min_line * scale)[0], - assume_unique=True))) - for gap in y_gaps] + np.nonzero(np.bincount(lbin[gap:,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], + assume_unique=True))) for gap in y_gaps] x_allowed = [not(np.any(np.intersect1d( # significant line labels left - np.nonzero(np.bincount(lbin[:,:gap].flatten(), - minlength=len(objects))[1:] > min_line * scale)[0], + np.nonzero(np.bincount(lbin[:,:gap].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], # significant line labels right - np.nonzero(np.bincount(lbin[:,gap:].flatten(), - minlength=len(objects))[1:] > min_line * scale)[0], - assume_unique=True))) - for gap in x_gaps] + np.nonzero(np.bincount(lbin[:,gap:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], + assume_unique=True))) for gap in x_gaps] y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(' allowed y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' allowed y_gaps {y_gaps} x_gaps {x_gaps}') y_prominence = np.amax(y_weights, initial=0) x_prominence = np.amax(x_weights, initial=0) - if debug: LOG.debug(' y_prominence {} x_prominence {}'.format(y_prominence, x_prominence)) + if debug: LOG.debug(f' y_prominence {y_prominence} x_prominence {x_prominence}') # suppress less prominent peaks (another heuristic...) # they must compete with the other direction next time # (when already new cuts or partitions will become visible) @@ -1949,33 +1955,30 @@ def find_topological(): x_allowed = x_weights > 0.8 * x_prominence y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(' prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' prominent y_gaps {y_gaps} x_gaps {x_gaps}') if npartitions > 0: # TODO this can be avoided when backtracking below # suppress peaks creating fewer partitions than others -- # how large in our preferred direction will the new partitions # of sepmask in both slices created by each cut candidate # add up? - y_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width, - morph.find_objects(morph.label( - partitions[:gap,:]>0)[0]) + - morph.find_objects(morph.label( - partitions[gap:,:]>0)[0]))) - for gap in y_gaps] - x_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width, - morph.find_objects(morph.label( - partitions[:,:gap]>0)[0]) + - morph.find_objects(morph.label( - partitions[:,gap:]>0)[0]))) - for gap in x_gaps] - if debug: LOG.debug(' y_partitionscores {} x_partitionscores {}'.format( - y_partitionscores, x_partitionscores)) + y_partitionscores = [sum(map( + sl.height if prefer_vertical else sl.width, + morph.find_objects(morph.label(partitions[:gap, :] > 0)[0]) + + morph.find_objects(morph.label(partitions[gap:, :] > 0)[0]))) + for gap in y_gaps] + x_partitionscores = [sum(map( + sl.height if prefer_vertical else sl.width, + morph.find_objects(morph.label(partitions[:, : gap] > 0)[0]) + + morph.find_objects(morph.label(partitions[:, gap :] > 0)[0]))) + for gap in x_gaps] + if debug: LOG.debug(f' y_partitionscores {y_partitionscores} x_partitionscores {x_partitionscores}') # Now identify those gaps with the largest overall score y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0) x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0) y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(' most partitioning y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' most partitioning y_gaps {y_gaps} x_gaps {x_gaps}') else: y_partitionscores = None x_partitionscores = None @@ -1986,7 +1989,7 @@ def find_topological(): x_allowed = x_weights > 0.9 * x_prominence y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(' prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) + if debug: LOG.debug(f' prominent y_gaps {y_gaps} x_gaps {x_gaps}') # decide which direction, x or y # TODO: this most likely needs a backtracking mechanism @@ -2052,7 +2055,7 @@ def find_topological(): llab2[box] = partitions DSAVE('recursive_x_y_cut_partitions', llab2) for label in range(1, npartitions+1): - LOG.debug('next partition %d on %s', label, box) + LOG.debug(f'next partition %d on %s', label, box) recursive_x_y_cut(box, mask=partitions==label, partition_type=new_partition_type) return @@ -2060,10 +2063,9 @@ def find_topological(): # no gaps left finalize() return + orientation = 'vertical' if choose_vertical else 'horizontal' # otherwise: cut on gaps - LOG.debug('cutting %s on %s into %s', 'vertically' - if choose_vertical else 'horizontally', - box, gaps) + LOG.debug(f'cutting {orientation}ly on {box} into {gaps}') cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim))) if choose_vertical: if rl: @@ -2078,9 +2080,7 @@ def find_topological(): sub = sl.box(0, len(y), start, stop) else: # "cut in horizontal direction" sub = sl.box(start, stop, 0, len(x)) - LOG.debug('next %s block on %s is %s', 'horizontal' - if choose_vertical else 'vertical', - box, sub) + LOG.debug(f'next {orientation} block on {box} is {sub}') recursive_x_y_cut(sl.compose(box,sub), mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray) else None) From fceaffe4e928bff7ea70aece7baa3d3717c03cff Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 02:03:47 +0200 Subject: [PATCH 150/194] optimize ocrolib --- ocrd_cis/ocropy/ocrolib/morph.py | 18 ++++++++++-------- ocrd_cis/ocropy/ocrolib/toplevel.py | 26 ++++++++------------------ 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index 7d6ffc85..b9619cca 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -292,8 +292,9 @@ def propagate_labels_majority(image,labels): with the largest overlap.""" rlabels,_ = label(image) cors = correspondences(rlabels,labels) - outputs = zeros(amax(rlabels)+1,'i') - counts = zeros(amax(rlabels)+1,'i') + amax_rlabels = amax(rlabels) + 1 + outputs = zeros(amax_rlabels,'i') + counts = zeros(amax_rlabels,'i') for rlabel, label_, count in cors.T: if not rlabel or not label_: # ignore background correspondences @@ -347,12 +348,13 @@ def all_neighbors(image, dist=1, bg=NaN): """Given an image with labels, find all pairs of labels that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``.""" q = 100000 - assert amax(image)=0 - u = unique(q*image+shift(image,(dist,0),order=0,cval=bg)) - d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg)) - l = unique(q*image+shift(image,(0,dist),order=0,cval=bg)) - r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg)) + assert amax(image) < q + assert amin(image) >= 0 + q_image = q * image + u = unique(q_image + shift(image, (dist, 0), order=0, cval=bg)) + d = unique(q_image + shift(image, (-dist, 0), order=0, cval=bg)) + l = unique(q_image + shift(image, (0, dist), order=0, cval=bg)) + r = unique(q_image + shift(image, (0, -dist), order=0, cval=bg)) all = unique(r_[u,d,l,r]) all = all[all!=bg] all = c_[all//q,all%q] diff --git a/ocrd_cis/ocropy/ocrolib/toplevel.py b/ocrd_cis/ocropy/ocrolib/toplevel.py index 87ed18c5..72e397af 100644 --- a/ocrd_cis/ocropy/ocrolib/toplevel.py +++ b/ocrd_cis/ocropy/ocrolib/toplevel.py @@ -125,14 +125,10 @@ def __init__(self,*args,**kw): self.fun = kw.get("fun","?") self.var = kw.get("var","?") self.description = " ".join([strc(x) for x in args]) + def __str__(self): - result = "\nCheckError for argument " - result += str(self.var) - result += " of function " - result += str(self.fun) - result += "\n" - result += self.description - return result + return f"\nCheckError for argument {str(self.var)} of function {str(self.fun)}\n{self.description}" + class CheckWarning(CheckError): def __init__(self,*args,**kw): @@ -142,14 +138,8 @@ def __init__(self,*args,**kw): CheckError.__init__(self, *args, **kw) def __str__(self): - result = "\nCheckWarning for argument " - result += str(self.var) - result += " of function " - result += str(self.fun) - result += "\n" - result += self.description - result += "(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n" - return result + return (f"\nCheckWarning for argument {str(self.var)} of function {str(self.fun)}\n{self.description} " + f"(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n") def checktype(value,type_): """Check value against the type spec. If everything @@ -211,7 +201,7 @@ def argument_checks(*args,**kw): e.var = var raise e except: - LOG.critical("unknown exception while checking function: '%s'", name) + LOG.critical(f"unknown exception while checking function: '{name}'") raise result = f(*args,**kw) checktype(result,kw.get("_",True)) @@ -225,9 +215,9 @@ def decorator(f): def wrapper(arg): if not f(arg): if warning: - raise CheckWarning(strc(arg)+" of type "+str(type(arg))+": "+str(message)) + raise CheckWarning(f"{strc(arg)} of type {str(type(arg))}: {str(message)}") else: - raise CheckError(strc(arg)+" of type "+str(type(arg))+": "+str(message)) + raise CheckError(f"{strc(arg)} of type {str(type(arg))}: {str(message)}") return wrapper return decorator From 3de2585787ea2b59126a4a1c39d9df3e42d18362 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Sat, 17 Aug 2024 02:03:58 +0200 Subject: [PATCH 151/194] optimize align cli --- ocrd_cis/align/cli.py | 85 ++++++++++++++++++------------------------- 1 file changed, 35 insertions(+), 50 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index 7747622e..7d6599c2 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -57,16 +57,16 @@ def process(self): def align(self, alignments, ift): """align the alignment objects with the according input file tuples""" for t in ift: - self.log.debug("tuple %s", os.path.basename(t.input_file.url)) + self.log.debug(f"tuple {os.path.basename(t.input_file.url)}") pcgtst = self.open_input_file_tuples(ift) i = 0 for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()): for mj, _ in enumerate(mr.get_TextLine()): for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()): - self.log.debug("[%d] %s", iiii, u.Unicode) + self.log.debug(f"[{iiii}] {u.Unicode}") for xx in mr.get_TextLine()[mj].get_Word(): for iiii, u in enumerate(xx.get_TextEquiv()): - self.log.debug("[%d] %s", iiii, u.Unicode) + self.log.debug(f"[{iiii}] {u.Unicode}") lines = [] for ii, t in enumerate(ift): @@ -88,23 +88,21 @@ def align_lines(self, lines): for i, line in enumerate(lines): if lines[0].region.get_TextEquiv() is None: lines[0].region.TextEquiv = [] - self.log.debug('line alignment: %s [%s - %s]', - get_textequiv_unicode(line.region), - line.region.get_id(), - line.input_file.input_file_group) - ddt = line.input_file.input_file_group + "/" + line.region.get_id() + self.log.debug(f'line alignment: {get_textequiv_unicode(line.region)} ' + f'[{line.region.get_id()} - {line.input_file.input_file_group}]') + ddt = f"{line.input_file.input_file_group}/{line.region.get_id()}" if i != 0: te = TextEquivType( Unicode=get_textequiv_unicode(line.region), conf=get_textequiv_conf(line.region), dataType="other", - dataTypeDetails="ocrd-cis-line-alignment:" + ddt) + dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}") lines[0].region.add_TextEquiv(te) else: self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i) lines[0].region.get_TextEquiv()[i].set_dataType("other") lines[0].region.get_TextEquiv()[i].set_dataTypeDetails( - "ocrd-cis-line-alignment-master-ocr:" + ddt) + f"ocrd-cis-line-alignment-master-ocr:{ddt}") lines[0].region.get_TextEquiv()[i].set_index(i+1) self.align_words(lines) @@ -113,18 +111,18 @@ def align_words(self, lines): mregion = lines[0].region.get_Word() oregion = [lines[i].region.get_Word() for i in range(1, len(lines))] for word in lines[0].alignment['wordAlignments']: - self.log.debug("aligning word %s", word['master']) + self.log.debug(f"aligning word {word['master']}", ) master, rest = self.find_word([word['master']], mregion, "master") mregion = rest if master is None or len(master) != 1: - self.log.warn("cannot find {}; giving up".format(word['master'])) - # raise Exception("cannot find {}; giving up".format(word['master'])) + self.log.warn(f"cannot find {word['master']}; giving up") + # raise Exception(f"cannot find {word['master']}; giving up") return others = list() for i, other in enumerate(word['alignments']): match, rest = self.find_word(other, oregion[i]) if match is None: - self.log.warn("cannot find {}; giving up".format(other)) + self.log.warn(f"cannot find {other}; giving up") return others.append(match) oregion[i] = rest @@ -132,10 +130,7 @@ def align_words(self, lines): words.append( Alignment(lines[0].input_file, master, lines[0].alignment)) for i, other in enumerate(others): - words.append(Alignment( - lines[i+1].input_file, - other, - lines[i+1].alignment)) + words.append(Alignment(lines[i+1].input_file, other, lines[i+1].alignment)) self.align_word_regions(words) def align_word_regions(self, words): @@ -144,10 +139,8 @@ def te0(x): for i, word in enumerate(words): if not word.region: ifg = word.input_file.input_file_group - self.log.debug("(empty) word alignment: [%s]", ifg) - te = TextEquivType( - dataType="other", - dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg) + self.log.debug(f"(empty) word alignment: [{ifg}]") + te = TextEquivType(dataType="other", dataTypeDetails=f"ocrd-cis-empty-word-alignment:{ifg}") words[0].region[0].add_TextEquiv(te) words[0].region[0].get_TextEquiv()[i].set_index(i+1) continue @@ -157,46 +150,38 @@ def te0(x): ddt = word.input_file.input_file_group + "/" + _id # if conf is none it is most likely ground truth data conf = min([float(te0(x).get_conf() or "1.0") for x in word.region]) - self.log.debug("word alignment: %s [%s - %s]", _str, _id, ifg) + self.log.debug(f"word alignment: {_str} [{_id} - {ifg}]") if i != 0: te = TextEquivType( - Unicode=_str, - conf=conf, - dataType="other", - dataTypeDetails="ocrd-cis-word-alignment:" + ddt) + Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}") words[0].region[0].add_TextEquiv(te) else: words[0].region[0].get_TextEquiv()[i].set_dataType("other") - words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails( - "ocrd-cis-word-alignment-master-ocr:" + ddt) + words[0].region[0].get_TextEquiv()[i].set_dataTypeDetails(f"ocrd-cis-word-alignment-master-ocr:{ddt}") words[0].region[0].get_TextEquiv()[i].set_index(i+1) def find_word(self, tokens, regions, t="other"): - self.log.debug("tokens = %s [%s]", tokens, t) + tokens_str = f"tokens = {tokens} [{t}]" + self.log.debug(tokens_str) for i, _ in enumerate(regions): n = self.match_tokens(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again with levenshtein - self.log.warn( - "could not find tokens = %s [%s]; trying again", - tokens, t) + self.log.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_lev(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again to match token within another one - self.log.warn( - "could not find tokens = %s [%s]; trying again", - tokens, t) + self.log.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_within(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) - # nothing could be found return tuple([None, regions]) @@ -212,7 +197,7 @@ def match_tokens_lev(self, tokens, regions, i): def f(a, b): k = 3 # int(len(a)/3) d = Levenshtein.distance(a, b) - self.log.debug("lev %s <=> %s: %d (%d)", a, b, d, d) + self.log.debug(f"lev {a} <=> {b}: {d} ({d})") return d <= 1 or d <= k return self.match_tokens_lambda(tokens, regions, i, f) @@ -227,14 +212,15 @@ def match_tokens_lambda(self, tokens, regions, i, f): Returns 0 if nothing could be matched. """ for j, token in enumerate(tokens): - if j + i >= len(regions): + sum_i_j = j + i + if sum_i_j >= len(regions): return 0 - if not regions[i+j].get_TextEquiv()[0].Unicode: - self.log.warn("cannot find %s", token) + unicode = regions[sum_i_j].get_TextEquiv()[0].Unicode + if not unicode: + self.log.warn(f"cannot find {token}") return 0 - self.log.debug('checking %s with %s', token, - regions[i+j].get_TextEquiv()[0].Unicode) - if f(token, regions[i+j].get_TextEquiv()[0].Unicode): + self.log.debug(f'checking {token} with {unicode}') + if f(token, unicode): continue if j == 0: return 0 @@ -259,19 +245,18 @@ def zip_input_files(self, ifgs): """Zip files of the given input file groups""" files = list() for ifg in ifgs: - self.log.info("input file group: %s", ifg) + self.log.info(f"input file group: {ifg}") ifiles = sorted( self.workspace.mets.find_files(fileGrp=ifg), key=lambda ifile: ifile.url) for i in ifiles: - self.log.debug("sorted file: %s %s", - os.path.basename(i.url), i.ID) + self.log.debug(f"sorted file: {os.path.basename(i.url)} {i.ID}") ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles] files.append(ifiles) return zip(*files) def read_lines_from_input_file(self, ifile): - self.log.info("reading input file: %s", ifile) + self.log.info(f"reading input file: {ifile}") lines = list() pcgts = ifile.open() for region in pcgts.get_Page().get_TextRegion(): @@ -286,7 +271,7 @@ def run_java_aligner(self, ifs): lines = zip(*lines) _input = [x.strip() for t in lines for x in t] for i in _input: - self.log.debug("input line: %s", i) + self.log.debug(f"input line: {i}") n = len(ifs) self.log.debug("starting java client") p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel())) @@ -300,7 +285,7 @@ def __init__(self, workspace, ifile, ifg): self.log = getLogger('cis.FileAlignment') def open(self): - self.log.info("opening: %s", os.path.basename(self.input_file.url)) + self.log.info(f"opening: {os.path.basename(self.input_file.url)}") return page_from_file(self.workspace.download_file(self.input_file)) From 0949277dbe049c1cd6776b3c701980c48cf2ebc8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 02:21:34 +0200 Subject: [PATCH 152/194] align: use final v3 API --- ocrd_cis/align/cli.py | 229 ++++++++++++++++-------------------------- 1 file changed, 85 insertions(+), 144 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index 7d6599c2..f85b7348 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -1,97 +1,71 @@ from __future__ import absolute_import +from __future__ import annotations + import click import json import os +from typing import Optional, List, Dict, Type + from rapidfuzz.distance import Levenshtein -from ocrd import Processor + +from ocrd import Processor, OcrdPage, OcrdPageResult from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import MIMETYPE_PAGE from ocrd_utils import getLogger from ocrd_utils import getLevelName -from ocrd_utils import make_file_id -from ocrd_modelfactory import page_from_file -from ocrd_models.ocrd_page import to_xml -from ocrd_models.ocrd_page_generateds import TextEquivType +from ocrd_models.ocrd_page import TextRegionType, TextEquivType from ocrd_cis import JavaAligner -from ocrd_cis import get_ocrd_tool @click.command() @ocrd_cli_options def ocrd_cis_align(*args, **kwargs): - return ocrd_cli_wrap_processor(Aligner, *args, **kwargs) + return ocrd_cli_wrap_processor(CISAligner, *args, **kwargs) -class Aligner(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-align'] - kwargs['version'] = ocrd_tool['version'] - super(Aligner, self).__init__(*args, **kwargs) +class CISAligner(Processor): + @property + def executable(self): + return 'ocrd-cis-align' - if hasattr(self, 'workspace'): - self.log = getLogger('cis.Processor.Aligner') - - def process(self): - ifgs = self.input_file_grp.split(",") # input file groups - if len(ifgs) < 2: - raise Exception("need at least two input file groups to align") - ifts = self.zip_input_files(ifgs) # input file tuples - for _id, ift in enumerate(ifts): - alignments = json.loads(self.run_java_aligner(ift)) - pcgts = self.align(alignments, ift) - # keep the right part after OCR-D-...-filename - # and prepend output_file_grp - input_file = ift[0].input_file - file_id = make_file_id(input_file, self.output_file_grp) - pcgts.set_pcGtsId(file_id) - out = self.workspace.add_file( - ID=file_id, - file_grp=self.output_file_grp, - pageId=input_file.pageId, - local_filename=os.path.join(self.output_file_grp, file_id + '.xml'), - mimetype=MIMETYPE_PAGE, - content=to_xml(pcgts), - ) - self.log.info('created file %s', out) + def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult: + assert len(input_pcgts) >= 2 + alignments = json.loads(self.run_java_aligner(input_pcgts)) + pcgts = self.align(alignments, input_pcgts) + return OcrdPageResult(pcgts) - def align(self, alignments, ift): + def align(self, alignments: List[Dict], pcgts: List[OcrdPage]) -> OcrdPage: """align the alignment objects with the according input file tuples""" - for t in ift: - self.log.debug(f"tuple {os.path.basename(t.input_file.url)}") - pcgtst = self.open_input_file_tuples(ift) i = 0 - for mi, mr in enumerate(pcgtst[0].get_Page().get_TextRegion()): + file_groups = self.input_file_grp.split(',') + for mi, mr in enumerate(pcgts[0].get_Page().get_AllRegions(classes=['Text'])): for mj, _ in enumerate(mr.get_TextLine()): - for iiii, u in enumerate(mr.get_TextLine()[mj].get_TextEquiv()): - self.log.debug(f"[{iiii}] {u.Unicode}") - for xx in mr.get_TextLine()[mj].get_Word(): - for iiii, u in enumerate(xx.get_TextEquiv()): - self.log.debug(f"[{iiii}] {u.Unicode}") - lines = [] - for ii, t in enumerate(ift): + for ii, page in enumerate(pcgts): if i >= len(alignments): break - tr = pcgtst[ii].get_Page().get_TextRegion() + tr = page.get_Page().get_AllRegions(classes=['Text']) region = tr[mi].get_TextLine()[mj] - lines.append(Alignment(t, region, alignments[i])) + lines.append(Alignment(file_groups[ii], page, region, alignments[i])) self.align_lines(lines) i += 1 - return pcgtst[0] + return pcgts[0] - def align_lines(self, lines): + def align_lines(self, lines: List[Alignment]) -> None: """align the given line alignment with the lines""" if not lines: return - if len(lines[0].region.get_TextEquiv()) > 1: - del lines[0].region.get_TextEquiv()[1:] + if len(lines[0].region.TextEquiv) > 1: + del lines[0].region.TextEquiv[1:] for i, line in enumerate(lines): if lines[0].region.get_TextEquiv() is None: lines[0].region.TextEquiv = [] - self.log.debug(f'line alignment: {get_textequiv_unicode(line.region)} ' - f'[{line.region.get_id()} - {line.input_file.input_file_group}]') - ddt = f"{line.input_file.input_file_group}/{line.region.get_id()}" - if i != 0: + self.logger.debug( + 'line alignment: %s [%s - %s]', + get_textequiv_unicode(line.region), + line.region.get_id(), + line.file_grp + ) + ddt = line.file_grp + "/" + line.region.get_id() + if i > 0: te = TextEquivType( Unicode=get_textequiv_unicode(line.region), conf=get_textequiv_conf(line.region), @@ -99,58 +73,64 @@ def align_lines(self, lines): dataTypeDetails=f"ocrd-cis-line-alignment:{ddt}") lines[0].region.add_TextEquiv(te) else: - self.log.debug("len: %i, i: %i", len(lines[0].region.get_TextEquiv()), i) - lines[0].region.get_TextEquiv()[i].set_dataType("other") - lines[0].region.get_TextEquiv()[i].set_dataTypeDetails( - f"ocrd-cis-line-alignment-master-ocr:{ddt}") - lines[0].region.get_TextEquiv()[i].set_index(i+1) + self.logger.debug("len: %i, i: %i", len(lines[0].region.TextEquiv), i) + lines[0].region.TextEquiv[i].set_dataType("other") + lines[0].region.TextEquiv[i].set_dataTypeDetails( + "ocrd-cis-line-alignment-master-ocr:" + ddt) + lines[0].region.TextEquiv[i].set_index(i+1) self.align_words(lines) - def align_words(self, lines): - # self.log.info(json.dumps(lines[0].alignment)) + def align_words(self, lines: List[Alignment]) -> None: + # self.logger.info(json.dumps(lines[0].alignment)) mregion = lines[0].region.get_Word() oregion = [lines[i].region.get_Word() for i in range(1, len(lines))] for word in lines[0].alignment['wordAlignments']: - self.log.debug(f"aligning word {word['master']}", ) + self.logger.debug("aligning word %s", word['master']) master, rest = self.find_word([word['master']], mregion, "master") mregion = rest if master is None or len(master) != 1: - self.log.warn(f"cannot find {word['master']}; giving up") - # raise Exception(f"cannot find {word['master']}; giving up") + self.logger.warn("cannot find {}; giving up".format(word['master'])) + # raise Exception("cannot find {}; giving up".format(word['master'])) return others = list() for i, other in enumerate(word['alignments']): match, rest = self.find_word(other, oregion[i]) if match is None: - self.log.warn(f"cannot find {other}; giving up") + self.logger.warn(f"cannot find {other}; giving up") return others.append(match) oregion[i] = rest words = list() words.append( - Alignment(lines[0].input_file, master, lines[0].alignment)) + Alignment(lines[0].file_grp, lines[0].pcgts, master, lines[0].alignment)) for i, other in enumerate(others): - words.append(Alignment(lines[i+1].input_file, other, lines[i+1].alignment)) + words.append(Alignment( + lines[i+1].file_grp, + lines[i+1].pcgts, + other, + lines[i+1].alignment)) self.align_word_regions(words) - def align_word_regions(self, words): + def align_word_regions(self, words: List[Alignment]) -> None: def te0(x): - return x.get_TextEquiv()[0] + return x.TextEquiv[0] for i, word in enumerate(words): if not word.region: - ifg = word.input_file.input_file_group - self.log.debug(f"(empty) word alignment: [{ifg}]") - te = TextEquivType(dataType="other", dataTypeDetails=f"ocrd-cis-empty-word-alignment:{ifg}") + ifg = word.file_grp + self.logger.debug("(empty) word alignment: [%s]", ifg) + te = TextEquivType( + dataType="other", + dataTypeDetails="ocrd-cis-empty-word-alignment:" + ifg) words[0].region[0].add_TextEquiv(te) words[0].region[0].get_TextEquiv()[i].set_index(i+1) continue _str = " ".join([te0(x).Unicode for x in word.region]) _id = ",".join([x.get_id() for x in word.region]) - ifg = word.input_file.input_file_group - ddt = word.input_file.input_file_group + "/" + _id + ifg = word.file_grp + ddt = word.file_grp + "/" + _id # if conf is none it is most likely ground truth data conf = min([float(te0(x).get_conf() or "1.0") for x in word.region]) - self.log.debug(f"word alignment: {_str} [{_id} - {ifg}]") + self.logger.debug(f"word alignment: {_str} [{_id} - {ifg}]") if i != 0: te = TextEquivType( Unicode=_str, conf=conf, dataType="other", dataTypeDetails=f"ocrd-cis-word-alignment:{ddt}") @@ -162,21 +142,21 @@ def te0(x): def find_word(self, tokens, regions, t="other"): tokens_str = f"tokens = {tokens} [{t}]" - self.log.debug(tokens_str) + self.logger.debug(tokens_str) for i, _ in enumerate(regions): n = self.match_tokens(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again with levenshtein - self.log.warn(f"could not find {tokens_str}; trying again") + self.logger.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_lev(tokens, regions, i) if n == 0: continue return tuple([regions[i:n], regions[i:]]) # not found try again to match token within another one - self.log.warn(f"could not find {tokens_str}; trying again") + self.logger.warn(f"could not find {tokens_str}; trying again") for i, _ in enumerate(regions): n = self.match_tokens_within(tokens, regions, i) if n == 0: @@ -197,7 +177,7 @@ def match_tokens_lev(self, tokens, regions, i): def f(a, b): k = 3 # int(len(a)/3) d = Levenshtein.distance(a, b) - self.log.debug(f"lev {a} <=> {b}: {d} ({d})") + self.logger.debug(f"lev {a} <=> {b}: {d} ({d})") return d <= 1 or d <= k return self.match_tokens_lambda(tokens, regions, i, f) @@ -215,11 +195,11 @@ def match_tokens_lambda(self, tokens, regions, i, f): sum_i_j = j + i if sum_i_j >= len(regions): return 0 - unicode = regions[sum_i_j].get_TextEquiv()[0].Unicode + unicode = regions[sum_i_j].TextEquiv[0].Unicode if not unicode: - self.log.warn(f"cannot find {token}") + self.logger.warn(f"cannot find {token}") return 0 - self.log.debug(f'checking {token} with {unicode}') + self.logger.debug(f'checking {token} with {unicode}') if f(token, unicode): continue if j == 0: @@ -230,68 +210,29 @@ def match_tokens_lambda(self, tokens, regions, i, f): i += 1 return i + len(tokens) - def open_input_file_tuples(self, ift): - """ - opens all xml files of the given input file tuple - and returns them as tuples - """ - res = list() - for ifile in ift: - pcgts = ifile.open() - res.append(pcgts) - return tuple(res) - - def zip_input_files(self, ifgs): - """Zip files of the given input file groups""" - files = list() - for ifg in ifgs: - self.log.info(f"input file group: {ifg}") - ifiles = sorted( - self.workspace.mets.find_files(fileGrp=ifg), - key=lambda ifile: ifile.url) - for i in ifiles: - self.log.debug(f"sorted file: {os.path.basename(i.url)} {i.ID}") - ifiles = [FileAlignment(self.workspace, x, ifg) for x in ifiles] - files.append(ifiles) - return zip(*files) - - def read_lines_from_input_file(self, ifile): - self.log.info(f"reading input file: {ifile}") + def run_java_aligner(self, input_pcgts: List[OcrdPage]) -> str: lines = list() - pcgts = ifile.open() - for region in pcgts.get_Page().get_TextRegion(): - for line in region.get_TextLine(): - lines.append(get_textequiv_unicode(line)) - return lines - - def run_java_aligner(self, ifs): - lines = list() - for ifile in ifs: - lines.append(self.read_lines_from_input_file(ifile)) + for pcgts in input_pcgts: + lines.append([get_textequiv_unicode(line) + for line in pcgts.get_Page().get_AllTextLines()]) + # JavaAligner expects a strange input format lines = zip(*lines) _input = [x.strip() for t in lines for x in t] for i in _input: - self.log.debug(f"input line: {i}") - n = len(ifs) - self.log.debug("starting java client") - p = JavaAligner(n, getLevelName(self.log.getEffectiveLevel())) + self.logger.debug("input line: %s", i) + n = len(input_pcgts) + self.logger.debug("starting java client") + p = JavaAligner(n, getLevelName(self.logger.getEffectiveLevel())) return p.run("\n".join(_input)) -class FileAlignment: - def __init__(self, workspace, ifile, ifg): - self.workspace = workspace - self.input_file = ifile - self.input_file_group = ifg - self.log = getLogger('cis.FileAlignment') - - def open(self): - self.log.info(f"opening: {os.path.basename(self.input_file.url)}") - return page_from_file(self.workspace.download_file(self.input_file)) - - class Alignment: - def __init__(self, ifile, region, alignment): - self.input_file = ifile + file_grp: str + pcgts: OcrdPage + region: TextRegionType + alignment: Alignment + def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: Alignment): + self.file_grp = file_grp + self.pcgts = pcgts self.region = region self.alignment = alignment From d4f8483ffdefac50161e4376637b9f8e813c384f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 02:21:58 +0200 Subject: [PATCH 153/194] use ocrd_utils instead of pkg_resources --- ocrd_cis/data/__main__.py | 10 +++++----- ocrd_cis/javaprocess.py | 5 ++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/ocrd_cis/data/__main__.py b/ocrd_cis/data/__main__.py index 3d8ef735..8fdcddd6 100644 --- a/ocrd_cis/data/__main__.py +++ b/ocrd_cis/data/__main__.py @@ -1,18 +1,18 @@ -import pkg_resources import sys +from ocrd_utils import resource_filename def main(): usage = 'usage: ' + sys.argv[0] + ' -jar|-3gs|-model|-config' if '-h' in sys.argv: print(usage) elif '-jar' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) + print(resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) elif '-3gs' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/3gs.csv.gz')) + print(resource_filename('ocrd_cis', 'data/3gs.csv.gz')) elif '-model' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/model.zip')) + print(resource_filename('ocrd_cis', 'data/model.zip')) elif '-config' in sys.argv: - print(pkg_resources.resource_filename('ocrd_cis', 'data/config.json')) + print(resource_filename('ocrd_cis', 'data/config.json')) else: raise ValueError(usage) diff --git a/ocrd_cis/javaprocess.py b/ocrd_cis/javaprocess.py index ce2f6bfd..72915d68 100644 --- a/ocrd_cis/javaprocess.py +++ b/ocrd_cis/javaprocess.py @@ -1,12 +1,11 @@ import subprocess import json -import pkg_resources -from ocrd_utils import getLogger +from ocrd_utils import getLogger, resource_filename from pathlib import Path MAIN = "de.lmu.cis.ocrd.cli.Main" -JAR = pkg_resources.resource_filename('ocrd_cis', 'data/ocrd-cis.jar') +JAR = str(resource_filename('ocrd_cis', 'data/ocrd-cis.jar')) def JavaAligner(n, loglvl): """Create a java process that calls -c align -D '{"n":n}'""" From ecc44c0358354c0c3c3ba6000e7de7413dc9cef1 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 22 Aug 2024 13:31:09 +0200 Subject: [PATCH 154/194] postcorrect: use final v3 API --- ocrd_cis/align/cli.py | 1 + ocrd_cis/postcorrect/cli.py | 22 ++++++++++------------ 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index f85b7348..f5e47785 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -16,6 +16,7 @@ from ocrd_models.ocrd_page import TextRegionType, TextEquivType from ocrd_cis import JavaAligner + @click.command() @ocrd_cli_options def ocrd_cis_align(*args, **kwargs): diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index dc3ee48e..71fbaad1 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -1,14 +1,15 @@ from __future__ import absolute_import +import os + import click import json -import os + from ocrd import Processor -from ocrd.decorators import ocrd_cli_options -from ocrd.decorators import ocrd_cli_wrap_processor +from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import getLogger, getLevelName from ocrd_models.ocrd_mets import OcrdMets from ocrd_cis import JavaPostCorrector -from ocrd_cis import get_ocrd_tool + @click.command() @ocrd_cli_options @@ -16,26 +17,23 @@ def ocrd_cis_postcorrect(*args, **kwargs): return ocrd_cli_wrap_processor(PostCorrector, *args, **kwargs) class PostCorrector(Processor): - def __init__(self, *args, **kwargs): - ocrd_tool = get_ocrd_tool() - kwargs['ocrd_tool'] = ocrd_tool['tools']['ocrd-cis-postcorrect'] - kwargs['version'] = ocrd_tool['version'] - super(PostCorrector, self).__init__(*args, **kwargs) + @property + def executable(self): + return 'ocrd-cis-postcorrect' def process(self): - self.log = getLogger('processor.CISPostCorrector') profiler = {} profiler["path"] = self.parameter["profilerPath"] profiler["config"] = self.parameter["profilerConfig"] profiler["noCache"] = True self.parameter["profiler"] = profiler self.parameter["runDM"] = True - self.log.debug(json.dumps(self.parameter, indent=4)) + self.logger.debug(json.dumps(self.parameter, indent=4)) p = JavaPostCorrector(self.workspace.mets_target, self.input_file_grp, self.output_file_grp, self.parameter, - getLevelName(self.log.getEffectiveLevel())) + getLevelName(self.logger.getEffectiveLevel())) p.exe() # reload the mets file to prevent run_processor's save_mets # from overriding the results from the Java process From 2b310b4690b1a83be75cd93432ea38be7250ee35 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 23 Aug 2024 13:51:07 +0200 Subject: [PATCH 155/194] revert: ocropy.ocrolib changes --- ocrd_cis/ocropy/ocrolib/morph.py | 18 ++++++++---------- ocrd_cis/ocropy/ocrolib/toplevel.py | 26 ++++++++++++++++++-------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index b9619cca..f7ccdc31 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -292,9 +292,8 @@ def propagate_labels_majority(image,labels): with the largest overlap.""" rlabels,_ = label(image) cors = correspondences(rlabels,labels) - amax_rlabels = amax(rlabels) + 1 - outputs = zeros(amax_rlabels,'i') - counts = zeros(amax_rlabels,'i') + outputs = zeros(amax(rlabels)+1,'i') + counts = zeros(amax(rlabels)+1,'i') for rlabel, label_, count in cors.T: if not rlabel or not label_: # ignore background correspondences @@ -348,13 +347,12 @@ def all_neighbors(image, dist=1, bg=NaN): """Given an image with labels, find all pairs of labels that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``.""" q = 100000 - assert amax(image) < q - assert amin(image) >= 0 - q_image = q * image - u = unique(q_image + shift(image, (dist, 0), order=0, cval=bg)) - d = unique(q_image + shift(image, (-dist, 0), order=0, cval=bg)) - l = unique(q_image + shift(image, (0, dist), order=0, cval=bg)) - r = unique(q_image + shift(image, (0, -dist), order=0, cval=bg)) + assert amax(image)=0 + u = unique(q*image+shift(image, (dist, 0), order=0, cval=bg)) + d = unique(q*image+shift(image, (-dist, 0), order=0, cval=bg)) + l = unique(q*image+shift(image, (0, dist), order=0, cval=bg)) + r = unique(q*image+shift(image, (0, -dist), order=0, cval=bg)) all = unique(r_[u,d,l,r]) all = all[all!=bg] all = c_[all//q,all%q] diff --git a/ocrd_cis/ocropy/ocrolib/toplevel.py b/ocrd_cis/ocropy/ocrolib/toplevel.py index 72e397af..87ed18c5 100644 --- a/ocrd_cis/ocropy/ocrolib/toplevel.py +++ b/ocrd_cis/ocropy/ocrolib/toplevel.py @@ -125,10 +125,14 @@ def __init__(self,*args,**kw): self.fun = kw.get("fun","?") self.var = kw.get("var","?") self.description = " ".join([strc(x) for x in args]) - def __str__(self): - return f"\nCheckError for argument {str(self.var)} of function {str(self.fun)}\n{self.description}" - + result = "\nCheckError for argument " + result += str(self.var) + result += " of function " + result += str(self.fun) + result += "\n" + result += self.description + return result class CheckWarning(CheckError): def __init__(self,*args,**kw): @@ -138,8 +142,14 @@ def __init__(self,*args,**kw): CheckError.__init__(self, *args, **kw) def __str__(self): - return (f"\nCheckWarning for argument {str(self.var)} of function {str(self.fun)}\n{self.description} " - f"(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n") + result = "\nCheckWarning for argument " + result += str(self.var) + result += " of function " + result += str(self.fun) + result += "\n" + result += self.description + result += "(This can happen occasionally during normal operations and isn't necessarily a bug or problem.)\n" + return result def checktype(value,type_): """Check value against the type spec. If everything @@ -201,7 +211,7 @@ def argument_checks(*args,**kw): e.var = var raise e except: - LOG.critical(f"unknown exception while checking function: '{name}'") + LOG.critical("unknown exception while checking function: '%s'", name) raise result = f(*args,**kw) checktype(result,kw.get("_",True)) @@ -215,9 +225,9 @@ def decorator(f): def wrapper(arg): if not f(arg): if warning: - raise CheckWarning(f"{strc(arg)} of type {str(type(arg))}: {str(message)}") + raise CheckWarning(strc(arg)+" of type "+str(type(arg))+": "+str(message)) else: - raise CheckError(f"{strc(arg)} of type {str(type(arg))}: {str(message)}") + raise CheckError(strc(arg)+" of type "+str(type(arg))+": "+str(message)) return wrapper return decorator From 4420c6fa246c81f1fc7c14e7a1cb6dc1d2460e5f Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 23 Aug 2024 15:06:41 +0200 Subject: [PATCH 156/194] revert: ocropy.common changes --- ocrd_cis/ocropy/common.py | 186 +++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index a5806517..c23e89b9 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -184,19 +184,16 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90): d0, d1 = flat.shape o0, o1 = int(bignore * d0), int(bignore * d1) est = flat[o0:d0 - o0, o1:d1 - o1] - if escale > 0: # by default, we use only regions that contain # significant variance; this makes the percentile # based low and high estimates more reliable e = escale - e_20_0 = e * 20.0 - e_50 = int(e * 50) - v = est - filters.gaussian_filter(est, e_20_0) - v = filters.gaussian_filter(v ** 2, e_20_0) ** 0.5 + v = est - filters.gaussian_filter(est, e*20.0) + v = filters.gaussian_filter(v ** 2, e*20.0) ** 0.5 v = (v > 0.3 * np.amax(v)) - v = morphology.binary_dilation(v, structure=np.ones((e_50, 1))) - v = morphology.binary_dilation(v, structure=np.ones((1, e_50))) + v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1))) + v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50)))) est = est[v] lo = stats.scoreatpercentile(est.ravel(), lo) hi = stats.scoreatpercentile(est.ravel(), hi) @@ -313,24 +310,24 @@ def check_line(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape) == 0: return "image dimensions are zero" - if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}" + if np.prod(binary.shape)==0: return "image dimensions are zero" + if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,) if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)200/zoom: return f"image too tall for a text line {binary.shape}" + if h<20/zoom: return "image not tall enough for a text line %s"%(binary.shape,) + if h>200/zoom: return "image too tall for a text line %s"%(binary.shape,) ##if w<1.5*h: return "line too short %s"%(binary.shape,) - if w<1.5*h and w<32/zoom: return f"image too short for a line image {binary.shape}" - if w>4000/zoom: return f"image too long for a line image {binary.shape}" + if w<1.5*h and w<32/zoom: return "image too short for a line image %s"%(binary.shape,) + if w>4000/zoom: return "image too long for a line image %s"%(binary.shape,) return None ratio = w*1.0/h _, ncomps = measurements.label(binary) lo = int(0.5*ratio+0.5) hi = int(4*ratio)+1 - if ncomps={lo})" - ##if ncomps>hi*ratio: return f"too many connected components (got {ncomps}, wanted <={hi})" - if ncomps>hi*ratio and ncomps>10: return f"too many connected components (got {ncomps}, wanted <={hi})" + if ncomps=%d)"%(ncomps,lo) + ##if ncomps>hi*ratio: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi) + if ncomps>hi*ratio and ncomps>10: return "too many connected components (got %d, wanted <=%d)"%(ncomps,hi) return None # inspired by ocropus-gpageseg check_page @@ -344,21 +341,21 @@ def check_region(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape) == 0: return "image dimensions are zero" - if len(binary.shape) == 3: return f"image is not monochrome {binary.shape}" + if np.prod(binary.shape)==0: return "image dimensions are zero" + if len(binary.shape)==3: return "image is not monochrome %s"%(binary.shape,) if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)5000/zoom: return f"image too tall for a region image {binary.shape}" - if w<100/zoom: return f"image too narrow for a region image {binary.shape}" - if w>5000/zoom: return f"image too wide for a region image {binary.shape}" + if h<45/zoom: return "image not tall enough for a region image %s"%(binary.shape,) + if h>5000/zoom: return "image too tall for a region image %s"%(binary.shape,) + if w<100/zoom: return "image too narrow for a region image %s"%(binary.shape,) + if w>5000/zoom: return "image too wide for a region image %s"%(binary.shape,) return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) - if ncomps<5: return f"too few connected components for a region image (got {ncomps})" - if ncomps>slots and ncomps>10: return f"too many connected components for a region image ({ncomps} > {slots})" + if ncomps<5: return "too few connected components for a region image (got %d)"%(ncomps,) + if ncomps>slots and ncomps>10: return "too many connected components for a region image (%d > %d)"%(ncomps,slots) return None # from ocropus-gpageseg, but with zoom parameter @@ -372,21 +369,21 @@ def check_page(binary, zoom=1.0): Returns an error report, or None if valid. """ - if np.prod(binary.shape) == 0: return "image dimensions are zero" - if len(binary.shape) == 3: return f"image not monochrome {binary.shape}" + if np.prod(binary.shape)==0: return "image dimensions are zero" + if len(binary.shape)==3: return "image not monochrome %s"%(binary.shape,) if np.amax(binary)==np.amin(binary): return "image is blank" if np.mean(binary)20000/zoom: return f"image too tall for a page image {binary.shape}" - if w<600/zoom: return f"image too narrow for a page image {binary.shape}" - if w>20000/zoom: return f"image too wide for a page image {binary.shape}" + if h<600/zoom: return "image not tall enough for a page image %s"%(binary.shape,) + if h>20000/zoom: return "image too tall for a page image %s"%(binary.shape,) + if w<600/zoom: return "image too narrow for a page image %s"%(binary.shape,) + if w>20000/zoom: return "image too wide for a page image %s"%(binary.shape,) return None # zoom factor (DPI relative) and 4 (against fragmentation from binarization) slots = int(w*h*1.0/(30*30)*zoom*zoom) * 4 _,ncomps = measurements.label(binary) - if ncomps<10: return f"too few connected components for a page image (got {ncomps})" - if ncomps>slots and ncomps>10: return f"too many connected components for a page image ({ncomps} > {slots})" + if ncomps<10: return "too few connected components for a page image (got %d)"%(ncomps,) + if ncomps>slots and ncomps>10: return "too many connected components for a page image (%d > %d)"%(ncomps,slots) return None def odd(num): @@ -479,13 +476,8 @@ def compute_images(binary, scale, maximages=5): #images = morph.rb_closing(images, (d0,d1)) #DSAVE('images1_closed', images+0.6*binary) # 1- filter largest connected components - binary_0_6 = 0.6 * binary - odd_scale = odd(scale) - odd_half_scale = odd(scale / 2) - odd_doubled_scale = odd(2 * scale) - region_min = (4 * scale) ** 2 - images = morph.select_regions(images, sl.area, min=region_min, nbest=2 * maximages) - DSAVE('images1_large', images + binary_0_6) + images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=2*maximages) + DSAVE('images1_large', images+0.6*binary) if not images.any(): return np.zeros_like(binary, int) # 2- open horizontally and vertically to suppress @@ -494,31 +486,31 @@ def compute_images(binary, scale, maximages=5): # single frame, because then the hull polygon # can cover/overlap large text/table parts which # we cannot discern from the actual image anymore - h_opened = morph.rb_opening(images, (1, odd_half_scale)) - DSAVE('images2_h-opened', h_opened + binary_0_6) - v_opened = morph.rb_opening(images, (odd_half_scale, 1)) - DSAVE('images2_v-opened', v_opened + binary_0_6) + h_opened = morph.rb_opening(images, (1, odd(scale/2))) + DSAVE('images2_h-opened', h_opened+0.6*binary) + v_opened = morph.rb_opening(images, (odd(scale/2), 1)) + DSAVE('images2_v-opened', v_opened+0.6*binary) # 3- close whatever remains - closed = morph.rb_closing(h_opened&v_opened, (odd_doubled_scale, odd_doubled_scale)) - DSAVE('images3_closed', closed + binary_0_6) + closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale), odd(2*scale))) + DSAVE('images3_closed', closed + 0.6*binary) # 4- reconstruct the losses up to a certain distance # to avoid creeping into pure h/v-lines again but still # cover most of the large object #images = np.where(images, closed, 2) #images = morph.spread_labels(images, maxdist=scale) % 2 | closed images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale) - DSAVE('images4_reconstructed', images + binary_0_6) + DSAVE('images4_reconstructed', images+0.6*binary) # 5- select nbest - images = morph.select_regions(images, sl.area, min=region_min, nbest=maximages) - DSAVE('images5_selected', images + binary_0_6) + images = morph.select_regions(images, sl.area, min=(4*scale)**2, nbest=maximages) + DSAVE('images5_selected', images+0.6*binary) if not images.any(): return np.zeros_like(binary, int) # 6- dilate a little to get a smooth contour without gaps - dilated = morph.r_dilation(images, (odd_scale, odd_scale)) + dilated = morph.r_dilation(images, (odd(scale), odd(scale))) images = morph.propagate_labels_majority(binary, dilated+1) images = morph.spread_labels(images, maxdist=scale)==2 images, _ = morph.label(images) - DSAVE('images6_dilated', images + binary_0_6) + DSAVE('images6_dilated', images+0.6*binary) # we could repeat reconstruct-dilate here... return images @@ -556,7 +548,6 @@ def compute_seplines(binary, scale, maxseps=0): sepsizes = [0] sepslices = [None] sepdists = [0] - doubled_scale = 2 * scale for label in range(1, nlabels + 1): labelslice = slices[label] labelmask = labels == label @@ -608,8 +599,8 @@ def compute_seplines(binary, scale, maxseps=0): binmask = sublabels == bin + 1 binlabels, nbinlabels = morph.label(binmask) _, binlabelcounts = np.unique(binlabels, return_counts=True) - largemask = (binlabelcounts > doubled_scale)[binlabels] - smallmask = (binlabelcounts <= doubled_scale)[binlabels] + largemask = (binlabelcounts > 2 * scale)[binlabels] + smallmask = (binlabelcounts <= 2 * scale)[binlabels] sublabels2[binmask & smallmask] = 1 if not np.any(binmask & largemask): continue @@ -1852,13 +1843,11 @@ def find_topological(): else: llab[box] = lbinary[box] # show projection at the sides - log_y = -10 * np.log(y + 1e-9) - log_x = -10 * np.log(x + 1e-9) - for i in range(int(scale / 2)): - llab[box[0], box[1].start + i] = log_y - llab[box[0], box[1].stop - 1 - i] = log_y - llab[box[0].start + i, box[1]] = log_x - llab[box[0].stop - 1 - i, box[1]] = log_x + for i in range(int(scale/2)): + llab[box[0],box[1].start+i] = -10*np.log(y+1e-9) + llab[box[0],box[1].stop-1-i] = -10*np.log(y+1e-9) + llab[box[0].start+i,box[1]] = -10*np.log(x+1e-9) + llab[box[0].stop-1-i,box[1]] = -10*np.log(x+1e-9) DSAVE('recursive_x_y_cut_' + (partition_type or 'sliced'), llab) gap_weights = list() for is_horizontal, profile in enumerate([y, x]): @@ -1888,19 +1877,19 @@ def find_topological(): weights = weights * (1 + 0.5 * props['peak_heights']/gap_height) gap_weights.append((gaps, weights)) if debug: - orientation = 'horizontal' if is_horizontal else 'vertical' - LOG.debug(f' {orientation} gaps {gaps} {props} weights {weights}') + LOG.debug(' {} gaps {} {} weights {}'.format( + 'horizontal' if is_horizontal else 'vertical', + gaps, props, weights)) if not gaps.shape[0]: continue - half_scale = int(scale / 2) for start, stop, height in sorted(zip( props['left_ips'].astype(int), props['right_ips'].astype(int), props['peak_heights']), key=lambda x: x[2]): if is_horizontal: - llab[box[0].start+half_scale:box[0].stop-half_scale,box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) + llab[box[0].start+int(scale/2):box[0].stop-int(scale/2),box[1].start+start:box[1].start+stop] = -10*np.log(-height+1e-9) else: - llab[box[0].start+start:box[0].start+stop,box[1].start+half_scale:box[1].stop-half_scale] = -10*np.log(-height+1e-9) + llab[box[0].start+start:box[0].start+stop,box[1].start+int(scale/2):box[1].stop-int(scale/2)] = -10*np.log(-height+1e-9) DSAVE('recursive_x_y_cut_gaps_' + ('h' if is_horizontal else 'v'), llab) # heuristic (not strict) decision on x or y cut, # factors to consider: @@ -1927,27 +1916,32 @@ def find_topological(): # are not allowed y_gaps, y_weights = gap_weights[0][0], gap_weights[0][1] x_gaps, x_weights = gap_weights[1][0], gap_weights[1][1] - if debug: LOG.debug(f' all y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' all y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) # suppress cuts that significantly split any line labels - min_line_scale = min_line * scale y_allowed = [not(np.any(np.intersect1d( # significant line labels above - np.nonzero(np.bincount(lbin[:gap,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], + np.nonzero(np.bincount(lbin[:gap,:].flatten(), + minlength=len(objects))[1:] > min_line * scale)[0], # significant line labels below - np.nonzero(np.bincount(lbin[gap:,:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], - assume_unique=True))) for gap in y_gaps] + np.nonzero(np.bincount(lbin[gap:,:].flatten(), + minlength=len(objects))[1:] > min_line * scale)[0], + assume_unique=True))) + for gap in y_gaps] x_allowed = [not(np.any(np.intersect1d( # significant line labels left - np.nonzero(np.bincount(lbin[:,:gap].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], + np.nonzero(np.bincount(lbin[:,:gap].flatten(), + minlength=len(objects))[1:] > min_line * scale)[0], # significant line labels right - np.nonzero(np.bincount(lbin[:,gap:].flatten(), minlength=len(objects))[1:] > min_line_scale)[0], - assume_unique=True))) for gap in x_gaps] + np.nonzero(np.bincount(lbin[:,gap:].flatten(), + minlength=len(objects))[1:] > min_line * scale)[0], + assume_unique=True))) + for gap in x_gaps] y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(f' allowed y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' allowed y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) y_prominence = np.amax(y_weights, initial=0) x_prominence = np.amax(x_weights, initial=0) - if debug: LOG.debug(f' y_prominence {y_prominence} x_prominence {x_prominence}') + if debug: LOG.debug(' y_prominence {} x_prominence {}'.format(y_prominence, x_prominence)) # suppress less prominent peaks (another heuristic...) # they must compete with the other direction next time # (when already new cuts or partitions will become visible) @@ -1955,30 +1949,33 @@ def find_topological(): x_allowed = x_weights > 0.8 * x_prominence y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(f' prominent y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) if npartitions > 0: # TODO this can be avoided when backtracking below # suppress peaks creating fewer partitions than others -- # how large in our preferred direction will the new partitions # of sepmask in both slices created by each cut candidate # add up? - y_partitionscores = [sum(map( - sl.height if prefer_vertical else sl.width, - morph.find_objects(morph.label(partitions[:gap, :] > 0)[0]) + - morph.find_objects(morph.label(partitions[gap:, :] > 0)[0]))) - for gap in y_gaps] - x_partitionscores = [sum(map( - sl.height if prefer_vertical else sl.width, - morph.find_objects(morph.label(partitions[:, : gap] > 0)[0]) + - morph.find_objects(morph.label(partitions[:, gap :] > 0)[0]))) - for gap in x_gaps] - if debug: LOG.debug(f' y_partitionscores {y_partitionscores} x_partitionscores {x_partitionscores}') + y_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width, + morph.find_objects(morph.label( + partitions[:gap,:]>0)[0]) + + morph.find_objects(morph.label( + partitions[gap:,:]>0)[0]))) + for gap in y_gaps] + x_partitionscores = [sum(map(sl.height if prefer_vertical else sl.width, + morph.find_objects(morph.label( + partitions[:,:gap]>0)[0]) + + morph.find_objects(morph.label( + partitions[:,gap:]>0)[0]))) + for gap in x_gaps] + if debug: LOG.debug(' y_partitionscores {} x_partitionscores {}'.format( + y_partitionscores, x_partitionscores)) # Now identify those gaps with the largest overall score y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0) x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0) y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(f' most partitioning y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' most partitioning y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) else: y_partitionscores = None x_partitionscores = None @@ -1989,7 +1986,7 @@ def find_topological(): x_allowed = x_weights > 0.9 * x_prominence y_gaps, y_weights = y_gaps[y_allowed], y_weights[y_allowed] x_gaps, x_weights = x_gaps[x_allowed], x_weights[x_allowed] - if debug: LOG.debug(f' prominent y_gaps {y_gaps} x_gaps {x_gaps}') + if debug: LOG.debug(' prominent y_gaps {} x_gaps {}'.format(y_gaps, x_gaps)) # decide which direction, x or y # TODO: this most likely needs a backtracking mechanism @@ -2055,7 +2052,7 @@ def find_topological(): llab2[box] = partitions DSAVE('recursive_x_y_cut_partitions', llab2) for label in range(1, npartitions+1): - LOG.debug(f'next partition %d on %s', label, box) + LOG.debug('next partition %d on %s', label, box) recursive_x_y_cut(box, mask=partitions==label, partition_type=new_partition_type) return @@ -2063,9 +2060,10 @@ def find_topological(): # no gaps left finalize() return - orientation = 'vertical' if choose_vertical else 'horizontal' # otherwise: cut on gaps - LOG.debug(f'cutting {orientation}ly on {box} into {gaps}') + LOG.debug('cutting %s on %s into %s', 'vertically' + if choose_vertical else 'horizontally', + box, gaps) cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim))) if choose_vertical: if rl: @@ -2080,7 +2078,9 @@ def find_topological(): sub = sl.box(0, len(y), start, stop) else: # "cut in horizontal direction" sub = sl.box(start, stop, 0, len(x)) - LOG.debug(f'next {orientation} block on {box} is {sub}') + LOG.debug('next %s block on %s is %s', 'horizontal' + if choose_vertical else 'vertical', + box, sub) recursive_x_y_cut(sl.compose(box,sub), mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray) else None) From 2d8650ed51f5e9cc627d95ae5aea217b9f7bacb6 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Fri, 23 Aug 2024 15:15:50 +0200 Subject: [PATCH 157/194] remove whitespaces in ocropy.common and ocropy.ocrolib --- ocrd_cis/ocropy/common.py | 18 +++++++++--------- ocrd_cis/ocropy/ocrolib/morph.py | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index c23e89b9..c5b56ed0 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -189,8 +189,8 @@ def estimate_thresholds(flat, bignore=0.1, escale=1.0, lo=5, hi=90): # significant variance; this makes the percentile # based low and high estimates more reliable e = escale - v = est - filters.gaussian_filter(est, e*20.0) - v = filters.gaussian_filter(v ** 2, e*20.0) ** 0.5 + v = est - filters.gaussian_filter(est, e * 20.0) + v = filters.gaussian_filter(v ** 2, e * 20.0) ** 0.5 v = (v > 0.3 * np.amax(v)) v = morphology.binary_dilation(v, structure=np.ones((int(e * 50), 1))) v = morphology.binary_dilation(v, structure=np.ones((1, int(e * 50)))) @@ -491,8 +491,8 @@ def compute_images(binary, scale, maximages=5): v_opened = morph.rb_opening(images, (odd(scale/2), 1)) DSAVE('images2_v-opened', v_opened+0.6*binary) # 3- close whatever remains - closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale), odd(2*scale))) - DSAVE('images3_closed', closed + 0.6*binary) + closed = morph.rb_closing(h_opened&v_opened, (odd(2*scale),odd(2*scale))) + DSAVE('images3_closed', closed+0.6*binary) # 4- reconstruct the losses up to a certain distance # to avoid creeping into pure h/v-lines again but still # cover most of the large object @@ -501,12 +501,12 @@ def compute_images(binary, scale, maximages=5): images = morph.rb_reconstruction(closed, images, step=2, maxsteps=scale) DSAVE('images4_reconstructed', images+0.6*binary) # 5- select nbest - images = morph.select_regions(images, sl.area, min=(4*scale)**2, nbest=maximages) + images = morph.select_regions(images,sl.area,min=(4*scale)**2,nbest=maximages) DSAVE('images5_selected', images+0.6*binary) if not images.any(): return np.zeros_like(binary, int) # 6- dilate a little to get a smooth contour without gaps - dilated = morph.r_dilation(images, (odd(scale), odd(scale))) + dilated = morph.r_dilation(images, (odd(scale),odd(scale))) images = morph.propagate_labels_majority(binary, dilated+1) images = morph.spread_labels(images, maxdist=scale)==2 images, _ = morph.label(images) @@ -1969,7 +1969,7 @@ def find_topological(): partitions[:,gap:]>0)[0]))) for gap in x_gaps] if debug: LOG.debug(' y_partitionscores {} x_partitionscores {}'.format( - y_partitionscores, x_partitionscores)) + y_partitionscores, x_partitionscores)) # Now identify those gaps with the largest overall score y_allowed = y_partitionscores == np.max(y_partitionscores, initial=0) x_allowed = x_partitionscores == np.max(x_partitionscores, initial=0) @@ -2062,7 +2062,7 @@ def find_topological(): return # otherwise: cut on gaps LOG.debug('cutting %s on %s into %s', 'vertically' - if choose_vertical else 'horizontally', + if choose_vertical else 'horizontally', box, gaps) cuts = list(zip(np.insert(gaps, 0, 0), np.append(gaps, lim))) if choose_vertical: @@ -2079,7 +2079,7 @@ def find_topological(): else: # "cut in horizontal direction" sub = sl.box(start, stop, 0, len(x)) LOG.debug('next %s block on %s is %s', 'horizontal' - if choose_vertical else 'vertical', + if choose_vertical else 'vertical', box, sub) recursive_x_y_cut(sl.compose(box,sub), mask=sl.cut(mask,sub) if isinstance(mask, np.ndarray) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index f7ccdc31..7d6ffc85 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -349,10 +349,10 @@ def all_neighbors(image, dist=1, bg=NaN): q = 100000 assert amax(image)=0 - u = unique(q*image+shift(image, (dist, 0), order=0, cval=bg)) - d = unique(q*image+shift(image, (-dist, 0), order=0, cval=bg)) - l = unique(q*image+shift(image, (0, dist), order=0, cval=bg)) - r = unique(q*image+shift(image, (0, -dist), order=0, cval=bg)) + u = unique(q*image+shift(image,(dist,0),order=0,cval=bg)) + d = unique(q*image+shift(image,(-dist,0),order=0,cval=bg)) + l = unique(q*image+shift(image,(0,dist),order=0,cval=bg)) + r = unique(q*image+shift(image,(0,-dist),order=0,cval=bg)) all = unique(r_[u,d,l,r]) all = all[all!=bg] all = c_[all//q,all%q] From 9a153b079a3684bf875b306ba8eaad9e1637eeed Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 02:01:36 +0200 Subject: [PATCH 158/194] postcorrect: adapt to frozendict Processor.parameter in v3 --- ocrd_cis/__init__.py | 1 - ocrd_cis/align/cli.py | 1 - ocrd_cis/ocrd_tool.py | 6 ---- ocrd_cis/ocropy/binarize.py | 6 +--- ocrd_cis/ocropy/clip.py | 9 +----- ocrd_cis/ocropy/denoise.py | 8 +----- ocrd_cis/ocropy/deskew.py | 8 +----- ocrd_cis/ocropy/dewarp.py | 4 --- ocrd_cis/ocropy/recognize.py | 7 ++--- ocrd_cis/ocropy/resegment.py | 9 +----- ocrd_cis/ocropy/segment.py | 47 +++++++++++++++--------------- ocrd_cis/ocropy/train.py | 7 +---- ocrd_cis/postcorrect/cli.py | 55 +++++++++++++++++++++--------------- 13 files changed, 63 insertions(+), 105 deletions(-) delete mode 100644 ocrd_cis/ocrd_tool.py diff --git a/ocrd_cis/__init__.py b/ocrd_cis/__init__.py index 6f37f4f7..9d22fe3e 100644 --- a/ocrd_cis/__init__.py +++ b/ocrd_cis/__init__.py @@ -1,3 +1,2 @@ from .javaprocess import JavaAligner from .javaprocess import JavaPostCorrector -from .ocrd_tool import get_ocrd_tool diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index f5e47785..5706461e 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -11,7 +11,6 @@ from ocrd import Processor, OcrdPage, OcrdPageResult from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor -from ocrd_utils import getLogger from ocrd_utils import getLevelName from ocrd_models.ocrd_page import TextRegionType, TextEquivType from ocrd_cis import JavaAligner diff --git a/ocrd_cis/ocrd_tool.py b/ocrd_cis/ocrd_tool.py deleted file mode 100644 index 36cb9d7e..00000000 --- a/ocrd_cis/ocrd_tool.py +++ /dev/null @@ -1,6 +0,0 @@ -import json -from ocrd_utils import resource_string - - -def get_ocrd_tool(): - return json.loads(resource_string(__name__, 'ocrd-tool.json')) diff --git a/ocrd_cis/ocropy/binarize.py b/ocrd_cis/ocropy/binarize.py index 35b28c5a..9a55301d 100644 --- a/ocrd_cis/ocropy/binarize.py +++ b/ocrd_cis/ocropy/binarize.py @@ -8,8 +8,7 @@ from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from . import common from .common import array2pil, determine_zoom, pil2array, remove_noise @@ -51,14 +50,11 @@ def binarize(logger: Logger, pil_image, method='ocropy', maxskew=2, threshold=0. return Image.fromarray(th), 0 class OcropyBinarize(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-binarize' def setup(self): - self.logger = getLogger('processor.OcropyBinarize') method = self.parameter['method'] if self.parameter['grayscale'] and method != 'ocropy': self.logger.critical(f'Requested method {method} does not support grayscale normalized output') diff --git a/ocrd_cis/ocropy/clip.py b/ocrd_cis/ocropy/clip.py index b81c731c..18a0c115 100644 --- a/ocrd_cis/ocropy/clip.py +++ b/ocrd_cis/ocropy/clip.py @@ -8,13 +8,11 @@ from shapely.prepared import prep from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from ocrd_utils import ( bbox_from_polygon, coordinates_of_segment, crop_image, - getLogger, image_from_polygon, polygon_from_points, polygon_mask, @@ -25,15 +23,10 @@ class OcropyClip(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-clip' - def setup(self): - self.logger = getLogger('processor.OcropyClip') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: str = None) -> OcrdPageResult: """Clip text regions / lines of a page at intersections with neighbours. diff --git a/ocrd_cis/ocropy/denoise.py b/ocrd_cis/ocropy/denoise.py index eb3e7d23..eaed74df 100644 --- a/ocrd_cis/ocropy/denoise.py +++ b/ocrd_cis/ocropy/denoise.py @@ -4,21 +4,15 @@ from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from .common import determine_zoom, remove_noise class OcropyDenoise(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-denoise' - def setup(self): - self.logger = getLogger('processor.OcropyDenoise') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Despeckle the pages / regions / lines of the workspace. diff --git a/ocrd_cis/ocropy/deskew.py b/ocrd_cis/ocropy/deskew.py index 7bdbba2d..b02c69d5 100644 --- a/ocrd_cis/ocropy/deskew.py +++ b/ocrd_cis/ocropy/deskew.py @@ -4,8 +4,7 @@ from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage, PageType -from ocrd import Processor -from ocrd.processor import OcrdPageResult, OcrdPageResultImage +from ocrd import Processor, OcrdPageResult, OcrdPageResultImage from . import common from .common import pil2array @@ -16,15 +15,10 @@ def deskew(pil_image, maxskew=2): return angle class OcropyDeskew(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-deskew' - def setup(self): - self.logger = getLogger('processor.OcropyDeskew') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Deskew the pages or regions of the workspace. diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index 17d0b4ce..e33ce024 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -5,7 +5,6 @@ from ocrd import Processor from ocrd.processor import OcrdPageResult, OcrdPageResultImage -from ocrd_utils import getLogger from ocrd_models.ocrd_page import AlternativeImageType, OcrdPage from .ocrolib import lineest @@ -54,14 +53,11 @@ def padvert(image, range_): return array2pil(line) class OcropyDewarp(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-dewarp' def setup(self): - self.logger = getLogger('processor.OcropyDewarp') # defaults from ocrolib.lineest: self.lnorm = lineest.CenterNormalizer( params=(self.parameter['range'], diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 02d29e7c..85a76585 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -10,10 +10,9 @@ from rapidfuzz.distance import Levenshtein -from ocrd_utils import coordinates_for_segment, getLogger, points_from_polygon, polygon_from_bbox +from ocrd_utils import coordinates_for_segment, points_from_polygon, polygon_from_bbox from ocrd_models.ocrd_page import CoordsType, GlyphType, OcrdPage, TextEquivType, WordType -from ocrd import Processor -from ocrd.processor import OcrdPageResult +from ocrd import Processor, OcrdPageResult from .common import check_line, pil2array from .ocrolib import lstm, load_object, midrange @@ -67,7 +66,6 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): - logger: Logger network: Any pad: int @@ -76,7 +74,6 @@ def executable(self): return 'ocrd-cis-ocropy-recognize' def setup(self): - self.logger = getLogger('processor.OcropyRecognize') self.pad = 16 # from ocropus-rpred: self.network = load_object(self.get_model(), verbose=1) diff --git a/ocrd_cis/ocropy/resegment.py b/ocrd_cis/ocropy/resegment.py index c1809569..0fb133c0 100644 --- a/ocrd_cis/ocropy/resegment.py +++ b/ocrd_cis/ocropy/resegment.py @@ -9,7 +9,6 @@ from shapely.prepared import prep from ocrd_utils import ( - getLogger, coordinates_of_segment, coordinates_for_segment, points_from_polygon, @@ -17,8 +16,7 @@ transform_coordinates, ) from ocrd_models.ocrd_page import BaselineType, PageType, OcrdPage -from ocrd import Processor -from ocrd.processor import OcrdPageResult +from ocrd import Processor, OcrdPageResult from .ocrolib import midrange, morph from .common import ( @@ -43,15 +41,10 @@ ) class OcropyResegment(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-resegment' - def setup(self): - self.logger = getLogger('processor.OcropyResegment') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Resegment lines of the workspace. diff --git a/ocrd_cis/ocropy/segment.py b/ocrd_cis/ocropy/segment.py index b363cbd2..493deb30 100644 --- a/ocrd_cis/ocropy/segment.py +++ b/ocrd_cis/ocropy/segment.py @@ -16,7 +16,6 @@ from shapely import set_precision from ocrd_utils import ( - getLogger, coordinates_of_segment, coordinates_for_segment, points_from_polygon, @@ -243,21 +242,17 @@ def getx(xy): class OcropySegment(Processor): - logger: Logger - @property def executable(self): return 'ocrd-cis-ocropy-segment' - def setup(self): - self.logger = getLogger('processor.OcropySegment') - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Segment pages into regions+lines, tables into cells+lines, or regions into lines. - + Open and deserialise PAGE input file and its respective images, then iterate over the element hierarchy down to the requested level. - + + \b Depending on ``level-of-operation``, consider existing segments: - If ``overwrite_separators=True`` on ``page`` level, then delete any SeparatorRegions. @@ -270,12 +265,13 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - If ``overwrite_order=True`` on ``page`` or ``table`` level, then delete the reading order OrderedGroup entry corresponding to the (page/table) segment. - + Next, get each element image according to the layout annotation (from the alternative image of the page/region, or by cropping via coordinates into the higher-level image) in binarized form, and represent it as an array with non-text regions and (remaining) text neighbours suppressed. - + + \b Then compute a text line segmentation for that array (as a label mask). When ``level-of-operation`` is ``page`` or ``table``, this also entails detecting @@ -284,25 +280,26 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - up to ``maxcolseps`` background column separators before text line segmentation itself, as well as aggregating text lines to text regions afterwards. - + Text regions are detected via a hybrid variant recursive X-Y cut algorithm (RXYC): RXYC partitions the binarized image in top-down manner by detecting horizontal or vertical gaps. This implementation uses the bottom-up text line segmentation to guide the search, and also uses both pre-existing and newly detected separators to alternatively partition the respective boxes into non-rectangular parts. - + During line segmentation, suppress the foreground of all previously annotated regions (of any kind) and lines, except if just removed due to ``overwrite``. During region aggregation however, combine the existing separators with the new-found separators to guide the column search. - + All detected segments (both text line and text region) are sorted according to their reading order (assuming a top-to-bottom, left-to-right ordering). When ``level-of-operation`` is ``page``, prefer vertical (column-first) succession of regions. When it is ``table``, prefer horizontal (row-first) succession of cells. - + + \b Then for each resulting segment label, convert its background mask into polygon outlines by finding the outer contours consistent with the element's polygon outline. Annotate the result by adding it as a new TextLine/TextRegion: @@ -314,7 +311,7 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional - If it is ``page``, then append the new lines to their respective regions, and append the new regions to the page. (Also, create an OrderedGroup for it in the ReadingOrder.) - + Produce a new output file by serialising the resulting hierarchy. """ # FIXME: allow passing a-priori info on reading order / textline order @@ -495,13 +492,13 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non Given a PageType, TableRegionType or TextRegionType ``element``, and a corresponding binarized PIL.Image object ``image`` with coordinate metadata ``coords``, run line segmentation with Ocropy. - + If operating on the full page (or table), then also detect horizontal and vertical separators, and aggregate the lines into text regions afterwards. - + Add the resulting sub-segments to the parent ``element``. - + If ``ignore`` is not empty, then first suppress all foreground components in any of those segments' coordinates during segmentation, and if also in full page/table mode, then combine all separators among them with the @@ -773,7 +770,7 @@ def _process_element(self, element, ignore, image, coords, zoom=1.0, rogroup=Non def polygon_for_parent(polygon, parent): """Clip polygon to parent polygon range. - + (Should be moved to ocrd_utils.coordinates_for_segment.) """ childp = Polygon(polygon) @@ -986,7 +983,7 @@ def join_baselines(logger: Logger, baselines, loc=''): def page_get_reading_order(ro, rogroup): """Add all elements from the given reading order group to the given dictionary. - + Given a dict ``ro`` from layout element IDs to ReadingOrder element objects, and an object ``rogroup`` with additional ReadingOrder element objects, add all references to the dict, traversing the group recursively. @@ -1006,10 +1003,10 @@ def page_get_reading_order(ro, rogroup): def page_add_to_reading_order(rogroup, region_id, index=None): """Add a region reference to an un/ordered RO group. - + Given a ReadingOrder group ``rogroup`` (of any type), append a reference to region ``region_id`` to it. - + If ``index`` is given, use that as position and return incremented by one. (This must be an integer if ``rogroup`` is an OrderedGroup(Indexed). @@ -1025,16 +1022,16 @@ def page_add_to_reading_order(rogroup, region_id, index=None): def page_subgroup_in_reading_order(logger: Logger, roelem): """Replace given RO element by an equivalent OrderedGroup. - + Given a ReadingOrder element ``roelem`` (of any type), first look up its parent group. Remove it from the respective member list (of its region refs or un/ordered groups), even if it already was an OrderedGroup(Indexed). - + Then instantiate an empty OrderedGroup(Indexed), referencing the same region as ``roelem`` (and using the same index, if any). Add that group to the parent instead. - + Return the new group object. """ if not roelem: diff --git a/ocrd_cis/ocropy/train.py b/ocrd_cis/ocropy/train.py index 6c627231..78302f12 100644 --- a/ocrd_cis/ocropy/train.py +++ b/ocrd_cis/ocropy/train.py @@ -7,9 +7,7 @@ from os.path import abspath, dirname, exists, join, isfile from ocrd_models import OcrdPage -from ocrd import Processor, Workspace -from ocrd.processor import OcrdPageResult -from ocrd_utils import getLogger +from ocrd import Processor, Workspace, OcrdPageResult from .ocropus_rtrain import * from .binarize import binarize @@ -30,9 +28,7 @@ def resize_keep_ratio(image, baseheight=48): class OcropyTrain(Processor): - logger: Logger modelpath: str - old_cwd: str outputpath: str @property @@ -40,7 +36,6 @@ def executable(self): return 'ocrd-cis-ocropy-train' def setup(self): - self.logger = getLogger('processor.OcropyTrain') if 'model' in self.parameter: model = self.parameter['model'] try: diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index 71fbaad1..6759b96a 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -4,10 +4,9 @@ import click import json -from ocrd import Processor +from ocrd import Processor, Workspace from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor -from ocrd_utils import getLogger, getLevelName -from ocrd_models.ocrd_mets import OcrdMets +from ocrd_utils import getLevelName, pushd_popd from ocrd_cis import JavaPostCorrector @@ -21,26 +20,38 @@ class PostCorrector(Processor): def executable(self): return 'ocrd-cis-postcorrect' - def process(self): + def setup(self): + # since ocrd v3.0 we cannot overwrite self.parameter anymore + # because that gets validated against the schema + # (so these additions would fail) + self.params = dict(self.parameter) profiler = {} profiler["path"] = self.parameter["profilerPath"] profiler["config"] = self.parameter["profilerConfig"] profiler["noCache"] = True - self.parameter["profiler"] = profiler - self.parameter["runDM"] = True - self.logger.debug(json.dumps(self.parameter, indent=4)) - p = JavaPostCorrector(self.workspace.mets_target, - self.input_file_grp, - self.output_file_grp, - self.parameter, - getLevelName(self.logger.getEffectiveLevel())) - p.exe() - # reload the mets file to prevent run_processor's save_mets - # from overriding the results from the Java process - self.workspace.reload_mets() - # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): - for output_file in self.workspace.find_files(file_grp=self.output_file_grp): - flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') - flocat.attrib['LOCTYPE'] = 'OTHER' - flocat.attrib['OTHERLOCTYPE'] = 'FILE' - output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + self.params["profiler"] = profiler + self.params["runDM"] = True + self.logger.debug(json.dumps(self.params, indent=4)) + + def process_workspace(self, workspace: Workspace): + with pushd_popd(workspace.directory): + self.workspace = workspace + self.verify() + # this CLI call mimics the OCR-D processor CLI itself + # we have no control over its interior + # (we get no page-wise error handling and input downloading) + p = JavaPostCorrector(self.workspace.mets_target, + self.input_file_grp, + self.output_file_grp, + self.params, + getLevelName(self.logger.getEffectiveLevel())) + p.exe() + # reload the mets file to prevent run_processor's save_mets + # from overriding the results from the Java process + self.workspace.reload_mets() + # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): + for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + flocat.attrib['LOCTYPE'] = 'OTHER' + flocat.attrib['OTHERLOCTYPE'] = 'FILE' + output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) From bd0613a20fd4d7d88a466cc75f3e94be656f08bf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 26 Aug 2024 11:36:53 +0200 Subject: [PATCH 159/194] require ocrd>=3.0.0b1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 38f09abd..83cf28bb 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=3.0.0a1', + 'ocrd>=3.0.0b1', 'click', 'scipy', 'numpy>=1.17.0', From f6e437fc8d5ef7bbb51fa7b4f5d590a11c6fc627 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 14:46:41 +0200 Subject: [PATCH 160/194] add: simple github actions workflow --- .github/workflow/tests.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 .github/workflow/tests.yml diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml new file mode 100644 index 00000000..424409df --- /dev/null +++ b/.github/workflow/tests.yml @@ -0,0 +1,27 @@ +name: Test ocrd_cis installation and run tests + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + os: [ "ubuntu-22.04" ] + + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install ocrd_cis + run: make install + - name: Test ocrd_cis + run: make test From 403781a3c27e5fdb0cddcf311401dad1a24f83f8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 15:30:14 +0200 Subject: [PATCH 161/194] Update .github/workflow/tests.yml Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- .github/workflow/tests.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml index 424409df..24fa0bc7 100644 --- a/.github/workflow/tests.yml +++ b/.github/workflow/tests.yml @@ -2,9 +2,8 @@ name: Test ocrd_cis installation and run tests on: push: - branches: [ "master" ] pull_request: - branches: [ "master" ] + workflow_dispatch: jobs: build: From 97083bb71e724276385058bde9244cbdd21dce64 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 15:30:25 +0200 Subject: [PATCH 162/194] Update .github/workflow/tests.yml Co-authored-by: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> --- .github/workflow/tests.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml index 24fa0bc7..559297dd 100644 --- a/.github/workflow/tests.yml +++ b/.github/workflow/tests.yml @@ -20,7 +20,11 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '11' - name: Install ocrd_cis run: make install - name: Test ocrd_cis - run: make test + run: make test V="" From 2b20e0c44da924a5b15379d86eb557acdf42b1f3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 15:49:11 +0200 Subject: [PATCH 163/194] fix: checkout ref --- .github/workflow/tests.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml index 559297dd..f95a09a4 100644 --- a/.github/workflow/tests.yml +++ b/.github/workflow/tests.yml @@ -15,7 +15,10 @@ jobs: os: [ "ubuntu-22.04" ] steps: - - uses: actions/checkout@v4 + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v5 with: From 86a08eb5cc471eef536bc2d050e80f768a728e43 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:08:48 +0200 Subject: [PATCH 164/194] Create GH Actions workflow: test.yml --- .github/workflows/test.yml | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 .github/workflows/test.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 00000000..f95a09a4 --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,33 @@ +name: Test ocrd_cis installation and run tests + +on: + push: + pull_request: + workflow_dispatch: + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] + os: [ "ubuntu-22.04" ] + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '11' + - name: Install ocrd_cis + run: make install + - name: Test ocrd_cis + run: make test V="" From 1d7e9a0d5f72e66c92c07e15508ba330e130f6bb Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:18:40 +0200 Subject: [PATCH 165/194] delete: wrong path for workflows --- .github/workflow/tests.yml | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 .github/workflow/tests.yml diff --git a/.github/workflow/tests.yml b/.github/workflow/tests.yml deleted file mode 100644 index f95a09a4..00000000 --- a/.github/workflow/tests.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Test ocrd_cis installation and run tests - -on: - push: - pull_request: - workflow_dispatch: - -jobs: - build: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12" ] - os: [ "ubuntu-22.04" ] - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - ref: ${{ github.head_ref }} - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - uses: actions/setup-java@v4 - with: - distribution: 'zulu' - java-version: '11' - - name: Install ocrd_cis - run: make install - - name: Test ocrd_cis - run: make test V="" From 224e86f5467c7506882792fa03397cbe032f69c9 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:27:55 +0200 Subject: [PATCH 166/194] fix: NaN error for python3.9+ --- ocrd_cis/ocropy/ocrolib/morph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index 7d6ffc85..1ebfb204 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -343,7 +343,7 @@ def select_regions(binary,f,min=0,nbest=100000): return keep[labels] @checks(SEGMENTATION) -def all_neighbors(image, dist=1, bg=NaN): +def all_neighbors(image, dist=1, bg=float('nan')): """Given an image with labels, find all pairs of labels that are directly (up to ``dist``) neighboring each other, ignoring the label ``bg``.""" q = 100000 From a397531e549532675341c15b6c4a6fbef1f96818 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:29:37 +0200 Subject: [PATCH 167/194] fix: NaN in reading_order in morph.py --- ocrd_cis/ocropy/ocrolib/morph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/ocrolib/morph.py b/ocrd_cis/ocropy/ocrolib/morph.py index 1ebfb204..4b626e83 100644 --- a/ocrd_cis/ocropy/ocrolib/morph.py +++ b/ocrd_cis/ocropy/ocrolib/morph.py @@ -429,7 +429,7 @@ def reading_order(seg,rl=False,bt=False): segmap[1:] = 1 return segmap def pos(f,l): - return array([f(x) if x else nan for x in l]) + return array([f(x) if x else float('nan') for x in l]) ys = pos(sl.ycenter,objects) yorder = argsort(ys)[::-1 if bt else 1] groups = [[yorder[0]]] From 9cf83051b2f1875b0757eb1d81ff0a29b7f63047 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:18:36 +0200 Subject: [PATCH 168/194] fix type hints --- ocrd_cis/align/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index 5706461e..395f7b07 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -229,8 +229,8 @@ class Alignment: file_grp: str pcgts: OcrdPage region: TextRegionType - alignment: Alignment - def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: Alignment): + alignment: dict + def __init__(self, file_grp: str, pcgts: OcrdPage, region: TextRegionType, alignment: dict): self.file_grp = file_grp self.pcgts = pcgts self.region = region From a0c734dd3e357606bde1c121cd4e25c972087df6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:19:29 +0200 Subject: [PATCH 169/194] dewarp: make thread-safe --- ocrd_cis/ocropy/dewarp.py | 25 ++++++++++++------------- ocrd_cis/ocropy/ocrolib/lineest.py | 2 +- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/ocrd_cis/ocropy/dewarp.py b/ocrd_cis/ocropy/dewarp.py index e33ce024..a0d0ea5c 100644 --- a/ocrd_cis/ocropy/dewarp.py +++ b/ocrd_cis/ocropy/dewarp.py @@ -57,17 +57,6 @@ class OcropyDewarp(Processor): def executable(self): return 'ocrd-cis-ocropy-dewarp' - def setup(self): - # defaults from ocrolib.lineest: - self.lnorm = lineest.CenterNormalizer( - params=(self.parameter['range'], - self.parameter['smoothness'], - # let's not expose this for now - # (otherwise we must explain mutual - # dependency between smoothness - # and extra params) - 0.3)) - def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult: """Dewarp the lines of the workspace. @@ -94,6 +83,16 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional page_image, page_xywh, page_image_info = self.workspace.image_from_page(page, page_id) zoom = determine_zoom(self.logger, page_id, self.parameter['dpi'], page_image_info) + # defaults from ocrolib.lineest: + lnorm = lineest.CenterNormalizer( + params=(self.parameter['range'], + self.parameter['smoothness'], + # let's not expose this for now + # (otherwise we must explain mutual + # dependency between smoothness + # and extra params) + 0.3)) + regions = page.get_AllRegions(classes=['Text'], order='reading-order') if not regions: self.logger.warning(f'Page "{page_id}" contains no text regions') @@ -107,8 +106,8 @@ def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional self.logger.info(f"About to dewarp page '{page_id}' region '{region.id}' line '{line.id}'") try: dew_image = dewarp( - line_image, self.lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) - except InvalidLine as err: + line_image, lnorm, check=True, max_neighbour=self.parameter['max_neighbour'], zoom=zoom) + except (InvalidLine, AssertionError) as err: self.logger.error(f'Cannot dewarp line "{line.id}": {err}') continue except InadequateLine as err: diff --git a/ocrd_cis/ocropy/ocrolib/lineest.py b/ocrd_cis/ocropy/ocrolib/lineest.py index 42ef2237..392c7e4a 100644 --- a/ocrd_cis/ocropy/ocrolib/lineest.py +++ b/ocrd_cis/ocropy/ocrolib/lineest.py @@ -75,7 +75,7 @@ def measure(self,line): plt.plot(self.center) plt.ginput(1,1000) def dewarp(self,img,cval=0,dtype=np.dtype('f')): - assert img.shape==self.shape + assert img.shape==self.shape, f"input shape {img.shape} deviates from measured shape {self.shape}" h,w = img.shape # The actual image img is embedded into a larger image by # adding vertical space on top and at the bottom (padding) From 66baaf07f60532185a41ea606c31964ee046c8ba Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:21:19 +0200 Subject: [PATCH 170/194] recognize: disallow multithreading (impossible with current lstm implementation) --- ocrd_cis/ocropy/recognize.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 85a76585..97bec8a7 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -45,7 +45,7 @@ def recognize(image, pad, network, check=True): pred = network.predictString(line) # getting confidence - result = lstm.translate_back(network.outputs, pos=1) + result = lstm.translate_back(network.outputs, pos=1) # raw positions scale = len(raw_line.T) * 1.0 / (len(network.outputs) - 2 * pad) clist = [] @@ -68,6 +68,8 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): network: Any pad: int + # lstm is not thread-safe (.outputs, .last_n as side effects etc) + max_workers = 1 @property def executable(self): @@ -191,7 +193,7 @@ def process_lines(self, textlines, maxlevel, region_image, region_xywh): try: linepred, clist, rlist, confidlist = recognize(final_img, self.pad, self.network, check=True) except Exception as err: - self.logger.debug(f'Error processing line "{line.id}": {err}') + self.logger.debug(f'Error processing line "{line.id}": {str(err) or err.__class__.__name__}') continue self.logger.debug(f"OCR '{line.id}': '{linepred}'") edits += Levenshtein.distance(linepred, linegt) From 32ce6560d9c1e10fdfd00055e567b0fe13187404 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:22:14 +0200 Subject: [PATCH 171/194] postcorrect: make work under METS Server --- ocrd_cis/postcorrect/cli.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/ocrd_cis/postcorrect/cli.py b/ocrd_cis/postcorrect/cli.py index 6759b96a..70918de7 100644 --- a/ocrd_cis/postcorrect/cli.py +++ b/ocrd_cis/postcorrect/cli.py @@ -1,12 +1,14 @@ from __future__ import absolute_import import os +import json import click -import json from ocrd import Processor, Workspace from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor from ocrd_utils import getLevelName, pushd_popd +from ocrd_models import OcrdMets + from ocrd_cis import JavaPostCorrector @@ -37,6 +39,8 @@ def process_workspace(self, workspace: Workspace): with pushd_popd(workspace.directory): self.workspace = workspace self.verify() + # ensure that input files are referenced in on-disk METS + self.workspace.save_mets() # this CLI call mimics the OCR-D processor CLI itself # we have no control over its interior # (we get no page-wise error handling and input downloading) @@ -46,12 +50,23 @@ def process_workspace(self, workspace: Workspace): self.params, getLevelName(self.logger.getEffectiveLevel())) p.exe() - # reload the mets file to prevent run_processor's save_mets - # from overriding the results from the Java process - self.workspace.reload_mets() # workaround for cisocrgroup/ocrd-postcorrection#13 (absolute paths in output): - for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + # We cannot do that with this method, because our self.workspace.mets might be + # a ClientSideOcrdMets, which does not allow modifying or removing files: + # for output_file in self.workspace.find_files(file_grp=self.output_file_grp): + # flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') + # flocat.attrib['LOCTYPE'] = 'OTHER' + # flocat.attrib['OTHERLOCTYPE'] = 'FILE' + # output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + # So instead, let's post-process the local METS file result directly: + mets = OcrdMets(filename=self.workspace.mets_target) + for output_file in mets.find_files(fileGrp=self.output_file_grp): flocat = output_file._el.find('{http://www.loc.gov/METS/}FLocat') flocat.attrib['LOCTYPE'] = 'OTHER' flocat.attrib['OTHERLOCTYPE'] = 'FILE' output_file.local_filename = os.path.relpath(output_file.local_filename, self.workspace.directory) + with open(self.workspace.mets_target, 'w') as f: + f.write(mets.to_xml(xmllint=True).decode('utf-8')) + # reload the mets file to prevent run_processor's save_mets + # from overriding the results from the Java process + self.workspace.reload_mets() From c4a5999d905d23a8e347eed2b257363c0c2545af Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:24:41 +0200 Subject: [PATCH 172/194] tests: use METS Server if OCRD_MAX_PARALLEL_PAGES>1 --- tests/run_add_zip_test.bash | 5 +-- tests/run_alignment_test.bash | 5 +-- tests/run_image_preprocessing_test.bash | 15 +++++---- tests/run_ocr_test.bash | 7 ++-- tests/run_postcorrection_test.bash | 19 +++++------ tests/run_training_test.bash | 7 ++-- tests/test_lib.bash | 43 ++++++++++++++++++++----- 7 files changed, 68 insertions(+), 33 deletions(-) diff --git a/tests/run_add_zip_test.bash b/tests/run_add_zip_test.bash index 02de2db2..e2d44983 100644 --- a/tests/run_add_zip_test.bash +++ b/tests/run_add_zip_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-GT-SEG-LINE); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-GT-SEG-LINE); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -16,9 +16,10 @@ popd # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G OCR-D-IMG); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-IMG); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" popd + diff --git a/tests/run_alignment_test.bash b/tests/run_alignment_test.bash index e8a3c79a..7a82254b 100644 --- a/tests/run_alignment_test.bash +++ b/tests/run_alignment_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -17,9 +17,10 @@ ocrd_cis_align pushd $tmpws found_files=0 -for file in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-ALIGN); do [[ -f "$file" ]] || fail "cannot find aligned file group workspace" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" popd + diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_image_preprocessing_test.bash index f80fc636..7a66a57b 100644 --- a/tests/run_image_preprocessing_test.bash +++ b/tests/run_image_preprocessing_test.bash @@ -7,16 +7,17 @@ ocrd_cis_init_ws "blumenbach_anatomie_1805.ocrd.zip" # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done (( found_files == 3 )) || fail "invalid number of files: $found_files" -ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN -ocrd-cis-ocropy-clip -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP -ocrd-cis-ocropy-denoise -l DEBUG -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN -ocrd-cis-ocropy-deskew -l DEBUG -I OCR-D-CIS-IMG-DEN -O OCR-D-CIS-IMG-DES -ocrd-cis-ocropy-dewarp -l DEBUG -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW -ocrd-cis-ocropy-segment -l DEBUG -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN +ocrd-cis-ocropy-clip ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-IMG-CLIP +ocrd-cis-ocropy-denoise ${ARGS[*]} -I OCR-D-CIS-IMG-CLIP -O OCR-D-CIS-IMG-DEN +ocrd-cis-ocropy-deskew ${ARGS[*]} -I OCR-D-CIS-IMG-DEN -O OCR-D-CIS-IMG-DES +ocrd-cis-ocropy-dewarp ${ARGS[*]} -I OCR-D-CIS-IMG-DES -O OCR-D-CIS-IMG-DEW +ocrd-cis-ocropy-segment ${ARGS[*]} -I OCR-D-CIS-IMG-DEW -O OCR-D-CIS-IMG-SEG popd diff --git a/tests/run_ocr_test.bash b/tests/run_ocr_test.bash index b10f6f6d..f737ae43 100644 --- a/tests/run_ocr_test.bash +++ b/tests/run_ocr_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -16,8 +16,9 @@ done ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz # run ocr -ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN -ocrd-cis-ocropy-recognize -l DEBUG -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \ +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN +ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR \ -P textequiv_level word -P model fraktur.pyrnn.gz popd diff --git a/tests/run_postcorrection_test.bash b/tests/run_postcorrection_test.bash index d7f34ace..859c8407 100644 --- a/tests/run_postcorrection_test.bash +++ b/tests/run_postcorrection_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -15,25 +15,26 @@ popd ocrd_cis_align -mkdir "$tmpdir/bin" -cat > "$tmpdir/bin/profiler.bash" < "bin/profiler.bash" < /dev/null echo '{}' EOF -chmod a+x "$tmpdir/bin/profiler.bash" -ocrd-cis-postcorrect -l DEBUG \ +chmod a+x "bin/profiler.bash" + +ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) +ocrd-cis-postcorrect ${ARGS[*]} \ -I OCR-D-CIS-ALIGN \ -O OCR-D-CIS-POSTCORRECT \ - -m $tmpws/mets.xml \ - -P profilerPath $tmpdir/bin/profiler.bash \ + -P profilerPath bin/profiler.bash \ -P profilerConfig ignored \ -P model "$(ocrd-cis-data -model)" \ -P nOCR 2 -pushd $tmpws found_files=0 -for file in $(ocrd workspace find -G OCR-D-CIS-POSTCORRECT); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G OCR-D-CIS-POSTCORRECT); do [[ -f "$file" ]] || fail "$file: not a file" found_files=$((found_files + 1)) done diff --git a/tests/run_training_test.bash b/tests/run_training_test.bash index ade1b68e..5b96dc3e 100644 --- a/tests/run_training_test.bash +++ b/tests/run_training_test.bash @@ -6,7 +6,7 @@ ocrd_cis_init_ws blumenbach_anatomie_1805.ocrd.zip # test if there are 3 gt files pushd "$tmpws" found_files=0 -for file in $(ocrd workspace find -G $OCRD_CIS_FILEGRP); do +for file in $(ocrd ${OCRD_LOG_ARGS[*]} workspace ${OCRD_WS_ARGS[*]} find -G $OCRD_CIS_FILEGRP); do [[ -f "$file" ]] || fail "cannot find ground truth file: $file" found_files=$((found_files + 1)) done @@ -15,9 +15,12 @@ popd ocrd_cis_align +stopserver +OCRD_MAX_PARALLEL_PAGES=1 + # fix ocr for some entries (otherwise the training will fail) pushd $tmpws -for f in $(ocrd workspace find -G OCR-D-CIS-ALIGN); do +for f in $(ocrd ${OCRD_LOG_ARGS[*]} workspace find -G OCR-D-CIS-ALIGN); do sed -i -e 's#e.#Säugethiere.#' $f sed -i -e 's#E#Säugethieren#' $f done diff --git a/tests/test_lib.bash b/tests/test_lib.bash index 801be01a..76111d25 100644 --- a/tests/test_lib.bash +++ b/tests/test_lib.bash @@ -1,10 +1,27 @@ #/bin/bash tmpdir=$(mktemp -d) -trap "trap 'echo exiting without removing $tmpdir' EXIT" ERR -trap "rm -rf $tmpdir" EXIT +function stopserver() { + : +} +function failexit() { + stopserver +} +function cleanexit() { + stopserver + rm -rf $tmpdir +} +trap "trap failexit EXIT" ERR +trap cleanexit EXIT + +OCRD_LOG_ARGS=() +if test -v OCRD_OVERRIDE_LOGLEVEL; then + OCRD_LOG_ARGS+=(-l $OCRD_OVERRIDE_LOGLEVEL) +fi +OCRD_WS_ARGS=() # -m mets.xml OCRD_CIS_FILEGRP="OCR-D-GT-SEG-LINE" + data_url="https://github.com/OCR-D/gt_structure_text/releases/download/v1.5.0/" function ocrd_cis_download_bagit() { local url="$data_url/$1" @@ -16,22 +33,32 @@ function ocrd_cis_init_ws() { ocrd_cis_download_bagit "$1" ocrd zip spill -d "$tmpdir" "$PWD/download/$1" tmpws="$tmpdir/${1%.ocrd.zip}" + if ((${OCRD_MAX_PARALLEL_PAGES:-0} > 1)); then + echo starting METS server at $tmpws + ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server start & + OCRD_WS_ARGS+=(-U "$tmpws/mets.sock") + sleep 1 + function stopserver() { + echo stopping METS server at $tmpws + ocrd workspace -d "$tmpws" -U "$tmpws/mets.sock" server stop || true + } + fi } + function ocrd_cis_align() { # download ocr models ocrd resmgr download ocrd-cis-ocropy-recognize fraktur.pyrnn.gz ocrd resmgr download ocrd-cis-ocropy-recognize fraktur-jze.pyrnn.gz # run ocr pushd $tmpws - ocrd-cis-ocropy-binarize -l DEBUG -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN - ocrd-cis-ocropy-recognize -l DEBUG \ - -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \ + ARGS=(${OCRD_LOG_ARGS[*]} ${OCRD_WS_ARGS[*]}) + ocrd-cis-ocropy-binarize ${ARGS[*]} -I $OCRD_CIS_FILEGRP -O OCR-D-CIS-IMG-BIN + ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-1 \ -P textequiv_level word -P model fraktur.pyrnn.gz - ocrd-cis-ocropy-recognize -l DEBUG \ - -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \ + ocrd-cis-ocropy-recognize ${ARGS[*]} -I OCR-D-CIS-IMG-BIN -O OCR-D-CIS-OCR-2 \ -P textequiv_level word -P model fraktur-jze.pyrnn.gz - ocrd-cis-align -l DEBUG -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \ + ocrd-cis-align ${ARGS[*]} -I OCR-D-CIS-OCR-1,OCR-D-CIS-OCR-2,$OCRD_CIS_FILEGRP \ -O OCR-D-CIS-ALIGN popd } From ae7dc671ab50104c0cf3f4dec6bf28fc3c1990ed Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 11:25:35 +0200 Subject: [PATCH 173/194] make test: run serially and parallel, show times --- Makefile | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index a040cf9d..d1991df0 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,17 @@ docker-push: docker-build TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) .PHONY: $(TEST_SCRIPTS) $(TEST_SCRIPTS): - bash $@ $V + OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V + OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V + +test: export OCRD_OVERRIDE_LOGLEVEL=DEBUG +test: export OCRD_MISSING_OUTPUT=ABORT +test: export OCRD_MAX_MISSING_OUTPUTS=-1 test: $(TEST_SCRIPTS) - @echo $^ + @echo =====single-threaded test results===== + @cat test_serially.log + @echo =====4-page-parallel test results===== + @cat test_parallel.log + @$(RM) test_serially.log test_parallel.log + .PHONY: install install-devel uninstall test docker-build docker-push From e540b108e0c7f14c1cfcf8579dd0722a41069ead Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 11:48:43 +0200 Subject: [PATCH 174/194] require ocrd>=3.0.0b4 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 83cf28bb..e8ea1cf3 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=3.0.0b1', + 'ocrd>=3.0.0b4', 'click', 'scipy', 'numpy>=1.17.0', From fe122ae4ac21e87e684af8c6b9aa02026bf0748c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 26 Sep 2024 01:28:50 +0000 Subject: [PATCH 175/194] segment: adapt to numpy deprecation --- ocrd_cis/ocropy/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index 3cb9e4c4..d78198e5 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -643,7 +643,7 @@ def compute_seplines(binary, scale, maxseps=0): sepdists.append(np.median(subdistances)) #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice)) sepsizes = np.array(sepsizes) - sepslices = np.array(sepslices) + sepslices = np.array(sepslices, dtype=object) LOG.debug("detected %d separator candidates", numsep) DSAVE("seps-raw", sepmap[labels]) # now dilate+erode to link neighbouring candidates, From 99b348915bcf0c1d3ea0028ca43ac2448a0ee922 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 26 Sep 2024 01:28:50 +0000 Subject: [PATCH 176/194] segment: adapt to numpy deprecation --- ocrd_cis/ocropy/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocropy/common.py b/ocrd_cis/ocropy/common.py index c5b56ed0..bae4dac0 100644 --- a/ocrd_cis/ocropy/common.py +++ b/ocrd_cis/ocropy/common.py @@ -644,7 +644,7 @@ def compute_seplines(binary, scale, maxseps=0): sepdists.append(np.median(subdistances)) #LOG.debug("adding sublabel %d as sep %d (size %d [%s])", sublabel, numsep, sublabelsize, str(sublabelslice)) sepsizes = np.array(sepsizes) - sepslices = np.array(sepslices) + sepslices = np.array(sepslices, dtype=object) LOG.debug("detected %d separator candidates", numsep) DSAVE("seps-raw", sepmap[labels]) # now dilate+erode to link neighbouring candidates, From 56eaca7116dd5b21a2ebd456cd1b0237b8c09dc3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Thu, 10 Oct 2024 19:09:41 +0200 Subject: [PATCH 177/194] fix: levenshtein import --- ocrd_cis/align/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/align/cli.py b/ocrd_cis/align/cli.py index ffe53fd8..7747622e 100644 --- a/ocrd_cis/align/cli.py +++ b/ocrd_cis/align/cli.py @@ -2,7 +2,7 @@ import click import json import os -import Levenshtein +from rapidfuzz.distance import Levenshtein from ocrd import Processor from ocrd.decorators import ocrd_cli_options from ocrd.decorators import ocrd_cli_wrap_processor From ca08c1af462769df84cf5e83aadf118da0d96865 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 11 Oct 2024 11:12:20 +0200 Subject: [PATCH 178/194] eval/stats: Levenshtein -> rapidfuzz.distance.Levenshtein --- ocrd_cis/div/eval.py | 2 +- ocrd_cis/div/stats.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/div/eval.py b/ocrd_cis/div/eval.py index 6efe90c6..f47682ff 100644 --- a/ocrd_cis/div/eval.py +++ b/ocrd_cis/div/eval.py @@ -1,6 +1,6 @@ import os from PIL import Image -from Levenshtein import distance +from rapidfuzz.distance.Levenshtein import distance path = '/mnt/c/Users/chris/Documents/projects/OCR-D/daten/gt/lines/' diff --git a/ocrd_cis/div/stats.py b/ocrd_cis/div/stats.py index ea385d98..6f9c9816 100644 --- a/ocrd_cis/div/stats.py +++ b/ocrd_cis/div/stats.py @@ -4,7 +4,7 @@ from ocrd import Processor from ocrd_cis import get_ocrd_tool from ocrd_models.ocrd_page_generateds import parse -from Levenshtein import distance +from rapidfuzz.distance import Levenshtein class Stats(Processor): @@ -81,7 +81,7 @@ def process(self): # print(line.get_TextEquiv()[2].dataType) unicodeline = line.get_TextEquiv()[i].Unicode - d[i] += distance(gtline, unicodeline) + d[i] += Levenshtein.distance(gtline, unicodeline) # words = line.get_Word() # for word in words: From dee1abf5c1cfcf3b8e111f4b3f8614e0f6fea214 Mon Sep 17 00:00:00 2001 From: kba Date: Fri, 11 Oct 2024 11:12:20 +0200 Subject: [PATCH 179/194] eval/stats: Levenshtein -> rapidfuzz.distance.Levenshtein --- ocrd_cis/div/eval.py | 2 +- ocrd_cis/div/stats.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_cis/div/eval.py b/ocrd_cis/div/eval.py index 6efe90c6..f47682ff 100644 --- a/ocrd_cis/div/eval.py +++ b/ocrd_cis/div/eval.py @@ -1,6 +1,6 @@ import os from PIL import Image -from Levenshtein import distance +from rapidfuzz.distance.Levenshtein import distance path = '/mnt/c/Users/chris/Documents/projects/OCR-D/daten/gt/lines/' diff --git a/ocrd_cis/div/stats.py b/ocrd_cis/div/stats.py index ea385d98..6f9c9816 100644 --- a/ocrd_cis/div/stats.py +++ b/ocrd_cis/div/stats.py @@ -4,7 +4,7 @@ from ocrd import Processor from ocrd_cis import get_ocrd_tool from ocrd_models.ocrd_page_generateds import parse -from Levenshtein import distance +from rapidfuzz.distance import Levenshtein class Stats(Processor): @@ -81,7 +81,7 @@ def process(self): # print(line.get_TextEquiv()[2].dataType) unicodeline = line.get_TextEquiv()[i].Unicode - d[i] += distance(gtline, unicodeline) + d[i] += Levenshtein.distance(gtline, unicodeline) # words = line.get_Word() # for word in words: From 817230b626a9d1c6d84fd868a05e77b4fa487005 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Feb 2025 13:36:52 +0100 Subject: [PATCH 180/194] require ocrd v3 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index e8ea1cf3..25dce03e 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ packages=find_packages(), include_package_data=True, install_requires=[ - 'ocrd>=3.0.0b4', + 'ocrd>=3.0.2', 'click', 'scipy', 'numpy>=1.17.0', From ec348fcf78a81506064e9d3fd1c83325d33a043c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Feb 2025 13:37:02 +0100 Subject: [PATCH 181/194] relax max_workers (now multiprocessing instead of multithreading) --- ocrd_cis/ocropy/recognize.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ocrd_cis/ocropy/recognize.py b/ocrd_cis/ocropy/recognize.py index 97bec8a7..55d91cc5 100644 --- a/ocrd_cis/ocropy/recognize.py +++ b/ocrd_cis/ocropy/recognize.py @@ -68,8 +68,6 @@ def recognize(image, pad, network, check=True): class OcropyRecognize(Processor): network: Any pad: int - # lstm is not thread-safe (.outputs, .last_n as side effects etc) - max_workers = 1 @property def executable(self): From c022bba6ee4f0dc322cf2a1ac1f09ad7ee8490ab Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 12 Feb 2025 13:54:02 +0100 Subject: [PATCH 182/194] ocrd-tool.json: add dockerhub=ocrd/cis --- ocrd_cis/ocrd-tool.json | 1 + 1 file changed, 1 insertion(+) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index c2e20268..378d73ac 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -1,6 +1,7 @@ { "git_url": "https://github.com/cisocrgroup/ocrd_cis", "version": "0.1.5", + "dockerhub": "ocrd/cis", "tools": { "ocrd-cis-ocropy-binarize": { "executable": "ocrd-cis-ocropy-binarize", From ed8082c539c294385254f9455c4165c9b2a5c458 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 4 Mar 2025 07:32:18 +0100 Subject: [PATCH 183/194] make test: better stats --- .github/workflows/test.yml | 2 +- Makefile | 14 +++++++++++--- ...ssing_test.bash => run_preprocessing_test.bash} | 0 3 files changed, 12 insertions(+), 4 deletions(-) rename tests/{run_image_preprocessing_test.bash => run_preprocessing_test.bash} (100%) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f95a09a4..c50810f0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,4 +30,4 @@ jobs: - name: Install ocrd_cis run: make install - name: Test ocrd_cis - run: make test V="" + run: make test V= diff --git a/Makefile b/Makefile index d1991df0..3583e0d0 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ PIP ?= pip3 V ?= > /dev/null 2>&1 PKG = ocrd_cis TAG = flobar/ocrd_cis +SHELL = bash install: ${PIP} install --upgrade pip . @@ -20,18 +21,25 @@ docker-push: docker-build docker push $(TAG):latest TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) +INDENT != MAX=; for NAME in $(TEST_SCRIPTS:tests/%=%); do if test $${\#MAX} -lt $${\#NAME}; then MAX=$${NAME//?/_}; fi; done; echo $$MAX +indent = `WHAT=$1; WITH=$(INDENT); echo $$WHAT$${WITH:$${\#WHAT}}` +format_tr = "$(call indent,$1):\t%U\t%S\t%E\t%P\t(%Mk)" +format_th = "$(call indent)\tuser\tsystem\telapsed\tCPU\tmaxRSS" + .PHONY: $(TEST_SCRIPTS) $(TEST_SCRIPTS): - OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V - OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f "$@: %Uuser %Ssystem %Eelapsed %PCPU (%Mmax)k" bash $@ $V + OCRD_MAX_PARALLEL_PAGES=1 /usr/bin/time -o test_serially.log -a -f $(call format_tr,$(@F)) bash -x $@ $V + OCRD_MAX_PARALLEL_PAGES=4 /usr/bin/time -o test_parallel.log -a -f $(call format_tr,$(@F)) bash -x $@ $V test: export OCRD_OVERRIDE_LOGLEVEL=DEBUG test: export OCRD_MISSING_OUTPUT=ABORT test: export OCRD_MAX_MISSING_OUTPUTS=-1 test: $(TEST_SCRIPTS) - @echo =====single-threaded test results===== + @echo =====single-processing test results===== + @echo -e $(call format_th) @cat test_serially.log @echo =====4-page-parallel test results===== + @echo -e $(call format_th) @cat test_parallel.log @$(RM) test_serially.log test_parallel.log diff --git a/tests/run_image_preprocessing_test.bash b/tests/run_preprocessing_test.bash similarity index 100% rename from tests/run_image_preprocessing_test.bash rename to tests/run_preprocessing_test.bash From 2854820d0abf07b89707a97848cfff65b0817b8e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 14:42:31 +0100 Subject: [PATCH 184/194] Docker: shortcut ocrd-all-tool.json via ocrd-tool.json --- .circleci/config.yml | 43 ------------------------------ .github/workflows/docker.yml | 51 ++++++++++++++++++++++++++++++++++++ Dockerfile | 30 ++++++++++++++------- Makefile | 19 +++++++++----- 4 files changed, 84 insertions(+), 59 deletions(-) delete mode 100644 .circleci/config.yml create mode 100644 .github/workflows/docker.yml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 5825a4e0..00000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,43 +0,0 @@ -version: 2.1 -jobs: - test-python3: - docker: - - image: ocrd/core - resource_class: large - environment: - PIP: pip3 - PYTHON: python3 - steps: - - checkout - - run: apt-get update && apt-get -y install default-jre-headless - - run: make install - - run: make -j test V="" - - deploy-docker: - docker: - - image: circleci/buildpack-deps:stretch - environment: - DOCKER_TAG: ocrd/cis - steps: - - checkout - - setup_remote_docker: # https://circleci.com/docs/2.0/building-docker-images/ - docker_layer_caching: true - - run: make docker-build TAG=$DOCKER_TAG - - run: - name: Login to Docker Hub - command: echo "$DOCKERHUB_PASS" | docker login --username "$DOCKERHUB_USER" --password-stdin - - run: docker push $DOCKER_TAG - -workflows: - version: 2 - build-and-test: - jobs: - - test-python3 - deploy: - jobs: - - deploy-docker: - filters: - branches: - only: - - master - - fix-alpha-shape diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml new file mode 100644 index 00000000..a9f766d5 --- /dev/null +++ b/.github/workflows/docker.yml @@ -0,0 +1,51 @@ +name: Docker Image CI + +on: + workflow_dispatch: + push: + branches: [ "master", "fix-alpha-shape" ] + +env: + REPO_NAME: ${{ github.repository }} + +jobs: + + build: + + runs-on: ubuntu-latest + permissions: + packages: write + contents: read + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + # we need tags for docker version tagging + fetch-tags: true + fetch-depth: 0 + - # Activate cache export feature to reduce build time of images + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Log in to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERIO_USERNAME }} + password: ${{ secrets.DOCKERIO_PASSWORD }} + - name: define image name from repo name + run: echo "IMAGE_NAME=ghcr.io/${REPO_NAME,,}" >> $GITHUB_ENV + - name: Build the Docker image + # build both tags at the same time + run: make docker-build DOCKER_TAG="docker.io/ocrd/cis -t ${{ env.IMAGE_NAME }}" + - name: Test the Docker image + run: docker run --rm ${{ env.IMAGE_NAME }} ocrd-cis-ocropy-segment -h + - name: Push to Dockerhub + run: docker push docker.io/ocrd/cis + - name: Push to Github Container Registry + run: docker push ${{ env.IMAGE_NAME }} + diff --git a/Dockerfile b/Dockerfile index e7b2249a..cffb7475 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,22 @@ -FROM ocrd/core:v2.67.2 AS base +ARG DOCKER_BASE_IMAGE +FROM $DOCKER_BASE_IMAGE AS base ARG VCS_REF ARG BUILD_DATE LABEL \ - maintainer="https://github.com/OCR-D/ocrd_cis/issues" \ + maintainer="https://github.com/cisocrgroup/ocrd_cis/issues" \ org.label-schema.vcs-ref=$VCS_REF \ - org.label-schema.vcs-url="https://github.com/OCR-D/ocrd_cis" \ - org.label-schema.build-date=$BUILD_DATE + org.label-schema.vcs-url="https://github.com/cisocrgroup/ocrd_cis" \ + org.label-schema.build-date=$BUILD_DATE \ + org.opencontainers.image.vendor="DFG-Funded Initiative for Optical Character Recognition Development" \ + org.opencontainers.image.title="ocrd_cis" \ + org.opencontainers.image.description="Ocropy OCR and CIS post-correction bindings" \ + org.opencontainers.image.source="https://github.com/cisocrgroup/ocrd_cis" \ + org.opencontainers.image.documentation="https://github.com/cisocrgroup/ocrd_cis/blob/${VCS_REF}/README.md" \ + org.opencontainers.image.revision=$VCS_REF \ + org.opencontainers.image.created=$BUILD_DATE \ + org.opencontainers.image.base.name=ocrd/core ENV GITURL="https://github.com/cisocrgroup" -ENV DOWNLOAD_URL="http://cis.lmu.de/~finkf" SHELL ["/bin/bash", "-c"] @@ -51,19 +59,23 @@ RUN apt-get update \ FROM base AS postcorrection # install ocrd_cis (python) -VOLUME ["/data"] +WORKDIR /build/ocrd_cis COPY --from=languagemodel /etc/profiler/languages /etc/profiler/languages COPY --from=profiler /apps/profiler /apps/ COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linux-gnu/ COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/ COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/ -COPY . /build/ocrd_cis +COPY . . +# prepackage ocrd-tool.json as ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd_cis/ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# install everything and reduce image size RUN apt-get update \ && apt-get -y install --no-install-recommends gcc wget default-jre-headless \ - && pushd /build/ocrd_cis \ && make install \ # test always fail, resources not available for download. Resources should be made available # somewhere else, e.g. github.com/OCR-D/assets # && make test \ - && popd \ && rm -rf /build/ocrd_cis + +WORKDIR /data +VOLUME /data diff --git a/Makefile b/Makefile index 3583e0d0..1d3e9930 100644 --- a/Makefile +++ b/Makefile @@ -2,23 +2,28 @@ PY ?= python3 PIP ?= pip3 V ?= > /dev/null 2>&1 PKG = ocrd_cis -TAG = flobar/ocrd_cis +DOCKER_TAG = ocrd/cis +DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.1.0 SHELL = bash install: - ${PIP} install --upgrade pip . -install-devel: - ${PIP} install --upgrade pip -e . + ${PIP} install . + +install-devel install-dev: + ${PIP} install -e . + uninstall: ${PIP} uninstall ${PKG} docker-build: Dockerfile docker build \ + --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ - -t $(TAG):latest . + -t $(DOCKER_TAG):latest . + docker-push: docker-build - docker push $(TAG):latest + docker push $(DOCKER_TAG):latest TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) INDENT != MAX=; for NAME in $(TEST_SCRIPTS:tests/%=%); do if test $${\#MAX} -lt $${\#NAME}; then MAX=$${NAME//?/_}; fi; done; echo $$MAX @@ -43,4 +48,4 @@ test: $(TEST_SCRIPTS) @cat test_parallel.log @$(RM) test_serially.log test_parallel.log -.PHONY: install install-devel uninstall test docker-build docker-push +.PHONY: install install-dev install-devel uninstall test docker-build docker-push From 6ee159230f48341d75a2fe0a6eb8e5ab0f441c97 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 15:15:16 +0100 Subject: [PATCH 185/194] =?UTF-8?q?setup=E2=86=92pyproject?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 ++ pyproject.toml | 107 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 10 +++++ setup.py | 74 -------------------------------- 4 files changed, 120 insertions(+), 74 deletions(-) create mode 100644 pyproject.toml create mode 100644 requirements.txt delete mode 100644 setup.py diff --git a/.gitignore b/.gitignore index fb28879b..aca5a739 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,6 @@ env-dir/* /venv* /build /dist +TAGS +*.log +download/ diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..133bbf51 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,107 @@ +[build-system] +requires = ["setuptools>=61.0.0", "wheel", "setuptools-ocrd"] + +[project] +name = "ocrd_cis" +authors = [ + {name = "Florian Fink", email = "finkf@cis.lmu.de"}, + {name = "Tobias Englmeier", email = "englmeier@cis.lmu.de"}, + {name = "Christoph Weber", email = "web_chris@msn.com"}, + {name = "Robert Sachunsky", email = "sachunsky@informatik.uni-leipzig.de"}, +] +description = "CIS OCR-D post-correction tools and improved Ocropy1" +readme = "README.md" +license = {text = "MIT"} +requires-python = ">=3.8" +keywords = ["ocr", "ocr-d", "ocropus-ocr", "post-correction"] + +dynamic = ["version", "dependencies"] + +# https://pypi.org/classifiers/ +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: Science/Research", + "Intended Audience :: Other Audience", + "License :: OSI Approved :: MIT Software License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Topic :: Text Processing", +] + +[project.scripts] +ocrd-cis-align = "ocrd_cis.align.cli:ocrd_cis_align" +ocrd-cis-postcorrect = "ocrd_cis.postcorrect.cli:ocrd_cis_postcorrect" +ocrd-cis-data = "ocrd_cis.data.__main__:main" +ocrd-cis-ocropy-train = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_train" +ocrd-cis-ocropy-recognize = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_recognize" +ocrd-cis-ocropy-segment = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_segment" +ocrd-cis-ocropy-resegment = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_resegment" +ocrd-cis-ocropy-clip = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_clip" +ocrd-cis-ocropy-dewarp = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_dewarp" +ocrd-cis-ocropy-deskew = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_deskew" +ocrd-cis-ocropy-denoise = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_denoise" +ocrd-cis-ocropy-binarize = "ocrd_cis.ocropy.cli:ocrd_cis_ocropy_binarize" + +[project.urls] +Homepage = "https://github.com/cisocrgroup/ocrd_cis" +Repository = "https://github.com/cisocrgroup/ocrd_cis.git" + +[project.optional-dependencies] +debug = ["matplotlib>3.0.0"] + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[tool.setuptools] +packages = ["ocrd_cis", "ocrd_cis.postcorrect", "ocrd_cis.aio", "ocrd_cis.data", "ocrd_cis.wer", "ocrd_cis.ocropy", "ocrd_cis.ocropy.ocrolib", "ocrd_cis.div", "ocrd_cis.align"] +package-data = {"*" = ["*.json", "*.jar", "model.zip", "3gs.csv.gz"]} + +[tool.pytest.ini_options] +minversion = 6.0 +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] + + +[tool.mypy] +plugins = ["numpy.typing.mypy_plugin"] + +ignore_missing_imports = true + + +strict = true + +disallow_subclassing_any = false +# ❗ error: Class cannot subclass "Processor" (has type "Any") +disallow_any_generics = false +disallow_untyped_defs = false +disallow_untyped_calls = false + + +[tool.ruff.lint] +select = ["E", "F", "I"] + + +[tool.coverage.run] +branch = true +source = [ + "ocrd_cis" +] +concurrency = [ + "thread", + "multiprocessing" +] + +[tool.coverage.report] +exclude_also = [ + "if self\\.debug", + "pragma: no cover", + "raise NotImplementedError", + "if __name__ == .__main__.:", +] +ignore_errors = true +omit = [ + "ocrd_cis/*/cli" +] diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..a57112af --- /dev/null +++ b/requirements.txt @@ -0,0 +1,10 @@ +ocrd>=3.0.2 +click +scipy +numpy>=1.17.0 +pillow>=7.1.2 +shapely>=2.0.0 +scikit-image +networkx +opencv-python-headless +rapidfuzz diff --git a/setup.py b/setup.py deleted file mode 100644 index 25dce03e..00000000 --- a/setup.py +++ /dev/null @@ -1,74 +0,0 @@ -""" -Installs: - - ocrd-cis-align - - ocrd-cis-postcorrect - - ocrd-cis-data - - ocrd-cis-ocropy-clip - - ocrd-cis-ocropy-denoise - - ocrd-cis-ocropy-deskew - - ocrd-cis-ocropy-binarize - - ocrd-cis-ocropy-resegment - - ocrd-cis-ocropy-segment - - ocrd-cis-ocropy-dewarp - - ocrd-cis-ocropy-recognize - - ocrd-cis-ocropy-train -""" - -import codecs -import json -from setuptools import setup -from setuptools import find_packages - -with codecs.open('README.md', encoding='utf-8') as f: - README = f.read() - -with open('./ocrd-tool.json', 'r') as f: - version = json.load(f)['version'] - -setup( - name='ocrd_cis', - version=version, - description='CIS OCR-D command line tools', - long_description=README, - long_description_content_type='text/markdown', - author='Florian Fink, Tobias Englmeier, Christoph Weber', - author_email='finkf@cis.lmu.de, englmeier@cis.lmu.de, web_chris@msn.com', - url='https://github.com/cisocrgroup/ocrd_cis', - license='MIT', - packages=find_packages(), - include_package_data=True, - install_requires=[ - 'ocrd>=3.0.2', - 'click', - 'scipy', - 'numpy>=1.17.0', - 'pillow>=7.1.2', - 'shapely>=1.7.1', - 'scikit-image', - 'networkx', - 'opencv-python-headless', - 'rapidfuzz' - ], - extras_require={ - 'debug': ['matplotlib>3.0.0'], - }, - package_data={ - '': ['*.json', '*.yml', '*.yaml', '*.csv.gz', '*.jar', '*.zip'], - }, - entry_points={ - 'console_scripts': [ - 'ocrd-cis-align=ocrd_cis.align.cli:ocrd_cis_align', - 'ocrd-cis-postcorrect=ocrd_cis.postcorrect.cli:ocrd_cis_postcorrect', - 'ocrd-cis-data=ocrd_cis.data.__main__:main', - 'ocrd-cis-ocropy-binarize=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_binarize', - 'ocrd-cis-ocropy-clip=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_clip', - 'ocrd-cis-ocropy-denoise=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_denoise', - 'ocrd-cis-ocropy-deskew=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_deskew', - 'ocrd-cis-ocropy-dewarp=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_dewarp', - 'ocrd-cis-ocropy-recognize=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_recognize', - 'ocrd-cis-ocropy-resegment=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_resegment', - 'ocrd-cis-ocropy-segment=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_segment', - 'ocrd-cis-ocropy-train=ocrd_cis.ocropy.cli:ocrd_cis_ocropy_train', - ] - }, -) From 89e046cd1ab55e007882d0ff3d68a7e2024267b2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 15:25:12 +0100 Subject: [PATCH 186/194] add 'build' and 'help' targets --- Makefile | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 1d3e9930..1f97072c 100644 --- a/Makefile +++ b/Makefile @@ -6,16 +6,36 @@ DOCKER_TAG = ocrd/cis DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.1.0 SHELL = bash +help: + @echo "" + @echo " Targets" + @echo "" + @echo " install Install ocrd_cis" + @echo " install-dev Install in editable mode" + @echo " build Build source and binary distribution" + @echo " docker Build Docker image" + @echo " test Run unit tests" + @echo "" + @echo " Variables" + @echo "" + @echo " DOCKER_TAG '$(DOCKER_TAG)'" + @echo " PY '$(PY)'" + @echo " PIP '$(PIP)'" + install: ${PIP} install . install-devel install-dev: ${PIP} install -e . +build: + ${PIP} install build + ${PY} -m build . + uninstall: ${PIP} uninstall ${PKG} -docker-build: Dockerfile +docker-build docker: Dockerfile docker build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ @@ -48,4 +68,4 @@ test: $(TEST_SCRIPTS) @cat test_parallel.log @$(RM) test_serially.log test_parallel.log -.PHONY: install install-dev install-devel uninstall test docker-build docker-push +.PHONY: install install-dev install-devel build uninstall test docker docker-build docker-push From f31917b70dd54b7bdb0d6dbbefac0e4125c48544 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 15:25:34 +0100 Subject: [PATCH 187/194] :package: 0.2.0 --- ocrd_cis/ocrd-tool.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd_cis/ocrd-tool.json b/ocrd_cis/ocrd-tool.json index 378d73ac..472ea5ab 100644 --- a/ocrd_cis/ocrd-tool.json +++ b/ocrd_cis/ocrd-tool.json @@ -1,6 +1,6 @@ { "git_url": "https://github.com/cisocrgroup/ocrd_cis", - "version": "0.1.5", + "version": "0.2.0", "dockerhub": "ocrd/cis", "tools": { "ocrd-cis-ocropy-binarize": { From 61ed15a3c558e8038d55cc4beb5e4bed716ca020 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 17:11:10 +0100 Subject: [PATCH 188/194] add PyPI CD --- .github/workflows/pypi.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/pypi.yml diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 00000000..1b239c0c --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,29 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: PyPI CD + +on: + release: + types: [published] + workflow_dispatch: + +jobs: + publish: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.8' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel build twine + pip install -r requirements.txt + - name: Build and publish + env: + TWINE_USERNAME: __token__ + TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} + run: twine upload --verbose dist/ocrd*${{ github.ref_name }}*{tar.gz,whl} From 2cf3c85ced72df0de28573db9f2ea531beaf0a42 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 17:31:00 +0100 Subject: [PATCH 189/194] PyPI CD: strip 'v' prefix from git tag --- .github/workflows/pypi.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 1b239c0c..17860add 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -26,4 +26,6 @@ jobs: env: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} - run: twine upload --verbose dist/ocrd*${{ github.ref_name }}*{tar.gz,whl} + run: | + version=${{ github.ref_name }} + twine upload --verbose dist/ocrd*${version:1}*{tar.gz,whl} From a8210ed3592107072ec91d6a07d89fa211e7d1f9 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 20:27:48 +0100 Subject: [PATCH 190/194] PyPI CD: use whatever is in dist --- .github/workflows/pypi.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 17860add..79309e74 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -27,5 +27,5 @@ jobs: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} run: | - version=${{ github.ref_name }} - twine upload --verbose dist/ocrd*${version:1}*{tar.gz,whl} + ls -l dist + twine upload --verbose dist/ocrd*{tar.gz,whl} From f4a41ce8be655a80bd36ab5b9a9d199e29a931f3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 20:30:53 +0100 Subject: [PATCH 191/194] PyPI CD: forgot the actual build! --- .github/workflows/pypi.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml index 79309e74..54c46713 100644 --- a/.github/workflows/pypi.yml +++ b/.github/workflows/pypi.yml @@ -27,5 +27,6 @@ jobs: TWINE_USERNAME: __token__ TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }} run: | + python -m build . ls -l dist twine upload --verbose dist/ocrd*{tar.gz,whl} From 5cf22f5baa093ffaf0049e3c9756094116273598 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 7 Mar 2025 22:10:08 +0100 Subject: [PATCH 192/194] fix license classifier for PyPI --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 133bbf51..6432dd27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ classifiers = [ "Environment :: Console", "Intended Audience :: Science/Research", "Intended Audience :: Other Audience", - "License :: OSI Approved :: MIT Software License", + "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Topic :: Text Processing", From 027ebe3614da172b2ed7ebc1279e6d3088d9728a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 2 May 2025 00:26:01 +0200 Subject: [PATCH 193/194] docker: prepackage ocrd-all-module-dir.json --- Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index cffb7475..0fa98fb4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -66,8 +66,11 @@ COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicuuc.so /usr/lib//x86_64-linu COPY --from=profiler /usr/lib/x86_64-linux-gnu/libicudata.so /usr/lib//x86_64-linux-gnu/ COPY --from=profiler /usr/lib//x86_64-linux-gnu/libxerces-c-3.2.so /usr/lib//x86_64-linux-gnu/ COPY . . +COPY ocrd-tool.json . # prepackage ocrd-tool.json as ocrd-all-tool.json -RUN ocrd ocrd-tool ocrd_cis/ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +RUN ocrd ocrd-tool ocrd-tool.json dump-tools > $(dirname $(ocrd bashlib filename))/ocrd-all-tool.json +# prepackage ocrd-all-module-dir.json +RUN ocrd ocrd-tool ocrd-tool.json dump-module-dirs > $(dirname $(ocrd bashlib filename))/ocrd-all-module-dir.json # install everything and reduce image size RUN apt-get update \ && apt-get -y install --no-install-recommends gcc wget default-jre-headless \ From 025409becd6dc01c42770f218f561507887d7ff5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 2 May 2025 00:25:57 +0200 Subject: [PATCH 194/194] docker: use latest core base stage --- Makefile | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 1f97072c..ac2edacc 100644 --- a/Makefile +++ b/Makefile @@ -2,8 +2,9 @@ PY ?= python3 PIP ?= pip3 V ?= > /dev/null 2>&1 PKG = ocrd_cis -DOCKER_TAG = ocrd/cis -DOCKER_BASE_IMAGE = docker.io/ocrd/core:v3.1.0 +DOCKER_TAG ?= ocrd/cis +DOCKER_BASE_IMAGE ?= docker.io/ocrd/core:latest +DOCKER ?= docker SHELL = bash help: @@ -36,14 +37,14 @@ uninstall: ${PIP} uninstall ${PKG} docker-build docker: Dockerfile - docker build \ + $(DOCKER) build \ --build-arg DOCKER_BASE_IMAGE=$(DOCKER_BASE_IMAGE) \ --build-arg VCS_REF=$$(git rev-parse --short HEAD) \ --build-arg BUILD_DATE=$$(date -u +"%Y-%m-%dT%H:%M:%SZ") \ -t $(DOCKER_TAG):latest . docker-push: docker-build - docker push $(DOCKER_TAG):latest + $(DOCKER) push $(DOCKER_TAG):latest TEST_SCRIPTS=$(sort $(filter-out tests/run_training_test.bash, $(wildcard tests/run_*.bash))) INDENT != MAX=; for NAME in $(TEST_SCRIPTS:tests/%=%); do if test $${\#MAX} -lt $${\#NAME}; then MAX=$${NAME//?/_}; fi; done; echo $$MAX